Repository URL to install this package:
|
Version:
4.5.4.dev1 ▾
|
import typing as t
from sarus_differential_privacy.query import (
ComposedQuery,
LaplaceQuery,
PrivateQuery,
)
from sarus_statistics.ops.std.local import std
from sarus_statistics.ops.utils import group_by_results_to_dataframe
import numpy as np
import pandas as pd
from sarus_data_spec.constants import PUBLIC, PU_COLUMN, WEIGHTS
from sarus_data_spec.sarus_statistics.ops.data_utils import (
compute_rescaled_data_from_dataspec,
compute_rescaled_groupby_data_from_dataspec,
)
import sarus_data_spec.typing as st
class StdOp:
def __init__(
self,
dataset: st.Dataset,
noise_mean: float,
noise_square: float,
noise_count: float,
):
self._dataset = dataset
self.noise_mean = noise_mean
self.noise_square = noise_square
self.noise_count = noise_count
def dataset(self) -> st.Dataset:
return self._dataset
def value(
self,
column_name: str,
max_multiplicity: float,
bounds: t.Tuple[float, float],
random_generator: t.Optional[np.random.Generator] = None,
) -> float:
dataset = self.dataset()
if dataset.manager().is_big_data(dataset):
return self._sql(
column_name, max_multiplicity, bounds, random_generator
)
return self._pandas(
column_name, max_multiplicity, bounds, random_generator
)
def _sql(
self,
column_name: str,
max_multiplicity: float,
bounds: t.Tuple[float, float],
random_generator: t.Optional[np.random.Generator] = None,
) -> float:
raise NotImplementedError
def _pandas(
self,
column_name: str,
max_multiplicity: float,
bounds: t.Tuple[float, float],
random_generator: t.Optional[np.random.Generator] = None,
) -> float:
rescaled_data = compute_rescaled_data_from_dataspec(
self.dataset(), max_multiplicity
)
return std(
rescaled_data[[column_name, PU_COLUMN, PUBLIC, WEIGHTS]],
column_name,
PU_COLUMN,
PUBLIC,
WEIGHTS,
self.noise_mean,
self.noise_square,
self.noise_count,
bounds,
max_multiplicity,
random_generator,
is_data_already_scaled=True,
)
def private_query(self) -> PrivateQuery:
mean_query = LaplaceQuery(self.noise_mean)
square_query = LaplaceQuery(self.noise_square)
count_query = LaplaceQuery(self.noise_count)
return ComposedQuery([mean_query, square_query, count_query])
class GroupByStdOp:
def __init__(
self,
dataset: st.Dataset,
noise_mean: float,
noise_square: float,
noise_count: float,
):
self._dataset = dataset
self.noise_mean = noise_mean
self.noise_square = noise_square
self.noise_count = noise_count
def dataset(self) -> st.Dataset:
return self._dataset
def value(
self,
column_name: str,
max_multiplicity: float,
bounds: t.Tuple[float, float],
random_generator: t.Optional[np.random.Generator] = None,
keys_values: t.Optional[t.List[t.Any]] = None,
) -> pd.DataFrame:
dataset = self.dataset()
if dataset.manager().is_big_data(dataset):
return self._sql(
column_name,
max_multiplicity,
bounds,
random_generator,
keys_values,
)
return self._pandas(
column_name,
max_multiplicity,
bounds,
random_generator,
keys_values,
)
def _sql(
self,
column_name: str,
max_multiplicity: float,
bounds: t.Tuple[float, float],
random_generator: t.Optional[np.random.Generator] = None,
keys_values: t.Optional[t.List[t.Any]] = None,
) -> pd.DataFrame:
raise NotImplementedError
def _pandas(
self,
column_name: str,
max_multiplicity: float,
bounds: t.Tuple[float, float],
random_generator: t.Optional[np.random.Generator] = None,
keys_values: t.Optional[t.List[t.Any]] = None,
) -> pd.DataFrame:
"""
The pandas implementation for the std count operation.
We apply a dp implementation of the std to all groups,
using a multi-dimensional Laplace mechanism.
See the docstring of the '_pandas' method in 'GroupBySumOp'
for more information about the noise used for each Laplace
mechanism in each group.
"""
(
rescaled_groupbydata,
keys_name,
) = compute_rescaled_groupby_data_from_dataspec(
self.dataset(), max_multiplicity
)
results = []
for key, rescaled_group_data in rescaled_groupbydata:
if keys_values is None or key in keys_values:
dp_std = std(
rescaled_group_data[
[column_name, PU_COLUMN, PUBLIC, WEIGHTS]
],
column_name,
PU_COLUMN,
PUBLIC,
WEIGHTS,
self.noise_mean,
self.noise_square,
self.noise_count,
bounds,
max_multiplicity,
random_generator,
is_data_already_scaled=True,
)
results.append((key, dp_std))
return group_by_results_to_dataframe(results, "Std", keys_name)
def private_query(self) -> PrivateQuery:
mean_query = LaplaceQuery(self.noise_mean)
square_query = LaplaceQuery(self.noise_square)
count_query = LaplaceQuery(self.noise_count)
return ComposedQuery([mean_query, square_query, count_query])