Repository URL to install this package:
|
Version:
4.5.4.dev1 ▾
|
import typing as t
from sarus_differential_privacy.query import LaplaceQuery, PrivateQuery
from sarus_statistics.ops.bounds.local import automatic_column_range_pandas
import numpy as np
import pandas as pd
from sarus_data_spec.constants import (
OPTIONAL_VALUE,
PUBLIC,
PU_COLUMN,
WEIGHTS,
)
from sarus_data_spec.sarus_statistics.ops.data_utils import (
compute_rescaled_data_from_dataspec,
)
import sarus_data_spec.type as sdt
import sarus_data_spec.typing as st
class BoundOp:
def __init__(
self,
dataset: st.Dataset,
noise: float,
estimate_on_synthetic_data: bool = True,
relative_margin: float = 2.0,
absolute_margin: float = 0.0,
):
"""Op for bound computation.
if estimate_on_synthetic_data, bounds are first estimated on synthetic
data. They are then expanded using the given margins"""
self._dataset = dataset
self.noise = noise
self.estimate_on_synthetic_data = estimate_on_synthetic_data
self.relative_margin = relative_margin
self.absolute_margin = absolute_margin
def dataset(self) -> st.Dataset:
return self._dataset
def value(
self,
column_name: str,
max_multiplicity: float,
random_generator: t.Optional[np.random.Generator] = None,
) -> t.Tuple[float, float]:
dataset = self.dataset()
if dataset.manager().is_big_data(dataset):
return self._sql(column_name, max_multiplicity, random_generator)
return self._pandas(column_name, max_multiplicity, random_generator)
def _sql(
self,
column_name: str,
max_multiplicity: float,
random_generator: t.Optional[np.random.Generator] = None,
) -> t.Tuple[float, float]:
raise NotImplementedError
def _pandas(
self,
column_name: str,
max_multiplicity: float,
random_generator: t.Optional[np.random.Generator] = None,
) -> t.Tuple[float, float]:
if self.estimate_on_synthetic_data:
estimate: t.Tuple[t.Optional[float], t.Optional[float]] = (
self.estimate(column_name)
)
else:
estimate = (None, None)
rescaled_data = compute_rescaled_data_from_dataspec(
self.dataset(), max_multiplicity
)
column_type = (
self.dataset().schema().data_type().children()[column_name]
)
if column_type.protobuf().WhichOneof("type") == "optional":
column_type = column_type.children()[OPTIONAL_VALUE]
rescaled_data = rescaled_data.dropna()
dtype = sdt.to_numeric_string(column_type)
return automatic_column_range_pandas(
rescaled_data[[column_name, PU_COLUMN, PUBLIC, WEIGHTS]],
column_name,
PU_COLUMN,
PUBLIC,
WEIGHTS,
dtype,
self.noise,
max_multiplicity=max_multiplicity,
estimate=estimate,
random_generator=random_generator,
is_data_already_scaled=True,
)
def private_query(self) -> PrivateQuery:
return LaplaceQuery(self.noise)
def estimate(self, column_name: str) -> t.Tuple[float, float]:
"""estimate bounds on synthetic data first to get a rough estimate.
The obtained bounds are expanded by a margin using
self.relative_margin and self.absolute_margin"""
synthetic_ds = t.cast(
st.Dataset,
self.dataset().variant(
kind=st.ConstraintKind.SYNTHETIC,
public_context=[],
),
)
r_margin = self.relative_margin / 2
a_margin = self.absolute_margin / 2
data = t.cast(pd.DataFrame, synthetic_ds.to(pd.DataFrame))[column_name]
minimum, maximum = min(data), max(data)
bounds_with_margin = (
minimum - r_margin * (maximum - minimum) - a_margin,
maximum + r_margin * (maximum - minimum) + a_margin,
)
return bounds_with_margin