Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
Size: Mime:
import typing as t

from sarus_differential_privacy.query import LaplaceQuery, PrivateQuery
from sarus_statistics.ops.bounds.local import automatic_column_range_pandas
import numpy as np
import pandas as pd

from sarus_data_spec.constants import (
    OPTIONAL_VALUE,
    PUBLIC,
    PU_COLUMN,
    WEIGHTS,
)
from sarus_data_spec.sarus_statistics.ops.data_utils import (
    compute_rescaled_data_from_dataspec,
)
import sarus_data_spec.type as sdt
import sarus_data_spec.typing as st


class BoundOp:
    def __init__(
        self,
        dataset: st.Dataset,
        noise: float,
        estimate_on_synthetic_data: bool = True,
        relative_margin: float = 2.0,
        absolute_margin: float = 0.0,
    ):
        """Op for bound computation.
        if estimate_on_synthetic_data, bounds are first estimated on synthetic
        data. They are then expanded using the given margins"""
        self._dataset = dataset
        self.noise = noise
        self.estimate_on_synthetic_data = estimate_on_synthetic_data
        self.relative_margin = relative_margin
        self.absolute_margin = absolute_margin

    def dataset(self) -> st.Dataset:
        return self._dataset

    def value(
        self,
        column_name: str,
        max_multiplicity: float,
        random_generator: t.Optional[np.random.Generator] = None,
    ) -> t.Tuple[float, float]:
        dataset = self.dataset()
        if dataset.manager().is_big_data(dataset):
            return self._sql(column_name, max_multiplicity, random_generator)
        return self._pandas(column_name, max_multiplicity, random_generator)

    def _sql(
        self,
        column_name: str,
        max_multiplicity: float,
        random_generator: t.Optional[np.random.Generator] = None,
    ) -> t.Tuple[float, float]:
        raise NotImplementedError

    def _pandas(
        self,
        column_name: str,
        max_multiplicity: float,
        random_generator: t.Optional[np.random.Generator] = None,
    ) -> t.Tuple[float, float]:
        if self.estimate_on_synthetic_data:
            estimate: t.Tuple[t.Optional[float], t.Optional[float]] = (
                self.estimate(column_name)
            )
        else:
            estimate = (None, None)

        rescaled_data = compute_rescaled_data_from_dataspec(
            self.dataset(), max_multiplicity
        )

        column_type = (
            self.dataset().schema().data_type().children()[column_name]
        )
        if column_type.protobuf().WhichOneof("type") == "optional":
            column_type = column_type.children()[OPTIONAL_VALUE]
            rescaled_data = rescaled_data.dropna()
        dtype = sdt.to_numeric_string(column_type)
        return automatic_column_range_pandas(
            rescaled_data[[column_name, PU_COLUMN, PUBLIC, WEIGHTS]],
            column_name,
            PU_COLUMN,
            PUBLIC,
            WEIGHTS,
            dtype,
            self.noise,
            max_multiplicity=max_multiplicity,
            estimate=estimate,
            random_generator=random_generator,
            is_data_already_scaled=True,
        )

    def private_query(self) -> PrivateQuery:
        return LaplaceQuery(self.noise)

    def estimate(self, column_name: str) -> t.Tuple[float, float]:
        """estimate bounds on synthetic data first to get a rough estimate.
        The obtained bounds are expanded by a margin using
        self.relative_margin and self.absolute_margin"""
        synthetic_ds = t.cast(
            st.Dataset,
            self.dataset().variant(
                kind=st.ConstraintKind.SYNTHETIC,
                public_context=[],
            ),
        )

        r_margin = self.relative_margin / 2
        a_margin = self.absolute_margin / 2

        data = t.cast(pd.DataFrame, synthetic_ds.to(pd.DataFrame))[column_name]
        minimum, maximum = min(data), max(data)
        bounds_with_margin = (
            minimum - r_margin * (maximum - minimum) - a_margin,
            maximum + r_margin * (maximum - minimum) + a_margin,
        )
        return bounds_with_margin