Gemfury

sarus / sarus_statistics python

Repository URL to install this package:
Details
sarus_statistics / sarus_statistics / ops / links / local.py
import typing as t

import numpy as np
import pandas as pd

try:
    from sklearn.isotonic import IsotonicRegression
except ModuleNotFoundError:
    pass

from sarus_statistics.ops.utils import check_is_private, rescale_weights


# pylint: disable=too-many-arguments, too-many-locals
def links(
    data: pd.DataFrame,
    col_name: str,
    user_col: str,
    weight_col: str,
    private_col: str,
    noise: float,
    mult_fk: float,
    mult_pk: float,
    size_fk: int,
    size_pk: int,
    nb_quantiles: int,
    max_categories: t.Optional[int] = None,
    peid_only: bool = False,
    iota: float = 0,
    isotonic_regression: bool = True,
) -> t.Dict[float, float]:
    """Compute links on the data:

    - compute histogram of histograms (prevalences)
    - get private cumulative prevalences
    - if isotonic_regression, fit regression
    - compute quantiles as an approximation of the link distribution

    if peid_only, assume that the only way a peid can influence a row is if it
    is the foreign key.
    if isotonic_regression, perform isotonic regression on the cumulative
    prevalences.
    if max_categories, we only consider counts <= max_categories, and clip
    otherwise.

    Reference: https://proceedings.neurips.cc/paper/2019/file/
               f06ae085fe74cd78ad5e89496b197fe1-Paper.pdf

    Parameters
    -----------
    data: pd.DataFrame
        data on which to compute histogram
    col_name: str
        name of the value's column
    user_col: str
        name of the users' column. If None, the row is public.
    private_col: str
        name of the column indicating the privacy status
    weight_col: str
        name of the weight's column
    noise: float
        scale of the gaussian noise to add to each category
    mult_fk: float
        multiplicity of the pointing column (Foreign Key)
    mult_pk: float
        multiplicity of the pointed column (Primary Key)
    size_fk: int
        (noisy) size of the pointing column (Foreign Key)
    size_pk: int
        (noisy) size of the pointed column (Primary Key)
    nb_quantiles: int
        number of quantiles to estimate the link distribution
    max_categories: t.Optional[int]
        max number of repetitions of the same primary key
        If None, suppose that all foreign keys can be the same primary key
    peid_only: bool
        if peid_only, assume that the only way a peid can influence a row is
        if it is the foreign key.
    iota: float
        clip quantiles probabilities to (iota, 1 - iota).
        this is useful to force the 0 and 1 quantiles to be mapped to the
        expected bounds
    isotonic_regression: bool
        if isotonic_regression, perform isotonic regression on the cumulative
        prevalences.


    Returns
    -------
    Dict[float, float]
        quantile dict
    """
    check_is_private(data, user_col, private_col)

    private_data = rescale_weights(
        data=data,
        user_col=user_col,
        private_col=private_col,
        weight_col=weight_col,
        max_multiplicity=mult_fk,
    )

    if not max_categories:
        max_categories = size_fk

    prevalences_length = int(
        min(max_categories, size_fk + 1 if peid_only else mult_fk + 1)
    )
    sensitivity = 2 * mult_pk if peid_only else 2 * mult_fk * mult_pk

    prevalences = (
        private_data.groupby(col_name, sort=False)[weight_col]
        .sum()
        .apply(round)
        .clip(upper=prevalences_length - 1)
        .value_counts()
    )
    return quantiles_from_prevalences(
        prevalences,
        prevalences_length,
        noise,
        sensitivity,
        nb_quantiles,
        size_pk,
        iota,
        isotonic_regression,
    )


def quantiles_from_prevalences(
    prevalences: pd.Series,
    prevalences_length: int,
    noise: float,
    sensitivity: float,
    nb_quantiles: int,
    size_pk: int,
    iota: float,
    isotonic_regression: bool,
) -> t.Dict[float, float]:
    """Compute quantiles of the links distribution from prevalences"""

    prevalences = prevalences.reindex(range(prevalences_length)).fillna(0)
    prevalences = prevalences.iloc[::-1].cumsum()
    prevalences = prevalences + np.random.laplace(
        0, noise * sensitivity, prevalences_length
    )

    if isotonic_regression:
        try:
            iso_reg = IsotonicRegression(
                y_min=0, y_max=size_pk, increasing=False, out_of_bounds='clip'
            ).fit(prevalences.index, prevalences.values)
            prevalences = pd.Series(
                iso_reg.transform(range(prevalences_length))
            )
        except NameError:
            prevalences = prevalences.iloc[::-1]
    else:
        prevalences = prevalences.iloc[::-1]

    # get quantiles
    quantiles_probabilities = [
        (q + 1) / (nb_quantiles + 1) for q in range(nb_quantiles)
    ]
    if len(prevalences.unique()) == 1:
        # constant
        unique_value = len(prevalences)
        return {q: unique_value for q in [0.0, 1.0] + quantiles_probabilities}

    quantiles: t.Dict[float, float] = {
        0: max(prevalences[prevalences == prevalences[0]].index),
        1: min(prevalences[prevalences == prevalences.iloc[-1]].index),
    }
    for probability in quantiles_probabilities:
        probability = np.clip(probability, iota, 1 - iota)
        value = prevalences[0] - probability * (
            prevalences[0] - prevalences.iloc[-1]
        )
        closest_prevalence = prevalences.iloc[
            (prevalences - value).abs().argsort()[:1]
        ].iloc[0]
        indices = prevalences[prevalences == closest_prevalence].index
        quantiles[probability] = int(np.mean(indices))

    return quantiles
sarus / sarus_statistics python

Products

About

Resources

Contact Gemfury