Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
Size: Mime:
from typing import Optional

import numpy as np
import pandas as pd

from sarus_statistics.ops.joint_exp.joint_exp import joint_exp
from sarus_statistics.ops.utils import check_is_private, generator_from_seed

QUANTILE_DIVIDER = 0.8  # else we're too close to 1


# pylint: disable=too-many-arguments
def max_multiplicity(
    data: pd.DataFrame,
    user_col: str,
    private_col: str,
    weight_col: str,
    epsilon_queries: float,
    noise_user_count: float,
    noise_multiplicity: float,
    max_max_multiplicity: float = 1.0,
    random_generator: Optional[np.random.Generator] = None,
) -> float:
    """Given a dataset, the epsilon of future queries and privacy parameters,
    compute the max_multiplicity than optimize the bias / variance tradeoff.

    For now, we cap the quantile to be computed to 0.8 to prevent critical
    failures.

    If user_col is None, each row is supposed to belong to a different user
    If weight_col is None, each row is supposed to have a weight of 1

    This is inspired from http://proceedings.mlr.press/v97/amin19a/amin19a.pdf

    Parameters
    -----------
    data: pd.Dataframe
        dataframe with a data, user and weight columns
    epsilon_queries: float
        epsilon of future counting queries. This is a parameter and NOT spent
    noise_user_count: float
        noise added when counting the number of users
    noise_multiplicity:
        noise added when computing the quantile which gives the max_mult
    max_max_multiplicity: float
        maximum max_multiplicity considered.
        Bounds for the quantiles
    user_col: Optional[str]
        name of the user column
    private_col: Optional[str]
        name of the column indicating the privacy status
    weight_col: Optional[str]
        name of the weight_col
    random_generator: Optional[np.random.Generator]
        generator to use through the computation (for reproducibility)

    Returns
    -------
    float
        value of max_multiplicity for this table
    """
    random_generator = (
        random_generator
        if random_generator is not None
        else generator_from_seed(random_generator)
    )
    check_is_private(data, user_col, private_col)

    user_count = len(data[user_col].unique()) + random_generator.laplace(
        0, noise_user_count
    )
    cumsum = data.groupby(user_col, sort=False)[weight_col].sum()
    quantile = np.array(
        [QUANTILE_DIVIDER * (1 - (1 / user_count / epsilon_queries))]
    )

    result: float = joint_exp(
        cumsum.values,
        np.ones_like(cumsum.values),
        0,
        max_max_multiplicity,
        quantile,
        1 / noise_multiplicity,
        False,
        max_multiplicity=1,
        random_generator=random_generator,
    )[0]

    return max(1, result)