Repository URL to install this package:
|
Version:
4.0.1 ▾
|
from typing import Optional
import numpy as np
import pandas as pd
from sarus_statistics.ops.joint_exp.joint_exp import joint_exp
from sarus_statistics.ops.utils import check_is_private, generator_from_seed
QUANTILE_DIVIDER = 0.8 # else we're too close to 1
# pylint: disable=too-many-arguments
def max_multiplicity(
data: pd.DataFrame,
user_col: str,
private_col: str,
weight_col: str,
epsilon_queries: float,
noise_user_count: float,
noise_multiplicity: float,
max_max_multiplicity: float = 1.0,
random_generator: Optional[np.random.Generator] = None,
) -> float:
"""Given a dataset, the epsilon of future queries and privacy parameters,
compute the max_multiplicity than optimize the bias / variance tradeoff.
For now, we cap the quantile to be computed to 0.8 to prevent critical
failures.
If user_col is None, each row is supposed to belong to a different user
If weight_col is None, each row is supposed to have a weight of 1
This is inspired from http://proceedings.mlr.press/v97/amin19a/amin19a.pdf
Parameters
-----------
data: pd.Dataframe
dataframe with a data, user and weight columns
epsilon_queries: float
epsilon of future counting queries. This is a parameter and NOT spent
noise_user_count: float
noise added when counting the number of users
noise_multiplicity:
noise added when computing the quantile which gives the max_mult
max_max_multiplicity: float
maximum max_multiplicity considered.
Bounds for the quantiles
user_col: Optional[str]
name of the user column
private_col: Optional[str]
name of the column indicating the privacy status
weight_col: Optional[str]
name of the weight_col
random_generator: Optional[np.random.Generator]
generator to use through the computation (for reproducibility)
Returns
-------
float
value of max_multiplicity for this table
"""
random_generator = (
random_generator
if random_generator is not None
else generator_from_seed(random_generator)
)
check_is_private(data, user_col, private_col)
user_count = len(data[user_col].unique()) + random_generator.laplace(
0, noise_user_count
)
cumsum = data.groupby(user_col, sort=False)[weight_col].sum()
quantile = np.array(
[QUANTILE_DIVIDER * (1 - (1 / user_count / epsilon_queries))]
)
result: float = joint_exp(
cumsum.values,
np.ones_like(cumsum.values),
0,
max_max_multiplicity,
quantile,
1 / noise_multiplicity,
False,
max_multiplicity=1,
random_generator=random_generator,
)[0]
return max(1, result)