Repository URL to install this package:
|
Version:
4.0.1 ▾
|
from typing import Dict, Optional, Tuple, cast
import numpy as np
import pandas as pd
from sarus_statistics.ops.joint_exp.joint_exp import joint_exp
from sarus_statistics.ops.utils import (
check_is_private,
generator_from_seed,
rescale_weights,
)
def get_quantiles( # pylint:disable=too-many-arguments,too-many-locals
dataframe: pd.DataFrame,
data_col: str,
user_col: str,
private_col: str,
weight_col: str,
sampling_ratio: float,
noise: float,
nb_quantiles: int,
swap: bool,
bounds: Tuple[float, float],
rho: float = 1e-3,
max_multiplicity: float = 1,
random_generator: Optional[np.random.Generator] = None,
) -> Dict[float, float]:
"""Computes quantiles for a given dataframe of continuous features
Parameters
----------
dataframe: pd.DataFrame
dataset of continuous features
data_col: str
name of the value's column
user_col: str
name of the users' column. If None, the row is public.
private_col: str
name of the column indicating the privacy status
weight_col: str
name of the weight's column
sampling_ratio: float
sampling ratio to compute quantiles
noise: float
target epsilon = 1 / noise
nb_quantiles: int
nb of quantiles to compute
swap: bool
if True: DP is considered in the "replace regime", else only
"add/remove"
bounds: Dict[str,Tuple[float,float]
bounds on each feature distribution
rho: float
add small uniform noise to fix degenerate cases with a lot of
equal values
max_multiplicity: float
max number of identical users
random_generator: Optional[np.random.Generator]
generator to use through the computation (for reproducibility)
Returns
-------
Tuple[Dict[float, float], List[PrivateQuery]]
quantiles and the list of queries
Raises
------
ValueError
if sampling_ratio is not in ]0,1]
"""
random_generator = (
random_generator
if random_generator is not None
else generator_from_seed(random_generator)
)
if sampling_ratio > 1 or sampling_ratio <= 0:
raise ValueError("Sampling ratio should be in ]0,1]")
num_samples = int(len(dataframe) * sampling_ratio)
check_is_private(dataframe, user_col, private_col)
dataframe = rescale_weights(
data=dataframe,
user_col=user_col,
private_col=private_col,
weight_col=weight_col,
max_multiplicity=max_multiplicity,
)
dataframe = cast(
pd.DataFrame,
dataframe.sample(n=num_samples, replace=False),
)
quantile_indices = np.array(
[i / (nb_quantiles + 1) for i in range(1, nb_quantiles + 1)]
)
quantiles = {
cast(float, q): cast(float, value)
for q, value in zip(
quantile_indices,
joint_exp(
sorted_data=dataframe[data_col].values,
weights=dataframe[weight_col].values,
data_low=bounds[0],
data_high=bounds[1],
qs=quantile_indices,
eps=1 / noise,
swap=swap,
max_multiplicity=max_multiplicity,
rho=rho,
random_generator=random_generator,
),
)
}
return quantiles