Repository URL to install this package:
|
Version:
4.0.1 ▾
|
from typing import Optional, Tuple, cast
import numpy as np
import pandas as pd
from sarus_statistics.ops.exp_quantile.exp_quantile import exp_quantile
from sarus_statistics.ops.utils import check_is_private, generator_from_seed
def get_quantile( # pylint:disable=too-many-arguments,too-many-locals
dataframe: pd.DataFrame,
data_col: str,
user_col: str,
private_col: str,
weight_col: str,
sampling_ratio: float,
noise: float,
quantile: float,
swap: bool,
bounds: Tuple[float, float],
rho: float = 1e-3,
max_multiplicity: float = 1,
random_generator: Optional[np.random.Generator] = None,
) -> float:
"""Computes value of quantile for a given dataframe
Parameters
----------
dataframe: pd.DataFrame
dataset
data_col: str
name of the value's column
user_col: str
name of the users' column. If None, the row is public.
private_col: str
name of the column indicating the privacy status
weight_col: str
name of the weight's column
sampling_ratio: float
sampling ratio to compute quantiles
noise: float
target epsilon = 1 / noise
quantile: float
quantile probability which value to compute
swap: bool
if True: DP is considered in the "replace regime", else only
"add/remove"
bounds: Tuple[float,float]
bounds on each feature distribution
rho: float
add small uniform noise to fix degenerate cases with a lot of
equal values
max_multiplicity: float
max number of identical users
random_generator: Optional[np.random.Generator]
generator to use through the computation (for reproducibility)
Returns
-------
float
private quantile value
Raises
------
ValueError
if sampling_ratio is not in ]0,1]
"""
random_generator = (
random_generator
if random_generator is not None
else generator_from_seed(random_generator)
)
if sampling_ratio > 1 or sampling_ratio <= 0:
raise ValueError("Sampling ratio should be in ]0,1]")
if quantile > 1 or quantile < 0:
raise ValueError("Quantile should be in [0,1]")
num_samples = int(len(dataframe) * sampling_ratio)
check_is_private(dataframe, user_col, private_col)
dataframe = (
dataframe.groupby(user_col, sort=False)
.apply(
lambda x: x.sample(n=min(len(x), max_multiplicity), replace=False)
)
.droplevel(0)
)
dataframe = cast(
pd.DataFrame,
dataframe.sample(n=min(len(dataframe), num_samples), replace=False),
)
quantile_value = exp_quantile(
sorted_data=dataframe[data_col].sort_values().values,
data_low=bounds[0],
data_high=bounds[1],
quantile=quantile,
eps=1 / noise,
rho=rho,
random_generator=random_generator,
)
return quantile_value