Repository URL to install this package:
|
Version:
4.0.1 ▾
|
from typing import Tuple
import sqlalchemy as sa
from sarus_statistics.ops.exp_quantile.exp_quantile_sql import exp_quantile
def get_quantile( # pylint:disable=too-many-arguments,too-many-locals
session: sa.orm.Session,
table: sa.Table,
data_col: str,
user_col: str,
private_col: str,
multiplicity_col: str,
sampling_ratio: float,
noise: float,
quantile: float,
swap: bool,
bounds: Tuple[float, float],
rho: float = 1e-3,
max_multiplicity: float = 1,
) -> sa.sql.ClauseElement:
"""Computes quantiles for a given dataframe of continuous features
Parameters
----------
dataframe: pd.DataFrame
dataset of continuous features
sampling_ratio: float
sampling ratio to compute quantiles
noise: float
target epsilon = 1 / noise
nb_quantiles: int
nb of quantiles to compute
swap: bool
if True: DP is considered in the "replace regime", else only
"add/remove"
bounds: Dict[str,Tuple[float,float]
bounds on each feature distribution
rho: float
add small uniform noise to fix degenerate cases with a lot of
equal values
max_multiplicity: flaot
max number of identical users
user_col: Optional[str]
name of the users' column
private_col: Optional[str]
name of the column indicating the privacy status
multiplicity_col: Optional[str]
name of the multiplicity's column
Returns
-------
Tuple[Dict[str,Dict[float, float]], List[PrivateQuery]]
quantiles and the list of queries
Raises
------
ValueError
if sampling_ratio is not in ]0,1]
"""
if sampling_ratio > 1 or sampling_ratio <= 0:
raise ValueError("Sampling ratio should be in ]0,1]")
num_samples = session.query(
sa.cast(sa.func.count(table.c.value) * sampling_ratio, sa.Integer)
).scalar_subquery()
# TODO: rescale weights
# if user_col and multiplicity_col:
# dataframe = (
# dataframe.groupby(user_col)
# .apply(
# lambda x: x.sample(
# n=min(len(x), max_multiplicity), replace=False
# )
# )
# .droplevel(0)
# )
# dataframe = dataframe.drop(columns=[user_col, multiplicity_col])
data = (
session.query(table)
.order_by(sa.func.rand())
.limit(num_samples)
.subquery()
)
return exp_quantile(
session,
data,
data_col,
data_low=bounds[0],
data_high=bounds[1],
quantile=quantile,
eps=1 / noise,
rho=rho,
)