Repository URL to install this package:
|
Version:
4.0.1 ▾
|
import typing as t
import numpy as np
import pandas as pd
try:
from sklearn.isotonic import IsotonicRegression
except ModuleNotFoundError:
pass
from sarus_statistics.ops.utils import check_is_private, rescale_weights
# pylint: disable=too-many-arguments, too-many-locals
def links(
data: pd.DataFrame,
col_name: str,
user_col: str,
weight_col: str,
private_col: str,
noise: float,
mult_fk: float,
mult_pk: float,
size_fk: int,
size_pk: int,
nb_quantiles: int,
max_categories: t.Optional[int] = None,
peid_only: bool = False,
iota: float = 0,
isotonic_regression: bool = True,
) -> t.Dict[float, float]:
"""Compute links on the data:
- compute histogram of histograms (prevalences)
- get private cumulative prevalences
- if isotonic_regression, fit regression
- compute quantiles as an approximation of the link distribution
if peid_only, assume that the only way a peid can influence a row is if it
is the foreign key.
if isotonic_regression, perform isotonic regression on the cumulative
prevalences.
if max_categories, we only consider counts <= max_categories, and clip
otherwise.
Reference: https://proceedings.neurips.cc/paper/2019/file/
f06ae085fe74cd78ad5e89496b197fe1-Paper.pdf
Parameters
-----------
data: pd.DataFrame
data on which to compute histogram
col_name: str
name of the value's column
user_col: str
name of the users' column. If None, the row is public.
private_col: str
name of the column indicating the privacy status
weight_col: str
name of the weight's column
noise: float
scale of the gaussian noise to add to each category
mult_fk: float
multiplicity of the pointing column (Foreign Key)
mult_pk: float
multiplicity of the pointed column (Primary Key)
size_fk: int
(noisy) size of the pointing column (Foreign Key)
size_pk: int
(noisy) size of the pointed column (Primary Key)
nb_quantiles: int
number of quantiles to estimate the link distribution
max_categories: t.Optional[int]
max number of repetitions of the same primary key
If None, suppose that all foreign keys can be the same primary key
peid_only: bool
if peid_only, assume that the only way a peid can influence a row is
if it is the foreign key.
iota: float
clip quantiles probabilities to (iota, 1 - iota).
this is useful to force the 0 and 1 quantiles to be mapped to the
expected bounds
isotonic_regression: bool
if isotonic_regression, perform isotonic regression on the cumulative
prevalences.
Returns
-------
Dict[float, float]
quantile dict
"""
check_is_private(data, user_col, private_col)
private_data = rescale_weights(
data=data,
user_col=user_col,
private_col=private_col,
weight_col=weight_col,
max_multiplicity=mult_fk,
)
if not max_categories:
max_categories = size_fk
prevalences_length = int(
min(max_categories, size_fk + 1 if peid_only else mult_fk + 1)
)
sensitivity = 2 * mult_pk if peid_only else 2 * mult_fk * mult_pk
prevalences = (
private_data.groupby(col_name, sort=False)[weight_col]
.sum()
.apply(round)
.clip(upper=prevalences_length - 1)
.value_counts()
)
return quantiles_from_prevalences(
prevalences,
prevalences_length,
noise,
sensitivity,
nb_quantiles,
size_pk,
iota,
isotonic_regression,
)
def quantiles_from_prevalences(
prevalences: pd.Series,
prevalences_length: int,
noise: float,
sensitivity: float,
nb_quantiles: int,
size_pk: int,
iota: float,
isotonic_regression: bool,
) -> t.Dict[float, float]:
"""Compute quantiles of the links distribution from prevalences"""
prevalences = prevalences.reindex(range(prevalences_length)).fillna(0)
prevalences = prevalences.iloc[::-1].cumsum()
prevalences = prevalences + np.random.laplace(
0, noise * sensitivity, prevalences_length
)
if isotonic_regression:
try:
iso_reg = IsotonicRegression(
y_min=0, y_max=size_pk, increasing=False, out_of_bounds='clip'
).fit(prevalences.index, prevalences.values)
prevalences = pd.Series(
iso_reg.transform(range(prevalences_length))
)
except NameError:
prevalences = prevalences.iloc[::-1]
else:
prevalences = prevalences.iloc[::-1]
# get quantiles
quantiles_probabilities = [
(q + 1) / (nb_quantiles + 1) for q in range(nb_quantiles)
]
if len(prevalences.unique()) == 1:
# constant
unique_value = len(prevalences)
return {q: unique_value for q in [0.0, 1.0] + quantiles_probabilities}
quantiles: t.Dict[float, float] = {
0: max(prevalences[prevalences == prevalences[0]].index),
1: min(prevalences[prevalences == prevalences.iloc[-1]].index),
}
for probability in quantiles_probabilities:
probability = np.clip(probability, iota, 1 - iota)
value = prevalences[0] - probability * (
prevalences[0] - prevalences.iloc[-1]
)
closest_prevalence = prevalences.iloc[
(prevalences - value).abs().argsort()[:1]
].iloc[0]
indices = prevalences[prevalences == closest_prevalence].index
quantiles[probability] = int(np.mean(indices))
return quantiles