Repository URL to install this package:
|
Version:
4.0.1 ▾
|
import math
from typing import Any, List, Optional, Union
import numpy as np
import pandas as pd
from sarus_statistics.ops.utils import check_is_private, generator_from_seed
CatType = Optional[Union[List[str], List[float], List[int], List[bool]]]
def dataset_above_tau_threshold(
data: pd.DataFrame,
user_col: str,
private_col: str,
weight_col: str, # Why not used?
keys: Optional[List[Any]],
epsilon: float,
delta: float,
max_multiplicity: float = 1,
random_generator: Optional[np.random.Generator] = None,
) -> pd.DataFrame:
random_generator = (
random_generator
if random_generator is not None
else generator_from_seed(random_generator)
)
check_is_private(data, user_col, private_col)
noise_user_count = max_multiplicity / epsilon
def process_group(
data: pd.DataFrame, random_generator: np.random.Generator
) -> Optional[float]:
user_count = len(data[user_col].unique()) + random_generator.laplace(
0, noise_user_count
)
tau = compute_tau(epsilon, delta, max_multiplicity)
return user_count if user_count > tau else None
df_processed_group = data.groupby(keys, sort=False).apply(
lambda x: process_group(x, random_generator)
)
df_processed_group = (
df_processed_group.dropna()
) # Remove the rows with NaN values
return df_processed_group
def compute_tau(epsilon: float, delta: float, max_ids: float) -> float:
return (
1 - max_ids * math.log(2 - 2 * (1 - delta) ** (1 / max_ids)) / epsilon
)