Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
Size: Mime:
from typing import Dict, Optional, Tuple, cast

import numpy as np
import pandas as pd

from sarus_statistics.ops.joint_exp.joint_exp import joint_exp
from sarus_statistics.ops.utils import (
    check_is_private,
    generator_from_seed,
    rescale_weights,
)


def get_quantiles(  # pylint:disable=too-many-arguments,too-many-locals
    dataframe: pd.DataFrame,
    data_col: str,
    user_col: str,
    private_col: str,
    weight_col: str,
    sampling_ratio: float,
    noise: float,
    nb_quantiles: int,
    swap: bool,
    bounds: Tuple[float, float],
    rho: float = 1e-3,
    max_multiplicity: float = 1,
    random_generator: Optional[np.random.Generator] = None,
) -> Dict[float, float]:
    """Computes quantiles for a given dataframe of continuous features

    Parameters
    ----------
    dataframe: pd.DataFrame
        dataset of continuous features
    data_col: str
        name of the value's column
    user_col: str
        name of the users' column. If None, the row is public.
    private_col: str
        name of the column indicating the privacy status
    weight_col: str
        name of the weight's column
    sampling_ratio: float
        sampling ratio to compute quantiles
    noise: float
        target epsilon = 1 / noise
    nb_quantiles: int
        nb of quantiles to compute
    swap: bool
        if True: DP is considered in the "replace regime", else only
        "add/remove"
    bounds: Dict[str,Tuple[float,float]
        bounds on each feature distribution
    rho: float
        add small uniform noise to fix degenerate cases with a lot of
        equal values
    max_multiplicity: float
        max number of identical users
    random_generator: Optional[np.random.Generator]
        generator to use through the computation (for reproducibility)

    Returns
    -------
    Tuple[Dict[float, float], List[PrivateQuery]]
        quantiles and the list of queries

    Raises
    ------
    ValueError
        if sampling_ratio is not in ]0,1]
    """
    random_generator = (
        random_generator
        if random_generator is not None
        else generator_from_seed(random_generator)
    )
    if sampling_ratio > 1 or sampling_ratio <= 0:
        raise ValueError("Sampling ratio should be in ]0,1]")
    num_samples = int(len(dataframe) * sampling_ratio)

    check_is_private(dataframe, user_col, private_col)
    dataframe = rescale_weights(
        data=dataframe,
        user_col=user_col,
        private_col=private_col,
        weight_col=weight_col,
        max_multiplicity=max_multiplicity,
    )

    dataframe = cast(
        pd.DataFrame,
        dataframe.sample(n=num_samples, replace=False),
    )
    quantile_indices = np.array(
        [i / (nb_quantiles + 1) for i in range(1, nb_quantiles + 1)]
    )

    quantiles = {
        cast(float, q): cast(float, value)
        for q, value in zip(
            quantile_indices,
            joint_exp(
                sorted_data=dataframe[data_col].values,
                weights=dataframe[weight_col].values,
                data_low=bounds[0],
                data_high=bounds[1],
                qs=quantile_indices,
                eps=1 / noise,
                swap=swap,
                max_multiplicity=max_multiplicity,
                rho=rho,
                random_generator=random_generator,
            ),
        )
    }

    return quantiles