Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
Size: Mime:
from typing import Optional, Tuple, cast

import numpy as np
import pandas as pd

from sarus_statistics.ops.exp_quantile.exp_quantile import exp_quantile
from sarus_statistics.ops.utils import check_is_private, generator_from_seed


def get_quantile(  # pylint:disable=too-many-arguments,too-many-locals
    dataframe: pd.DataFrame,
    data_col: str,
    user_col: str,
    private_col: str,
    weight_col: str,
    sampling_ratio: float,
    noise: float,
    quantile: float,
    swap: bool,
    bounds: Tuple[float, float],
    rho: float = 1e-3,
    max_multiplicity: float = 1,
    random_generator: Optional[np.random.Generator] = None,
) -> float:
    """Computes value of quantile for a given dataframe

    Parameters
    ----------
    dataframe: pd.DataFrame
        dataset
    data_col: str
        name of the value's column
    user_col: str
        name of the users' column. If None, the row is public.
    private_col: str
        name of the column indicating the privacy status
    weight_col: str
        name of the weight's column
    sampling_ratio: float
        sampling ratio to compute quantiles
    noise: float
        target epsilon = 1 / noise
    quantile: float
        quantile probability which value to compute
    swap: bool
        if True: DP is considered in the "replace regime", else only
        "add/remove"
    bounds: Tuple[float,float]
        bounds on each feature distribution
    rho: float
        add small uniform noise to fix degenerate cases with a lot of
        equal values
    max_multiplicity: float
        max number of identical users
    random_generator: Optional[np.random.Generator]
        generator to use through the computation (for reproducibility)

    Returns
    -------
    float
        private quantile value

    Raises
    ------
    ValueError
        if sampling_ratio is not in ]0,1]
    """
    random_generator = (
        random_generator
        if random_generator is not None
        else generator_from_seed(random_generator)
    )
    if sampling_ratio > 1 or sampling_ratio <= 0:
        raise ValueError("Sampling ratio should be in ]0,1]")
    if quantile > 1 or quantile < 0:
        raise ValueError("Quantile should be in [0,1]")
    num_samples = int(len(dataframe) * sampling_ratio)

    check_is_private(dataframe, user_col, private_col)
    dataframe = (
        dataframe.groupby(user_col, sort=False)
        .apply(
            lambda x: x.sample(n=min(len(x), max_multiplicity), replace=False)
        )
        .droplevel(0)
    )

    dataframe = cast(
        pd.DataFrame,
        dataframe.sample(n=min(len(dataframe), num_samples), replace=False),
    )

    quantile_value = exp_quantile(
        sorted_data=dataframe[data_col].sort_values().values,
        data_low=bounds[0],
        data_high=bounds[1],
        quantile=quantile,
        eps=1 / noise,
        rho=rho,
        random_generator=random_generator,
    )

    return quantile_value