Repository URL to install this package:
|
Version:
4.0.1 ▾
|
from typing import Optional, Tuple, cast
import numpy as np
import pandas as pd
from sarus_statistics.ops.histograms.local import dataset_length
from sarus_statistics.ops.mean.local import mean
from sarus_statistics.ops.sum.local import sum_op
def std(
data: pd.DataFrame,
data_col: str,
user_col: str,
private_col: str,
weight_col: str,
noise_mean: float,
noise_square: float,
noise_count: float,
bounds: Tuple[float, float],
max_multiplicity: float,
random_generator: Optional[np.random.Generator] = None,
is_data_already_scaled: bool = False,
) -> float:
"""Compute DP mean of column according to max_multiplicity of user
This is a very naive implementation"""
copy = data.copy()
data[data_col] = np.clip(
data[data_col],
a_min=min(bounds[0], bounds[1]),
a_max=max(bounds[0], bounds[1]),
)
# mean
mean_result = mean(
copy,
data_col,
user_col,
private_col,
weight_col,
noise_mean,
bounds,
max_multiplicity,
random_generator,
is_data_already_scaled,
)
copy[data_col] = copy[data_col] - mean_result
# sum of square
squared_centered_bounds = (
0,
max((bounds[0] - mean_result) ** 2, (bounds[1] - mean_result) ** 2),
)
copy[data_col] = copy[data_col].apply(np.square)
sum_result = sum_op(
copy,
data_col,
user_col,
private_col,
weight_col,
noise_square,
squared_centered_bounds,
max_multiplicity,
random_generator,
is_data_already_scaled,
)
# count
length = dataset_length(
copy,
user_col,
private_col,
weight_col,
noise_count,
max_multiplicity,
random_generator,
)
std_result = cast(float, np.sqrt(sum_result / length))
return std_result