Repository URL to install this package:
|
Version:
4.0.1 ▾
|
import numpy as np
import pandas as pd
import pytest
from pandas.testing import assert_series_equal
from sarus_statistics.ops.histograms.local import NoiseKind, private_histogram
np.random.seed(0)
NOISE = 1e-9
CATEGORIES = ["England", "France", "Germany", "Italy", "Spain"]
COL_TO_EVAL = "countries"
@pytest.fixture()
def mixed_data(admin_cols):
public, user_col, weights = admin_cols
return pd.DataFrame(
data={
public: [
True,
False,
False,
False,
False,
True,
False,
False,
False,
False,
],
user_col: ["A", "A", "B", "B", "C", "C", "A", "A", "B", "A"],
weights: [
1.0,
1.0,
1.0,
0.5,
0.5,
1.0,
0.5,
0.5,
0.5,
0.5,
],
COL_TO_EVAL: [
"England",
"Italy",
"Spain",
"France",
"Italy",
"England",
"France",
"France",
"Italy",
"Italy",
],
}
)
@pytest.fixture()
def private_data(mixed_data, admin_cols):
public, _, _ = admin_cols
return mixed_data[~mixed_data[public].values]
@pytest.fixture()
def public_data(mixed_data, admin_cols):
public, user_col, weights = admin_cols
return mixed_data[mixed_data[public].values]
@pytest.mark.parametrize(
"max_multiplicity,expected_hist",
[
(1, pd.Series([2.0, 0.65, 0.0, 1.35, 0.5], index=CATEGORIES)),
(2, pd.Series([2.0, 1.3, 0.0, 2.2, 1.0], index=CATEGORIES)),
],
)
def test_laplace_mixed_data(
mixed_data, expected_hist, max_multiplicity, admin_cols
):
public, user_col, weights = admin_cols
# test with little noise, should get almost as expected real
hist = private_histogram(
data=mixed_data,
categories=CATEGORIES,
noise=NOISE,
data_col=COL_TO_EVAL,
user_col=user_col,
private_col=public,
weight_col=weights,
max_multiplicity=max_multiplicity,
public_categories=["England"],
noise_kind=NoiseKind.LAPLACE,
)
assert_series_equal(
pd.Series(hist), expected_hist, atol=1e-3, check_exact=False
)
@pytest.mark.parametrize(
"max_multiplicity,expected_hist",
[
(1, pd.Series([2.0, 0.0, 0.0, 0.0, 0.0], index=CATEGORIES)),
(2, pd.Series([2.0, 0.0, 0.0, 0.0, 0.0], index=CATEGORIES)),
],
)
def test_laplace_gaussian__public_data(
public_data, expected_hist, max_multiplicity, admin_cols
):
public, user_col, weights = admin_cols
# test with little noise, should get almost as expected real
hist = private_histogram(
data=public_data,
categories=CATEGORIES,
noise=NOISE,
data_col=COL_TO_EVAL,
user_col=user_col,
private_col=public,
weight_col=weights,
max_multiplicity=max_multiplicity,
public_categories=["England"],
noise_kind=NoiseKind.LAPLACE,
)
assert_series_equal(pd.Series(hist), expected_hist, check_exact=True)
hist = private_histogram(
data=public_data,
categories=CATEGORIES,
noise=NOISE,
data_col=COL_TO_EVAL,
user_col=user_col,
private_col=public,
weight_col=weights,
max_multiplicity=max_multiplicity,
public_categories=["England"],
noise_kind=NoiseKind.GAUSSIAN,
)
assert_series_equal(pd.Series(hist), expected_hist, check_exact=True)
@pytest.mark.parametrize(
"max_multiplicity,expected_hist",
[
(1, pd.Series([0.0, 0.65, 0.0, 1.35, 0.5], index=CATEGORIES)),
(2, pd.Series([0.0, 1.3, 0.0, 2.2, 1.0], index=CATEGORIES)),
],
)
def test_laplace_private_data(
private_data, expected_hist, max_multiplicity, admin_cols
):
public, user_col, weights = admin_cols
# test with little noise, should get almost as expected real
hist = private_histogram(
data=private_data,
categories=CATEGORIES,
noise=NOISE,
data_col=COL_TO_EVAL,
user_col=user_col,
private_col=public,
weight_col=weights,
max_multiplicity=max_multiplicity,
public_categories=["England"],
noise_kind=NoiseKind.LAPLACE,
)
assert_series_equal(
pd.Series(hist), expected_hist, atol=1e-3, check_exact=False
)
@pytest.mark.parametrize(
"max_multiplicity,expected_hist",
[
(
1,
pd.Series(
[
0.0,
2 / np.sqrt(13) + 1 / np.sqrt(6),
0.0,
3 / np.sqrt(13) + 1 / np.sqrt(6) + 0.5,
np.sqrt(2 / 3),
],
index=CATEGORIES,
),
),
(
np.sqrt(2),
pd.Series(
[
0.0,
2 * np.sqrt(2) / np.sqrt(13) + 0.5,
0.0,
3 * np.sqrt(2) / np.sqrt(13) + 0.5 + 0.5,
1,
],
index=CATEGORIES,
),
),
],
)
def test_gaussian_private_data(
private_data, expected_hist, max_multiplicity, admin_cols
):
public, user_col, weights = admin_cols
# test with little noise, should get almost as expected real
hist = private_histogram(
data=private_data,
categories=CATEGORIES,
noise=NOISE,
data_col=COL_TO_EVAL,
user_col=user_col,
private_col=public,
weight_col=weights,
max_multiplicity=max_multiplicity,
public_categories=["England"],
noise_kind=NoiseKind.GAUSSIAN,
)
assert_series_equal(
pd.Series(hist), expected_hist, atol=1e-3, check_exact=False
)
@pytest.mark.parametrize(
"max_multiplicity,expected_hist",
[
(
1,
pd.Series(
[
2.0,
2 / np.sqrt(13) + 1 / np.sqrt(6),
0.0,
3 / np.sqrt(13) + 1 / np.sqrt(6) + 0.5,
np.sqrt(2 / 3),
],
index=CATEGORIES,
),
),
(
np.sqrt(2),
pd.Series(
[
2.0,
2 * np.sqrt(2) / np.sqrt(13) + 0.5,
0.0,
3 * np.sqrt(2) / np.sqrt(13) + 0.5 + 0.5,
1,
],
index=CATEGORIES,
),
),
],
)
def test_gaussian_mixed_data(
mixed_data, expected_hist, max_multiplicity, admin_cols
):
public, user_col, weights = admin_cols
# test with little noise, should get almost as expected real
hist = private_histogram(
data=mixed_data,
categories=CATEGORIES,
noise=NOISE,
data_col=COL_TO_EVAL,
user_col=user_col,
private_col=public,
weight_col=weights,
max_multiplicity=max_multiplicity,
public_categories=["England"],
noise_kind=NoiseKind.GAUSSIAN,
)
assert_series_equal(
pd.Series(hist), expected_hist, atol=1e-3, check_exact=False
)