Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
sarus_statistics / tests / unit / test_ops / test_histograms.py
Size: Mime:
import numpy as np
import pandas as pd
import pytest
from pandas.testing import assert_series_equal

from sarus_statistics.ops.histograms.local import NoiseKind, private_histogram

np.random.seed(0)
NOISE = 1e-9
CATEGORIES = ["England", "France", "Germany", "Italy", "Spain"]

COL_TO_EVAL = "countries"


@pytest.fixture()
def mixed_data(admin_cols):
    public, user_col, weights = admin_cols

    return pd.DataFrame(
        data={
            public: [
                True,
                False,
                False,
                False,
                False,
                True,
                False,
                False,
                False,
                False,
            ],
            user_col: ["A", "A", "B", "B", "C", "C", "A", "A", "B", "A"],
            weights: [
                1.0,
                1.0,
                1.0,
                0.5,
                0.5,
                1.0,
                0.5,
                0.5,
                0.5,
                0.5,
            ],
            COL_TO_EVAL: [
                "England",
                "Italy",
                "Spain",
                "France",
                "Italy",
                "England",
                "France",
                "France",
                "Italy",
                "Italy",
            ],
        }
    )


@pytest.fixture()
def private_data(mixed_data, admin_cols):
    public, _, _ = admin_cols
    return mixed_data[~mixed_data[public].values]


@pytest.fixture()
def public_data(mixed_data, admin_cols):
    public, user_col, weights = admin_cols

    return mixed_data[mixed_data[public].values]


@pytest.mark.parametrize(
    "max_multiplicity,expected_hist",
    [
        (1, pd.Series([2.0, 0.65, 0.0, 1.35, 0.5], index=CATEGORIES)),
        (2, pd.Series([2.0, 1.3, 0.0, 2.2, 1.0], index=CATEGORIES)),
    ],
)
def test_laplace_mixed_data(
    mixed_data, expected_hist, max_multiplicity, admin_cols
):
    public, user_col, weights = admin_cols

    # test with little noise, should get almost as expected real
    hist = private_histogram(
        data=mixed_data,
        categories=CATEGORIES,
        noise=NOISE,
        data_col=COL_TO_EVAL,
        user_col=user_col,
        private_col=public,
        weight_col=weights,
        max_multiplicity=max_multiplicity,
        public_categories=["England"],
        noise_kind=NoiseKind.LAPLACE,
    )
    assert_series_equal(
        pd.Series(hist), expected_hist, atol=1e-3, check_exact=False
    )


@pytest.mark.parametrize(
    "max_multiplicity,expected_hist",
    [
        (1, pd.Series([2.0, 0.0, 0.0, 0.0, 0.0], index=CATEGORIES)),
        (2, pd.Series([2.0, 0.0, 0.0, 0.0, 0.0], index=CATEGORIES)),
    ],
)
def test_laplace_gaussian__public_data(
    public_data, expected_hist, max_multiplicity, admin_cols
):
    public, user_col, weights = admin_cols

    # test with little noise, should get almost as expected real
    hist = private_histogram(
        data=public_data,
        categories=CATEGORIES,
        noise=NOISE,
        data_col=COL_TO_EVAL,
        user_col=user_col,
        private_col=public,
        weight_col=weights,
        max_multiplicity=max_multiplicity,
        public_categories=["England"],
        noise_kind=NoiseKind.LAPLACE,
    )
    assert_series_equal(pd.Series(hist), expected_hist, check_exact=True)
    hist = private_histogram(
        data=public_data,
        categories=CATEGORIES,
        noise=NOISE,
        data_col=COL_TO_EVAL,
        user_col=user_col,
        private_col=public,
        weight_col=weights,
        max_multiplicity=max_multiplicity,
        public_categories=["England"],
        noise_kind=NoiseKind.GAUSSIAN,
    )
    assert_series_equal(pd.Series(hist), expected_hist, check_exact=True)


@pytest.mark.parametrize(
    "max_multiplicity,expected_hist",
    [
        (1, pd.Series([0.0, 0.65, 0.0, 1.35, 0.5], index=CATEGORIES)),
        (2, pd.Series([0.0, 1.3, 0.0, 2.2, 1.0], index=CATEGORIES)),
    ],
)
def test_laplace_private_data(
    private_data, expected_hist, max_multiplicity, admin_cols
):
    public, user_col, weights = admin_cols

    # test with little noise, should get almost as expected real
    hist = private_histogram(
        data=private_data,
        categories=CATEGORIES,
        noise=NOISE,
        data_col=COL_TO_EVAL,
        user_col=user_col,
        private_col=public,
        weight_col=weights,
        max_multiplicity=max_multiplicity,
        public_categories=["England"],
        noise_kind=NoiseKind.LAPLACE,
    )
    assert_series_equal(
        pd.Series(hist), expected_hist, atol=1e-3, check_exact=False
    )


@pytest.mark.parametrize(
    "max_multiplicity,expected_hist",
    [
        (
            1,
            pd.Series(
                [
                    0.0,
                    2 / np.sqrt(13) + 1 / np.sqrt(6),
                    0.0,
                    3 / np.sqrt(13) + 1 / np.sqrt(6) + 0.5,
                    np.sqrt(2 / 3),
                ],
                index=CATEGORIES,
            ),
        ),
        (
            np.sqrt(2),
            pd.Series(
                [
                    0.0,
                    2 * np.sqrt(2) / np.sqrt(13) + 0.5,
                    0.0,
                    3 * np.sqrt(2) / np.sqrt(13) + 0.5 + 0.5,
                    1,
                ],
                index=CATEGORIES,
            ),
        ),
    ],
)
def test_gaussian_private_data(
    private_data, expected_hist, max_multiplicity, admin_cols
):
    public, user_col, weights = admin_cols

    # test with little noise, should get almost as expected real
    hist = private_histogram(
        data=private_data,
        categories=CATEGORIES,
        noise=NOISE,
        data_col=COL_TO_EVAL,
        user_col=user_col,
        private_col=public,
        weight_col=weights,
        max_multiplicity=max_multiplicity,
        public_categories=["England"],
        noise_kind=NoiseKind.GAUSSIAN,
    )
    assert_series_equal(
        pd.Series(hist), expected_hist, atol=1e-3, check_exact=False
    )


@pytest.mark.parametrize(
    "max_multiplicity,expected_hist",
    [
        (
            1,
            pd.Series(
                [
                    2.0,
                    2 / np.sqrt(13) + 1 / np.sqrt(6),
                    0.0,
                    3 / np.sqrt(13) + 1 / np.sqrt(6) + 0.5,
                    np.sqrt(2 / 3),
                ],
                index=CATEGORIES,
            ),
        ),
        (
            np.sqrt(2),
            pd.Series(
                [
                    2.0,
                    2 * np.sqrt(2) / np.sqrt(13) + 0.5,
                    0.0,
                    3 * np.sqrt(2) / np.sqrt(13) + 0.5 + 0.5,
                    1,
                ],
                index=CATEGORIES,
            ),
        ),
    ],
)
def test_gaussian_mixed_data(
    mixed_data, expected_hist, max_multiplicity, admin_cols
):

    public, user_col, weights = admin_cols
    # test with little noise, should get almost as expected real
    hist = private_histogram(
        data=mixed_data,
        categories=CATEGORIES,
        noise=NOISE,
        data_col=COL_TO_EVAL,
        user_col=user_col,
        private_col=public,
        weight_col=weights,
        max_multiplicity=max_multiplicity,
        public_categories=["England"],
        noise_kind=NoiseKind.GAUSSIAN,
    )
    assert_series_equal(
        pd.Series(hist), expected_hist, atol=1e-3, check_exact=False
    )