Repository URL to install this package:
|
Version:
4.0.1 ▾
|
import numpy as np
import pandas as pd
import pytest
from sarus_statistics.ops.bounds.local import automatic_column_range_pandas
np.random.seed(0)
LOWER_BOUND = 10
UPPER_BOUND = 100
NOISE = 0.01
N_SAMPLES = 10000
N_QUERIES = [4092, 508, 128, 64, 32, 8]
USERS = list(range(1000))
@pytest.fixture()
def bounds_data():
return pd.DataFrame(
{
"data": np.random.uniform(size=N_SAMPLES)
* (UPPER_BOUND - LOWER_BOUND)
+ LOWER_BOUND,
"user": np.random.choice(USERS, size=N_SAMPLES),
"multiplicity": [1 for _ in range(N_SAMPLES)],
}
)
@pytest.mark.parametrize(
'dtype', ["float64", "float32", "int64", "int32", "int16", "int8"]
)
def test_automatic_column_range_pandas(bounds_data, dtype):
data_dtype = bounds_data.astype(dtype=dtype)
data_dtype["is_public"] = [False for _ in range(N_SAMPLES)]
lower, upper = automatic_column_range_pandas(
data=data_dtype,
noise=NOISE,
data_col="data",
user_col="user",
private_col="is_public",
dtype=dtype,
weight_col="multiplicity",
)
assert lower == 8
assert upper == 128
def test_bounds(ops_data, admin_cols):
public, user_col, weights = admin_cols
min_value, max_value = automatic_column_range_pandas(
ops_data,
data_col='integer',
user_col=user_col,
private_col=public,
weight_col=weights,
dtype='int64',
noise=1e-9,
max_multiplicity=10,
)
assert pytest.approx([min_value, max_value], 1e-2) == [0, 1024]