Repository URL to install this package:
|
Version:
4.5.4.dev1 ▾
|
import typing as t
from sarus_statistics.ops.utils import rescale_weights
import pandas as pd
from sarus_data_spec.arrow.admin_utils import (
async_admin_data,
compute_admin_data,
)
from sarus_data_spec.constants import PUBLIC, PU_COLUMN, WEIGHTS
from sarus_data_spec.manager.async_utils import sync
from sarus_data_spec.manager.cache_utils import lru_caching
import sarus_data_spec.typing as st
@lru_caching("statistics", use_first_arg=True)
def compute_full_data_from_dataspec(dataset: st.Dataset) -> pd.DataFrame:
left = t.cast(pd.DataFrame, dataset.to(pd.DataFrame))
assert dataset.pup_token() is not None
admin_data = sync(async_admin_data(dataset))
output_admin_data = compute_admin_data(admin_data, left)
output_admin_data_df = output_admin_data.to_pandas()
return left.join(output_admin_data_df)
@lru_caching("statistics", use_first_arg=True)
def compute_rescaled_data_from_dataspec(
dataset: st.Dataset, max_multiplicity: float
) -> pd.DataFrame:
data = compute_full_data_from_dataspec(dataset)
rescaled_data = rescale_weights(
data=data,
user_col=PU_COLUMN,
private_col=PUBLIC,
weight_col=WEIGHTS,
max_multiplicity=max_multiplicity,
)
return rescaled_data
@lru_caching("statistics", use_first_arg=True)
def compute_groupby_data_from_dataspec(
dataset: st.Dataset,
) -> t.Tuple[t.List[t.Tuple[t.Any, pd.DataFrame]], float]:
dataframegroupby: pd.core.groupby.DataFrameGroupBy = dataset.to(
pd.core.groupby.DataFrameGroupBy
)
keys_name = dataframegroupby.keys
full_data = compute_full_data_from_dataspec(dataset)
rescaled_groupbydata = []
for key, group_df in dataframegroupby:
group_data = group_df.join(full_data[[PU_COLUMN, PUBLIC, WEIGHTS]])
rescaled_groupbydata.append((key, group_data))
return rescaled_groupbydata.copy(), keys_name
@lru_caching("statistics", use_first_arg=True)
def compute_rescaled_groupby_data_from_dataspec(
dataset: st.Dataset, max_multiplicity: float
) -> t.Tuple[t.List[t.Tuple[t.Any, pd.DataFrame]], float]:
dataframegroupby, keys_name = compute_groupby_data_from_dataspec(dataset)
rescaled_groupbydata = []
for key, group_data in dataframegroupby:
rescaled_group_data = rescale_weights(
data=group_data,
user_col=PU_COLUMN,
private_col=PUBLIC,
weight_col=WEIGHTS,
max_multiplicity=max_multiplicity,
)
rescaled_groupbydata.append((key, rescaled_group_data))
return rescaled_groupbydata.copy(), keys_name