Repository URL to install this package:
|
Version:
4.5.4.dev1 ▾
|
import typing as t
import warnings
try:
from sarus_data_spec.sarus_statistics.tasks.bounds.base import (
BoundsParameters,
)
from sarus_data_spec.sarus_statistics.tasks.marginals.base import (
MarginalsParameters,
)
from sarus_data_spec.sarus_statistics.tasks.max_multiplicity.base import (
MaxMultiplicityParameters,
)
from sarus_data_spec.sarus_statistics.tasks.size.base import SizeParameters
import sarus_data_spec.sarus_statistics.tasks.links.links as links_classes
import sarus_data_spec.sarus_statistics.tasks.links.links_sql as links_sql
except ModuleNotFoundError:
warnings.warn(
"Sarus_statistics Module not found, marginals operations not "
"available "
)
from sarus_data_spec.bounds import bounds as bounds_builder
from sarus_data_spec.constants import (
BOUNDS_TASK,
LINKS_TASK,
MARGINALS_TASK,
MULTIPLICITY_TASK,
SIZE_TASK,
)
from sarus_data_spec.manager.ops.source.query_builder import (
qb_parameters,
size_parameters,
synthetic_parameters,
)
from sarus_data_spec.marginals import marginals as marginals_builder
from sarus_data_spec.multiplicity import multiplicity as multiplicity_builder
from sarus_data_spec.scalar import Scalar
from sarus_data_spec.size import size as size_builder
import sarus_data_spec.typing as st
from sarus_synthetic_data.configs.global_config import SyntheticConfig
async def get_multiplicity(dataset: st.Dataset) -> st.Multiplicity:
"""Compute the multiplicity for the given dataset.
it first retrieves the budget and
the query builder params.
These parameters are serialized and so they
are transformed back to protobufs.
Then it calls the statistics method to compute.
"""
total_budget = await get_attr_budget(dataset)
multiplicity_params = await size_parameters(
dataset, budget=total_budget, task=MULTIPLICITY_TASK
)
multiplicity_params = MaxMultiplicityParameters(multiplicity_params)
statistics = multiplicity_params.compute(dataset)
return multiplicity_builder(
dataset=dataset,
statistics=statistics,
)
async def get_size(dataset: st.Dataset) -> st.Size:
"""Compute the size for the given dataset.
it first retrieves the budget and
the query builder params.
These parameters are serialized and so they
are transformed back to protobufs.
Then it calls the statistics method to compute.
"""
total_budget = await get_attr_budget(dataset)
size_params = await size_parameters(
dataset, budget=total_budget, task=SIZE_TASK
)
size_params = SizeParameters(size_params)
statistics = size_params.compute(dataset)
return size_builder(
dataset=dataset,
statistics=statistics,
)
async def get_bounds(dataset: st.Dataset) -> st.Bounds:
"""Compute the bounds for the given dataset.
it first retrieves the budget and
the query builder params.
These parameters are serialized and so they
are transformed back to protobufs.
Then it calls the statistics method to compute.
"""
total_budget = await get_attr_budget(dataset)
# get qb params
bounds_params = await qb_parameters(
dataset, total_budget=total_budget, task=BOUNDS_TASK
)
bounds_params = BoundsParameters(bounds_params)
statistics = bounds_params.compute(dataset)
return bounds_builder(dataset=dataset, statistics=statistics)
async def get_marginals(dataset: st.Dataset) -> st.Marginals:
total_budget = await get_attr_budget(dataset)
# get qb params
marginals_params = await qb_parameters(
dataset, total_budget=total_budget, task=MARGINALS_TASK
)
marginals_params = MarginalsParameters(marginals_params)
statistics = marginals_params.compute(dataset)
return marginals_builder(dataset=dataset, statistics=statistics)
async def get_attr_budget(dataset: st.Dataset) -> t.Tuple[float, float]:
"""Retrieves the total budget for all the attributes"""
assert dataset.transform().name() == "budget_assignment"
_, parents_dict = dataset.parents()
return t.cast(
t.Tuple[float, float],
await dataset.manager().async_value(
t.cast(Scalar, parents_dict["attributes_budget"])
),
)
async def get_links(dataset: st.Dataset) -> st.Links:
"""Computes distribution of links (Foreign keys). The input
dataset should be the synthetic dataset as links are used
to generate it"""
total_budget = t.cast(
t.Tuple[float, float],
await t.cast(Scalar, dataset.parents()[1]["sd_budget"]).async_value(),
)
# Model
model_properties = t.cast(Scalar, dataset.parents()[1]["synthetic_model"])
synthetic_config = t.cast(
SyntheticConfig,
await dataset.manager().async_value(model_properties),
)
# get qb params
links_params = await synthetic_parameters(
dataset,
sd_budget=total_budget,
task=LINKS_TASK,
synthetic_config=synthetic_config,
)
if dataset.manager().is_big_data(dataset):
return t.cast(
st.Links,
links_sql.links_statistics(
dataset,
links_params.random_links,
),
)
return t.cast(
st.Links,
links_classes.links_statistics(
synthetic_dataset=dataset,
links_dp_params=links_params.random_links,
),
)