Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
Size: Mime:
import typing as t
import warnings

try:
    from sarus_data_spec.sarus_statistics.tasks.bounds.base import (
        BoundsParameters,
    )
    from sarus_data_spec.sarus_statistics.tasks.marginals.base import (
        MarginalsParameters,
    )
    from sarus_data_spec.sarus_statistics.tasks.max_multiplicity.base import (
        MaxMultiplicityParameters,
    )
    from sarus_data_spec.sarus_statistics.tasks.size.base import SizeParameters
    import sarus_data_spec.sarus_statistics.tasks.links.links as links_classes
    import sarus_data_spec.sarus_statistics.tasks.links.links_sql as links_sql

except ModuleNotFoundError:
    warnings.warn(
        "Sarus_statistics Module not found, marginals operations not "
        "available "
    )

from sarus_data_spec.bounds import bounds as bounds_builder
from sarus_data_spec.constants import (
    BOUNDS_TASK,
    LINKS_TASK,
    MARGINALS_TASK,
    MULTIPLICITY_TASK,
    SIZE_TASK,
)
from sarus_data_spec.manager.ops.source.query_builder import (
    qb_parameters,
    size_parameters,
    synthetic_parameters,
)
from sarus_data_spec.marginals import marginals as marginals_builder
from sarus_data_spec.multiplicity import multiplicity as multiplicity_builder
from sarus_data_spec.scalar import Scalar
from sarus_data_spec.size import size as size_builder
import sarus_data_spec.typing as st
from sarus_synthetic_data.configs.global_config import SyntheticConfig


async def get_multiplicity(dataset: st.Dataset) -> st.Multiplicity:
    """Compute the multiplicity for the given dataset.
    it first retrieves the budget and
    the query builder params.
    These parameters are serialized and so they
    are transformed back to protobufs.
    Then it calls the statistics method to compute.
    """

    total_budget = await get_attr_budget(dataset)
    multiplicity_params = await size_parameters(
        dataset, budget=total_budget, task=MULTIPLICITY_TASK
    )
    multiplicity_params = MaxMultiplicityParameters(multiplicity_params)
    statistics = multiplicity_params.compute(dataset)
    return multiplicity_builder(
        dataset=dataset,
        statistics=statistics,
    )


async def get_size(dataset: st.Dataset) -> st.Size:
    """Compute the size for the given dataset.
    it first retrieves the budget and
    the query builder params.
    These parameters are serialized and so they
    are transformed back to protobufs.
    Then it calls the statistics method to compute.
    """

    total_budget = await get_attr_budget(dataset)
    size_params = await size_parameters(
        dataset, budget=total_budget, task=SIZE_TASK
    )
    size_params = SizeParameters(size_params)
    statistics = size_params.compute(dataset)
    return size_builder(
        dataset=dataset,
        statistics=statistics,
    )


async def get_bounds(dataset: st.Dataset) -> st.Bounds:
    """Compute the bounds for the given dataset.
    it first retrieves the budget and
    the query builder params.
    These parameters are serialized and so they
    are transformed back to protobufs.
    Then it calls the statistics method to compute.
    """

    total_budget = await get_attr_budget(dataset)
    # get qb params
    bounds_params = await qb_parameters(
        dataset, total_budget=total_budget, task=BOUNDS_TASK
    )
    bounds_params = BoundsParameters(bounds_params)
    statistics = bounds_params.compute(dataset)
    return bounds_builder(dataset=dataset, statistics=statistics)


async def get_marginals(dataset: st.Dataset) -> st.Marginals:
    total_budget = await get_attr_budget(dataset)
    # get qb params
    marginals_params = await qb_parameters(
        dataset, total_budget=total_budget, task=MARGINALS_TASK
    )
    marginals_params = MarginalsParameters(marginals_params)
    statistics = marginals_params.compute(dataset)
    return marginals_builder(dataset=dataset, statistics=statistics)


async def get_attr_budget(dataset: st.Dataset) -> t.Tuple[float, float]:
    """Retrieves the total budget for all the attributes"""
    assert dataset.transform().name() == "budget_assignment"
    _, parents_dict = dataset.parents()
    return t.cast(
        t.Tuple[float, float],
        await dataset.manager().async_value(
            t.cast(Scalar, parents_dict["attributes_budget"])
        ),
    )


async def get_links(dataset: st.Dataset) -> st.Links:
    """Computes distribution of links (Foreign keys). The input
    dataset should be the synthetic dataset as links are used
    to generate it"""

    total_budget = t.cast(
        t.Tuple[float, float],
        await t.cast(Scalar, dataset.parents()[1]["sd_budget"]).async_value(),
    )
    # Model
    model_properties = t.cast(Scalar, dataset.parents()[1]["synthetic_model"])
    synthetic_config = t.cast(
        SyntheticConfig,
        await dataset.manager().async_value(model_properties),
    )

    # get qb params
    links_params = await synthetic_parameters(
        dataset,
        sd_budget=total_budget,
        task=LINKS_TASK,
        synthetic_config=synthetic_config,
    )

    if dataset.manager().is_big_data(dataset):
        return t.cast(
            st.Links,
            links_sql.links_statistics(
                dataset,
                links_params.random_links,
            ),
        )
    return t.cast(
        st.Links,
        links_classes.links_statistics(
            synthetic_dataset=dataset,
            links_dp_params=links_params.random_links,
        ),
    )