Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
sarus_sql / sarus_sql / merge_strategies / standard_merge.py
Size: Mime:
"""Some basics merging strategies"""

from typing import Any, List, Optional, Tuple, Union


class Simple:
    """The first merge strategy used.
    Merge formula:
        DP * epsilon/(1+epsilon) + SY/(1+epsilon) where DP and SY bins overlapp
        otherwise keep or DP or SY

    Atrtibutes:
        target_epsilon (flaot): used for the dp query
        keep_float (bool): if True then return floats
            else it will be of the same type of synthetic resuls.
            If synthetic results are empty keep_float has no effect and
            the DP result types will be returned.
    """

    def __init__(
        self, target_epsilon: float, keep_float: bool, **_kwargs: Any
    ):
        """Constructor

        Args:
            target_epsilon (float):
            keep_float (bool):
        """
        self.epsilon = target_epsilon
        self.keep_float = keep_float
        self.dtypes: List[Any]

    def merging_formula(
        self,
        dp_value: Optional[float],
        synth_value: Optional[Union[int, float]],
    ) -> Optional[Union[int, float]]:
        """Applied merging formula

        Args:
            dp_value Union[float, NoneType]: single dp result
            synth_value Union[int, float, NoneType]: single synthetic result

        Returns:
            Union[int, float, NoneType]: merged results.
                The type depends on self.keep_float
        """
        if dp_value is not None and synth_value is not None:
            res = (self.epsilon * float(dp_value) + float(synth_value)) / (
                1 + self.epsilon
            )
        else:
            return None

        if not self.keep_float and synth_value is not None:
            return type(synth_value)(res)
        return res

    def merge_row(
        self,
        dp_row: List[Any],
        sy_row: List[Any],
        grouping_key: Tuple[Any, ...],
    ) -> List[Any]:
        """merge DP row with SY row. For this strategy we keep the
        old merging conception:
            if DP empty return SY
            if SY empty return DP
            on common keys apply formula

        Args:
            dp_row (List[Any]): _description_
            sy_row (List[Any]): _description_
            grouping_key (Tuple[Any, ...]): _description_

        Returns:
            List[Any]: merged row
        """

        if not dp_row:
            res = sy_row
        elif not sy_row:
            # When the synthetic row is epmpty we return the DP row
            # with SD type if keep_float is False
            res = [
                self.dtypes[i](dp_row[i])
                if self.dtypes[i] != type(None)  # noqa: E721
                and not isinstance(dp_row[i], (self.dtypes[i], type(None)))
                and not self.keep_float
                else dp_row[i]
                for i in range(len(dp_row))
            ]
        else:
            res = [
                self.merging_formula(dp_res, synth_res)
                if synth_res not in grouping_key and dp_res not in grouping_key
                else synth_res
                for synth_res, dp_res in zip(sy_row, dp_row)
            ]
        return res

    def merge(
        self,
        dp_keys: List[Tuple[Any, ...]],
        dp_values: List[List[Any]],
        sy_keys: List[Tuple[Any, ...]],
        sy_values: List[List[Any]],
    ) -> List[List[Any]]:
        """Dp and synthetic results are merged here.

        Args:
            dp_keys (List[Tuple[Any, ...]]):
                list containing tuples with group by values for each line.
                They are used as keys. e.g. given a query with 'GROUP BY sex
                education_num', dp_keys would be:
                [("Male", 1), ("Female", 1)] where "Female" and "Male"

            dp_values (List[List[Any]]):
                list with row values from dp results. e.g. given a query like
                'SELECT sex, education_num, AVG(age), COUNT(*) FROM census
                GROUP BY sex, education_num' dp_values would be:
                [["Male", 1, 32.3, 444], ["Female", 1, 44.4, 654]]
            sy_keys (List[Tuple[Any, ...]]): similarly as dp_keys, list
                containing tuples with group by values for each line.
            sy_values (List[List[Any]]): similarly as dp_values, list with row
            values from synthetic results.

        Returns:
            List[List[Any]]: merged results
        """
        if len(sy_values) > 0:
            self.dtypes = [
                type(sy_values[0][i]) for i in range(len(sy_values[0]))
            ]
        else:
            self.dtypes = [
                type(dp_values[0][i]) for i in range(len(dp_values[0]))
            ]

        merge_results = []
        for grouping_key in set(dp_keys + sy_keys):
            dp_row = (
                dp_values[dp_keys.index(grouping_key)]
                if grouping_key in dp_keys
                else []
            )
            sy_row = (
                sy_values[sy_keys.index(grouping_key)]
                if grouping_key in sy_keys
                else []
            )
            merge_results.append(self.merge_row(dp_row, sy_row, grouping_key))

        return merge_results


class Bayesian(Simple):
    """Bayesian merge as described
    [here](https://colab.research.google.com/drive/
    1rrUhKS-PVQCFh3obMI9yeWtd8FRqOk7_?usp=sharing).

    Merge formula:
        DP * C*epsilon^2/(1+C*epsilon^2) + SY/(1+C*epsilon^2) where DP and SY
        bins overlapp otherwise keep or DP or SY.
        C is the ponderation_coefficient.

    Atrtibutes:
        target_epsilon (flaot): used for the dp query
        keep_float (bool): if True the return resul will be float
            else it will be of the same type of synthetic resuls.
        ponderation_coefficient (float):
    """

    def __init__(
        self,
        target_epsilon: float,
        keep_float: bool,
        ponderation_coefficient: float = 8800.0,
        **_kwargs: Any,
    ):
        """Constructor

        Args:
            target_epsilon (float):
            keep_float (bool):
            ponderation_coefficient (float, optional): Defaults to 8800.0.
        """
        super().__init__(target_epsilon=target_epsilon, keep_float=keep_float)
        self.ponderation_coefficient = ponderation_coefficient

    def merging_formula(
        self,
        dp_value: Optional[float],
        synth_value: Optional[Union[int, float]],
    ) -> Optional[Union[int, float]]:
        """Applied merging formula

        Args:
            dp_value (Optional[float]): single differencially private result
            synth_value (Optional[Union[int, float]]): single synthetic result

        Returns:
            Optional[Union[int, float]]: merged results depending.
            The type depends on self.keep_float
        """
        if dp_value is not None and synth_value is not None:
            res = (
                self.epsilon**2
                * self.ponderation_coefficient
                * float(dp_value)
                + float(synth_value)
            ) / (1 + self.epsilon**2 * self.ponderation_coefficient)
        else:
            return None

        if not self.keep_float and synth_value is not None:
            return type(synth_value)(res)
        return res

    def merge_row(
        self,
        dp_row: List[Any],
        sy_row: List[Any],
        grouping_key: Tuple[Any, ...],
    ) -> List[Any]:
        """merge DP row with SY row. For this strategy we use the histogram
        database conception:
            if DP empty merge(0, SY)
            if SY empty merge(DP, 0)
            on common keys merge(DP, SY)

        Args:
            dp_row (List[Any]): line with dp results list
            sy_row (List[Any]): line with sy results list
            grouping_key (Tuple[Any, ...]): tuple with grouping keys

        Returns:
            List[Any]: merged row
        """

        if not dp_row:
            res = [
                self.merging_formula(0.0, synth_res)
                if synth_res not in grouping_key
                else synth_res
                for synth_res in sy_row
            ]
        elif not sy_row:
            res = [
                self.merging_formula(
                    dp_row[i],
                    self.dtypes[i](0.0),
                )
                if self.dtypes[i] != type(None)  # noqa: E721
                and dp_row[i] not in grouping_key
                else dp_row[i]
                for i in range(len(dp_row))
            ]
        else:
            res = [
                self.merging_formula(dp_res, synth_res)
                if synth_res not in grouping_key and dp_res not in grouping_key
                else synth_res
                for synth_res, dp_res in zip(sy_row, dp_row)
            ]
        return res


class SyntheticAsFallback:  # pylint: disable=too-few-public-methods
    """Merging strategy where synthetic results are returned only when groups
    are not present in dp results otherwhise dp results is returned.
    """

    def merge(  # pylint: disable=no-self-use
        self,
        dp_keys: List[Tuple[Any, ...]],
        dp_values: List[List[Any]],
        sy_keys: List[Tuple[Any, ...]],
        sy_values: List[List[Any]],
    ) -> List[List[Any]]:
        """Dp and synthetic results are merged here.

        Args:
            dp_keys (List[Tuple[Any, ...]]):
                list containing tuples with group by values for each line.
                They are used as keys. e.g. given a query with 'GROUP BY sex,
                education_num', dp_keys would be :
                [("Male", 1), ("Female", 1)] where "Female" and "Male"

            dp_values (List[List[Any]]):
                list with row values from dp results. e.g. given a query like
                'SELECT sex, education_num, AVG(age), COUNT(*) FROM census
                GROUP BY sex, education_num' dp_values would be:
                [["Male", 1, 32.3, 444], ["Female", 1, 44.4, 654]]
            sy_keys (List[Tuple[Any, ...]]): similarly as dp_keys, list
                containing tuples with group by values for each line.
            sy_values (List[List[Any]]): similarly as dp_values,
                list with row values from synthetic results.

        Returns:
            List[List[Any]]: merged results
        """

        merge_results = []
        for grouping_key in set(dp_keys + sy_keys):
            dp_row = (
                dp_values[dp_keys.index(grouping_key)]
                if grouping_key in dp_keys
                else []
            )
            sy_row = (
                sy_values[sy_keys.index(grouping_key)]
                if grouping_key in sy_keys
                else []
            )
            if not dp_row:
                merge_results.append(sy_row)
            else:
                merge_results.append(dp_row)
        return merge_results


class NoSynthetic:  # pylint: disable=too-few-public-methods
    """Merging strategy where only DP results are returned. (No merging)"""

    def merge(  # pylint: disable=no-self-use
        self,
        _dp_keys: List[Tuple[Any, ...]],
        dp_values: List[List[Any]],
        _sy_keys: List[Tuple[Any, ...]],
        _sy_values: List[List[Any]],
    ) -> List[List[Any]]:
        """Dp and synthetic results are merged here. Here only dp_values
        is used by definition.

        Args:
            _dp_keys (List[Tuple[Any, ...]]): not used in this merge strategy
            dp_values (List[List[Any]]):
                list with row values from dp results. e.g. given a query like
                'SELECT sex, education_num, AVG(age), COUNT(*) FROM census
                GROUP BY sex, education_num' dp_values would be:
                [["Male", 1, 32.3, 444], ["Female", 1, 44.4, 654]]
            _sy_keys (List[Tuple[Any, ...]]): not used in this merge strategy
            _sy_values (List[List[Any]]): not used in this merge strategy

        Returns:
            List[List[Any]]: merged results
        """

        return dp_values