Repository URL to install this package:
|
Version:
3.0.0.dev3 ▾
|
"""Some basics merging strategies"""
from typing import Any, List, Optional, Tuple, Union
class Simple:
"""The first merge strategy used.
Merge formula:
DP * epsilon/(1+epsilon) + SY/(1+epsilon) where DP and SY bins overlapp
otherwise keep or DP or SY
Atrtibutes:
target_epsilon (flaot): used for the dp query
keep_float (bool): if True then return floats
else it will be of the same type of synthetic resuls.
If synthetic results are empty keep_float has no effect and
the DP result types will be returned.
"""
def __init__(
self, target_epsilon: float, keep_float: bool, **_kwargs: Any
):
"""Constructor
Args:
target_epsilon (float):
keep_float (bool):
"""
self.epsilon = target_epsilon
self.keep_float = keep_float
self.dtypes: List[Any]
def merging_formula(
self,
dp_value: Optional[float],
synth_value: Optional[Union[int, float]],
) -> Optional[Union[int, float]]:
"""Applied merging formula
Args:
dp_value Union[float, NoneType]: single dp result
synth_value Union[int, float, NoneType]: single synthetic result
Returns:
Union[int, float, NoneType]: merged results.
The type depends on self.keep_float
"""
if dp_value is not None and synth_value is not None:
res = (self.epsilon * float(dp_value) + float(synth_value)) / (
1 + self.epsilon
)
else:
return None
if not self.keep_float and synth_value is not None:
return type(synth_value)(res)
return res
def merge_row(
self,
dp_row: List[Any],
sy_row: List[Any],
grouping_key: Tuple[Any, ...],
) -> List[Any]:
"""merge DP row with SY row. For this strategy we keep the
old merging conception:
if DP empty return SY
if SY empty return DP
on common keys apply formula
Args:
dp_row (List[Any]): _description_
sy_row (List[Any]): _description_
grouping_key (Tuple[Any, ...]): _description_
Returns:
List[Any]: merged row
"""
if not dp_row:
res = sy_row
elif not sy_row:
# When the synthetic row is epmpty we return the DP row
# with SD type if keep_float is False
res = [
self.dtypes[i](dp_row[i])
if self.dtypes[i] != type(None) # noqa: E721
and not isinstance(dp_row[i], (self.dtypes[i], type(None)))
and not self.keep_float
else dp_row[i]
for i in range(len(dp_row))
]
else:
res = [
self.merging_formula(dp_res, synth_res)
if synth_res not in grouping_key and dp_res not in grouping_key
else synth_res
for synth_res, dp_res in zip(sy_row, dp_row)
]
return res
def merge(
self,
dp_keys: List[Tuple[Any, ...]],
dp_values: List[List[Any]],
sy_keys: List[Tuple[Any, ...]],
sy_values: List[List[Any]],
) -> List[List[Any]]:
"""Dp and synthetic results are merged here.
Args:
dp_keys (List[Tuple[Any, ...]]):
list containing tuples with group by values for each line.
They are used as keys. e.g. given a query with 'GROUP BY sex
education_num', dp_keys would be:
[("Male", 1), ("Female", 1)] where "Female" and "Male"
dp_values (List[List[Any]]):
list with row values from dp results. e.g. given a query like
'SELECT sex, education_num, AVG(age), COUNT(*) FROM census
GROUP BY sex, education_num' dp_values would be:
[["Male", 1, 32.3, 444], ["Female", 1, 44.4, 654]]
sy_keys (List[Tuple[Any, ...]]): similarly as dp_keys, list
containing tuples with group by values for each line.
sy_values (List[List[Any]]): similarly as dp_values, list with row
values from synthetic results.
Returns:
List[List[Any]]: merged results
"""
if len(sy_values) > 0:
self.dtypes = [
type(sy_values[0][i]) for i in range(len(sy_values[0]))
]
else:
self.dtypes = [
type(dp_values[0][i]) for i in range(len(dp_values[0]))
]
merge_results = []
for grouping_key in set(dp_keys + sy_keys):
dp_row = (
dp_values[dp_keys.index(grouping_key)]
if grouping_key in dp_keys
else []
)
sy_row = (
sy_values[sy_keys.index(grouping_key)]
if grouping_key in sy_keys
else []
)
merge_results.append(self.merge_row(dp_row, sy_row, grouping_key))
return merge_results
class Bayesian(Simple):
"""Bayesian merge as described
[here](https://colab.research.google.com/drive/
1rrUhKS-PVQCFh3obMI9yeWtd8FRqOk7_?usp=sharing).
Merge formula:
DP * C*epsilon^2/(1+C*epsilon^2) + SY/(1+C*epsilon^2) where DP and SY
bins overlapp otherwise keep or DP or SY.
C is the ponderation_coefficient.
Atrtibutes:
target_epsilon (flaot): used for the dp query
keep_float (bool): if True the return resul will be float
else it will be of the same type of synthetic resuls.
ponderation_coefficient (float):
"""
def __init__(
self,
target_epsilon: float,
keep_float: bool,
ponderation_coefficient: float = 8800.0,
**_kwargs: Any,
):
"""Constructor
Args:
target_epsilon (float):
keep_float (bool):
ponderation_coefficient (float, optional): Defaults to 8800.0.
"""
super().__init__(target_epsilon=target_epsilon, keep_float=keep_float)
self.ponderation_coefficient = ponderation_coefficient
def merging_formula(
self,
dp_value: Optional[float],
synth_value: Optional[Union[int, float]],
) -> Optional[Union[int, float]]:
"""Applied merging formula
Args:
dp_value (Optional[float]): single differencially private result
synth_value (Optional[Union[int, float]]): single synthetic result
Returns:
Optional[Union[int, float]]: merged results depending.
The type depends on self.keep_float
"""
if dp_value is not None and synth_value is not None:
res = (
self.epsilon**2
* self.ponderation_coefficient
* float(dp_value)
+ float(synth_value)
) / (1 + self.epsilon**2 * self.ponderation_coefficient)
else:
return None
if not self.keep_float and synth_value is not None:
return type(synth_value)(res)
return res
def merge_row(
self,
dp_row: List[Any],
sy_row: List[Any],
grouping_key: Tuple[Any, ...],
) -> List[Any]:
"""merge DP row with SY row. For this strategy we use the histogram
database conception:
if DP empty merge(0, SY)
if SY empty merge(DP, 0)
on common keys merge(DP, SY)
Args:
dp_row (List[Any]): line with dp results list
sy_row (List[Any]): line with sy results list
grouping_key (Tuple[Any, ...]): tuple with grouping keys
Returns:
List[Any]: merged row
"""
if not dp_row:
res = [
self.merging_formula(0.0, synth_res)
if synth_res not in grouping_key
else synth_res
for synth_res in sy_row
]
elif not sy_row:
res = [
self.merging_formula(
dp_row[i],
self.dtypes[i](0.0),
)
if self.dtypes[i] != type(None) # noqa: E721
and dp_row[i] not in grouping_key
else dp_row[i]
for i in range(len(dp_row))
]
else:
res = [
self.merging_formula(dp_res, synth_res)
if synth_res not in grouping_key and dp_res not in grouping_key
else synth_res
for synth_res, dp_res in zip(sy_row, dp_row)
]
return res
class SyntheticAsFallback: # pylint: disable=too-few-public-methods
"""Merging strategy where synthetic results are returned only when groups
are not present in dp results otherwhise dp results is returned.
"""
def merge( # pylint: disable=no-self-use
self,
dp_keys: List[Tuple[Any, ...]],
dp_values: List[List[Any]],
sy_keys: List[Tuple[Any, ...]],
sy_values: List[List[Any]],
) -> List[List[Any]]:
"""Dp and synthetic results are merged here.
Args:
dp_keys (List[Tuple[Any, ...]]):
list containing tuples with group by values for each line.
They are used as keys. e.g. given a query with 'GROUP BY sex,
education_num', dp_keys would be :
[("Male", 1), ("Female", 1)] where "Female" and "Male"
dp_values (List[List[Any]]):
list with row values from dp results. e.g. given a query like
'SELECT sex, education_num, AVG(age), COUNT(*) FROM census
GROUP BY sex, education_num' dp_values would be:
[["Male", 1, 32.3, 444], ["Female", 1, 44.4, 654]]
sy_keys (List[Tuple[Any, ...]]): similarly as dp_keys, list
containing tuples with group by values for each line.
sy_values (List[List[Any]]): similarly as dp_values,
list with row values from synthetic results.
Returns:
List[List[Any]]: merged results
"""
merge_results = []
for grouping_key in set(dp_keys + sy_keys):
dp_row = (
dp_values[dp_keys.index(grouping_key)]
if grouping_key in dp_keys
else []
)
sy_row = (
sy_values[sy_keys.index(grouping_key)]
if grouping_key in sy_keys
else []
)
if not dp_row:
merge_results.append(sy_row)
else:
merge_results.append(dp_row)
return merge_results
class NoSynthetic: # pylint: disable=too-few-public-methods
"""Merging strategy where only DP results are returned. (No merging)"""
def merge( # pylint: disable=no-self-use
self,
_dp_keys: List[Tuple[Any, ...]],
dp_values: List[List[Any]],
_sy_keys: List[Tuple[Any, ...]],
_sy_values: List[List[Any]],
) -> List[List[Any]]:
"""Dp and synthetic results are merged here. Here only dp_values
is used by definition.
Args:
_dp_keys (List[Tuple[Any, ...]]): not used in this merge strategy
dp_values (List[List[Any]]):
list with row values from dp results. e.g. given a query like
'SELECT sex, education_num, AVG(age), COUNT(*) FROM census
GROUP BY sex, education_num' dp_values would be:
[["Male", 1, 32.3, 444], ["Female", 1, 44.4, 654]]
_sy_keys (List[Tuple[Any, ...]]): not used in this merge strategy
_sy_values (List[List[Any]]): not used in this merge strategy
Returns:
List[List[Any]]: merged results
"""
return dp_values