Repository URL to install this package:
|
Version:
0.21.1 ▾
|
"""
Table Schema builders
http://specs.frictionlessdata.io/json-table-schema/
"""
from pandas.core.common import _all_not_none
from pandas.core.dtypes.common import (
is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype,
is_bool_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
is_categorical_dtype, is_period_dtype, is_string_dtype
)
def as_json_table_type(x):
"""
Convert a NumPy / pandas type to its corresponding json_table.
Parameters
----------
x : array or dtype
Returns
-------
t : str
the Table Schema data types
Notes
-----
This table shows the relationship between NumPy / pandas dtypes,
and Table Schema dtypes.
============== =================
Pandas type Table Schema type
============== =================
int64 integer
float64 number
bool boolean
datetime64[ns] datetime
timedelta64[ns] duration
object str
categorical any
=============== =================
"""
if is_integer_dtype(x):
return 'integer'
elif is_bool_dtype(x):
return 'boolean'
elif is_numeric_dtype(x):
return 'number'
elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
is_period_dtype(x)):
return 'datetime'
elif is_timedelta64_dtype(x):
return 'duration'
elif is_categorical_dtype(x):
return 'any'
elif is_string_dtype(x):
return 'string'
else:
return 'any'
def set_default_names(data):
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
if _all_not_none(*data.index.names):
return data
data = data.copy()
if data.index.nlevels > 1:
names = [name if name is not None else 'level_{}'.format(i)
for i, name in enumerate(data.index.names)]
data.index.names = names
else:
data.index.name = data.index.name or 'index'
return data
def make_field(arr, dtype=None):
dtype = dtype or arr.dtype
if arr.name is None:
name = 'values'
else:
name = arr.name
field = {'name': name,
'type': as_json_table_type(dtype)}
if is_categorical_dtype(arr):
if hasattr(arr, 'categories'):
cats = arr.categories
ordered = arr.ordered
else:
cats = arr.cat.categories
ordered = arr.cat.ordered
field['constraints'] = {"enum": list(cats)}
field['ordered'] = ordered
elif is_period_dtype(arr):
field['freq'] = arr.freqstr
elif is_datetime64tz_dtype(arr):
if hasattr(arr, 'dt'):
field['tz'] = arr.dt.tz.zone
else:
field['tz'] = arr.tz.zone
return field
def build_table_schema(data, index=True, primary_key=None, version=True):
"""
Create a Table schema from ``data``.
Parameters
----------
data : Series, DataFrame
index : bool, default True
Whether to include ``data.index`` in the schema.
primary_key : bool or None, default True
column names to designate as the primary key.
The default `None` will set `'primaryKey'` to the index
level or levels if the index is unique.
version : bool, default True
Whether to include a field `pandas_version` with the version
of pandas that generated the schema.
Returns
-------
schema : dict
Examples
--------
>>> df = pd.DataFrame(
... {'A': [1, 2, 3],
... 'B': ['a', 'b', 'c'],
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
... }, index=pd.Index(range(3), name='idx'))
>>> build_table_schema(df)
{'fields': [{'name': 'idx', 'type': 'integer'},
{'name': 'A', 'type': 'integer'},
{'name': 'B', 'type': 'string'},
{'name': 'C', 'type': 'datetime'}],
'pandas_version': '0.20.0',
'primaryKey': ['idx']}
Notes
-----
See `_as_json_table_type` for conversion types.
Timedeltas as converted to ISO8601 duration format with
9 decimal places after the secnods field for nanosecond precision.
Categoricals are converted to the `any` dtype, and use the `enum` field
constraint to list the allowed values. The `ordered` attribute is included
in an `ordered` field.
"""
if index is True:
data = set_default_names(data)
schema = {}
fields = []
if index:
if data.index.nlevels > 1:
for level in data.index.levels:
fields.append(make_field(level))
else:
fields.append(make_field(data.index))
if data.ndim > 1:
for column, s in data.iteritems():
fields.append(make_field(s))
else:
fields.append(make_field(data))
schema['fields'] = fields
if index and data.index.is_unique and primary_key is None:
if data.index.nlevels == 1:
schema['primaryKey'] = [data.index.name]
else:
schema['primaryKey'] = data.index.names
elif primary_key is not None:
schema['primaryKey'] = primary_key
if version:
schema['pandas_version'] = '0.20.0'
return schema