Repository URL to install this package:
|
Version:
0.4.29 ▾
|
# -*- coding: utf-8 -*-
import atexit
import os
import shutil
import tempfile
from typing import Any, Mapping, Type, Union
from kiara.data_types import DataTypeConfig
from kiara.data_types.included_core_types import AnyType
from kiara.defaults import DEFAULT_PRETTY_PRINT_CONFIG
from kiara.models.values.value import SerializationResult, SerializedData, Value
from kiara.utils.output import ArrowTabularWrap
from kiara_plugin.tabular.data_types.array import store_array
from kiara_plugin.tabular.defaults import TABLE_SCHEMA_CHUNKS_NAME
from kiara_plugin.tabular.models.table import KiaraTable
from kiara_plugin.tabular.modules.table import EMPTY_COLUMN_NAME_MARKER
class TableType(AnyType[KiaraTable, DataTypeConfig]):
"""Tabular data (table, spreadsheet, data_frame, what have you).
The table data is organized in sets of columns (arrays of data of the same type), with each column having a string identifier.
*kiara* uses an instance of the [`KiaraTable`][kiara_plugin.tabular.models.table.KiaraTable]
class to manage the table data, which let's developers access it in different formats ([Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html), [Pandas dataframe](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), Python dict of lists, more to follow...).
Please consult the API doc of the `KiaraTable` class for more information about how to access and query the data:
- [`KiaraTable` API doc](https://dharpa.org/kiara_plugin.tabular/latest/reference/kiara_plugin/tabular/models/__init__/#kiara_plugin.tabular.models.table.KiaraTable)
Internally, the data is stored in [Apache Feather format](https://arrow.apache.org/docs/python/feather.html) -- both
in memory and on disk when saved, which enables some advanced usage to preserve memory and compute overhead.
"""
_data_type_name = "table"
@classmethod
def python_class(cls) -> Type:
return KiaraTable
def parse_python_obj(self, data: Any) -> KiaraTable:
return KiaraTable.create_table(data)
def _validate(cls, value: Any) -> None:
pass
if not isinstance(value, KiaraTable):
raise Exception(
f"invalid type '{type(value).__name__}', must be 'KiaraTable'."
)
def serialize(self, data: KiaraTable) -> SerializedData:
import pyarrow as pa
chunk_map = {}
# TODO: make sure temp dir is in the same partition as file store
temp_f = tempfile.mkdtemp()
def cleanup():
shutil.rmtree(temp_f, ignore_errors=True)
atexit.register(cleanup)
table = data.arrow_table
schema_bytes = table.schema.serialize().to_pybytes()
chunk_map[TABLE_SCHEMA_CHUNKS_NAME] = {
"type": "chunk",
"chunk": schema_bytes,
"codec": "raw",
}
for column_name in table.column_names:
assert column_name != TABLE_SCHEMA_CHUNKS_NAME
column: pa.Array = table.column(column_name)
if not column_name:
file_name = os.path.join(temp_f, EMPTY_COLUMN_NAME_MARKER)
else:
file_name = os.path.join(temp_f, column_name)
store_array(array_obj=column, file_name=file_name, column_name=column_name)
chunk_map[column_name] = {"type": "file", "file": file_name, "codec": "raw"}
serialized_data = {
"data_type": self.data_type_name,
"data_type_config": self.type_config.dict(),
"data": chunk_map,
"serialization_profile": "feather",
"metadata": {
"environment": {},
"deserialize": {
"python_object": {
"module_type": "load.table",
"module_config": {
"value_type": "table",
"target_profile": "python_object",
"serialization_profile": "feather",
},
}
},
},
}
serialized = SerializationResult(**serialized_data)
return serialized
def pretty_print_as__terminal_renderable(
self, value: "Value", render_config: Mapping[str, Any]
) -> Any:
max_rows = render_config.get(
"max_no_rows", DEFAULT_PRETTY_PRINT_CONFIG["max_no_rows"]
)
max_row_height = render_config.get(
"max_row_height", DEFAULT_PRETTY_PRINT_CONFIG["max_row_height"]
)
max_cell_length = render_config.get(
"max_cell_length", DEFAULT_PRETTY_PRINT_CONFIG["max_cell_length"]
)
half_lines: Union[int, None] = None
if max_rows:
half_lines = int(max_rows / 2)
atw = ArrowTabularWrap(value.data.arrow_table)
result = atw.as_terminal_renderable(
rows_head=half_lines,
rows_tail=half_lines,
max_row_height=max_row_height,
max_cell_length=max_cell_length,
)
return result
def pretty_print_as__string(
self, value: "Value", render_config: Mapping[str, Any]
) -> Any:
max_rows = render_config.get(
"max_no_rows", DEFAULT_PRETTY_PRINT_CONFIG["max_no_rows"]
)
max_row_height = render_config.get(
"max_row_height", DEFAULT_PRETTY_PRINT_CONFIG["max_row_height"]
)
max_cell_length = render_config.get(
"max_cell_length", DEFAULT_PRETTY_PRINT_CONFIG["max_cell_length"]
)
half_lines: Union[int, None] = None
if max_rows:
half_lines = int(max_rows / 2)
atw = ArrowTabularWrap(value.data.arrow_table)
result = atw.as_string(
rows_head=half_lines,
rows_tail=half_lines,
max_row_height=max_row_height,
max_cell_length=max_cell_length,
)
return result