# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from cpython.pycapsule cimport (
PyCapsule_CheckExact,
PyCapsule_GetPointer,
PyCapsule_GetName,
PyCapsule_New,
PyCapsule_IsValid
)
import atexit
from collections.abc import Mapping
import pickle
import re
import sys
import warnings
from cython import sizeof
# These are imprecise because the type (in pandas 0.x) depends on the presence
# of nulls
cdef dict _pandas_type_map = {}
def _get_pandas_type_map():
global _pandas_type_map
if not _pandas_type_map:
_pandas_type_map.update({
_Type_NA: np.object_, # NaNs
_Type_BOOL: np.bool_,
_Type_INT8: np.int8,
_Type_INT16: np.int16,
_Type_INT32: np.int32,
_Type_INT64: np.int64,
_Type_UINT8: np.uint8,
_Type_UINT16: np.uint16,
_Type_UINT32: np.uint32,
_Type_UINT64: np.uint64,
_Type_HALF_FLOAT: np.float16,
_Type_FLOAT: np.float32,
_Type_DOUBLE: np.float64,
# Pandas does not support [D]ay, so default to [ms] for date32
_Type_DATE32: np.dtype('datetime64[ms]'),
_Type_DATE64: np.dtype('datetime64[ms]'),
_Type_TIMESTAMP: {
's': np.dtype('datetime64[s]'),
'ms': np.dtype('datetime64[ms]'),
'us': np.dtype('datetime64[us]'),
'ns': np.dtype('datetime64[ns]'),
},
_Type_DURATION: {
's': np.dtype('timedelta64[s]'),
'ms': np.dtype('timedelta64[ms]'),
'us': np.dtype('timedelta64[us]'),
'ns': np.dtype('timedelta64[ns]'),
},
_Type_BINARY: np.object_,
_Type_FIXED_SIZE_BINARY: np.object_,
_Type_STRING: np.object_,
_Type_LIST: np.object_,
_Type_MAP: np.object_,
_Type_DECIMAL128: np.object_,
})
return _pandas_type_map
cdef dict _pep3118_type_map = {
_Type_INT8: b'b',
_Type_INT16: b'h',
_Type_INT32: b'i',
_Type_INT64: b'q',
_Type_UINT8: b'B',
_Type_UINT16: b'H',
_Type_UINT32: b'I',
_Type_UINT64: b'Q',
_Type_HALF_FLOAT: b'e',
_Type_FLOAT: b'f',
_Type_DOUBLE: b'd',
}
cdef bytes _datatype_to_pep3118(CDataType* type):
"""
Construct a PEP 3118 format string describing the given datatype.
None is returned for unsupported types.
"""
try:
char = _pep3118_type_map[type.id()]
except KeyError:
return None
else:
if char in b'bBhHiIqQ':
# Use "standard" int widths, not native
return b'=' + char
else:
return char
cdef void* _as_c_pointer(v, allow_null=False) except *:
"""
Convert a Python object to a raw C pointer.
Used mainly for the C data interface.
Integers are accepted as well as capsule objects with a NULL name.
(the latter for compatibility with raw pointers exported by reticulate)
"""
cdef void* c_ptr
cdef const char* capsule_name
if isinstance(v, int):
c_ptr = <void*> <uintptr_t > v
elif isinstance(v, float):
warnings.warn(
"Passing a pointer value as a float is unsafe and only "
"supported for compatibility with older versions of the R "
"Arrow library", UserWarning, stacklevel=2)
c_ptr = <void*> <uintptr_t > v
elif PyCapsule_CheckExact(v):
# An R external pointer was how the R bindings passed pointer values to
# Python from versions 7 to 15 (inclusive); however, the reticulate 1.35.0
# update changed the name of the capsule from NULL to "r_extptr".
# Newer versions of the R package pass a Python integer; however, this
# workaround ensures that old versions of the R package continue to work
# with newer versions of pyarrow.
capsule_name = PyCapsule_GetName(v)
if capsule_name == NULL or capsule_name == b"r_extptr":
c_ptr = PyCapsule_GetPointer(v, capsule_name)
else:
capsule_name_str = capsule_name.decode()
raise ValueError(
f"Can't convert PyCapsule with name '{capsule_name_str}' to pointer address"
)
else:
raise TypeError(f"Expected a pointer value, got {type(v)!r}")
if not allow_null and c_ptr == NULL:
raise ValueError(f"Null pointer (value before cast = {v!r})")
return c_ptr
def _is_primitive(Type type):
# This is simply a redirect, the official API is in pyarrow.types.
return is_primitive(type)
def _get_pandas_type(arrow_type, coerce_to_ns=False):
cdef Type type_id = arrow_type.id
cdef dict pandas_type_map = _get_pandas_type_map()
if type_id not in pandas_type_map:
return None
if coerce_to_ns:
# ARROW-3789: Coerce date/timestamp types to datetime64[ns]
if type_id == _Type_DURATION:
return np.dtype('timedelta64[ns]')
return np.dtype('datetime64[ns]')
pandas_type = pandas_type_map[type_id]
if isinstance(pandas_type, dict):
unit = getattr(arrow_type, 'unit', None)
pandas_type = pandas_type.get(unit, None)
return pandas_type
def _get_pandas_tz_type(arrow_type, coerce_to_ns=False):
from pyarrow.pandas_compat import make_datetimetz
unit = 'ns' if coerce_to_ns else arrow_type.unit
return make_datetimetz(unit, arrow_type.tz)
def _to_pandas_dtype(arrow_type, options=None):
coerce_to_ns = (options and options.get('coerce_temporal_nanoseconds', False)) or (
_pandas_api.is_v1() and arrow_type.id in
[_Type_DATE32, _Type_DATE64, _Type_TIMESTAMP, _Type_DURATION])
if getattr(arrow_type, 'tz', None):
dtype = _get_pandas_tz_type(arrow_type, coerce_to_ns)
else:
dtype = _get_pandas_type(arrow_type, coerce_to_ns)
if not dtype:
raise NotImplementedError(str(arrow_type))
return dtype
# Workaround for Cython parsing bug
# https://github.com/cython/cython/issues/2143
ctypedef CFixedWidthType* _CFixedWidthTypePtr
cdef class DataType(_Weakrefable):
"""
Base class of all Arrow data types.
Each data type is an *instance* of this class.
Examples
--------
Instance of int64 type:
>>> import pyarrow as pa
>>> pa.int64()
DataType(int64)
"""
def __cinit__(self):
pass
def __init__(self):
raise TypeError("Do not call {}'s constructor directly, use public "
"functions like pyarrow.int64, pyarrow.list_, etc. "
"instead.".format(self.__class__.__name__))
cdef void init(self, const shared_ptr[CDataType]& type) except *:
assert type != nullptr
self.sp_type = type
self.type = type.get()
self.pep3118_format = _datatype_to_pep3118(self.type)
cpdef Field field(self, i):
"""
Parameters
----------
i : int
Returns
-------
pyarrow.Field
"""
if not isinstance(i, int):
raise TypeError(f"Expected int index, got type '{type(i)}'")
cdef int index = <int> _normalize_index(i, self.type.num_fields())
return pyarrow_wrap_field(self.type.field(index))
@property
def id(self):
return self.type.id()
@property
def bit_width(self):
"""
Bit width for fixed width type.
Examples
--------
>>> import pyarrow as pa
>>> pa.int64()
DataType(int64)
>>> pa.int64().bit_width
64
"""
cdef _CFixedWidthTypePtr ty
ty = dynamic_cast[_CFixedWidthTypePtr](self.type)
if ty == nullptr:
raise ValueError("Non-fixed width type")
return ty.bit_width()
@property
def byte_width(self):
"""
Byte width for fixed width type.
Examples
--------
>>> import pyarrow as pa
>>> pa.int64()
DataType(int64)
>>> pa.int64().byte_width
8
"""
cdef _CFixedWidthTypePtr ty
ty = dynamic_cast[_CFixedWidthTypePtr](self.type)
if ty == nullptr:
raise ValueError("Non-fixed width type")
byte_width = ty.byte_width()
if byte_width == 0 and self.bit_width != 0:
raise ValueError("Less than one byte")
return byte_width
@property
def num_fields(self):
"""
The number of child fields.
Examples
--------
>>> import pyarrow as pa
>>> pa.int64()
DataType(int64)
>>> pa.int64().num_fields
0
>>> pa.list_(pa.string())
ListType(list<item: string>)
>>> pa.list_(pa.string()).num_fields
1
>>> struct = pa.struct({'x': pa.int32(), 'y': pa.string()})
>>> struct.num_fields
2
"""
return self.type.num_fields()
@property
def num_buffers(self):
"""
Number of data buffers required to construct Array type
excluding children.
Examples
--------
>>> import pyarrow as pa
>>> pa.int64().num_buffers
2
>>> pa.string().num_buffers
3
"""
return self.type.layout().buffers.size()
def __str__(self):
return frombytes(self.type.ToString(), safe=True)
def __hash__(self):
return hash(str(self))
def __reduce__(self):
return type_for_alias, (str(self),)
def __repr__(self):
return '{0.__class__.__name__}({0})'.format(self)
def __eq__(self, other):
try:
return self.equals(other)
except (TypeError, ValueError):
return NotImplemented
Loading ...