Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev65 

/ types.pxi

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from cpython.pycapsule cimport (
    PyCapsule_CheckExact,
    PyCapsule_GetPointer,
    PyCapsule_GetName,
    PyCapsule_New,
    PyCapsule_IsValid
)

import atexit
from collections.abc import Mapping
import pickle
import re
import sys
import warnings
from cython import sizeof

# These are imprecise because the type (in pandas 0.x) depends on the presence
# of nulls
cdef dict _pandas_type_map = {}


def _get_pandas_type_map():
    global _pandas_type_map
    if not _pandas_type_map:
        _pandas_type_map.update({
            _Type_NA: np.object_,  # NaNs
            _Type_BOOL: np.bool_,
            _Type_INT8: np.int8,
            _Type_INT16: np.int16,
            _Type_INT32: np.int32,
            _Type_INT64: np.int64,
            _Type_UINT8: np.uint8,
            _Type_UINT16: np.uint16,
            _Type_UINT32: np.uint32,
            _Type_UINT64: np.uint64,
            _Type_HALF_FLOAT: np.float16,
            _Type_FLOAT: np.float32,
            _Type_DOUBLE: np.float64,
            # Pandas does not support [D]ay, so default to [ms] for date32
            _Type_DATE32: np.dtype('datetime64[ms]'),
            _Type_DATE64: np.dtype('datetime64[ms]'),
            _Type_TIMESTAMP: {
                's': np.dtype('datetime64[s]'),
                'ms': np.dtype('datetime64[ms]'),
                'us': np.dtype('datetime64[us]'),
                'ns': np.dtype('datetime64[ns]'),
            },
            _Type_DURATION: {
                's': np.dtype('timedelta64[s]'),
                'ms': np.dtype('timedelta64[ms]'),
                'us': np.dtype('timedelta64[us]'),
                'ns': np.dtype('timedelta64[ns]'),
            },
            _Type_BINARY: np.object_,
            _Type_FIXED_SIZE_BINARY: np.object_,
            _Type_STRING: np.object_,
            _Type_LIST: np.object_,
            _Type_MAP: np.object_,
            _Type_DECIMAL128: np.object_,
        })
    return _pandas_type_map


cdef dict _pep3118_type_map = {
    _Type_INT8: b'b',
    _Type_INT16: b'h',
    _Type_INT32: b'i',
    _Type_INT64: b'q',
    _Type_UINT8: b'B',
    _Type_UINT16: b'H',
    _Type_UINT32: b'I',
    _Type_UINT64: b'Q',
    _Type_HALF_FLOAT: b'e',
    _Type_FLOAT: b'f',
    _Type_DOUBLE: b'd',
}


cdef bytes _datatype_to_pep3118(CDataType* type):
    """
    Construct a PEP 3118 format string describing the given datatype.
    None is returned for unsupported types.
    """
    try:
        char = _pep3118_type_map[type.id()]
    except KeyError:
        return None
    else:
        if char in b'bBhHiIqQ':
            # Use "standard" int widths, not native
            return b'=' + char
        else:
            return char


cdef void* _as_c_pointer(v, allow_null=False) except *:
    """
    Convert a Python object to a raw C pointer.

    Used mainly for the C data interface.
    Integers are accepted as well as capsule objects with a NULL name.
    (the latter for compatibility with raw pointers exported by reticulate)
    """
    cdef void* c_ptr
    cdef const char* capsule_name
    if isinstance(v, int):
        c_ptr = <void*> <uintptr_t > v
    elif isinstance(v, float):
        warnings.warn(
            "Passing a pointer value as a float is unsafe and only "
            "supported for compatibility with older versions of the R "
            "Arrow library", UserWarning, stacklevel=2)
        c_ptr = <void*> <uintptr_t > v
    elif PyCapsule_CheckExact(v):
        # An R external pointer was how the R bindings passed pointer values to
        # Python from versions 7 to 15 (inclusive); however, the reticulate 1.35.0
        # update changed the name of the capsule from NULL to "r_extptr".
        # Newer versions of the R package pass a Python integer; however, this
        # workaround ensures that old versions of the R package continue to work
        # with newer versions of pyarrow.
        capsule_name = PyCapsule_GetName(v)
        if capsule_name == NULL or capsule_name == b"r_extptr":
            c_ptr = PyCapsule_GetPointer(v, capsule_name)
        else:
            capsule_name_str = capsule_name.decode()
            raise ValueError(
                f"Can't convert PyCapsule with name '{capsule_name_str}' to pointer address"
            )
    else:
        raise TypeError(f"Expected a pointer value, got {type(v)!r}")
    if not allow_null and c_ptr == NULL:
        raise ValueError(f"Null pointer (value before cast = {v!r})")
    return c_ptr


def _is_primitive(Type type):
    # This is simply a redirect, the official API is in pyarrow.types.
    return is_primitive(type)


def _get_pandas_type(arrow_type, coerce_to_ns=False):
    cdef Type type_id = arrow_type.id
    cdef dict pandas_type_map = _get_pandas_type_map()
    if type_id not in pandas_type_map:
        return None
    if coerce_to_ns:
        # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
        if type_id == _Type_DURATION:
            return np.dtype('timedelta64[ns]')
        return np.dtype('datetime64[ns]')
    pandas_type = pandas_type_map[type_id]
    if isinstance(pandas_type, dict):
        unit = getattr(arrow_type, 'unit', None)
        pandas_type = pandas_type.get(unit, None)
    return pandas_type


def _get_pandas_tz_type(arrow_type, coerce_to_ns=False):
    from pyarrow.pandas_compat import make_datetimetz
    unit = 'ns' if coerce_to_ns else arrow_type.unit
    return make_datetimetz(unit, arrow_type.tz)


def _to_pandas_dtype(arrow_type, options=None):
    coerce_to_ns = (options and options.get('coerce_temporal_nanoseconds', False)) or (
        _pandas_api.is_v1() and arrow_type.id in
        [_Type_DATE32, _Type_DATE64, _Type_TIMESTAMP, _Type_DURATION])

    if getattr(arrow_type, 'tz', None):
        dtype = _get_pandas_tz_type(arrow_type, coerce_to_ns)
    else:
        dtype = _get_pandas_type(arrow_type, coerce_to_ns)

    if not dtype:
        raise NotImplementedError(str(arrow_type))

    return dtype


# Workaround for Cython parsing bug
# https://github.com/cython/cython/issues/2143
ctypedef CFixedWidthType* _CFixedWidthTypePtr


cdef class DataType(_Weakrefable):
    """
    Base class of all Arrow data types.

    Each data type is an *instance* of this class.

    Examples
    --------
    Instance of int64 type:

    >>> import pyarrow as pa
    >>> pa.int64()
    DataType(int64)
    """

    def __cinit__(self):
        pass

    def __init__(self):
        raise TypeError("Do not call {}'s constructor directly, use public "
                        "functions like pyarrow.int64, pyarrow.list_, etc. "
                        "instead.".format(self.__class__.__name__))

    cdef void init(self, const shared_ptr[CDataType]& type) except *:
        assert type != nullptr
        self.sp_type = type
        self.type = type.get()
        self.pep3118_format = _datatype_to_pep3118(self.type)

    cpdef Field field(self, i):
        """
        Parameters
        ----------
        i : int

        Returns
        -------
        pyarrow.Field
        """
        if not isinstance(i, int):
            raise TypeError(f"Expected int index, got type '{type(i)}'")
        cdef int index = <int> _normalize_index(i, self.type.num_fields())
        return pyarrow_wrap_field(self.type.field(index))

    @property
    def id(self):
        return self.type.id()

    @property
    def bit_width(self):
        """
        Bit width for fixed width type.

        Examples
        --------
        >>> import pyarrow as pa
        >>> pa.int64()
        DataType(int64)
        >>> pa.int64().bit_width
        64
        """
        cdef _CFixedWidthTypePtr ty
        ty = dynamic_cast[_CFixedWidthTypePtr](self.type)
        if ty == nullptr:
            raise ValueError("Non-fixed width type")
        return ty.bit_width()

    @property
    def byte_width(self):
        """
        Byte width for fixed width type.

        Examples
        --------
        >>> import pyarrow as pa
        >>> pa.int64()
        DataType(int64)
        >>> pa.int64().byte_width
        8
        """
        cdef _CFixedWidthTypePtr ty
        ty = dynamic_cast[_CFixedWidthTypePtr](self.type)
        if ty == nullptr:
            raise ValueError("Non-fixed width type")
        byte_width = ty.byte_width()
        if byte_width == 0 and self.bit_width != 0:
            raise ValueError("Less than one byte")
        return byte_width

    @property
    def num_fields(self):
        """
        The number of child fields.

        Examples
        --------
        >>> import pyarrow as pa
        >>> pa.int64()
        DataType(int64)
        >>> pa.int64().num_fields
        0
        >>> pa.list_(pa.string())
        ListType(list<item: string>)
        >>> pa.list_(pa.string()).num_fields
        1
        >>> struct = pa.struct({'x': pa.int32(), 'y': pa.string()})
        >>> struct.num_fields
        2
        """
        return self.type.num_fields()

    @property
    def num_buffers(self):
        """
        Number of data buffers required to construct Array type
        excluding children.

        Examples
        --------
        >>> import pyarrow as pa
        >>> pa.int64().num_buffers
        2
        >>> pa.string().num_buffers
        3
        """
        return self.type.layout().buffers.size()

    def __str__(self):
        return frombytes(self.type.ToString(), safe=True)

    def __hash__(self):
        return hash(str(self))

    def __reduce__(self):
        return type_for_alias, (str(self),)

    def __repr__(self):
        return '{0.__class__.__name__}({0})'.format(self)

    def __eq__(self, other):
        try:
            return self.equals(other)
        except (TypeError, ValueError):
            return NotImplemented
Loading ...