Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

/ array.pxi

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New

import os
import warnings
from cython import sizeof


cdef _sequence_to_array(object sequence, object mask, object size,
                        DataType type, CMemoryPool* pool, c_bool from_pandas):
    cdef:
        int64_t c_size
        PyConversionOptions options
        shared_ptr[CChunkedArray] chunked

    if type is not None:
        options.type = type.sp_type

    if size is not None:
        options.size = size

    options.from_pandas = from_pandas
    options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)

    with nogil:
        chunked = GetResultValue(
            ConvertPySequence(sequence, mask, options, pool)
        )

    if chunked.get().num_chunks() == 1:
        return pyarrow_wrap_array(chunked.get().chunk(0))
    else:
        return pyarrow_wrap_chunked_array(chunked)


cdef inline _is_array_like(obj):
    if isinstance(obj, np.ndarray):
        return True
    return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj)


def _ndarray_to_arrow_type(object values, DataType type):
    return pyarrow_wrap_data_type(_ndarray_to_type(values, type))


cdef shared_ptr[CDataType] _ndarray_to_type(object values,
                                            DataType type) except *:
    cdef shared_ptr[CDataType] c_type

    dtype = values.dtype

    if type is None and dtype != object:
        c_type = GetResultValue(NumPyDtypeToArrow(dtype))

    if type is not None:
        c_type = type.sp_type

    return c_type


cdef _ndarray_to_array(object values, object mask, DataType type,
                       c_bool from_pandas, c_bool safe, CMemoryPool* pool):
    cdef:
        shared_ptr[CChunkedArray] chunked_out
        shared_ptr[CDataType] c_type = _ndarray_to_type(values, type)
        CCastOptions cast_options = CCastOptions(safe)

    with nogil:
        check_status(NdarrayToArrow(pool, values, mask, from_pandas,
                                    c_type, cast_options, &chunked_out))

    if chunked_out.get().num_chunks() > 1:
        return pyarrow_wrap_chunked_array(chunked_out)
    else:
        return pyarrow_wrap_array(chunked_out.get().chunk(0))


cdef _codes_to_indices(object codes, object mask, DataType type,
                       MemoryPool memory_pool):
    """
    Convert the codes of a pandas Categorical to indices for a pyarrow
    DictionaryArray, taking into account missing values + mask
    """
    if mask is None:
        mask = codes == -1
    else:
        mask = mask | (codes == -1)
    return array(codes, mask=mask, type=type, memory_pool=memory_pool)


def _handle_arrow_array_protocol(obj, type, mask, size):
    if mask is not None or size is not None:
        raise ValueError(
            "Cannot specify a mask or a size when passing an object that is "
            "converted with the __arrow_array__ protocol.")
    res = obj.__arrow_array__(type=type)
    if not isinstance(res, (Array, ChunkedArray)):
        raise TypeError("The object's __arrow_array__ method does not "
                        "return a pyarrow Array or ChunkedArray.")
    if isinstance(res, ChunkedArray) and res.num_chunks==1:
        res = res.chunk(0)
    return res


def array(object obj, type=None, mask=None, size=None, from_pandas=None,
          bint safe=True, MemoryPool memory_pool=None):
    """
    Create pyarrow.Array instance from a Python object.

    Parameters
    ----------
    obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array
        If both type and size are specified may be a single use iterable. If
        not strongly-typed, Arrow type will be inferred for resulting array.
        Any Arrow-compatible array that implements the Arrow PyCapsule Protocol
        (has an ``__arrow_c_array__`` method) can be passed as well.
    type : pyarrow.DataType
        Explicit type to attempt to coerce to, otherwise will be inferred from
        the data.
    mask : array[bool], optional
        Indicate which values are null (True) or not null (False).
    size : int64, optional
        Size of the elements. If the input is larger than size bail at this
        length. For iterators, if size is larger than the input iterator this
        will be treated as a "max size", but will involve an initial allocation
        of size followed by a resize to the actual size (so if you know the
        exact size specifying it correctly will give you better performance).
    from_pandas : bool, default None
        Use pandas's semantics for inferring nulls from values in
        ndarray-like data. If passed, the mask tasks precedence, but
        if a value is unmasked (not-null), but still null according to
        pandas semantics, then it is null. Defaults to False if not
        passed explicitly by user, or True if a pandas object is
        passed in.
    safe : bool, default True
        Check for overflows or other unsafe conversions.
    memory_pool : pyarrow.MemoryPool, optional
        If not passed, will allocate memory from the currently-set default
        memory pool.

    Returns
    -------
    array : pyarrow.Array or pyarrow.ChunkedArray
        A ChunkedArray instead of an Array is returned if:

        - the object data overflowed binary storage.
        - the object's ``__arrow_array__`` protocol method returned a chunked
          array.

    Notes
    -----
    Timezone will be preserved in the returned array for timezone-aware data,
    else no timezone will be returned for naive timestamps.
    Internally, UTC values are stored for timezone-aware data with the
    timezone set in the data type.

    Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by
    default converted as MonthDayNanoIntervalArray. relativedelta leapdays
    are ignored as are all absolute fields on both objects. datetime.timedelta
    can also be converted to MonthDayNanoIntervalArray but this requires
    passing MonthDayNanoIntervalType explicitly.

    Converting to dictionary array will promote to a wider integer type for
    indices if the number of distinct values cannot be represented, even if
    the index type was explicitly set. This means that if there are more than
    127 values the returned dictionary array's index type will be at least
    pa.int16() even if pa.int8() was passed to the function. Note that an
    explicit index type will not be demoted even if it is wider than required.

    Examples
    --------
    >>> import pandas as pd
    >>> import pyarrow as pa
    >>> pa.array(pd.Series([1, 2]))
    <pyarrow.lib.Int64Array object at ...>
    [
      1,
      2
    ]

    >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string()))
    <pyarrow.lib.DictionaryArray object at ...>
    ...
    -- dictionary:
      [
        "a",
        "b"
      ]
    -- indices:
      [
        0,
        1,
        0
      ]

    >>> import numpy as np
    >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool))
    <pyarrow.lib.Int64Array object at ...>
    [
      1,
      null
    ]

    >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64()))
    >>> arr.type.index_type
    DataType(int16)
    """
    cdef:
        CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
        bint is_pandas_object = False
        bint c_from_pandas

    type = ensure_type(type, allow_none=True)

    extension_type = None
    if type is not None and type.id == _Type_EXTENSION:
        extension_type = type
        type = type.storage_type

    if from_pandas is None:
        c_from_pandas = False
    else:
        c_from_pandas = from_pandas

    if isinstance(obj, Array):
        if type is not None and not obj.type.equals(type):
            obj = obj.cast(type, safe=safe, memory_pool=memory_pool)
        return obj

    if hasattr(obj, '__arrow_array__'):
        return _handle_arrow_array_protocol(obj, type, mask, size)
    elif hasattr(obj, '__arrow_c_array__'):
        if type is not None:
            requested_type = type.__arrow_c_schema__()
        else:
            requested_type = None
        schema_capsule, array_capsule = obj.__arrow_c_array__(requested_type)
        out_array = Array._import_from_c_capsule(schema_capsule, array_capsule)
        if type is not None and out_array.type != type:
            # PyCapsule interface type coercion is best effort, so we need to
            # check the type of the returned array and cast if necessary
            out_array = array.cast(type, safe=safe, memory_pool=memory_pool)
        return out_array
    elif _is_array_like(obj):
        if mask is not None:
            if _is_array_like(mask):
                mask = get_values(mask, &is_pandas_object)
            else:
                raise TypeError("Mask must be a numpy array "
                                "when converting numpy arrays")

        values = get_values(obj, &is_pandas_object)
        if is_pandas_object and from_pandas is None:
            c_from_pandas = True

        if isinstance(values, np.ma.MaskedArray):
            if mask is not None:
                raise ValueError("Cannot pass a numpy masked array and "
                                 "specify a mask at the same time")
            else:
                # don't use shrunken masks
                mask = None if values.mask is np.ma.nomask else values.mask
                values = values.data

        if mask is not None:
            if mask.dtype != np.bool_:
                raise TypeError("Mask must be boolean dtype")
            if mask.ndim != 1:
                raise ValueError("Mask must be 1D array")
            if len(values) != len(mask):
                raise ValueError(
                    "Mask is a different length from sequence being converted")

        if hasattr(values, '__arrow_array__'):
            return _handle_arrow_array_protocol(values, type, mask, size)
        elif (pandas_api.is_categorical(values) and
              type is not None and type.id != Type_DICTIONARY):
            result = _ndarray_to_array(
                np.asarray(values), mask, type, c_from_pandas, safe, pool
            )
        elif pandas_api.is_categorical(values):
            if type is not None:
                index_type = type.index_type
                value_type = type.value_type
                if values.ordered != type.ordered:
                    raise ValueError(
                        "The 'ordered' flag of the passed categorical values "
                        "does not match the 'ordered' of the specified type. ")
            else:
                index_type = None
                value_type = None

            indices = _codes_to_indices(
                values.codes, mask, index_type, memory_pool)
            try:
                dictionary = array(
                    values.categories.values, type=value_type,
                    memory_pool=memory_pool)
            except TypeError:
                # TODO when removing the deprecation warning, this whole
                # try/except can be removed (to bubble the TypeError of
                # the first array(..) call)
                if value_type is not None:
                    warnings.warn(
                        "The dtype of the 'categories' of the passed "
                        "categorical values ({0}) does not match the "
                        "specified type ({1}). For now ignoring the specified "
                        "type, but in the future this mismatch will raise a "
                        "TypeError".format(
                            values.categories.dtype, value_type),
                        FutureWarning, stacklevel=2)
                    dictionary = array(
                        values.categories.values, memory_pool=memory_pool)
                else:
                    raise

            return DictionaryArray.from_arrays(
                indices, dictionary, ordered=values.ordered, safe=safe)
        else:
            if pandas_api.have_pandas:
                values, type = pandas_api.compat.get_datetimetz_type(
                    values, obj.dtype, type)
            if type and type.id == _Type_RUN_END_ENCODED:
                arr = _ndarray_to_array(
                    values, mask, type.value_type, c_from_pandas, safe, pool)
                result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
                                              memory_pool=memory_pool)
            else:
                result = _ndarray_to_array(values, mask, type, c_from_pandas, safe,
Loading ...