Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

/ table.pxi

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New

import warnings
from cython import sizeof

cdef class ChunkedArray(_PandasConvertible):
    """
    An array-like composed from a (possibly empty) collection of pyarrow.Arrays

    Warnings
    --------
    Do not call this class's constructor directly.

    Examples
    --------
    To construct a ChunkedArray object use :func:`pyarrow.chunked_array`:

    >>> import pyarrow as pa
    >>> pa.chunked_array([], type=pa.int8())
    <pyarrow.lib.ChunkedArray object at ...>
    [
    ...
    ]

    >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]])
    <pyarrow.lib.ChunkedArray object at ...>
    [
      [
        2,
        2,
        4
      ],
      [
        4,
        5,
        100
      ]
    ]
    >>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray)
    True
    """

    def __cinit__(self):
        self.chunked_array = NULL

    def __init__(self):
        raise TypeError("Do not call ChunkedArray's constructor directly, use "
                        "`chunked_array` function instead.")

    cdef void init(self, const shared_ptr[CChunkedArray]& chunked_array):
        self.sp_chunked_array = chunked_array
        self.chunked_array = chunked_array.get()

    def __reduce__(self):
        return chunked_array, (self.chunks, self.type)

    @property
    def data(self):
        import warnings
        warnings.warn("Calling .data on ChunkedArray is provided for "
                      "compatibility after Column was removed, simply drop "
                      "this attribute", FutureWarning)
        return self

    @property
    def type(self):
        """
        Return data type of a ChunkedArray.

        Examples
        --------
        >>> import pyarrow as pa
        >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
        >>> n_legs.type
        DataType(int64)
        """
        return pyarrow_wrap_data_type(self.sp_chunked_array.get().type())

    def length(self):
        """
        Return length of a ChunkedArray.

        Examples
        --------
        >>> import pyarrow as pa
        >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
        >>> n_legs.length()
        6
        """
        return self.chunked_array.length()

    def __len__(self):
        return self.length()

    def __repr__(self):
        type_format = object.__repr__(self)
        return '{0}\n{1}'.format(type_format, str(self))

    def to_string(self, *, int indent=0, int window=5, int container_window=2,
                  c_bool skip_new_lines=False):
        """
        Render a "pretty-printed" string representation of the ChunkedArray

        Parameters
        ----------
        indent : int
            How much to indent right the content of the array,
            by default ``0``.
        window : int
            How many items to preview within each chunk at the begin and end
            of the chunk when the chunk is bigger than the window.
            The other elements will be ellipsed.
        container_window : int
            How many chunks to preview at the begin and end
            of the array when the array is bigger than the window.
            The other elements will be ellipsed.
            This setting also applies to list columns.
        skip_new_lines : bool
            If the array should be rendered as a single line of text
            or if each element should be on its own line.

        Examples
        --------
        >>> import pyarrow as pa
        >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
        >>> n_legs.to_string(skip_new_lines=True)
        '[[2,2,4],[4,5,100]]'
        """
        cdef:
            c_string result
            PrettyPrintOptions options

        with nogil:
            options = PrettyPrintOptions(indent, window)
            options.skip_new_lines = skip_new_lines
            options.container_window = container_window
            check_status(
                PrettyPrint(
                    deref(self.chunked_array),
                    options,
                    &result
                )
            )

        return frombytes(result, safe=True)

    def format(self, **kwargs):
        """
        DEPRECATED, use pyarrow.ChunkedArray.to_string

        Parameters
        ----------
        **kwargs : dict

        Returns
        -------
        str
        """
        import warnings
        warnings.warn('ChunkedArray.format is deprecated, '
                      'use ChunkedArray.to_string')
        return self.to_string(**kwargs)

    def __str__(self):
        return self.to_string()

    def validate(self, *, full=False):
        """
        Perform validation checks.  An exception is raised if validation fails.

        By default only cheap validation checks are run.  Pass `full=True`
        for thorough validation checks (potentially O(n)).

        Parameters
        ----------
        full : bool, default False
            If True, run expensive checks, otherwise cheap checks only.

        Raises
        ------
        ArrowInvalid
        """
        if full:
            with nogil:
                check_status(self.sp_chunked_array.get().ValidateFull())
        else:
            with nogil:
                check_status(self.sp_chunked_array.get().Validate())

    @property
    def null_count(self):
        """
        Number of null entries

        Returns
        -------
        int

        Examples
        --------
        >>> import pyarrow as pa
        >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
        >>> n_legs.null_count
        1
        """
        return self.chunked_array.null_count()

    @property
    def nbytes(self):
        """
        Total number of bytes consumed by the elements of the chunked array.

        In other words, the sum of bytes from all buffer ranges referenced.

        Unlike `get_total_buffer_size` this method will account for array
        offsets.

        If buffers are shared between arrays then the shared
        portion will only be counted multiple times.

        The dictionary of dictionary arrays will always be counted in their
        entirety even if the array only references a portion of the dictionary.

        Examples
        --------
        >>> import pyarrow as pa
        >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
        >>> n_legs.nbytes
        49
        """
        cdef:
            CResult[int64_t] c_res_buffer

        with nogil:
            c_res_buffer = ReferencedBufferSize(deref(self.chunked_array))
            size = GetResultValue(c_res_buffer)
        return size

    def get_total_buffer_size(self):
        """
        The sum of bytes in each buffer referenced by the chunked array.

        An array may only reference a portion of a buffer.
        This method will overestimate in this case and return the
        byte size of the entire buffer.

        If a buffer is referenced multiple times then it will
        only be counted once.

        Examples
        --------
        >>> import pyarrow as pa
        >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
        >>> n_legs.get_total_buffer_size()
        49
        """
        cdef:
            int64_t total_buffer_size

        total_buffer_size = TotalBufferSize(deref(self.chunked_array))
        return total_buffer_size

    def __sizeof__(self):
        return super(ChunkedArray, self).__sizeof__() + self.nbytes

    def __iter__(self):
        for chunk in self.iterchunks():
            for item in chunk:
                yield item

    def __getitem__(self, key):
        """
        Slice or return value at given index

        Parameters
        ----------
        key : integer or slice
            Slices with step not equal to 1 (or None) will produce a copy
            rather than a zero-copy view

        Returns
        -------
        value : Scalar (index) or ChunkedArray (slice)
        """

        if isinstance(key, slice):
            return _normalize_slice(self, key)

        return self.getitem(_normalize_index(key, self.chunked_array.length()))

    cdef getitem(self, int64_t i):
        return Scalar.wrap(GetResultValue(self.chunked_array.GetScalar(i)))

    def is_null(self, *, nan_is_null=False):
        """
        Return boolean array indicating the null values.

        Parameters
        ----------
        nan_is_null : bool (optional, default False)
            Whether floating-point NaN values should also be considered null.

        Returns
        -------
        array : boolean Array or ChunkedArray

        Examples
        --------
        >>> import pyarrow as pa
        >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
        >>> n_legs.is_null()
        <pyarrow.lib.ChunkedArray object at ...>
        [
          [
            false,
            false,
            false,
            false,
            true,
            false
          ]
        ]
        """
        options = _pc().NullOptions(nan_is_null=nan_is_null)
        return _pc().call_function('is_null', [self], options)

    def is_nan(self):
        """
Loading ...