# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New
import warnings
from cython import sizeof
cdef class ChunkedArray(_PandasConvertible):
"""
An array-like composed from a (possibly empty) collection of pyarrow.Arrays
Warnings
--------
Do not call this class's constructor directly.
Examples
--------
To construct a ChunkedArray object use :func:`pyarrow.chunked_array`:
>>> import pyarrow as pa
>>> pa.chunked_array([], type=pa.int8())
<pyarrow.lib.ChunkedArray object at ...>
[
...
]
>>> pa.chunked_array([[2, 2, 4], [4, 5, 100]])
<pyarrow.lib.ChunkedArray object at ...>
[
[
2,
2,
4
],
[
4,
5,
100
]
]
>>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray)
True
"""
def __cinit__(self):
self.chunked_array = NULL
def __init__(self):
raise TypeError("Do not call ChunkedArray's constructor directly, use "
"`chunked_array` function instead.")
cdef void init(self, const shared_ptr[CChunkedArray]& chunked_array):
self.sp_chunked_array = chunked_array
self.chunked_array = chunked_array.get()
def __reduce__(self):
return chunked_array, (self.chunks, self.type)
@property
def data(self):
import warnings
warnings.warn("Calling .data on ChunkedArray is provided for "
"compatibility after Column was removed, simply drop "
"this attribute", FutureWarning)
return self
@property
def type(self):
"""
Return data type of a ChunkedArray.
Examples
--------
>>> import pyarrow as pa
>>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
>>> n_legs.type
DataType(int64)
"""
return pyarrow_wrap_data_type(self.sp_chunked_array.get().type())
def length(self):
"""
Return length of a ChunkedArray.
Examples
--------
>>> import pyarrow as pa
>>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
>>> n_legs.length()
6
"""
return self.chunked_array.length()
def __len__(self):
return self.length()
def __repr__(self):
type_format = object.__repr__(self)
return '{0}\n{1}'.format(type_format, str(self))
def to_string(self, *, int indent=0, int window=5, int container_window=2,
c_bool skip_new_lines=False):
"""
Render a "pretty-printed" string representation of the ChunkedArray
Parameters
----------
indent : int
How much to indent right the content of the array,
by default ``0``.
window : int
How many items to preview within each chunk at the begin and end
of the chunk when the chunk is bigger than the window.
The other elements will be ellipsed.
container_window : int
How many chunks to preview at the begin and end
of the array when the array is bigger than the window.
The other elements will be ellipsed.
This setting also applies to list columns.
skip_new_lines : bool
If the array should be rendered as a single line of text
or if each element should be on its own line.
Examples
--------
>>> import pyarrow as pa
>>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
>>> n_legs.to_string(skip_new_lines=True)
'[[2,2,4],[4,5,100]]'
"""
cdef:
c_string result
PrettyPrintOptions options
with nogil:
options = PrettyPrintOptions(indent, window)
options.skip_new_lines = skip_new_lines
options.container_window = container_window
check_status(
PrettyPrint(
deref(self.chunked_array),
options,
&result
)
)
return frombytes(result, safe=True)
def format(self, **kwargs):
"""
DEPRECATED, use pyarrow.ChunkedArray.to_string
Parameters
----------
**kwargs : dict
Returns
-------
str
"""
import warnings
warnings.warn('ChunkedArray.format is deprecated, '
'use ChunkedArray.to_string')
return self.to_string(**kwargs)
def __str__(self):
return self.to_string()
def validate(self, *, full=False):
"""
Perform validation checks. An exception is raised if validation fails.
By default only cheap validation checks are run. Pass `full=True`
for thorough validation checks (potentially O(n)).
Parameters
----------
full : bool, default False
If True, run expensive checks, otherwise cheap checks only.
Raises
------
ArrowInvalid
"""
if full:
with nogil:
check_status(self.sp_chunked_array.get().ValidateFull())
else:
with nogil:
check_status(self.sp_chunked_array.get().Validate())
@property
def null_count(self):
"""
Number of null entries
Returns
-------
int
Examples
--------
>>> import pyarrow as pa
>>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
>>> n_legs.null_count
1
"""
return self.chunked_array.null_count()
@property
def nbytes(self):
"""
Total number of bytes consumed by the elements of the chunked array.
In other words, the sum of bytes from all buffer ranges referenced.
Unlike `get_total_buffer_size` this method will account for array
offsets.
If buffers are shared between arrays then the shared
portion will only be counted multiple times.
The dictionary of dictionary arrays will always be counted in their
entirety even if the array only references a portion of the dictionary.
Examples
--------
>>> import pyarrow as pa
>>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
>>> n_legs.nbytes
49
"""
cdef:
CResult[int64_t] c_res_buffer
with nogil:
c_res_buffer = ReferencedBufferSize(deref(self.chunked_array))
size = GetResultValue(c_res_buffer)
return size
def get_total_buffer_size(self):
"""
The sum of bytes in each buffer referenced by the chunked array.
An array may only reference a portion of a buffer.
This method will overestimate in this case and return the
byte size of the entire buffer.
If a buffer is referenced multiple times then it will
only be counted once.
Examples
--------
>>> import pyarrow as pa
>>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
>>> n_legs.get_total_buffer_size()
49
"""
cdef:
int64_t total_buffer_size
total_buffer_size = TotalBufferSize(deref(self.chunked_array))
return total_buffer_size
def __sizeof__(self):
return super(ChunkedArray, self).__sizeof__() + self.nbytes
def __iter__(self):
for chunk in self.iterchunks():
for item in chunk:
yield item
def __getitem__(self, key):
"""
Slice or return value at given index
Parameters
----------
key : integer or slice
Slices with step not equal to 1 (or None) will produce a copy
rather than a zero-copy view
Returns
-------
value : Scalar (index) or ChunkedArray (slice)
"""
if isinstance(key, slice):
return _normalize_slice(self, key)
return self.getitem(_normalize_index(key, self.chunked_array.length()))
cdef getitem(self, int64_t i):
return Scalar.wrap(GetResultValue(self.chunked_array.GetScalar(i)))
def is_null(self, *, nan_is_null=False):
"""
Return boolean array indicating the null values.
Parameters
----------
nan_is_null : bool (optional, default False)
Whether floating-point NaN values should also be considered null.
Returns
-------
array : boolean Array or ChunkedArray
Examples
--------
>>> import pyarrow as pa
>>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]])
>>> n_legs.is_null()
<pyarrow.lib.ChunkedArray object at ...>
[
[
false,
false,
false,
false,
true,
false
]
]
"""
options = _pc().NullOptions(nan_is_null=nan_is_null)
return _pc().call_function('is_null', [self], options)
def is_nan(self):
"""
Loading ...