Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

/ includes / libarrow_dataset.pxd

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# distutils: language = c++

from libcpp.unordered_map cimport unordered_map
from libcpp cimport bool as c_bool

from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_acero cimport *
from pyarrow.includes.libarrow_fs cimport *


cdef extern from "arrow/dataset/plan.h" namespace "arrow::dataset::internal" nogil:

    cdef void Initialize()


ctypedef CStatus cb_writer_finish_internal(CFileWriter*)
ctypedef void cb_writer_finish(dict, CFileWriter*)

cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:

    cdef enum ExistingDataBehavior" arrow::dataset::ExistingDataBehavior":
        ExistingDataBehavior_DELETE_MATCHING" \
            arrow::dataset::ExistingDataBehavior::kDeleteMatchingPartitions"
        ExistingDataBehavior_OVERWRITE_OR_IGNORE" \
            arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore"
        ExistingDataBehavior_ERROR" \
            arrow::dataset::ExistingDataBehavior::kError"

    cdef cppclass CScanOptions "arrow::dataset::ScanOptions":
        shared_ptr[CSchema] dataset_schema
        shared_ptr[CSchema] projected_schema
        c_bool use_threads
        CExpression filter

    cdef cppclass CScanNodeOptions "arrow::dataset::ScanNodeOptions"(CExecNodeOptions):
        CScanNodeOptions(shared_ptr[CDataset] dataset, shared_ptr[CScanOptions] scan_options)

        shared_ptr[CScanOptions] scan_options

    cdef cppclass CFragmentScanOptions "arrow::dataset::FragmentScanOptions":
        c_string type_name() const

    ctypedef CIterator[shared_ptr[CScanTask]] CScanTaskIterator \
        "arrow::dataset::ScanTaskIterator"

    cdef cppclass CScanTask" arrow::dataset::ScanTask":
        CResult[CRecordBatchIterator] Execute()

    cdef cppclass CFragment "arrow::dataset::Fragment":
        CResult[shared_ptr[CSchema]] ReadPhysicalSchema()
        CResult[CScanTaskIterator] Scan(shared_ptr[CScanOptions] options)
        c_bool splittable() const
        c_string type_name() const
        const CExpression& partition_expression() const

    ctypedef vector[shared_ptr[CFragment]] CFragmentVector \
        "arrow::dataset::FragmentVector"

    ctypedef CIterator[shared_ptr[CFragment]] CFragmentIterator \
        "arrow::dataset::FragmentIterator"

    cdef cppclass CInMemoryFragment "arrow::dataset::InMemoryFragment"(
            CFragment):
        CInMemoryFragment(vector[shared_ptr[CRecordBatch]] record_batches,
                          CExpression partition_expression)

    cdef cppclass CTaggedRecordBatch "arrow::dataset::TaggedRecordBatch":
        shared_ptr[CRecordBatch] record_batch
        shared_ptr[CFragment] fragment

    ctypedef CIterator[CTaggedRecordBatch] CTaggedRecordBatchIterator \
        "arrow::dataset::TaggedRecordBatchIterator"

    cdef cppclass CScanner "arrow::dataset::Scanner":
        CScanner(shared_ptr[CDataset], shared_ptr[CScanOptions])
        CScanner(shared_ptr[CFragment], shared_ptr[CScanOptions])
        CResult[CScanTaskIterator] Scan()
        CResult[CTaggedRecordBatchIterator] ScanBatches()
        CResult[shared_ptr[CTable]] ToTable()
        CResult[shared_ptr[CTable]] TakeRows(const CArray& indices)
        CResult[shared_ptr[CTable]] Head(int64_t num_rows)
        CResult[int64_t] CountRows()
        CResult[CFragmentIterator] GetFragments()
        CResult[shared_ptr[CRecordBatchReader]] ToRecordBatchReader()
        const shared_ptr[CScanOptions]& options()

    cdef cppclass CScannerBuilder "arrow::dataset::ScannerBuilder":
        CScannerBuilder(shared_ptr[CDataset],
                        shared_ptr[CScanOptions] scan_options)
        CScannerBuilder(shared_ptr[CSchema], shared_ptr[CFragment],
                        shared_ptr[CScanOptions] scan_options)

        @staticmethod
        shared_ptr[CScannerBuilder] FromRecordBatchReader(
            shared_ptr[CRecordBatchReader] reader)
        CStatus ProjectColumns "Project"(const vector[c_string]& columns)
        CStatus Project(vector[CExpression]& exprs, vector[c_string]& columns)
        CStatus Filter(CExpression filter)
        CStatus UseThreads(c_bool use_threads)
        CStatus Pool(CMemoryPool* pool)
        CStatus BatchSize(int64_t batch_size)
        CStatus BatchReadahead(int32_t batch_readahead)
        CStatus FragmentReadahead(int32_t fragment_readahead)
        CStatus FragmentScanOptions(
            shared_ptr[CFragmentScanOptions] fragment_scan_options)
        CResult[shared_ptr[CScanOptions]] GetScanOptions()
        CResult[shared_ptr[CScanner]] Finish()
        shared_ptr[CSchema] schema() const

    ctypedef vector[shared_ptr[CDataset]] CDatasetVector \
        "arrow::dataset::DatasetVector"

    cdef cppclass CDataset "arrow::dataset::Dataset":
        const shared_ptr[CSchema] & schema()
        CResult[CFragmentIterator] GetFragments()
        CResult[CFragmentIterator] GetFragments(CExpression predicate)
        const CExpression & partition_expression()
        c_string type_name()

        CResult[shared_ptr[CDataset]] ReplaceSchema(shared_ptr[CSchema])

        CResult[shared_ptr[CScannerBuilder]] NewScan()

    cdef cppclass CInMemoryDataset "arrow::dataset::InMemoryDataset"(
            CDataset):
        CInMemoryDataset(shared_ptr[CRecordBatchReader])
        CInMemoryDataset(shared_ptr[CTable])

    cdef cppclass CUnionDataset "arrow::dataset::UnionDataset"(
            CDataset):
        @staticmethod
        CResult[shared_ptr[CUnionDataset]] Make(shared_ptr[CSchema] schema,
                                                CDatasetVector children)

        const CDatasetVector& children() const

    cdef cppclass CInspectOptions "arrow::dataset::InspectOptions":
        int fragments

    cdef cppclass CFinishOptions "arrow::dataset::FinishOptions":
        shared_ptr[CSchema] schema
        CInspectOptions inspect_options
        c_bool validate_fragments

    cdef cppclass CDatasetFactory "arrow::dataset::DatasetFactory":
        CResult[vector[shared_ptr[CSchema]]] InspectSchemas(CInspectOptions)
        CResult[shared_ptr[CSchema]] Inspect(CInspectOptions)
        CResult[shared_ptr[CDataset]] FinishWithSchema "Finish"(
            const shared_ptr[CSchema]& schema)
        CResult[shared_ptr[CDataset]] Finish()
        const CExpression& root_partition()
        CStatus SetRootPartition(CExpression partition)

    cdef cppclass CUnionDatasetFactory "arrow::dataset::UnionDatasetFactory":
        @staticmethod
        CResult[shared_ptr[CDatasetFactory]] Make(
            vector[shared_ptr[CDatasetFactory]] factories)

    cdef cppclass CFileSource "arrow::dataset::FileSource":
        const c_string& path() const
        const shared_ptr[CFileSystem]& filesystem() const
        const shared_ptr[CBuffer]& buffer() const
        const int64_t size() const
        # HACK: Cython can't handle all the overloads so don't declare them.
        # This means invalid construction of CFileSource won't be caught in
        # the C++ generation phase (though it will still be caught when
        # the generated C++ is compiled).
        CFileSource(...)

    cdef cppclass CFileWriteOptions \
            "arrow::dataset::FileWriteOptions":
        const shared_ptr[CFileFormat]& format() const
        c_string type_name() const

    cdef cppclass CFileWriter \
            "arrow::dataset::FileWriter":
        const shared_ptr[CFileFormat]& format() const
        const shared_ptr[CSchema]& schema() const
        const shared_ptr[CFileWriteOptions]& options() const
        const CFileLocator& destination() const
        CResult[int64_t] GetBytesWritten()

    cdef cppclass CFileFormat "arrow::dataset::FileFormat":
        shared_ptr[CFragmentScanOptions] default_fragment_scan_options
        c_string type_name() const
        CResult[shared_ptr[CSchema]] Inspect(const CFileSource&) const
        CResult[shared_ptr[CFileFragment]] MakeFragment(
            CFileSource source,
            CExpression partition_expression,
            shared_ptr[CSchema] physical_schema)
        shared_ptr[CFileWriteOptions] DefaultWriteOptions()

    cdef cppclass CFileFragment "arrow::dataset::FileFragment"(
            CFragment):
        const CFileSource& source() const
        const shared_ptr[CFileFormat]& format() const

    cdef cppclass CFileSystemDatasetWriteOptions \
            "arrow::dataset::FileSystemDatasetWriteOptions":
        shared_ptr[CFileWriteOptions] file_write_options
        shared_ptr[CFileSystem] filesystem
        c_string base_dir
        shared_ptr[CPartitioning] partitioning
        int max_partitions
        c_string basename_template
        function[cb_writer_finish_internal] writer_pre_finish
        function[cb_writer_finish_internal] writer_post_finish
        ExistingDataBehavior existing_data_behavior
        c_bool create_dir
        uint32_t max_open_files
        uint64_t max_rows_per_file
        uint64_t min_rows_per_group
        uint64_t max_rows_per_group

    cdef cppclass CFileSystemDataset \
            "arrow::dataset::FileSystemDataset"(CDataset):
        @staticmethod
        CResult[shared_ptr[CDataset]] Make(
            shared_ptr[CSchema] schema,
            CExpression source_partition,
            shared_ptr[CFileFormat] format,
            shared_ptr[CFileSystem] filesystem,
            vector[shared_ptr[CFileFragment]] fragments)

        @staticmethod
        CStatus Write(
            const CFileSystemDatasetWriteOptions& write_options,
            shared_ptr[CScanner] scanner)

        c_string type()
        vector[c_string] files()
        const shared_ptr[CFileFormat]& format() const
        const shared_ptr[CFileSystem]& filesystem() const
        const shared_ptr[CPartitioning]& partitioning() const

    cdef cppclass CIpcFileWriteOptions \
            "arrow::dataset::IpcFileWriteOptions"(CFileWriteOptions):
        shared_ptr[CIpcWriteOptions] options

    cdef cppclass CIpcFileFormat "arrow::dataset::IpcFileFormat"(
            CFileFormat):
        pass

    cdef cppclass COrcFileFormat "arrow::dataset::OrcFileFormat"(
            CFileFormat):
        pass

    cdef cppclass CCsvFileWriteOptions \
            "arrow::dataset::CsvFileWriteOptions"(CFileWriteOptions):
        shared_ptr[CCSVWriteOptions] write_options
        CMemoryPool* pool

    cdef cppclass CCsvFileFormat "arrow::dataset::CsvFileFormat"(
            CFileFormat):
        CCSVParseOptions parse_options

    cdef cppclass CCsvFragmentScanOptions \
            "arrow::dataset::CsvFragmentScanOptions"(CFragmentScanOptions):
        CCSVConvertOptions convert_options
        CCSVReadOptions read_options
        function[StreamWrapFunc] stream_transform_func

    cdef cppclass CJsonFileFormat "arrow::dataset::JsonFileFormat"(CFileFormat):
        pass

    cdef cppclass CJsonFragmentScanOptions "arrow::dataset::JsonFragmentScanOptions"(CFragmentScanOptions):
        CJSONParseOptions parse_options
        CJSONReadOptions read_options

    cdef cppclass CPartitioning "arrow::dataset::Partitioning":
        c_string type_name() const
        CResult[CExpression] Parse(const c_string & path) const
        const shared_ptr[CSchema] & schema()
        c_bool Equals(const CPartitioning& other) const

    cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding":
        bint operator==(CSegmentEncoding)

    CSegmentEncoding CSegmentEncoding_None\
        " arrow::dataset::SegmentEncoding::None"
    CSegmentEncoding CSegmentEncoding_Uri\
        " arrow::dataset::SegmentEncoding::Uri"

    cdef cppclass CKeyValuePartitioningOptions \
            "arrow::dataset::KeyValuePartitioningOptions":
        CSegmentEncoding segment_encoding

    cdef cppclass CHivePartitioningOptions \
            "arrow::dataset::HivePartitioningOptions":
        CSegmentEncoding segment_encoding
        c_string null_fallback

    cdef cppclass CPartitioningFactoryOptions \
            "arrow::dataset::PartitioningFactoryOptions":
        c_bool infer_dictionary
        shared_ptr[CSchema] schema
        CSegmentEncoding segment_encoding

    cdef cppclass CHivePartitioningFactoryOptions \
            "arrow::dataset::HivePartitioningFactoryOptions":
        c_bool infer_dictionary
        c_string null_fallback
        shared_ptr[CSchema] schema
        CSegmentEncoding segment_encoding

    cdef cppclass CPartitioningFactory "arrow::dataset::PartitioningFactory":
        c_string type_name() const

    cdef cppclass CKeyValuePartitioning \
            "arrow::dataset::KeyValuePartitioning"(CPartitioning):
        CKeyValuePartitioning(shared_ptr[CSchema] schema,
                              vector[shared_ptr[CArray]] dictionaries,
                              CKeyValuePartitioningOptions options)

        vector[shared_ptr[CArray]] dictionaries() const
        CSegmentEncoding segment_encoding()

    cdef cppclass CDirectoryPartitioning \
            "arrow::dataset::DirectoryPartitioning"(CPartitioning):
        CDirectoryPartitioning(shared_ptr[CSchema] schema,
                               vector[shared_ptr[CArray]] dictionaries)

        @staticmethod
        shared_ptr[CPartitioningFactory] MakeFactory(
            vector[c_string] field_names, CPartitioningFactoryOptions)

        vector[shared_ptr[CArray]] dictionaries() const
Loading ...