Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev259 

/ include / arrow / compute / row / grouper.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <memory>
#include <vector>

#include "arrow/compute/kernel.h"
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/util/visibility.h"

namespace arrow {
namespace compute {

/// \brief A segment
/// A segment group is a chunk of continuous rows that have the same segment key. (For
/// example, in ordered time series processing, segment key can be "date", and a segment
/// group can be all the rows that belong to the same date.) A segment group can span
/// across multiple exec batches. A segment is a chunk of continuous rows that has the
/// same segment key within a given batch. When a segment group span cross batches, it
/// will have multiple segments. A segment never spans cross batches. The segment data
/// structure only makes sense when used along with a exec batch.
struct ARROW_EXPORT Segment {
  /// \brief the offset into the batch where the segment starts
  int64_t offset;
  /// \brief the length of the segment
  int64_t length;
  /// \brief whether the segment may be extended by a next one
  bool is_open;
  /// \brief whether the segment extends a preceeding one
  bool extends;
};

inline bool operator==(const Segment& segment1, const Segment& segment2) {
  return segment1.offset == segment2.offset && segment1.length == segment2.length &&
         segment1.is_open == segment2.is_open && segment1.extends == segment2.extends;
}
inline bool operator!=(const Segment& segment1, const Segment& segment2) {
  return !(segment1 == segment2);
}

/// \brief a helper class to divide a batch into segments of equal values
///
/// For example, given a batch with two columns specifed as segment keys:
///
/// A A [other columns]...
/// A A ...
/// A B ...
/// A B ...
/// A A ...
///
/// Then the batch could be divided into 3 segments.  The first would be rows 0 & 1,
/// the second would be rows 2 & 3, and the third would be row 4.
///
/// Further, a segmenter keeps track of the last value seen.  This allows it to calculate
/// segments which span batches.  In our above example the last batch we emit would set
/// the "open" flag, which indicates whether the segment may extend into the next batch.
///
/// If the next call to the segmenter starts with `A A` then that segment would set the
/// "extends" flag, which indicates whether the segment continues the last open batch.
class ARROW_EXPORT RowSegmenter {
 public:
  virtual ~RowSegmenter() = default;

  /// \brief Construct a Segmenter which segments on the specified key types
  ///
  /// \param[in] key_types the specified key types
  /// \param[in] nullable_keys whether values of the specified keys may be null
  /// \param[in] ctx the execution context to use
  static Result<std::unique_ptr<RowSegmenter>> Make(
      const std::vector<TypeHolder>& key_types, bool nullable_keys, ExecContext* ctx);

  /// \brief Return the key types of this segmenter
  virtual const std::vector<TypeHolder>& key_types() const = 0;

  /// \brief Reset this segmenter
  ///
  /// A segmenter normally extends (see `Segment`) a segment from one batch to the next.
  /// If segment-extension is undesirable, for example when each batch is processed
  /// independently, then `Reset` should be invoked before processing the next batch.
  virtual Status Reset() = 0;

  /// \brief Get the next segment for the given batch starting from the given offset
  /// DEPRECATED: Due to its inefficiency, use GetSegments instead.
  ARROW_DEPRECATED("Deprecated in 18.0.0. Use GetSegments instead.")
  virtual Result<Segment> GetNextSegment(const ExecSpan& batch, int64_t offset) = 0;

  /// \brief Get all segments for the given batch
  virtual Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) = 0;
};

/// Consumes batches of keys and yields batches of the group ids.
class ARROW_EXPORT Grouper {
 public:
  virtual ~Grouper() = default;

  /// Construct a Grouper which receives the specified key types
  static Result<std::unique_ptr<Grouper>> Make(const std::vector<TypeHolder>& key_types,
                                               ExecContext* ctx = default_exec_context());

  /// Reset all intermediate state, make the grouper logically as just `Make`ed.
  /// The underlying buffers, if any, may or may not be released though.
  virtual Status Reset() = 0;

  /// Consume a batch of keys, producing the corresponding group ids as an integer array,
  /// over a slice defined by an offset and length, which defaults to the batch length.
  /// Currently only uint32 indices will be produced, eventually the bit width will only
  /// be as wide as necessary.
  virtual Result<Datum> Consume(const ExecSpan& batch, int64_t offset = 0,
                                int64_t length = -1) = 0;

  /// Get current unique keys. May be called multiple times.
  virtual Result<ExecBatch> GetUniques() = 0;

  /// Get the current number of groups.
  virtual uint32_t num_groups() const = 0;

  /// \brief Assemble lists of indices of identical elements.
  ///
  /// \param[in] ids An unsigned, all-valid integral array which will be
  ///                used as grouping criteria.
  /// \param[in] num_groups An upper bound for the elements of ids
  /// \param[in] ctx Execution context to use during the operation
  /// \return A num_groups-long ListArray where the slot at i contains a
  ///         list of indices where i appears in ids.
  ///
  ///   MakeGroupings([
  ///       2,
  ///       2,
  ///       5,
  ///       5,
  ///       2,
  ///       3
  ///   ], 8) == [
  ///       [],
  ///       [],
  ///       [0, 1, 4],
  ///       [5],
  ///       [],
  ///       [2, 3],
  ///       [],
  ///       []
  ///   ]
  static Result<std::shared_ptr<ListArray>> MakeGroupings(
      const UInt32Array& ids, uint32_t num_groups,
      ExecContext* ctx = default_exec_context());

  /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
  /// the provided groupings.
  ///
  /// For example,
  ///   ApplyGroupings([
  ///       [],
  ///       [],
  ///       [0, 1, 4],
  ///       [5],
  ///       [],
  ///       [2, 3],
  ///       [],
  ///       []
  ///   ], [2, 2, 5, 5, 2, 3]) == [
  ///       [],
  ///       [],
  ///       [2, 2, 2],
  ///       [3],
  ///       [],
  ///       [5, 5],
  ///       [],
  ///       []
  ///   ]
  static Result<std::shared_ptr<ListArray>> ApplyGroupings(
      const ListArray& groupings, const Array& array,
      ExecContext* ctx = default_exec_context());
};

}  // namespace compute
}  // namespace arrow