Gemfury

flet / pandas python

Repository URL to install this package:
Details
pandas / _libs / hashtable_class_helper.pxi.in
"""
Template for each `dtype` helper function for hashtable

WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""


{{py:

# name
complex_types = ['complex64',
                 'complex128']
}}

{{for name in complex_types}}
cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
    cdef kh{{name}}_t res
    res.real = val.real
    res.imag = val.imag
    return res

{{endfor}}


{{py:


# name
c_types = ['khcomplex128_t',
           'khcomplex64_t',
           'float64_t',
           'float32_t',
           'int64_t',
           'int32_t',
           'int16_t',
           'int8_t',
           'uint64_t',
           'uint32_t',
           'uint16_t',
           'uint8_t']
}}

{{for c_type in c_types}}

cdef bint is_nan_{{c_type}}({{c_type}} val) nogil:
    {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }}
    return val.real != val.real or val.imag != val.imag
    {{elif c_type in {'float64_t', 'float32_t'} }}
    return val != val
    {{else}}
    return False
    {{endif}}


{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }}
# are_equivalent_{{c_type}} is cimported via khash.pxd
{{else}}
cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil:
    return val1 == val2
{{endif}}

{{endfor}}


{{py:

# name
cimported_types = ['complex64',
                   'complex128',
                   'float32',
                   'float64',
                   'int8',
                   'int16',
                   'int32',
                   'int64',
                   'pymap',
                   'str',
                   'strbox',
                   'uint8',
                   'uint16',
                   'uint32',
                   'uint64']
}}

{{for name in cimported_types}}
from pandas._libs.khash cimport (
    kh_destroy_{{name}},
    kh_exist_{{name}},
    kh_get_{{name}},
    kh_init_{{name}},
    kh_put_{{name}},
    kh_resize_{{name}},
)

{{endfor}}

# ----------------------------------------------------------------------
# VectorData
# ----------------------------------------------------------------------

from pandas._libs.tslibs.util cimport get_c_string
from pandas._libs.missing cimport C_NA


{{py:

# name, dtype, c_type
# the generated StringVector is not actually used
# but is included for completeness (rather ObjectVector is used
# for uniques in hashtables)

dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
          ('Complex64', 'complex64', 'khcomplex64_t'),
          ('Float64', 'float64', 'float64_t'),
          ('Float32', 'float32', 'float32_t'),
          ('Int64', 'int64', 'int64_t'),
          ('Int32', 'int32', 'int32_t'),
          ('Int16', 'int16', 'int16_t'),
          ('Int8', 'int8', 'int8_t'),
          ('String', 'string', 'char *'),
          ('UInt64', 'uint64', 'uint64_t'),
          ('UInt32', 'uint32', 'uint32_t'),
          ('UInt16', 'uint16', 'uint16_t'),
          ('UInt8', 'uint8', 'uint8_t')]
}}

{{for name, dtype, c_type in dtypes}}


{{if dtype != 'int64'}}
# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
#  by IntervalTree

ctypedef struct {{name}}VectorData:
    {{c_type}} *data
    Py_ssize_t n, m

{{endif}}


@cython.wraparound(False)
@cython.boundscheck(False)
cdef void append_data_{{dtype}}({{name}}VectorData *data,
                                       {{c_type}} x) nogil:

    data.data[data.n] = x
    data.n += 1

{{endfor}}

ctypedef fused vector_data:
    Int64VectorData
    Int32VectorData
    Int16VectorData
    Int8VectorData
    UInt64VectorData
    UInt32VectorData
    UInt16VectorData
    UInt8VectorData
    Float64VectorData
    Float32VectorData
    Complex128VectorData
    Complex64VectorData
    StringVectorData

cdef bint needs_resize(vector_data *data) nogil:
    return data.n == data.m

# ----------------------------------------------------------------------
# Vector
# ----------------------------------------------------------------------

cdef class Vector:
    # cdef readonly:
    #    bint external_view_exists

    def __cinit__(self):
        self.external_view_exists = False


{{py:

# name, dtype, c_type
dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
          ('Complex64', 'complex64', 'khcomplex64_t'),
          ('Float64', 'float64', 'float64_t'),
          ('UInt64', 'uint64', 'uint64_t'),
          ('Int64', 'int64', 'int64_t'),
          ('Float32', 'float32', 'float32_t'),
          ('UInt32', 'uint32', 'uint32_t'),
          ('Int32', 'int32', 'int32_t'),
          ('UInt16', 'uint16', 'uint16_t'),
          ('Int16', 'int16', 'int16_t'),
          ('UInt8', 'uint8', 'uint8_t'),
          ('Int8', 'int8', 'int8_t')]

}}

{{for name, dtype, c_type in dtypes}}

cdef class {{name}}Vector(Vector):

    # For int64 we have to put this declaration in the .pxd file;
    # Int64Vector is the only one we need exposed for other cython files.
    {{if dtype != 'int64'}}
    cdef:
        {{name}}VectorData *data
        ndarray ao
    {{endif}}

    def __cinit__(self):
        self.data = <{{name}}VectorData *>PyMem_Malloc(
            sizeof({{name}}VectorData))
        if not self.data:
            raise MemoryError()
        self.data.n = 0
        self.data.m = _INIT_VEC_CAP
        self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
        self.data.data = <{{c_type}}*>self.ao.data

    cdef resize(self):
        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
        self.ao.resize(self.data.m, refcheck=False)
        self.data.data = <{{c_type}}*>self.ao.data

    def __dealloc__(self):
        if self.data is not NULL:
            PyMem_Free(self.data)
            self.data = NULL

    def __len__(self) -> int:
        return self.data.n

    cpdef ndarray to_array(self):
        if self.data.m != self.data.n:
            if self.external_view_exists:
                # should never happen
                raise ValueError("should have raised on append()")
            self.ao.resize(self.data.n, refcheck=False)
            self.data.m = self.data.n
        self.external_view_exists = True
        return self.ao

    cdef void append(self, {{c_type}} x):

        if needs_resize(self.data):
            if self.external_view_exists:
                raise ValueError("external reference but "
                                 "Vector.resize() needed")
            self.resize()

        append_data_{{dtype}}(self.data, x)

    cdef extend(self, const {{c_type}}[:] x):
        for i in range(len(x)):
            self.append(x[i])

{{endfor}}

cdef class StringVector(Vector):

    cdef:
        StringVectorData *data

    def __cinit__(self):
        self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
        if not self.data:
            raise MemoryError()
        self.data.n = 0
        self.data.m = _INIT_VEC_CAP
        self.data.data = <char **>malloc(self.data.m * sizeof(char *))
        if not self.data.data:
            raise MemoryError()

    cdef resize(self):
        cdef:
            char **orig_data
            Py_ssize_t i, m

        m = self.data.m
        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)

        orig_data = self.data.data
        self.data.data = <char **>malloc(self.data.m * sizeof(char *))
        if not self.data.data:
            raise MemoryError()
        for i in range(m):
            self.data.data[i] = orig_data[i]

    def __dealloc__(self):
        if self.data is not NULL:
            if self.data.data is not NULL:
                free(self.data.data)
            PyMem_Free(self.data)
            self.data = NULL

    def __len__(self) -> int:
        return self.data.n

    cpdef ndarray[object, ndim=1] to_array(self):
        cdef:
            ndarray ao
            Py_ssize_t n
            object val

        ao = np.empty(self.data.n, dtype=object)
        for i in range(self.data.n):
            val = self.data.data[i]
            ao[i] = val
        self.external_view_exists = True
        self.data.m = self.data.n
        return ao

    cdef void append(self, char *x):

        if needs_resize(self.data):
            self.resize()

        append_data_string(self.data, x)

    cdef extend(self, ndarray[object] x):
        for i in range(len(x)):
            self.append(x[i])


cdef class ObjectVector(Vector):

    cdef:
        PyObject **data
        Py_ssize_t n, m
        ndarray ao

    def __cinit__(self):
        self.n = 0
        self.m = _INIT_VEC_CAP
        self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
        self.data = <PyObject**>self.ao.data

    def __len__(self) -> int:
        return self.n

    cdef append(self, object obj):
        if self.n == self.m:
            if self.external_view_exists:
                raise ValueError("external reference but "
                                 "Vector.resize() needed")
            self.m = max(self.m * 2, _INIT_VEC_CAP)
            self.ao.resize(self.m, refcheck=False)
            self.data = <PyObject**>self.ao.data

        Py_INCREF(obj)
        self.data[self.n] = <PyObject*>obj
        self.n += 1

    cpdef ndarray[object, ndim=1] to_array(self):
        if self.m != self.n:
            if self.external_view_exists:
                raise ValueError("should have raised on append()")
            self.ao.resize(self.n, refcheck=False)
            self.m = self.n
        self.external_view_exists = True
        return self.ao

    cdef extend(self, ndarray[object] x):
        for i in range(len(x)):
            self.append(x[i])

# ----------------------------------------------------------------------
# HashTable
# ----------------------------------------------------------------------


cdef class HashTable:

    pass

{{py:

# name, dtype, c_type, to_c_type
dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
          ('Float64', 'float64', 'float64_t', ''),
          ('UInt64', 'uint64', 'uint64_t', ''),
          ('Int64', 'int64', 'int64_t', ''),
          ('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
          ('Float32', 'float32', 'float32_t', ''),
          ('UInt32', 'uint32', 'uint32_t', ''),
          ('Int32', 'int32', 'int32_t', ''),
          ('UInt16', 'uint16', 'uint16_t', ''),
          ('Int16', 'int16', 'int16_t', ''),
          ('UInt8', 'uint8', 'uint8_t', ''),
          ('Int8', 'int8', 'int8_t', '')]

}}


{{for name, dtype, c_type, to_c_type in dtypes}}

cdef class {{name}}HashTable(HashTable):

    def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
        self.table = kh_init_{{dtype}}()
        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
        kh_resize_{{dtype}}(self.table, size_hint)

        self.uses_mask = uses_mask
        self.na_position = -1

    def __len__(self) -> int:
        return self.table.size + (0 if self.na_position == -1 else 1)

    def __dealloc__(self):
        if self.table is not NULL:
            kh_destroy_{{dtype}}(self.table)
            self.table = NULL

    def __contains__(self, object key) -> bool:
        # The caller is responsible to check for compatible NA values in case
        # of masked arrays.
        cdef:
            khiter_t k
            {{c_type}} ckey

        if self.uses_mask and checknull(key):
            return -1 != self.na_position

        ckey = {{to_c_type}}(key)
        k = kh_get_{{dtype}}(self.table, ckey)
        return k != self.table.n_buckets

    def sizeof(self, deep: bool = False) -> int:
        """ return the size of my table in bytes """
        overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
        for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
        for_pairs =  self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
                                             sizeof(Py_ssize_t))   # vals
        return overhead + for_flags + for_pairs

    def get_state(self) -> dict[str, int]:
        """ returns infos about the state of the hashtable"""
        return {
            'n_buckets' : self.table.n_buckets,
            'size' : self.table.size,
            'n_occupied' : self.table.n_occupied,
            'upper_bound' : self.table.upper_bound,
        }

    cpdef get_item(self, {{dtype}}_t val):
        """Extracts the position of val from the hashtable.

        Parameters
        ----------
        val : Scalar
            The value that is looked up in the hashtable

        Returns
        -------
        The position of the requested integer.
        """

        # Used in core.sorting, IndexEngine.get_loc
        # Caller is responsible for checking for pd.NA
        cdef:
            khiter_t k
            {{c_type}} cval

        cval = {{to_c_type}}(val)
        k = kh_get_{{dtype}}(self.table, cval)
        if k != self.table.n_buckets:
            return self.table.vals[k]
        else:
            raise KeyError(val)

    cpdef get_na(self):
        """Extracts the position of na_value from the hashtable.

        Returns
        -------
        The position of the last na value.
        """

        if not self.uses_mask:
            raise NotImplementedError

        if self.na_position == -1:
            raise KeyError("NA")
        return self.na_position

    cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
        # Used in libjoin
        # Caller is responsible for checking for pd.NA
        cdef:
            khiter_t k
            int ret = 0
            {{c_type}} ckey

        ckey = {{to_c_type}}(key)
        k = kh_put_{{dtype}}(self.table, ckey, &ret)
        if kh_exist_{{dtype}}(self.table, k):
            self.table.vals[k] = val
        else:
            raise KeyError(key)

    cpdef set_na(self, Py_ssize_t val):
        # Caller is responsible for checking for pd.NA
        cdef:
            khiter_t k
            int ret = 0
            {{c_type}} ckey

        if not self.uses_mask:
            raise NotImplementedError

        self.na_position = val

    {{if dtype == "int64" }}
    # We only use this for int64, can reduce build size and make .pyi
    #  more accurate by only implementing it for int64
    @cython.boundscheck(False)
    def map_keys_to_values(
        self, const {{dtype}}_t[:] keys, const int64_t[:] values
    ) -> None:
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            {{c_type}} key
            khiter_t k

        with nogil:
            for i in range(n):
                key = {{to_c_type}}(keys[i])
                k = kh_put_{{dtype}}(self.table, key, &ret)
                self.table.vals[k] = <Py_ssize_t>values[i]
    {{endif}}

    @cython.boundscheck(False)
    def map_locations(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> None:
        # Used in libindex, safe_sort
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            {{c_type}} val
            khiter_t k
            int8_t na_position = self.na_position

        if self.uses_mask and mask is None:
            raise NotImplementedError  # pragma: no cover

        with nogil:
            if self.uses_mask:
                for i in range(n):
                    if mask[i]:
                        na_position = i
                    else:
                        val= {{to_c_type}}(values[i])
                        k = kh_put_{{dtype}}(self.table, val, &ret)
                        self.table.vals[k] = i
            else:
                for i in range(n):
                    val= {{to_c_type}}(values[i])
                    k = kh_put_{{dtype}}(self.table, val, &ret)
                    self.table.vals[k] = i
        self.na_position = na_position

    @cython.boundscheck(False)
    def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray:
        # -> np.ndarray[np.intp]
        # Used in safe_sort, IndexEngine.get_indexer
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            {{c_type}} val
            khiter_t k
            intp_t[::1] locs = np.empty(n, dtype=np.intp)
            int8_t na_position = self.na_position

        if self.uses_mask and mask is None:
            raise NotImplementedError  # pragma: no cover

        with nogil:
            for i in range(n):
                if self.uses_mask and mask[i]:
                    locs[i] = na_position
                else:
                    val = {{to_c_type}}(values[i])
                    k = kh_get_{{dtype}}(self.table, val)
                    if k != self.table.n_buckets:
                        locs[i] = self.table.vals[k]
                    else:
                        locs[i] = -1

        return np.asarray(locs)

    @cython.boundscheck(False)
    @cython.wraparound(False)
    def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                object na_value=None, bint ignore_na=False,
                object mask=None, bint return_inverse=False, bint use_result_mask=False):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[{{dtype}}]
            Array of values of which unique will be calculated
        uniques : {{name}}Vector
            Vector into which uniques will be written
        count_prior : Py_ssize_t, default 0
            Number of existing entries in uniques
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then
            any value "val" satisfying val != val is considered missing.
            If na_value is not None, then _additionally_, any value "val"
            satisfying val == na_value is considered missing.
        ignore_na : bool, default False
            Whether NA-values should be ignored for calculating the uniques. If
            True, the labels corresponding to missing values will be set to
            na_sentinel.
        mask : ndarray[bool], optional
            If not None, the mask is used as indicator for missing values
            (True = missing, False = valid) instead of `na_value` or
            condition "val != val".
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.
        use_result_mask: bool, default False
            Whether to create a result mask for the unique values. Not supported
            with return_inverse=True.

        Returns
        -------
        uniques : ndarray[{{dtype}}]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse=True)
            The labels from values to uniques
        result_mask: ndarray[bool], if use_result_mask is true
            The mask for the result values.
        """
        cdef:
            Py_ssize_t i, idx, count = count_prior, n = len(values)
            intp_t[::1] labels
            int ret = 0
            {{c_type}} val, na_value2
            khiter_t k
            {{name}}VectorData *ud
            UInt8Vector result_mask
            UInt8VectorData *rmd
            bint use_na_value, use_mask, seen_na = False
            uint8_t[:] mask_values

        if return_inverse:
            labels = np.empty(n, dtype=np.intp)
        ud = uniques.data
        use_na_value = na_value is not None
        use_mask = mask is not None
        if not use_mask and use_result_mask:
            raise NotImplementedError  # pragma: no cover

        if use_result_mask and return_inverse:
            raise NotImplementedError  # pragma: no cover

        result_mask = UInt8Vector()
        rmd = result_mask.data

        if use_mask:
            mask_values = mask.view("uint8")

        if use_na_value:
            # We need this na_value2 because we want to allow users
            # to *optionally* specify an NA sentinel *of the correct* type.
            # We use None, to make it optional, which requires `object` type
            # for the parameter. To please the compiler, we use na_value2,
            # which is only used if it's *specified*.
            na_value2 = {{to_c_type}}(na_value)
        else:
            na_value2 = {{to_c_type}}(0)

        with nogil:
            for i in range(n):
                val = {{to_c_type}}(values[i])

                if ignore_na and use_mask:
                    if mask_values[i]:
                        labels[i] = na_sentinel
                        continue
                elif ignore_na and (
                   is_nan_{{c_type}}(val) or
                   (use_na_value and are_equivalent_{{c_type}}(val, na_value2))
                ):
                    # if missing values do not count as unique values (i.e. if
                    # ignore_na is True), skip the hashtable entry for them,
                    # and replace the corresponding label with na_sentinel
                    labels[i] = na_sentinel
                    continue
                elif not ignore_na and use_result_mask:
                    if mask_values[i]:
                        if seen_na:
                            continue

                        seen_na = True
                        if needs_resize(ud):
                            with gil:
                                if uniques.external_view_exists:
                                    raise ValueError("external reference to "
                                                     "uniques held, but "
                                                     "Vector.resize() needed")
                                uniques.resize()
                                if result_mask.external_view_exists:
                                    raise ValueError("external reference to "
                                                     "result_mask held, but "
                                                     "Vector.resize() needed")
                                result_mask.resize()
                        append_data_{{dtype}}(ud, val)
                        append_data_uint8(rmd, 1)
                        continue

                k = kh_get_{{dtype}}(self.table, val)

                if k == self.table.n_buckets:
                    # k hasn't been seen yet
                    k = kh_put_{{dtype}}(self.table, val, &ret)

                    if needs_resize(ud):
                        with gil:
                            if uniques.external_view_exists:
                                raise ValueError("external reference to "
                                                 "uniques held, but "
                                                 "Vector.resize() needed")
                            uniques.resize()
                            if use_result_mask:
                                if result_mask.external_view_exists:
                                    raise ValueError("external reference to "
                                                     "result_mask held, but "
                                                     "Vector.resize() needed")
                                result_mask.resize()
                    append_data_{{dtype}}(ud, val)
                    if use_result_mask:
                        append_data_uint8(rmd, 0)

                    if return_inverse:
                        self.table.vals[k] = count
                        labels[i] = count
                        count += 1
                elif return_inverse:
                    # k falls into a previous bucket
                    # only relevant in case we need to construct the inverse
                    idx = self.table.vals[k]
                    labels[i] = idx

        if return_inverse:
            return uniques.to_array(), labels.base  # .base -> underlying ndarray
        if use_result_mask:
            return uniques.to_array(), result_mask.to_array()
        return uniques.to_array()

    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[{{dtype}}]
            Array of values of which unique will be calculated
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.
        mask : ndarray[bool], optional
            If not None, the mask is used as indicator for missing values
            (True = missing, False = valid) instead of `na_value` or

        Returns
        -------
        uniques : ndarray[{{dtype}}]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse)
            The labels from values to uniques
        result_mask: ndarray[bool], if mask is given as input
            The mask for the result values.
        """
        uniques = {{name}}Vector()
        use_result_mask = True if mask is not None else False
        return self._unique(values, uniques, ignore_na=False,
                            return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)

    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
                  object na_value=None, object mask=None, ignore_na=True):
        """
        Calculate unique values and labels (no sorting!)

        Missing values are not included in the "uniques" for this method.
        The labels for any missing values will be set to "na_sentinel"

        Parameters
        ----------
        values : ndarray[{{dtype}}]
            Array of values of which unique will be calculated
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then
            any value "val" satisfying val != val is considered missing.
            If na_value is not None, then _additionally_, any value "val"
            satisfying val == na_value is considered missing.
        mask : ndarray[bool], optional
            If not None, the mask is used as indicator for missing values
            (True = missing, False = valid) instead of `na_value` or
            condition "val != val".

        Returns
        -------
        uniques : ndarray[{{dtype}}]
            Unique values of input, not sorted
        labels : ndarray[intp_t]
            The labels from values to uniques
        """
        uniques_vector = {{name}}Vector()
        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
                            na_value=na_value, ignore_na=ignore_na, mask=mask,
                            return_inverse=True)

    def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                   object na_value=None, object mask=None):
        # -> np.ndarray[np.intp]
        _, labels = self._unique(values, uniques, count_prior=count_prior,
                                 na_sentinel=na_sentinel, na_value=na_value,
                                 ignore_na=True, return_inverse=True, mask=mask)
        return labels

    {{if dtype == 'int64'}}
    @cython.boundscheck(False)
    def get_labels_groupby(
        self, const {{dtype}}_t[:] values
    ) -> tuple[ndarray, ndarray]:
        # tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]]
        cdef:
            Py_ssize_t i, n = len(values)
            intp_t[::1] labels
            Py_ssize_t idx, count = 0
            int ret = 0
            {{c_type}} val
            khiter_t k
            {{name}}Vector uniques = {{name}}Vector()
            {{name}}VectorData *ud

        labels = np.empty(n, dtype=np.intp)
        ud = uniques.data

        with nogil:
            for i in range(n):
                val = {{to_c_type}}(values[i])

                # specific for groupby
                if val < 0:
                    labels[i] = -1
                    continue

                k = kh_get_{{dtype}}(self.table, val)
                if k != self.table.n_buckets:
                    idx = self.table.vals[k]
                    labels[i] = idx
                else:
                    k = kh_put_{{dtype}}(self.table, val, &ret)
                    self.table.vals[k] = count

                    if needs_resize(ud):
                        with gil:
                            uniques.resize()
                    append_data_{{dtype}}(ud, val)
                    labels[i] = count
                    count += 1

        arr_uniques = uniques.to_array()

        return np.asarray(labels), arr_uniques
    {{endif}}


cdef class {{name}}Factorizer(Factorizer):
    cdef public:
        {{name}}HashTable table
        {{name}}Vector uniques

    def __cinit__(self, size_hint: int):
        self.table = {{name}}HashTable(size_hint)
        self.uniques = {{name}}Vector()

    def factorize(self, const {{c_type}}[:] values,
                  na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
        """
        Returns
        -------
        ndarray[intp_t]

        Examples
        --------
        Factorize values with nans replaced by na_sentinel

        >>> fac = {{name}}Factorizer(3)
        >>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20)
        array([0, 1, 2])
        """
        cdef:
            ndarray[intp_t] labels

        if self.uniques.external_view_exists:
            uniques = {{name}}Vector()
            uniques.extend(self.uniques.to_array())
            self.uniques = uniques
        labels = self.table.get_labels(values, self.uniques,
                                       self.count, na_sentinel,
                                       na_value=na_value, mask=mask)
        self.count = len(self.uniques)
        return labels

{{endfor}}


cdef class StringHashTable(HashTable):
    # these by-definition *must* be strings
    # or a sentinel np.nan / None missing value
    na_string_sentinel = '__nan__'

    def __init__(self, int64_t size_hint=1):
        self.table = kh_init_str()
        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
        kh_resize_str(self.table, size_hint)

    def __dealloc__(self):
        if self.table is not NULL:
            kh_destroy_str(self.table)
            self.table = NULL

    def sizeof(self, deep: bool = False) -> int:
        overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
        for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
        for_pairs =  self.table.n_buckets * (sizeof(char *) +      # keys
                                             sizeof(Py_ssize_t))   # vals
        return overhead + for_flags + for_pairs

    def get_state(self) -> dict[str, int]:
        """ returns infos about the state of the hashtable"""
        return {
            'n_buckets' : self.table.n_buckets,
            'size' : self.table.size,
            'n_occupied' : self.table.n_occupied,
            'upper_bound' : self.table.upper_bound,
        }

    cpdef get_item(self, str val):
        cdef:
            khiter_t k
            const char *v
        v = get_c_string(val)

        k = kh_get_str(self.table, v)
        if k != self.table.n_buckets:
            return self.table.vals[k]
        else:
            raise KeyError(val)

    cpdef set_item(self, str key, Py_ssize_t val):
        cdef:
            khiter_t k
            int ret = 0
            const char *v

        v = get_c_string(key)

        k = kh_put_str(self.table, v, &ret)
        if kh_exist_str(self.table, k):
            self.table.vals[k] = val
        else:
            raise KeyError(key)

    @cython.boundscheck(False)
    def get_indexer(self, ndarray[object] values) -> ndarray:
        # -> np.ndarray[np.intp]
        cdef:
            Py_ssize_t i, n = len(values)
            ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
            intp_t *resbuf = <intp_t*>labels.data
            khiter_t k
            kh_str_t *table = self.table
            const char *v
            const char **vecs

        vecs = <const char **>malloc(n * sizeof(char *))
        for i in range(n):
            val = values[i]
            v = get_c_string(val)
            vecs[i] = v

        with nogil:
            for i in range(n):
                k = kh_get_str(table, vecs[i])
                if k != table.n_buckets:
                    resbuf[i] = table.vals[k]
                else:
                    resbuf[i] = -1

        free(vecs)
        return labels

    @cython.boundscheck(False)
    def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
        # -> np.ndarray[np.intp]
        # mask not yet implemented
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            object val
            const char *v
            khiter_t k
            intp_t[::1] locs = np.empty(n, dtype=np.intp)

        # these by-definition *must* be strings
        vecs = <const char **>malloc(n * sizeof(char *))
        for i in range(n):
            val = values[i]

            if isinstance(val, str):
                # GH#31499 if we have a np.str_ get_c_string won't recognize
                #  it as a str, even though isinstance does.
                v = get_c_string(<str>val)
            else:
                v = get_c_string(self.na_string_sentinel)
            vecs[i] = v

        with nogil:
            for i in range(n):
                v = vecs[i]
                k = kh_get_str(self.table, v)
                if k != self.table.n_buckets:
                    locs[i] = self.table.vals[k]
                else:
                    locs[i] = -1

        free(vecs)
        return np.asarray(locs)

    @cython.boundscheck(False)
    def map_locations(self, ndarray[object] values, object mask = None) -> None:
        # mask not yet implemented
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            object val
            const char *v
            const char **vecs
            khiter_t k

        # these by-definition *must* be strings
        vecs = <const char **>malloc(n * sizeof(char *))
        for i in range(n):
            val = values[i]

            if isinstance(val, str):
                # GH#31499 if we have a np.str_ get_c_string won't recognize
                #  it as a str, even though isinstance does.
                v = get_c_string(<str>val)
            else:
                v = get_c_string(self.na_string_sentinel)
            vecs[i] = v

        with nogil:
            for i in range(n):
                v = vecs[i]
                k = kh_put_str(self.table, v, &ret)
                self.table.vals[k] = i
        free(vecs)

    @cython.boundscheck(False)
    @cython.wraparound(False)
    def _unique(self, ndarray[object] values, ObjectVector uniques,
                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                object na_value=None, bint ignore_na=False,
                bint return_inverse=False):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        uniques : ObjectVector
            Vector into which uniques will be written
        count_prior : Py_ssize_t, default 0
            Number of existing entries in uniques
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then any value
            that is not a string is considered missing. If na_value is
            not None, then _additionally_ any value "val" satisfying
            val == na_value is considered missing.
        ignore_na : bool, default False
            Whether NA-values should be ignored for calculating the uniques. If
            True, the labels corresponding to missing values will be set to
            na_sentinel.
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse=True)
            The labels from values to uniques
        """
        cdef:
            Py_ssize_t i, idx, count = count_prior, n = len(values)
            intp_t[::1] labels
            int64_t[::1] uindexer
            int ret = 0
            object val
            const char *v
            const char **vecs
            khiter_t k
            bint use_na_value

        if return_inverse:
            labels = np.zeros(n, dtype=np.intp)
        uindexer = np.empty(n, dtype=np.int64)
        use_na_value = na_value is not None

        # assign pointers and pre-filter out missing (if ignore_na)
        vecs = <const char **>malloc(n * sizeof(char *))
        for i in range(n):
            val = values[i]

            if (ignore_na
                and (not isinstance(val, str)
                     or (use_na_value and val == na_value))):
                # if missing values do not count as unique values (i.e. if
                # ignore_na is True), we can skip the actual value, and
                # replace the label with na_sentinel directly
                labels[i] = na_sentinel
            else:
                # if ignore_na is False, we also stringify NaN/None/etc.
                try:
                    v = get_c_string(<str>val)
                except UnicodeEncodeError:
                    v = get_c_string(<str>repr(val))
                vecs[i] = v

        # compute
        with nogil:
            for i in range(n):
                if ignore_na and labels[i] == na_sentinel:
                    # skip entries for ignored missing values (see above)
                    continue

                v = vecs[i]
                k = kh_get_str(self.table, v)
                if k == self.table.n_buckets:
                    # k hasn't been seen yet
                    k = kh_put_str(self.table, v, &ret)
                    uindexer[count] = i
                    if return_inverse:
                        self.table.vals[k] = count
                        labels[i] = count
                    count += 1
                elif return_inverse:
                    # k falls into a previous bucket
                    # only relevant in case we need to construct the inverse
                    idx = self.table.vals[k]
                    labels[i] = idx

        free(vecs)

        # uniques
        for i in range(count):
            uniques.append(values[uindexer[i]])

        if return_inverse:
            return uniques.to_array(), labels.base  # .base -> underlying ndarray
        return uniques.to_array()

    def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.
        mask : ndarray[bool], optional
            Not yet implemented for StringHashTable

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse)
            The labels from values to uniques
        """
        uniques = ObjectVector()
        return self._unique(values, uniques, ignore_na=False,
                            return_inverse=return_inverse)

    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
                  object na_value=None, object mask=None, ignore_na=True):
        """
        Calculate unique values and labels (no sorting!)

        Missing values are not included in the "uniques" for this method.
        The labels for any missing values will be set to "na_sentinel"

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then any value
            that is not a string is considered missing. If na_value is
            not None, then _additionally_ any value "val" satisfying
            val == na_value is considered missing.
        mask : ndarray[bool], optional
            Not yet implemented for StringHashTable.

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp]
            The labels from values to uniques
        """
        uniques_vector = ObjectVector()
        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
                            na_value=na_value, ignore_na=ignore_na,
                            return_inverse=True)

    def get_labels(self, ndarray[object] values, ObjectVector uniques,
                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
        # -> np.ndarray[np.intp]
        _, labels = self._unique(values, uniques, count_prior=count_prior,
                                 na_sentinel=na_sentinel, na_value=na_value,
                                 ignore_na=True, return_inverse=True)
        return labels


cdef class PyObjectHashTable(HashTable):

    def __init__(self, int64_t size_hint=1):
        self.table = kh_init_pymap()
        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
        kh_resize_pymap(self.table, size_hint)

    def __dealloc__(self):
        if self.table is not NULL:
            kh_destroy_pymap(self.table)
            self.table = NULL

    def __len__(self) -> int:
        return self.table.size

    def __contains__(self, object key) -> bool:
        cdef:
            khiter_t k
        hash(key)

        k = kh_get_pymap(self.table, <PyObject*>key)
        return k != self.table.n_buckets

    def sizeof(self, deep: bool = False) -> int:
        """ return the size of my table in bytes """
        overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
        for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
        for_pairs =  self.table.n_buckets * (sizeof(PyObject *) +  # keys
                                             sizeof(Py_ssize_t))   # vals
        return overhead + for_flags + for_pairs

    def get_state(self) -> dict[str, int]:
        """
        returns infos about the current state of the hashtable like size,
        number of buckets and so on.
        """
        return {
            'n_buckets' : self.table.n_buckets,
            'size' : self.table.size,
            'n_occupied' : self.table.n_occupied,
            'upper_bound' : self.table.upper_bound,
        }

    cpdef get_item(self, object val):
        cdef:
            khiter_t k

        k = kh_get_pymap(self.table, <PyObject*>val)
        if k != self.table.n_buckets:
            return self.table.vals[k]
        else:
            raise KeyError(val)

    cpdef set_item(self, object key, Py_ssize_t val):
        cdef:
            khiter_t k
            int ret = 0
            char* buf

        hash(key)

        k = kh_put_pymap(self.table, <PyObject*>key, &ret)
        if kh_exist_pymap(self.table, k):
            self.table.vals[k] = val
        else:
            raise KeyError(key)

    def map_locations(self, ndarray[object] values, object mask = None) -> None:
        # mask not yet implemented
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            object val
            khiter_t k

        for i in range(n):
            val = values[i]
            hash(val)

            k = kh_put_pymap(self.table, <PyObject*>val, &ret)
            self.table.vals[k] = i

    def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
        # -> np.ndarray[np.intp]
        # mask not yet implemented
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            object val
            khiter_t k
            intp_t[::1] locs = np.empty(n, dtype=np.intp)

        for i in range(n):
            val = values[i]
            hash(val)

            k = kh_get_pymap(self.table, <PyObject*>val)
            if k != self.table.n_buckets:
                locs[i] = self.table.vals[k]
            else:
                locs[i] = -1

        return np.asarray(locs)

    @cython.boundscheck(False)
    @cython.wraparound(False)
    def _unique(self, ndarray[object] values, ObjectVector uniques,
                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                object na_value=None, bint ignore_na=False,
                bint return_inverse=False):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        uniques : ObjectVector
            Vector into which uniques will be written
        count_prior : Py_ssize_t, default 0
            Number of existing entries in uniques
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then None _plus_
            any value "val" satisfying val != val is considered missing.
            If na_value is not None, then _additionally_, any value "val"
            satisfying val == na_value is considered missing.
        ignore_na : bool, default False
            Whether NA-values should be ignored for calculating the uniques. If
            True, the labels corresponding to missing values will be set to
            na_sentinel.
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse=True)
            The labels from values to uniques
        """
        cdef:
            Py_ssize_t i, idx, count = count_prior, n = len(values)
            intp_t[::1] labels
            int ret = 0
            object val
            khiter_t k
            bint use_na_value

        if return_inverse:
            labels = np.empty(n, dtype=np.intp)
        use_na_value = na_value is not None

        for i in range(n):
            val = values[i]
            hash(val)

            if ignore_na and (
                checknull(val)
                or (use_na_value and val == na_value)
            ):
                # if missing values do not count as unique values (i.e. if
                # ignore_na is True), skip the hashtable entry for them, and
                # replace the corresponding label with na_sentinel
                labels[i] = na_sentinel
                continue

            k = kh_get_pymap(self.table, <PyObject*>val)
            if k == self.table.n_buckets:
                # k hasn't been seen yet
                k = kh_put_pymap(self.table, <PyObject*>val, &ret)
                uniques.append(val)
                if return_inverse:
                    self.table.vals[k] = count
                    labels[i] = count
                    count += 1
            elif return_inverse:
                # k falls into a previous bucket
                # only relevant in case we need to construct the inverse
                idx = self.table.vals[k]
                labels[i] = idx

        if return_inverse:
            return uniques.to_array(), labels.base  # .base -> underlying ndarray
        return uniques.to_array()

    def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.
        mask : ndarray[bool], optional
            Not yet implemented for PyObjectHashTable

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse)
            The labels from values to uniques
        """
        uniques = ObjectVector()
        return self._unique(values, uniques, ignore_na=False,
                            return_inverse=return_inverse)

    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
                  object na_value=None, object mask=None, ignore_na=True):
        """
        Calculate unique values and labels (no sorting!)

        Missing values are not included in the "uniques" for this method.
        The labels for any missing values will be set to "na_sentinel"

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then None _plus_
            any value "val" satisfying val != val is considered missing.
            If na_value is not None, then _additionally_, any value "val"
            satisfying val == na_value is considered missing.
        mask : ndarray[bool], optional
            Not yet implemented for PyObjectHashTable.

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp_t]
            The labels from values to uniques
        """
        uniques_vector = ObjectVector()
        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
                            na_value=na_value, ignore_na=ignore_na,
                            return_inverse=True)

    def get_labels(self, ndarray[object] values, ObjectVector uniques,
                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
        # -> np.ndarray[np.intp]
        _, labels = self._unique(values, uniques, count_prior=count_prior,
                                 na_sentinel=na_sentinel, na_value=na_value,
                                 ignore_na=True, return_inverse=True)
        return labels
flet / pandas python

Products

About

Resources

Contact Gemfury