Gemfury

flet / pandas python

Repository URL to install this package:
Details
pandas / _libs / tslibs / np_datetime.pyx
cimport cython
from cpython.datetime cimport (
    PyDateTime_CheckExact,
    PyDateTime_DATE_GET_HOUR,
    PyDateTime_DATE_GET_MICROSECOND,
    PyDateTime_DATE_GET_MINUTE,
    PyDateTime_DATE_GET_SECOND,
    PyDateTime_GET_DAY,
    PyDateTime_GET_MONTH,
    PyDateTime_GET_YEAR,
    import_datetime,
)
from cpython.object cimport (
    Py_EQ,
    Py_GE,
    Py_GT,
    Py_LE,
    Py_LT,
    Py_NE,
)

import_datetime()

import numpy as np

cimport numpy as cnp

cnp.import_array()
from numpy cimport (
    int64_t,
    ndarray,
    uint8_t,
)

from pandas._libs.tslibs.util cimport get_c_string_buf_and_size


cdef extern from "src/datetime/np_datetime.h":
    int cmp_npy_datetimestruct(npy_datetimestruct *a,
                               npy_datetimestruct *b)

    # AS, FS, PS versions exist but are not imported because they are not used.
    npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS
    npy_datetimestruct _US_MIN_DTS, _US_MAX_DTS
    npy_datetimestruct _MS_MIN_DTS, _MS_MAX_DTS
    npy_datetimestruct _S_MIN_DTS, _S_MAX_DTS
    npy_datetimestruct _M_MIN_DTS, _M_MAX_DTS

    PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(cnp.PyArray_Descr *dtype)

cdef extern from "src/datetime/np_datetime_strings.h":
    int parse_iso_8601_datetime(const char *str, int len, int want_exc,
                                npy_datetimestruct *out,
                                NPY_DATETIMEUNIT *out_bestunit,
                                int *out_local, int *out_tzoffset,
                                const char *format, int format_len,
                                FormatRequirement exact)


# ----------------------------------------------------------------------
# numpy object inspection

cdef npy_datetime get_datetime64_value(object obj) nogil:
    """
    returns the int64 value underlying scalar numpy datetime64 object

    Note that to interpret this as a datetime, the corresponding unit is
    also needed.  That can be found using `get_datetime64_unit`.
    """
    return (<PyDatetimeScalarObject*>obj).obval


cdef npy_timedelta get_timedelta64_value(object obj) nogil:
    """
    returns the int64 value underlying scalar numpy timedelta64 object
    """
    return (<PyTimedeltaScalarObject*>obj).obval


cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil:
    """
    returns the unit part of the dtype for a numpy datetime64 object.
    """
    return <NPY_DATETIMEUNIT>(<PyDatetimeScalarObject*>obj).obmeta.base


cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype):
    # NB: caller is responsible for ensuring this is *some* datetime64 or
    #  timedelta64 dtype, otherwise we can segfault
    cdef:
        cnp.PyArray_Descr* descr = <cnp.PyArray_Descr*>dtype
        PyArray_DatetimeMetaData meta
    meta = get_datetime_metadata_from_dtype(descr)
    return meta.base


def py_get_unit_from_dtype(dtype):
    # for testing get_unit_from_dtype; adds 896 bytes to the .so file.
    return get_unit_from_dtype(dtype)


def is_unitless(dtype: cnp.dtype) -> bool:
    """
    Check if a datetime64 or timedelta64 dtype has no attached unit.
    """
    if dtype.type_num not in [cnp.NPY_DATETIME, cnp.NPY_TIMEDELTA]:
        raise ValueError("is_unitless dtype must be datetime64 or timedelta64")
    cdef:
        NPY_DATETIMEUNIT unit = get_unit_from_dtype(dtype)

    return unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC


# ----------------------------------------------------------------------
# Comparison


cdef bint cmp_dtstructs(
    npy_datetimestruct* left, npy_datetimestruct* right, int op
):
    cdef:
        int cmp_res

    cmp_res = cmp_npy_datetimestruct(left, right)
    if op == Py_EQ:
        return cmp_res == 0
    if op == Py_NE:
        return cmp_res != 0
    if op == Py_GT:
        return cmp_res == 1
    if op == Py_LT:
        return cmp_res == -1
    if op == Py_GE:
        return cmp_res == 1 or cmp_res == 0
    else:
        # i.e. op == Py_LE
        return cmp_res == -1 or cmp_res == 0


cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1:
    """
    cmp_scalar is a more performant version of PyObject_RichCompare
    typed for int64_t arguments.
    """
    if op == Py_EQ:
        return lhs == rhs
    elif op == Py_NE:
        return lhs != rhs
    elif op == Py_LT:
        return lhs < rhs
    elif op == Py_LE:
        return lhs <= rhs
    elif op == Py_GT:
        return lhs > rhs
    elif op == Py_GE:
        return lhs >= rhs


class OutOfBoundsDatetime(ValueError):
    """
    Raised when the datetime is outside the range that can be represented.
    """
    pass


class OutOfBoundsTimedelta(ValueError):
    """
    Raised when encountering a timedelta value that cannot be represented.

    Representation should be within a timedelta64[ns].
    """
    # Timedelta analogue to OutOfBoundsDatetime
    pass


cdef get_implementation_bounds(
    NPY_DATETIMEUNIT reso,
    npy_datetimestruct *lower,
    npy_datetimestruct *upper,
):
    if reso == NPY_FR_ns:
        upper[0] = _NS_MAX_DTS
        lower[0] = _NS_MIN_DTS
    elif reso == NPY_FR_us:
        upper[0] = _US_MAX_DTS
        lower[0] = _US_MIN_DTS
    elif reso == NPY_FR_ms:
        upper[0] = _MS_MAX_DTS
        lower[0] = _MS_MIN_DTS
    elif reso == NPY_FR_s:
        upper[0] = _S_MAX_DTS
        lower[0] = _S_MIN_DTS
    elif reso == NPY_FR_m:
        upper[0] = _M_MAX_DTS
        lower[0] = _M_MIN_DTS
    else:
        raise NotImplementedError(reso)


cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=NPY_FR_ns):
    """Raises OutOfBoundsDatetime if the given date is outside the range that
    can be represented by nanosecond-resolution 64-bit integers."""
    cdef:
        bint error = False
        npy_datetimestruct cmp_upper, cmp_lower

    get_implementation_bounds(unit, &cmp_lower, &cmp_upper)

    if cmp_npy_datetimestruct(dts, &cmp_lower) == -1:
        error = True
    elif cmp_npy_datetimestruct(dts, &cmp_upper) == 1:
        error = True

    if error:
        fmt = (f"{dts.year}-{dts.month:02d}-{dts.day:02d} "
               f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}")
        # TODO: "nanosecond" in the message assumes NPY_FR_ns
        raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {fmt}")


# ----------------------------------------------------------------------
# Conversion


# just exposed for testing at the moment
def py_td64_to_tdstruct(int64_t td64, NPY_DATETIMEUNIT unit):
    cdef:
        pandas_timedeltastruct tds
    pandas_timedelta_to_timedeltastruct(td64, unit, &tds)
    return tds  # <- returned as a dict to python


cdef void pydatetime_to_dtstruct(datetime dt, npy_datetimestruct *dts):
    if PyDateTime_CheckExact(dt):
        dts.year = PyDateTime_GET_YEAR(dt)
    else:
        # We use dt.year instead of PyDateTime_GET_YEAR because with Timestamp
        #  we override year such that PyDateTime_GET_YEAR is incorrect.
        dts.year = dt.year

    dts.month = PyDateTime_GET_MONTH(dt)
    dts.day = PyDateTime_GET_DAY(dt)
    dts.hour = PyDateTime_DATE_GET_HOUR(dt)
    dts.min = PyDateTime_DATE_GET_MINUTE(dt)
    dts.sec = PyDateTime_DATE_GET_SECOND(dt)
    dts.us = PyDateTime_DATE_GET_MICROSECOND(dt)
    dts.ps = dts.as = 0


cdef int64_t pydatetime_to_dt64(datetime val,
                                npy_datetimestruct *dts,
                                NPY_DATETIMEUNIT reso=NPY_FR_ns):
    """
    Note we are assuming that the datetime object is timezone-naive.
    """
    pydatetime_to_dtstruct(val, dts)
    return npy_datetimestruct_to_datetime(reso, dts)


cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts):
    dts.year = PyDateTime_GET_YEAR(val)
    dts.month = PyDateTime_GET_MONTH(val)
    dts.day = PyDateTime_GET_DAY(val)
    dts.hour = dts.min = dts.sec = dts.us = 0
    dts.ps = dts.as = 0
    return

cdef int64_t pydate_to_dt64(
    date val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=NPY_FR_ns
):
    pydate_to_dtstruct(val, dts)
    return npy_datetimestruct_to_datetime(reso, dts)


cdef int string_to_dts(
    str val,
    npy_datetimestruct* dts,
    NPY_DATETIMEUNIT* out_bestunit,
    int* out_local,
    int* out_tzoffset,
    bint want_exc,
    format: str | None=None,
    bint exact=True,
) except? -1:
    cdef:
        Py_ssize_t length
        const char* buf
        Py_ssize_t format_length
        const char* format_buf
        FormatRequirement format_requirement

    buf = get_c_string_buf_and_size(val, &length)
    if format is None:
        format_buf = b""
        format_length = 0
        format_requirement = INFER_FORMAT
    else:
        format_buf = get_c_string_buf_and_size(format, &format_length)
        format_requirement = <FormatRequirement>exact
    return parse_iso_8601_datetime(buf, length, want_exc,
                                   dts, out_bestunit, out_local, out_tzoffset,
                                   format_buf, format_length,
                                   format_requirement)


cpdef ndarray astype_overflowsafe(
    ndarray values,
    cnp.dtype dtype,
    bint copy=True,
    bint round_ok=True,
    bint is_coerce=False,
):
    """
    Convert an ndarray with datetime64[X] to datetime64[Y]
    or timedelta64[X] to timedelta64[Y],
    raising on overflow.
    """
    if values.descr.type_num == dtype.type_num == cnp.NPY_DATETIME:
        # i.e. dtype.kind == "M"
        dtype_name = "datetime64"
    elif values.descr.type_num == dtype.type_num == cnp.NPY_TIMEDELTA:
        # i.e. dtype.kind == "m"
        dtype_name = "timedelta64"
    else:
        raise TypeError(
            "astype_overflowsafe values.dtype and dtype must be either "
            "both-datetime64 or both-timedelta64."
        )

    cdef:
        NPY_DATETIMEUNIT from_unit = get_unit_from_dtype(values.dtype)
        NPY_DATETIMEUNIT to_unit = get_unit_from_dtype(dtype)

    if from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
        raise TypeError(f"{dtype_name} values must have a unit specified")

    if to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
        # without raising explicitly here, we end up with a SystemError
        # built-in function [...] returned a result with an error
        raise ValueError(
            f"{dtype_name} dtype must have a unit specified"
        )

    if from_unit == to_unit:
        # Check this before allocating result for perf, might save some memory
        if copy:
            return values.copy()
        return values

    elif from_unit > to_unit:
        if round_ok:
            # e.g. ns -> us, so there is no risk of overflow, so we can use
            #  numpy's astype safely. Note there _is_ risk of truncation.
            return values.astype(dtype)
        else:
            iresult2 = astype_round_check(values.view("i8"), from_unit, to_unit)
            return iresult2.view(dtype)

    if (<object>values).dtype.byteorder == ">":
        # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap
        values = values.astype(values.dtype.newbyteorder("<"))

    cdef:
        ndarray i8values = values.view("i8")

        # equiv: result = np.empty((<object>values).shape, dtype="i8")
        ndarray iresult = cnp.PyArray_EMPTY(
            values.ndim, values.shape, cnp.NPY_INT64, 0
        )

        cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values)
        Py_ssize_t i, N = values.size
        int64_t value, new_value
        npy_datetimestruct dts
        bint is_td = dtype.type_num == cnp.NPY_TIMEDELTA

    for i in range(N):
        # Analogous to: item = values[i]
        value = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]

        if value == NPY_DATETIME_NAT:
            new_value = NPY_DATETIME_NAT
        else:
            pandas_datetime_to_datetimestruct(value, from_unit, &dts)

            try:
                check_dts_bounds(&dts, to_unit)
            except OutOfBoundsDatetime as err:
                if is_coerce:
                    new_value = NPY_DATETIME_NAT
                elif is_td:
                    from_abbrev = np.datetime_data(values.dtype)[0]
                    np_val = np.timedelta64(value, from_abbrev)
                    msg = (
                        "Cannot convert {np_val} to {dtype} without overflow"
                        .format(np_val=str(np_val), dtype=str(dtype))
                    )
                    raise OutOfBoundsTimedelta(msg) from err
                else:
                    raise
            else:
                new_value = npy_datetimestruct_to_datetime(to_unit, &dts)

        # Analogous to: iresult[i] = new_value
        (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value

        cnp.PyArray_MultiIter_NEXT(mi)

    return iresult.view(dtype)


# TODO: try to upstream this fix to numpy
def compare_mismatched_resolutions(ndarray left, ndarray right, op):
    """
    Overflow-safe comparison of timedelta64/datetime64 with mismatched resolutions.

    >>> left = np.array([500], dtype="M8[Y]")
    >>> right = np.array([0], dtype="M8[ns]")
    >>> left < right  # <- wrong!
    array([ True])
    """

    if left.dtype.kind != right.dtype.kind or left.dtype.kind not in ["m", "M"]:
        raise ValueError("left and right must both be timedelta64 or both datetime64")

    cdef:
        int op_code = op_to_op_code(op)
        NPY_DATETIMEUNIT left_unit = get_unit_from_dtype(left.dtype)
        NPY_DATETIMEUNIT right_unit = get_unit_from_dtype(right.dtype)

        # equiv: result = np.empty((<object>left).shape, dtype="bool")
        ndarray result = cnp.PyArray_EMPTY(
            left.ndim, left.shape, cnp.NPY_BOOL, 0
        )

        ndarray lvalues = left.view("i8")
        ndarray rvalues = right.view("i8")

        cnp.broadcast mi = cnp.PyArray_MultiIterNew3(result, lvalues, rvalues)
        int64_t lval, rval
        bint res_value

        Py_ssize_t i, N = left.size
        npy_datetimestruct ldts, rdts

    for i in range(N):
        # Analogous to: lval = lvalues[i]
        lval = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]

        # Analogous to: rval = rvalues[i]
        rval = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 2))[0]

        if lval == NPY_DATETIME_NAT or rval == NPY_DATETIME_NAT:
            res_value = op_code == Py_NE

        else:
            pandas_datetime_to_datetimestruct(lval, left_unit, &ldts)
            pandas_datetime_to_datetimestruct(rval, right_unit, &rdts)

            res_value = cmp_dtstructs(&ldts, &rdts, op_code)

        # Analogous to: result[i] = res_value
        (<uint8_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_value

        cnp.PyArray_MultiIter_NEXT(mi)

    return result


import operator


cdef int op_to_op_code(op):
    # TODO: should exist somewhere?
    if op is operator.eq:
        return Py_EQ
    if op is operator.ne:
        return Py_NE
    if op is operator.le:
        return Py_LE
    if op is operator.lt:
        return Py_LT
    if op is operator.ge:
        return Py_GE
    if op is operator.gt:
        return Py_GT


cdef ndarray astype_round_check(
    ndarray i8values,
    NPY_DATETIMEUNIT from_unit,
    NPY_DATETIMEUNIT to_unit
):
    # cases with from_unit > to_unit, e.g. ns->us, raise if the conversion
    #  involves truncation, e.g. 1500ns->1us
    cdef:
        Py_ssize_t i, N = i8values.size

        # equiv: iresult = np.empty((<object>i8values).shape, dtype="i8")
        ndarray iresult = cnp.PyArray_EMPTY(
            i8values.ndim, i8values.shape, cnp.NPY_INT64, 0
        )
        cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values)

        # Note the arguments to_unit, from unit are swapped vs how they
        #  are passed when going to a higher-frequency reso.
        int64_t mult = get_conversion_factor(to_unit, from_unit)
        int64_t value, mod

    for i in range(N):
        # Analogous to: item = i8values[i]
        value = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]

        if value == NPY_DATETIME_NAT:
            new_value = NPY_DATETIME_NAT
        else:
            new_value, mod = divmod(value, mult)
            if mod != 0:
                # TODO: avoid runtime import
                from pandas._libs.tslibs.dtypes import npy_unit_to_abbrev
                from_abbrev = npy_unit_to_abbrev(from_unit)
                to_abbrev = npy_unit_to_abbrev(to_unit)
                raise ValueError(
                    f"Cannot losslessly cast '{value} {from_abbrev}' to {to_abbrev}"
                )

        # Analogous to: iresult[i] = new_value
        (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value

        cnp.PyArray_MultiIter_NEXT(mi)

    return iresult


@cython.overflowcheck(True)
cdef int64_t get_conversion_factor(
    NPY_DATETIMEUNIT from_unit,
    NPY_DATETIMEUNIT to_unit
) except? -1:
    """
    Find the factor by which we need to multiply to convert from from_unit to to_unit.
    """
    if (
        from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
        or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
    ):
        raise ValueError("unit-less resolutions are not supported")
    if from_unit > to_unit:
        raise ValueError

    if from_unit == to_unit:
        return 1

    if from_unit == NPY_DATETIMEUNIT.NPY_FR_W:
        return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit)
    elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D:
        return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit)
    elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h:
        return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit)
    elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m:
        return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit)
    elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s:
        return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit)
    elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms:
        return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit)
    elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us:
        return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit)
    elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns:
        return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit)
    elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps:
        return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit)
    elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs:
        return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit)


cdef int64_t convert_reso(
    int64_t value,
    NPY_DATETIMEUNIT from_reso,
    NPY_DATETIMEUNIT to_reso,
    bint round_ok,
) except? -1:
    cdef:
        int64_t res_value, mult, div, mod

    if from_reso == to_reso:
        return value

    elif to_reso < from_reso:
        # e.g. ns -> us, no risk of overflow, but can be lossy rounding
        mult = get_conversion_factor(to_reso, from_reso)
        div, mod = divmod(value, mult)
        if mod > 0 and not round_ok:
            raise ValueError("Cannot losslessly convert units")

        # Note that when mod > 0, we follow np.timedelta64 in always
        #  rounding down.
        res_value = div

    elif (
        from_reso == NPY_FR_Y
        or from_reso == NPY_FR_M
        or to_reso == NPY_FR_Y
        or to_reso == NPY_FR_M
    ):
        # Converting by multiplying isn't _quite_ right bc the number of
        #  seconds in a month/year isn't fixed.
        res_value = _convert_reso_with_dtstruct(value, from_reso, to_reso)

    else:
        # e.g. ns -> us, risk of overflow, but no risk of lossy rounding
        mult = get_conversion_factor(from_reso, to_reso)
        with cython.overflowcheck(True):
            # Note: caller is responsible for re-raising as OutOfBoundsTimedelta
            res_value = value * mult

    return res_value


cdef int64_t _convert_reso_with_dtstruct(
    int64_t value,
    NPY_DATETIMEUNIT from_unit,
    NPY_DATETIMEUNIT to_unit,
) except? -1:
    cdef:
        npy_datetimestruct dts

    pandas_datetime_to_datetimestruct(value, from_unit, &dts)
    check_dts_bounds(&dts, to_unit)
    return npy_datetimestruct_to_datetime(to_unit, &dts)
flet / pandas python

Products

About

Resources

Contact Gemfury