Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

/ _fs.pyx

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# cython: language_level = 3

from cpython.datetime cimport datetime, PyDateTime_DateTime
from cython cimport binding

from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint
from pyarrow.lib import _detect_compression, frombytes, tobytes
from pyarrow.lib cimport *
from pyarrow.util import _stringify_path

from abc import ABC, abstractmethod
from datetime import datetime, timezone
import os
import pathlib
import sys


cdef _init_ca_paths():
    cdef CFileSystemGlobalOptions options

    import ssl
    paths = ssl.get_default_verify_paths()
    if paths.cafile:
        options.tls_ca_file_path = os.fsencode(paths.cafile)
    if paths.capath:
        options.tls_ca_dir_path = os.fsencode(paths.capath)
    check_status(CFileSystemsInitialize(options))


if sys.platform == 'linux':
    # ARROW-9261: On Linux, we may need to fixup the paths to TLS CA certs
    # (especially in manylinux packages) since the values hardcoded at
    # compile-time in libcurl may be wrong.
    _init_ca_paths()


cdef inline c_string _path_as_bytes(path) except *:
    # handle only abstract paths, not bound to any filesystem like pathlib is,
    # so we only accept plain strings
    if not isinstance(path, (bytes, str)):
        raise TypeError('Path must be a string')
    # tobytes always uses utf-8, which is more or less ok, at least on Windows
    # since the C++ side then decodes from utf-8. On Unix, os.fsencode may be
    # better.
    return tobytes(path)


cdef object _wrap_file_type(CFileType ty):
    return FileType(<int8_t> ty)


cdef CFileType _unwrap_file_type(FileType ty) except *:
    if ty == FileType.Unknown:
        return CFileType_Unknown
    elif ty == FileType.NotFound:
        return CFileType_NotFound
    elif ty == FileType.File:
        return CFileType_File
    elif ty == FileType.Directory:
        return CFileType_Directory
    assert 0


def _file_type_to_string(ty):
    # Python 3.11 changed str(IntEnum) to return the string representation
    # of the integer value: https://github.com/python/cpython/issues/94763
    return f"{ty.__class__.__name__}.{ty._name_}"


cdef class FileInfo(_Weakrefable):
    """
    FileSystem entry info.

    Parameters
    ----------
    path : str
        The full path to the filesystem entry.
    type : FileType
        The type of the filesystem entry.
    mtime : datetime or float, default None
        If given, the modification time of the filesystem entry.
        If a float is given, it is the number of seconds since the
        Unix epoch.
    mtime_ns : int, default None
        If given, the modification time of the filesystem entry,
        in nanoseconds since the Unix epoch.
        `mtime` and `mtime_ns` are mutually exclusive.
    size : int, default None
        If given, the filesystem entry size in bytes.  This should only
        be given if `type` is `FileType.File`.

    Examples
    --------
    Generate a file:

    >>> from pyarrow import fs
    >>> local = fs.LocalFileSystem()
    >>> path_fs = local_path + '/pyarrow-fs-example.dat'
    >>> with local.open_output_stream(path_fs) as stream:
    ...     stream.write(b'data')
    4

    Get FileInfo object using ``get_file_info()``:

    >>> file_info = local.get_file_info(path_fs)
    >>> file_info
    <FileInfo for '.../pyarrow-fs-example.dat': type=FileType.File, size=4>

    Inspect FileInfo attributes:

    >>> file_info.type
    <FileType.File: 2>

    >>> file_info.is_file
    True

    >>> file_info.path
    '/.../pyarrow-fs-example.dat'

    >>> file_info.base_name
    'pyarrow-fs-example.dat'

    >>> file_info.size
    4

    >>> file_info.extension
    'dat'

    >>> file_info.mtime # doctest: +SKIP
    datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc)

    >>> file_info.mtime_ns # doctest: +SKIP
    1656489370873922073
    """

    def __init__(self, path, FileType type=FileType.Unknown, *,
                 mtime=None, mtime_ns=None, size=None):
        self.info.set_path(tobytes(path))
        self.info.set_type(_unwrap_file_type(type))
        if mtime is not None:
            if mtime_ns is not None:
                raise TypeError("Only one of mtime and mtime_ns "
                                "can be given")
            if isinstance(mtime, datetime):
                self.info.set_mtime(PyDateTime_to_TimePoint(
                    <PyDateTime_DateTime*> mtime))
            else:
                self.info.set_mtime(TimePoint_from_s(mtime))
        elif mtime_ns is not None:
            self.info.set_mtime(TimePoint_from_ns(mtime_ns))
        if size is not None:
            self.info.set_size(size)

    @staticmethod
    cdef wrap(CFileInfo info):
        cdef FileInfo self = FileInfo.__new__(FileInfo)
        self.info = move(info)
        return self

    cdef inline CFileInfo unwrap(self) nogil:
        return self.info

    @staticmethod
    cdef CFileInfo unwrap_safe(obj):
        if not isinstance(obj, FileInfo):
            raise TypeError("Expected FileInfo instance, got {0}"
                            .format(type(obj)))
        return (<FileInfo> obj).unwrap()

    def __repr__(self):
        def getvalue(attr):
            try:
                return getattr(self, attr)
            except ValueError:
                return ''

        s = (f'<FileInfo for {self.path!r}: '
             f'type={_file_type_to_string(self.type)}')
        if self.is_file:
            s += f', size={self.size}'
        s += '>'
        return s

    @property
    def type(self):
        """
        Type of the file.

        The returned enum values can be the following:

        - FileType.NotFound: target does not exist
        - FileType.Unknown: target exists but its type is unknown (could be a
          special file such as a Unix socket or character device, or
          Windows NUL / CON / ...)
        - FileType.File: target is a regular file
        - FileType.Directory: target is a regular directory

        Returns
        -------
        type : FileType
        """
        return _wrap_file_type(self.info.type())

    @property
    def is_file(self):
        """
        """
        return self.type == FileType.File

    @property
    def path(self):
        """
        The full file path in the filesystem.

        Examples
        --------
        >>> file_info = local.get_file_info(path)
        >>> file_info.path
        '/.../pyarrow-fs-example.dat'
        """
        return frombytes(self.info.path())

    @property
    def base_name(self):
        """
        The file base name.

        Component after the last directory separator.

        Examples
        --------
        >>> file_info = local.get_file_info(path)
        >>> file_info.base_name
        'pyarrow-fs-example.dat'
        """
        return frombytes(self.info.base_name())

    @property
    def size(self):
        """
        The size in bytes, if available.

        Only regular files are guaranteed to have a size.

        Returns
        -------
        size : int or None
        """
        cdef int64_t size
        size = self.info.size()
        return (size if size != -1 else None)

    @property
    def extension(self):
        """
        The file extension.

        Examples
        --------
        >>> file_info = local.get_file_info(path)
        >>> file_info.extension
        'dat'
        """
        return frombytes(self.info.extension())

    @property
    def mtime(self):
        """
        The time of last modification, if available.

        Returns
        -------
        mtime : datetime.datetime or None

        Examples
        --------
        >>> file_info = local.get_file_info(path)
        >>> file_info.mtime # doctest: +SKIP
        datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc)
        """
        cdef int64_t nanoseconds
        nanoseconds = TimePoint_to_ns(self.info.mtime())
        return (datetime.fromtimestamp(nanoseconds / 1.0e9, timezone.utc)
                if nanoseconds != -1 else None)

    @property
    def mtime_ns(self):
        """
        The time of last modification, if available, expressed in nanoseconds
        since the Unix epoch.

        Returns
        -------
        mtime_ns : int or None

        Examples
        --------
        >>> file_info = local.get_file_info(path)
        >>> file_info.mtime_ns # doctest: +SKIP
        1656489370873922073
        """
        cdef int64_t nanoseconds
        nanoseconds = TimePoint_to_ns(self.info.mtime())
        return (nanoseconds if nanoseconds != -1 else None)


cdef class FileSelector(_Weakrefable):
    """
    File and directory selector.

    It contains a set of options that describes how to search for files and
    directories.

    Parameters
    ----------
    base_dir : str
        The directory in which to select files. Relative paths also work, use
        '.' for the current directory and '..' for the parent.
    allow_not_found : bool, default False
        The behavior if `base_dir` doesn't exist in the filesystem.
        If false, an error is returned.
        If true, an empty selection is returned.
    recursive : bool, default False
        Whether to recurse into subdirectories.

    Examples
    --------
Loading ...