# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from cpython.datetime cimport datetime, PyDateTime_DateTime
from cython cimport binding
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint
from pyarrow.lib import _detect_compression, frombytes, tobytes
from pyarrow.lib cimport *
from pyarrow.util import _stringify_path
from abc import ABC, abstractmethod
from datetime import datetime, timezone
import os
import pathlib
import sys
cdef _init_ca_paths():
cdef CFileSystemGlobalOptions options
import ssl
paths = ssl.get_default_verify_paths()
if paths.cafile:
options.tls_ca_file_path = os.fsencode(paths.cafile)
if paths.capath:
options.tls_ca_dir_path = os.fsencode(paths.capath)
check_status(CFileSystemsInitialize(options))
if sys.platform == 'linux':
# ARROW-9261: On Linux, we may need to fixup the paths to TLS CA certs
# (especially in manylinux packages) since the values hardcoded at
# compile-time in libcurl may be wrong.
_init_ca_paths()
cdef inline c_string _path_as_bytes(path) except *:
# handle only abstract paths, not bound to any filesystem like pathlib is,
# so we only accept plain strings
if not isinstance(path, (bytes, str)):
raise TypeError('Path must be a string')
# tobytes always uses utf-8, which is more or less ok, at least on Windows
# since the C++ side then decodes from utf-8. On Unix, os.fsencode may be
# better.
return tobytes(path)
cdef object _wrap_file_type(CFileType ty):
return FileType(<int8_t> ty)
cdef CFileType _unwrap_file_type(FileType ty) except *:
if ty == FileType.Unknown:
return CFileType_Unknown
elif ty == FileType.NotFound:
return CFileType_NotFound
elif ty == FileType.File:
return CFileType_File
elif ty == FileType.Directory:
return CFileType_Directory
assert 0
def _file_type_to_string(ty):
# Python 3.11 changed str(IntEnum) to return the string representation
# of the integer value: https://github.com/python/cpython/issues/94763
return f"{ty.__class__.__name__}.{ty._name_}"
cdef class FileInfo(_Weakrefable):
"""
FileSystem entry info.
Parameters
----------
path : str
The full path to the filesystem entry.
type : FileType
The type of the filesystem entry.
mtime : datetime or float, default None
If given, the modification time of the filesystem entry.
If a float is given, it is the number of seconds since the
Unix epoch.
mtime_ns : int, default None
If given, the modification time of the filesystem entry,
in nanoseconds since the Unix epoch.
`mtime` and `mtime_ns` are mutually exclusive.
size : int, default None
If given, the filesystem entry size in bytes. This should only
be given if `type` is `FileType.File`.
Examples
--------
Generate a file:
>>> from pyarrow import fs
>>> local = fs.LocalFileSystem()
>>> path_fs = local_path + '/pyarrow-fs-example.dat'
>>> with local.open_output_stream(path_fs) as stream:
... stream.write(b'data')
4
Get FileInfo object using ``get_file_info()``:
>>> file_info = local.get_file_info(path_fs)
>>> file_info
<FileInfo for '.../pyarrow-fs-example.dat': type=FileType.File, size=4>
Inspect FileInfo attributes:
>>> file_info.type
<FileType.File: 2>
>>> file_info.is_file
True
>>> file_info.path
'/.../pyarrow-fs-example.dat'
>>> file_info.base_name
'pyarrow-fs-example.dat'
>>> file_info.size
4
>>> file_info.extension
'dat'
>>> file_info.mtime # doctest: +SKIP
datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc)
>>> file_info.mtime_ns # doctest: +SKIP
1656489370873922073
"""
def __init__(self, path, FileType type=FileType.Unknown, *,
mtime=None, mtime_ns=None, size=None):
self.info.set_path(tobytes(path))
self.info.set_type(_unwrap_file_type(type))
if mtime is not None:
if mtime_ns is not None:
raise TypeError("Only one of mtime and mtime_ns "
"can be given")
if isinstance(mtime, datetime):
self.info.set_mtime(PyDateTime_to_TimePoint(
<PyDateTime_DateTime*> mtime))
else:
self.info.set_mtime(TimePoint_from_s(mtime))
elif mtime_ns is not None:
self.info.set_mtime(TimePoint_from_ns(mtime_ns))
if size is not None:
self.info.set_size(size)
@staticmethod
cdef wrap(CFileInfo info):
cdef FileInfo self = FileInfo.__new__(FileInfo)
self.info = move(info)
return self
cdef inline CFileInfo unwrap(self) nogil:
return self.info
@staticmethod
cdef CFileInfo unwrap_safe(obj):
if not isinstance(obj, FileInfo):
raise TypeError("Expected FileInfo instance, got {0}"
.format(type(obj)))
return (<FileInfo> obj).unwrap()
def __repr__(self):
def getvalue(attr):
try:
return getattr(self, attr)
except ValueError:
return ''
s = (f'<FileInfo for {self.path!r}: '
f'type={_file_type_to_string(self.type)}')
if self.is_file:
s += f', size={self.size}'
s += '>'
return s
@property
def type(self):
"""
Type of the file.
The returned enum values can be the following:
- FileType.NotFound: target does not exist
- FileType.Unknown: target exists but its type is unknown (could be a
special file such as a Unix socket or character device, or
Windows NUL / CON / ...)
- FileType.File: target is a regular file
- FileType.Directory: target is a regular directory
Returns
-------
type : FileType
"""
return _wrap_file_type(self.info.type())
@property
def is_file(self):
"""
"""
return self.type == FileType.File
@property
def path(self):
"""
The full file path in the filesystem.
Examples
--------
>>> file_info = local.get_file_info(path)
>>> file_info.path
'/.../pyarrow-fs-example.dat'
"""
return frombytes(self.info.path())
@property
def base_name(self):
"""
The file base name.
Component after the last directory separator.
Examples
--------
>>> file_info = local.get_file_info(path)
>>> file_info.base_name
'pyarrow-fs-example.dat'
"""
return frombytes(self.info.base_name())
@property
def size(self):
"""
The size in bytes, if available.
Only regular files are guaranteed to have a size.
Returns
-------
size : int or None
"""
cdef int64_t size
size = self.info.size()
return (size if size != -1 else None)
@property
def extension(self):
"""
The file extension.
Examples
--------
>>> file_info = local.get_file_info(path)
>>> file_info.extension
'dat'
"""
return frombytes(self.info.extension())
@property
def mtime(self):
"""
The time of last modification, if available.
Returns
-------
mtime : datetime.datetime or None
Examples
--------
>>> file_info = local.get_file_info(path)
>>> file_info.mtime # doctest: +SKIP
datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc)
"""
cdef int64_t nanoseconds
nanoseconds = TimePoint_to_ns(self.info.mtime())
return (datetime.fromtimestamp(nanoseconds / 1.0e9, timezone.utc)
if nanoseconds != -1 else None)
@property
def mtime_ns(self):
"""
The time of last modification, if available, expressed in nanoseconds
since the Unix epoch.
Returns
-------
mtime_ns : int or None
Examples
--------
>>> file_info = local.get_file_info(path)
>>> file_info.mtime_ns # doctest: +SKIP
1656489370873922073
"""
cdef int64_t nanoseconds
nanoseconds = TimePoint_to_ns(self.info.mtime())
return (nanoseconds if nanoseconds != -1 else None)
cdef class FileSelector(_Weakrefable):
"""
File and directory selector.
It contains a set of options that describes how to search for files and
directories.
Parameters
----------
base_dir : str
The directory in which to select files. Relative paths also work, use
'.' for the current directory and '..' for the parent.
allow_not_found : bool, default False
The behavior if `base_dir` doesn't exist in the filesystem.
If false, an error is returned.
If true, an empty selection is returned.
recursive : bool, default False
Whether to recurse into subdirectories.
Examples
--------
Loading ...