Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

/ tests / test_schema.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from collections import OrderedDict
import sys
import weakref

import pytest
import numpy as np
import pyarrow as pa

import pyarrow.tests.util as test_util
from pyarrow.vendored.version import Version

try:
    import pandas as pd
except ImportError:
    pass


def test_schema_constructor_errors():
    msg = ("Do not call Schema's constructor directly, use `pyarrow.schema` "
           "instead")
    with pytest.raises(TypeError, match=msg):
        pa.Schema()


def test_type_integers():
    dtypes = ['int8', 'int16', 'int32', 'int64',
              'uint8', 'uint16', 'uint32', 'uint64']

    for name in dtypes:
        factory = getattr(pa, name)
        t = factory()
        assert str(t) == name


@pytest.mark.pandas
def test_type_to_pandas_dtype():
    M8 = np.dtype('datetime64[ms]')
    if Version(pd.__version__) < Version("2.0.0"):
        M8 = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.object_),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8),
        (pa.date64(), M8),
        (pa.timestamp('ms'), M8),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
        # (pa.list_(pa.int8(), 2), np.object_),  # TODO needs pandas conversion
        (pa.map_(pa.int64(), pa.float64()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type


@pytest.mark.pandas
def test_type_to_pandas_dtype_check_import():
    # ARROW-7980
    test_util.invoke_script('arrow_7980.py')


def test_type_list():
    value_type = pa.int32()
    list_type = pa.list_(value_type)
    assert str(list_type) == 'list<item: int32>'

    field = pa.field('my_item', pa.string())
    l2 = pa.list_(field)
    assert str(l2) == 'list<my_item: string>'


def test_type_comparisons():
    val = pa.int32()
    assert val == pa.int32()
    assert val == 'int32'
    assert val != 5


def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
        ('duration[s]', pa.duration('s')),
        ('duration[ms]', pa.duration('ms')),
        ('duration[us]', pa.duration('us')),
        ('duration[ns]', pa.duration('ns')),
        ('month_day_nano_interval', pa.month_day_nano_interval()),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected


def test_type_string():
    t = pa.string()
    assert str(t) == 'string'


def test_type_timestamp_with_tz():
    tz = 'America/Los_Angeles'
    t = pa.timestamp('ns', tz=tz)
    assert t.unit == 'ns'
    assert t.tz == tz


def test_time_types():
    t1 = pa.time32('s')
    t2 = pa.time32('ms')
    t3 = pa.time64('us')
    t4 = pa.time64('ns')

    assert t1.unit == 's'
    assert t2.unit == 'ms'
    assert t3.unit == 'us'
    assert t4.unit == 'ns'

    assert str(t1) == 'time32[s]'
    assert str(t4) == 'time64[ns]'

    with pytest.raises(ValueError):
        pa.time32('us')

    with pytest.raises(ValueError):
        pa.time64('s')


def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns')),
        (np.dtype('timedelta64[s]'), pa.duration('s')),
        (np.dtype('timedelta64[ms]'), pa.duration('ms')),
        (np.dtype('timedelta64[us]'), pa.duration('us')),
        (np.dtype('timedelta64[ns]'), pa.duration('ns')),
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.str_) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')


def test_schema():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    sch = pa.schema(fields)

    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]

    assert len(sch) == 3
    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field('foo').name == 'foo'
    assert sch.field('foo').type == fields[0].type

    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([None])


def test_schema_weakref():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    schema = pa.schema(fields)
    wr = weakref.ref(schema)
    assert wr() is not None
    del schema
    assert wr() is None


def test_schema_to_string_with_metadata():
    lorem = """\
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla accumsan vel
turpis et mollis. Aliquam tincidunt arcu id tortor blandit blandit. Donec
eget leo quis lectus scelerisque varius. Class aptent taciti sociosqu ad
litora torquent per conubia nostra, per inceptos himenaeos. Praesent
faucibus, diam eu volutpat iaculis, tellus est porta ligula, a efficitur
turpis nulla facilisis quam. Aliquam vitae lorem erat. Proin a dolor ac libero
dignissim mollis vitae eu mauris. Quisque posuere tellus vitae massa
pellentesque sagittis. Aenean feugiat, diam ac dignissim fermentum, lorem
sapien commodo massa, vel volutpat orci nisi eu justo. Nulla non blandit
sapien. Quisque pretium vestibulum urna eu vehicula."""
    # ARROW-7063
    my_schema = pa.schema([pa.field("foo", "int32", False,
                                    metadata={"key1": "value1"}),
                           pa.field("bar", "string", True,
                                    metadata={"key3": "value3"})],
                          metadata={"lorem": lorem})

    assert my_schema.to_string() == """\
foo: int32 not null
  -- field metadata --
  key1: 'value1'
bar: string
  -- field metadata --
  key3: 'value3'
-- schema metadata --
lorem: '""" + lorem[:65] + "' + " + str(len(lorem) - 65)

    # Metadata that exactly fits
    result = pa.schema([('f0', 'int32')],
                       metadata={'key': 'value' + 'x' * 62}).to_string()
    assert result == """\
f0: int32
-- schema metadata --
key: 'valuexxxxxxxxxxxxxxxxxxxxxxxxxxxxx\
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'"""

    assert my_schema.to_string(truncate_metadata=False) == """\
foo: int32 not null
  -- field metadata --
  key1: 'value1'
bar: string
  -- field metadata --
  key3: 'value3'
-- schema metadata --
lorem: '{}'""".format(lorem)

    assert my_schema.to_string(truncate_metadata=False,
                               show_field_metadata=False) == """\
foo: int32 not null
bar: string
-- schema metadata --
lorem: '{}'""".format(lorem)

    assert my_schema.to_string(truncate_metadata=False,
                               show_schema_metadata=False) == """\
foo: int32 not null
  -- field metadata --
  key1: 'value1'
bar: string
  -- field metadata --
  key3: 'value3'"""

    assert my_schema.to_string(truncate_metadata=False,
                               show_field_metadata=False,
                               show_schema_metadata=False) == """\
foo: int32 not null
bar: string"""


def test_schema_from_tuples():
    fields = [
        ('foo', pa.int32()),
        ('bar', pa.string()),
        ('baz', pa.list_(pa.int8())),
    ]
Loading ...