# Last Change: Mon Aug 20 08:00 PM 2007 J
from __future__ import division, print_function, absolute_import
import re
import datetime
from collections import OrderedDict
import numpy as np
from scipy._lib.six import next
import csv
import ctypes
"""A module to read arff files."""
__all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
# An Arff file is basically two parts:
# - header
# - data
#
# A header has each of its components starting by @META where META is one of
# the keyword (attribute of relation, for now).
# TODO:
# - both integer and reals are treated as numeric -> the integer info
# is lost!
# - Replace ValueError by ParseError or something
# We know can handle the following:
# - numeric and nominal attributes
# - missing values for numeric attributes
r_meta = re.compile(r'^\s*@')
# Match a comment
r_comment = re.compile(r'^%')
# Match an empty line
r_empty = re.compile(r'^\s+$')
# Match a header line, that is a line which starts by @ + a word
r_headerline = re.compile(r'^\s*@\S*')
r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
r_nominal = re.compile('{(.+)}')
r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
# To get attributes name enclosed with ''
r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
# To get normal attributes
r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
# ------------------------
# Module defined exception
# ------------------------
class ArffError(IOError):
pass
class ParseArffError(ArffError):
pass
# ----------
# Attributes
# ----------
class Attribute(object):
type_name = None
def __init__(self, name):
self.name = name
self.range = None
self.dtype = np.object_
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
"""
return None
def parse_data(self, data_str):
"""
Parse a value of this type.
"""
return None
def __str__(self):
"""
Parse a value of this type.
"""
return self.name + ',' + self.type_name
class NominalAttribute(Attribute):
type_name = 'nominal'
def __init__(self, name, values):
super().__init__(name)
self.values = values
self.range = values
self.dtype = (np.string_, max(len(i) for i in values))
@staticmethod
def _get_nom_val(atrv):
"""Given a string containing a nominal type, returns a tuple of the
possible values.
A nominal type is defined as something framed between braces ({}).
Parameters
----------
atrv : str
Nominal type definition
Returns
-------
poss_vals : tuple
possible values
Examples
--------
>>> get_nom_val("{floup, bouga, fl, ratata}")
('floup', 'bouga', 'fl', 'ratata')
"""
m = r_nominal.match(atrv)
if m:
attrs, _ = split_data_line(m.group(1))
return tuple(attrs)
else:
raise ValueError("This does not look like a nominal string")
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
For nominal attributes, the attribute string would be like '{<attr_1>,
<attr2>, <attr_3>}'.
"""
if attr_string[0] == '{':
values = cls._get_nom_val(attr_string)
return cls(name, values)
else:
return None
def parse_data(self, data_str):
"""
Parse a value of this type.
"""
if data_str in self.values:
return data_str
elif data_str == '?':
return data_str
else:
raise ValueError("%s value not in %s" % (str(data_str),
str(self.values)))
def __str__(self):
msg = self.name + ",{"
for i in range(len(self.values)-1):
msg += self.values[i] + ","
msg += self.values[-1]
msg += "}"
return msg
class NumericAttribute(Attribute):
def __init__(self, name):
super().__init__(name)
self.type_name = 'numeric'
self.dtype = np.float_
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
For numeric attributes, the attribute string would be like
'numeric' or 'int' or 'real'.
"""
attr_string = attr_string.lower().strip()
if(attr_string[:len('numeric')] == 'numeric' or
attr_string[:len('int')] == 'int' or
attr_string[:len('real')] == 'real'):
return cls(name)
else:
return None
def parse_data(self, data_str):
"""
Parse a value of this type.
Parameters
----------
data_str : str
string to convert
Returns
-------
f : float
where float can be nan
Examples
--------
>>> atr = NumericAttribute('atr')
>>> atr.parse_data('1')
1.0
>>> atr.parse_data('1\\n')
1.0
>>> atr.parse_data('?\\n')
nan
"""
if '?' in data_str:
return np.nan
else:
return float(data_str)
def _basic_stats(self, data):
nbfac = data.size * 1. / (data.size - 1)
return (np.nanmin(data), np.nanmax(data),
np.mean(data), np.std(data) * nbfac)
class StringAttribute(Attribute):
def __init__(self, name):
super().__init__(name)
self.type_name = 'string'
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
For string attributes, the attribute string would be like
'string'.
"""
attr_string = attr_string.lower().strip()
if attr_string[:len('string')] == 'string':
return cls(name)
else:
return None
class DateAttribute(Attribute):
def __init__(self, name, date_format, datetime_unit):
super().__init__(name)
self.date_format = date_format
self.datetime_unit = datetime_unit
self.type_name = 'date'
self.range = date_format
self.dtype = np.datetime64(0, self.datetime_unit)
@staticmethod
def _get_date_format(atrv):
m = r_date.match(atrv)
if m:
pattern = m.group(1).strip()
# convert time pattern from Java's SimpleDateFormat to C's format
datetime_unit = None
if "yyyy" in pattern:
pattern = pattern.replace("yyyy", "%Y")
datetime_unit = "Y"
elif "yy":
pattern = pattern.replace("yy", "%y")
datetime_unit = "Y"
if "MM" in pattern:
pattern = pattern.replace("MM", "%m")
datetime_unit = "M"
if "dd" in pattern:
pattern = pattern.replace("dd", "%d")
datetime_unit = "D"
if "HH" in pattern:
pattern = pattern.replace("HH", "%H")
datetime_unit = "h"
if "mm" in pattern:
pattern = pattern.replace("mm", "%M")
datetime_unit = "m"
if "ss" in pattern:
pattern = pattern.replace("ss", "%S")
datetime_unit = "s"
if "z" in pattern or "Z" in pattern:
raise ValueError("Date type attributes with time zone not "
"supported, yet")
if datetime_unit is None:
raise ValueError("Invalid or unsupported date format")
return pattern, datetime_unit
else:
raise ValueError("Invalid or no date format")
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
For date attributes, the attribute string would be like
'date <format>'.
"""
attr_string_lower = attr_string.lower().strip()
if attr_string_lower[:len('date')] == 'date':
date_format, datetime_unit = cls._get_date_format(attr_string)
return cls(name, date_format, datetime_unit)
else:
return None
def parse_data(self, data_str):
"""
Parse a value of this type.
"""
date_str = data_str.strip().strip("'").strip('"')
if date_str == '?':
return np.datetime64('NaT', self.datetime_unit)
else:
dt = datetime.datetime.strptime(date_str, self.date_format)
return np.datetime64(dt).astype(
"datetime64[%s]" % self.datetime_unit)
def __str__(self):
return super(DateAttribute, self).__str__() + ',' + self.date_format
class RelationalAttribute(Attribute):
def __init__(self, name):
super().__init__(name)
Loading ...