Repository URL to install this package:
|
Version:
0.3.3 ▾
|
dshipparser
/
parser.py
|
|---|
# -*- coding: utf-8 -*-
import codecs
import os.path
import pandas as pd
from dshipparser.exportinfo import Dship2ExportInfo, Dship3ExportInfo
from . import helpers
LAT_COLS = os.environ.get('LAT_COLS') or ['SYS.STR.PosLat',
'Weatherstation.PDWDC.Latitude',
'Position lat',
'lat',
'XXXX.Pos.LatStr'] # SONNE BSH DShip3
LON_COLS = os.environ.get('LON_COLS') or ['SYS.STR.PosLon',
'Weatherstation.PDWDC.Longitude',
'lon',
'Position lon',
'XXXX.Pos.LongStr'] # SONNE BSH DShip3
LAT_COL = '_POS_LAT_'
LON_COL = '_POS_LON_'
DATE_FORMAT_SPLIT_COLS = os.environ.get('DATE_FORMAT_SPLIT_COLS') or ['YYYY*MM*DD*HH*mm*ss',
'YYYY\\tMM\\tDD\\tHH\\tmm\\tSS']
def parse_data(base_dir):
"""
Parses the dship export. This method will
* Convert timestamp and lat/lon format to common std format
* drop empty rows (NaN/None only)
* rename lat/lon columns
* (optional default) parse NaN values for each channel
:param base_dir: base dir with export files
:param replace_nan_values: default True, if set the NaN value for each channel (column) is calculated
from metainfo and NaN values (e.g. 99.999999) are replaced with NaN
:return: pandas Dataframe with parsed data
"""
return DshipParser(base_dir).parse()
class DshipParser(object):
def __init__(self, base_dir):
"""DshipParser knows how to interpret files of a dship-export and can compile
a single dataframe objec with standardised timestamp and lat/lon format.
base_dir: base directory where export files are located
"""
self.base_dir = base_dir
self.export_info = self.get_export_info()
self.raw_df = None
self.df = None
def parse(self):
"""
Parses the dship export. This method will
* Convert timestamp and lat/lon format to common std format
* drop empty rows (NaN/None only)
* rename lat/lon columns
* (optional default) parse NaN values for each channel
:return: pandas Dataframe with parsed data
"""
self.df = self.parse_dat_file(replace_na_values=True)
self.standardise_df()
return self.df
def get_export_info(self):
"""creates dataframe from dship export.
determines if export is for dshi2 or dship3, extracts export metainfo and parses .dat file
"""
if helpers.is_dship3_export(self.base_dir):
export_info = Dship3ExportInfo(self.base_dir)
elif helpers.is_dship2_export(self.base_dir):
export_info = Dship2ExportInfo(self.base_dir)
return export_info
def parse_dat_file(self, replace_na_values=True):
"""Reads .dat file indo data frame"""
dat_file = self.export_info.get_info()['dat_file']
file_format = self.export_info.get_info()['file_format']
sep = codecs.decode(file_format['@separator'], 'unicode_escape') # \\t --> \t
lineterm = codecs.decode(file_format['@endofrecordmarker'], 'unicode_escape')
parse_dates = True
cols = self.export_info.get_columns()
index_col = 0
# Merian data currently comes with a strange strange timestamp, correct that below
dtypes = {}
if file_format['@datetimeformat'] in DATE_FORMAT_SPLIT_COLS:
# The dat format above means that year, month day, ... are in separate columns
# future warning states that passing list of column indices to parse_dates will
# not be supported in future, so we need to parse dates manually in this case (see below)
# parse_dates = [[0, 1, 2, 3, 4, 5]] # parse dates from first 5 columns
parse_dates = False
cols = ['YY', 'MM', 'DD', 'hh', 'mm', 'ss'] + cols
dtypes = {'YY': str, 'MM': str, 'DD': str, 'hh': str, 'mm': str, 'ss': str}
index_col = None
# this is the default format set for dship2 exports which puts date and time
# in separate columns, this needs special attention
if file_format['@datetimeformat'] == 'YYYY/MM/DD\\tHH:mm:SS':
parse_dates = False
cols = ['YY/MM/DD', 'HH:mm:SS'] + cols
index_col = None
encoding = helpers.detect_encoding(dat_file)
if replace_na_values:
na_values = self.export_info.get_nan_values()
else:
na_values = None
try:
ret = pd.read_csv(dat_file,
sep=sep,
decimal=file_format['@decimalsymbol'],
dtype=dtypes,
lineterminator=lineterm,
parse_dates=parse_dates,
index_col=index_col,
header=None,
names=cols,
na_values=na_values,
encoding=encoding,
skiprows=helpers.guess_skiprows(dat_file, encoding),
skipinitialspace=True,
low_memory=False,
on_bad_lines='warn',
# engine='python'
)
# re-combine date and time cols for 'special' datetime format
if file_format['@datetimeformat'] == 'YYYY/MM/DD\\tHH:mm:SS':
ret.index = pd.to_datetime(ret.loc[:, 'YY/MM/DD':'HH:mm:SS'].apply(lambda x: 'T'.join(x), axis=1))
elif file_format['@datetimeformat'] in DATE_FORMAT_SPLIT_COLS:
# Combine separate year/month/day/hour/minute/second columns into a single datetime index
try:
# Ensure values are strings and build a timestamp like 'YYYY-MM-DD HH:MM:SS'
ts_series = ret[['YY', 'MM', 'DD', 'hh', 'mm', 'ss']].astype(str).apply(
lambda r: f"{r['YY']}-{r['MM']}-{r['DD']} {r['hh']}:{r['mm']}:{r['ss']}", axis=1)
ret.index = pd.to_datetime(ts_series, format='ISO8601', errors='raise')
except Exception as e:
import logging
logging.exception(f'Could not combine split date/time columns for file {dat_file}!')
raise e
return ret
except IndexError as e:
print('Oh no, something went wrong with file {}!'.format(dat_file))
raise e
except UnicodeDecodeError as e:
print('Caught UnicodeDecodeError in file {}!'.format(dat_file))
raise e
except Exception as e:
print('Unexpected Error in file {}!'.format(dat_file))
raise e
def standardise_df(self):
"""Cleans up and standardises data frame parsed from dship export
- drop columns without data (NaN, 999, ...)
- rename lat/ lon columns
- if needed convert coordinates from degree minute.second to degree decimal
- if needed combine YY MM DD hh mm ss into one column and parse date
Returns time series (dataframe with datetimeindex) with lat/lon in known format
"""
# For some reason parse_dates fails for Merian timestamps
# even so we told it to take first 5 columns together... fix below
if self.export_info.get_info()['file_format']['@datetimeformat'] in DATE_FORMAT_SPLIT_COLS:
if not isinstance(self.df.index, pd.DatetimeIndex):
try:
self.df.index = pd.to_datetime(
[idx.replace(' ', '') for idx in self.df.index]) # TODO This is slow. Improve.
except ValueError as e:
import logging
logging.exception(f'Could not parse index to datetime!')
raise e
self.df = helpers.standardise_df(self.df, helpers.get_platform_token(self.export_info.get_info()['platform_id']))