Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
dshipparser / parser.py
Size: Mime:
# -*- coding: utf-8 -*-
import codecs
import os.path

import pandas as pd

from dshipparser.exportinfo import Dship2ExportInfo, Dship3ExportInfo
from . import helpers

LAT_COLS = os.environ.get('LAT_COLS') or ['SYS.STR.PosLat',
                                          'Weatherstation.PDWDC.Latitude',
                                          'Position lat',
                                          'lat',
                                          'XXXX.Pos.LatStr']  # SONNE BSH DShip3
LON_COLS = os.environ.get('LON_COLS') or ['SYS.STR.PosLon',
                                          'Weatherstation.PDWDC.Longitude',
                                          'lon',
                                          'Position lon',
                                          'XXXX.Pos.LongStr']  # SONNE BSH DShip3

LAT_COL = '_POS_LAT_'
LON_COL = '_POS_LON_'

DATE_FORMAT_SPLIT_COLS = os.environ.get('DATE_FORMAT_SPLIT_COLS') or ['YYYY*MM*DD*HH*mm*ss',
                                                                      'YYYY\\tMM\\tDD\\tHH\\tmm\\tSS']


def parse_data(base_dir):
    """
    Parses the dship export. This method will
         * Convert timestamp and lat/lon format to common std format
         * drop empty rows (NaN/None only)
         * rename lat/lon columns
         * (optional default) parse NaN values for each channel
    :param base_dir: base dir with export files
    :param replace_nan_values: default True, if set the NaN value for each channel (column) is calculated
            from metainfo and NaN values (e.g. 99.999999) are replaced with NaN
    :return: pandas Dataframe with parsed data
    """
    return DshipParser(base_dir).parse()


class DshipParser(object):


    def __init__(self, base_dir):
        """DshipParser knows how to interpret files of a dship-export and can compile
         a single dataframe objec with standardised timestamp and lat/lon format.

         base_dir: base directory where export files are located
         """
        self.base_dir = base_dir
        self.export_info = self.get_export_info()
        self.raw_df = None
        self.df = None

    def parse(self):
        """
        Parses the dship export. This method will
         * Convert timestamp and lat/lon format to common std format
         * drop empty rows (NaN/None only)
         * rename lat/lon columns
         * (optional default) parse NaN values for each channel
        :return: pandas Dataframe with parsed data
        """
        self.df = self.parse_dat_file(replace_na_values=True)
        self.standardise_df()

        return self.df


    def get_export_info(self):
        """creates dataframe from dship export.

        determines if export is for dshi2 or dship3, extracts export metainfo and parses .dat file
        """
        if helpers.is_dship3_export(self.base_dir):
            export_info = Dship3ExportInfo(self.base_dir)
        elif helpers.is_dship2_export(self.base_dir):
            export_info = Dship2ExportInfo(self.base_dir)

        return export_info

    def parse_dat_file(self, replace_na_values=True):
        """Reads .dat file indo data frame"""
        dat_file = self.export_info.get_info()['dat_file']
        file_format = self.export_info.get_info()['file_format']

        sep = codecs.decode(file_format['@separator'], 'unicode_escape')  # \\t --> \t
        lineterm = codecs.decode(file_format['@endofrecordmarker'], 'unicode_escape')

        parse_dates = True
        cols = self.export_info.get_columns()

        index_col = 0

        # Merian data currently comes with a strange strange timestamp, correct that below
        dtypes = {}
        if file_format['@datetimeformat'] in DATE_FORMAT_SPLIT_COLS:
            # The dat format above means that year, month day, ... are in separate columns
            # future warning states that passing list of column indices to parse_dates will 
            # not be supported in future, so we need to parse dates manually in this case (see below)
            # parse_dates = [[0, 1, 2, 3, 4, 5]]  # parse dates from first 5 columns
            parse_dates = False
            cols = ['YY', 'MM', 'DD', 'hh', 'mm', 'ss'] + cols
            dtypes = {'YY': str, 'MM': str, 'DD': str, 'hh': str, 'mm': str, 'ss': str}
            index_col = None

        # this is the default format set for dship2 exports which puts date and time
        # in separate columns, this needs special attention
        if file_format['@datetimeformat'] == 'YYYY/MM/DD\\tHH:mm:SS':
            parse_dates = False
            cols = ['YY/MM/DD', 'HH:mm:SS'] + cols
            index_col = None

        encoding = helpers.detect_encoding(dat_file)
        if replace_na_values:
            na_values = self.export_info.get_nan_values()
        else:
            na_values = None

        try:
            ret = pd.read_csv(dat_file,
                              sep=sep,
                              decimal=file_format['@decimalsymbol'],
                              dtype=dtypes,
                              lineterminator=lineterm,
                              parse_dates=parse_dates,
                              index_col=index_col,
                              header=None,
                              names=cols,
                              na_values=na_values,
                              encoding=encoding,
                              skiprows=helpers.guess_skiprows(dat_file, encoding),
                              skipinitialspace=True,
                              low_memory=False,
                              on_bad_lines='warn',
                              # engine='python'
                              )

            # re-combine date and time cols for 'special' datetime format
            if file_format['@datetimeformat'] == 'YYYY/MM/DD\\tHH:mm:SS':
                ret.index = pd.to_datetime(ret.loc[:, 'YY/MM/DD':'HH:mm:SS'].apply(lambda x: 'T'.join(x), axis=1))
            elif file_format['@datetimeformat'] in DATE_FORMAT_SPLIT_COLS:
                # Combine separate year/month/day/hour/minute/second columns into a single datetime index
                try:
                    # Ensure values are strings and build a timestamp like 'YYYY-MM-DD HH:MM:SS'
                    ts_series = ret[['YY', 'MM', 'DD', 'hh', 'mm', 'ss']].astype(str).apply(
                        lambda r: f"{r['YY']}-{r['MM']}-{r['DD']} {r['hh']}:{r['mm']}:{r['ss']}", axis=1)
                    ret.index = pd.to_datetime(ts_series, format='ISO8601', errors='raise')
                except Exception as e:
                    import logging
                    logging.exception(f'Could not combine split date/time columns for file {dat_file}!')
                    raise e

            return ret

        except IndexError as e:
            print('Oh no, something went wrong with file {}!'.format(dat_file))
            raise e
        except UnicodeDecodeError as e:
            print('Caught UnicodeDecodeError in file {}!'.format(dat_file))
            raise e
        except Exception as e:
            print('Unexpected Error in file {}!'.format(dat_file))
            raise e


    def standardise_df(self):
        """Cleans up and standardises data frame parsed from dship export

          - drop columns without data (NaN, 999, ...)
          - rename lat/ lon columns
          - if needed convert coordinates from degree minute.second to degree decimal
          - if needed combine YY MM DD hh mm ss into one column and parse date

          Returns time series (dataframe with datetimeindex) with lat/lon in known format
        """

        # For some reason parse_dates fails for Merian timestamps
        # even so we told it to take first 5 columns together... fix below
        if self.export_info.get_info()['file_format']['@datetimeformat'] in DATE_FORMAT_SPLIT_COLS:
            if not isinstance(self.df.index, pd.DatetimeIndex):
                try:
                    self.df.index = pd.to_datetime(
                        [idx.replace(' ', '') for idx in self.df.index])  # TODO This is slow. Improve.
                except ValueError as e:
                    import logging
                    logging.exception(f'Could not parse index to datetime!')
                    raise e

        self.df = helpers.standardise_df(self.df, helpers.get_platform_token(self.export_info.get_info()['platform_id']))