Skip to content
Snippets Groups Projects
file_extractor.py 6.14 KiB
Newer Older
'''
file_extractor.py
'''
import fileinput
import logging
jacques.grelet_ird.fr's avatar
jacques.grelet_ird.fr committed
import toml
import sys
import argparse
import numpy as np


class FileExtractor:

    '''
    This class read multiple ASCII file, extract physical parameter from ROSCOP codification at the given column
    and fill arrays.

    Parameters
    ----------
    fname : file, str, pathlib.Path, list of str
        File, filename, or list to read.
    skip_header : int, optional
        The number of lines to skip at the beginning of the file.
    '''

    # constructor with values by default
    def __init__(self, fname, skip_header=0):
        # attibutes
        # public:
        self.fname = fname
        self.n = 0
        self.m = 0
        # replace this constante with roscop fill value
        self.FillValue = 1e36

        # private:
        self.skip_header = skip_header
        self.__header = {}
jacques.grelet_ird.fr's avatar
jacques.grelet_ird.fr committed
        self.__data = {}
    # overloading operators
jacques.grelet_ird.fr's avatar
jacques.grelet_ird.fr committed

    def __str__(self):
        ''' overload string representation '''
        return 'Class FileExtractor, file: %s, size = %d' % (self.fname, len(self))

    def disp(self, keys):
jacques.grelet_ird.fr's avatar
jacques.grelet_ird.fr committed
        # for key in keys:
        #     print("{}:".format(key))
        #     print(self.__data[key])
        buf = ''
        for key in keys:
jacques.grelet_ird.fr's avatar
jacques.grelet_ird.fr committed
            buf += "{}:\n".format(key)
            buf += "{}\n".format(self.__data[key])
        return buf

   # first pass on file(s)
    def firstPass(self):
        '''
        Returns
         ------
        out : [n,m]
        The size of array.
        '''
        lineRead = 0
        filesRead = 0
        indMax = 0

        for file in self.fname:
            with fileinput.input(
                    file, openhook=fileinput.hook_encoded("ISO-8859-1")) as f:
                filesRead += 1
                for line in f:
                    if line[0] == '#' or line[0] == '*':
                        continue

                    # increment the line number
                    lineRead += 1

                if lineRead > indMax:
                    indMax = lineRead
                logging.debug(
                    " file: {} -> read: {:>{width}}".format(
                        file, lineRead, width=6))
            lineRead = 0
        # the size of arrays
        self.n = filesRead
        self.m = indMax
        return self.n, self.m

    # second pass, extract data from roscop code in fname and fill array
    def secondPass(self, keys, cfg, device):
        '''
        Read the file to its internal dict

        Parameters
        ----------
        keys: sequence, a list of physical parameter to read.
            ex: ['PRES', 'TEMP', 'PSAL']
        cfg: toml.load() instance, configuration file
        device: str, instrument
            ex: CTD,XBT or LADCP
        '''
        #print(device, cfg[device.lower()])
        print(device)
        # set skip_header is declared in toml section, 0 by default
        if 'skikHeader' in cfg[device.lower()]:
            self.skip_header = cfg[device.lower()]['skipHeader']
        #
        logging.debug(self.skip_header)

        # get the dictionary from toml block, device must be is in lower case
        hash = cfg['split'][device.lower()]
        # initialize arrays, move at the end of firstPass ?
        for key in keys:
            # mult by __fillValue next
            # the shape parameter has to be an int or sequence of ints
            self.__data[key] = np.ones((self.n, self.m)) * self.FillValue

        for file in self.fname:
            with fileinput.input(
                    file, openhook=fileinput.hook_encoded("ISO-8859-1")) as f:
                for line in f:
                    if f.filelineno() < self.skip_header + 1:
                        continue
                    if line[0] == '#' or line[0] == '*':
                        continue
                    # split the line
                    p = line.split()

                    str = ' '
                    # fill array with extracted value of line for eack key (physical parameter)
                    for key in keys:
                        self.__data[key][n, m] = p[hash[key]]
                        # debug info
                        str += "{:>{width}}".format(
                            p[hash[key]], width=8)
                    logging.debug(str)

                    # increment m indice (the line number)
                    m += 1
            n += 1
            m = 0


# for testing in standalone context
# ---------------------------------
if __name__ == "__main__":
    # > python file_extractor.py data/cnv/dfr2900[1-3].cnv -d
    # > python file_extractor.py data/cnv/dfr2900[1-3].cnv -k PRES TEMP PSAL DOX2 DENS
    # > python file_extractor.py data/cnv/dfr29*.cnv -d
    parser = argparse.ArgumentParser(
        description='This class read multiple ASCII file, extract physical parameter \
            from ROSCOP codification at the given column and fill arrays ',
        epilog='J. Grelet IRD US191 - March 2019')
    parser.add_argument('-d', '--debug', help='display debug informations',
                        action='store_true')
    parser.add_argument('-c', '--config', help="toml configuration file, (default: %(default)s)",
                        default='tests/test.toml')
    parser.add_argument('-k', '--key', nargs='+', default=['PRES', 'TEMP', 'PSAL'],
                        help='display dictionary for key(s), (default: %(default)s)')
    parser.add_argument('fname', nargs='*',
                        help='cnv file(s) to parse, (default: data/cnv/dfr29*.cnv)')

    # display extra logging info
    # see: https://stackoverflow.com/questions/14097061/easier-way-to-enable-verbose-logging
    # https://docs.python.org/2/howto/argparse.html
    args = parser.parse_args()
    if args.debug:
        logging.basicConfig(
            format='%(levelname)s:%(message)s', level=logging.DEBUG)

    fe = FileExtractor(args.fname)
    print("File(s): {}, Config: {}".format(args.fname, args.config))
    cfg = toml.load(args.config)
    [n, m] = fe.firstPass()
    print("Indices:", n, m)
    fe.secondPass(args.key, cfg, 'ctd')
jacques.grelet_ird.fr's avatar
jacques.grelet_ird.fr committed
    # fe.secondPass(['PRES', 'TEMP', 'PSAL', 'DOX2'], cdf, 'ctd')
    fe.disp(args.key)
jacques.grelet_ird.fr's avatar
jacques.grelet_ird.fr committed
    # fe.disp(['PRES', 'TEMP', 'PSAL', 'DOX2'])