''' file_extractor.py ''' import fileinput import logging import toml import sys import argparse import numpy as np class FileExtractor: ''' This class read multiple ASCII file, extract physical parameter from ROSCOP codification at the given column and fill arrays. Parameters ---------- fname : file, str, pathlib.Path, list of str File, filename, or list to read. separator : str, default None skip_header : int, optional The number of lines to skip at the beginning of the file. ''' # constructor with values by defaul def __init__(self, fname, separator=None, skip_header=0): # attibutes # public: self.fname = fname self.n = 0 self.m = 0 # private: self.__skip_header = skip_header self.__separator = separator self.__header = {} self.__data = {} # replace this constante with roscop fill value self.__FillValue = 1e36 # overloading operators def __str__(self): ''' overload string representation ''' return 'Class FileExtractor, file: %s, size = %d' % (self.fname, len(self)) def disp(self, keys): # for key in keys: # print("{}:".format(key)) # print(self.__data[key]) buf = '' for key in keys: buf += "{}:\n".format(key) buf += "{}\n".format(self.__data[key]) return buf # first pass on file(s) def firstPass(self): ''' Returns ------ out : [n,m] The size of array. ''' lineRead = 0 filesRead = 0 indMax = 0 for file in self.fname: with fileinput.input( file, openhook=fileinput.hook_encoded("ISO-8859-1")) as f: filesRead += 1 for line in f: if line[0] == '#' or line[0] == '*': continue # increment the line number lineRead += 1 if lineRead > indMax: indMax = lineRead logging.debug( " file: {} -> read: {:>{width}}".format( file, lineRead, width=6)) lineRead = 0 # the size of arrays self.n = filesRead self.m = indMax return self.n, self.m # second pass, extract data from roscop code in fname and fill array def secondPass(self, keys, cfg, device): ''' Read the file to its internal dict Parameters ---------- keys: sequence, a list of physical parameter to read. ex: ['PRES', 'TEMP', 'PSAL'] cfg: toml.load() instance, configuration file device: str, instrument ex: CTD,XBT or LADCP ''' n = 0 m = 0 # set skipHeader is declared in toml section, 0 by default if 'separator' in cfg[device.lower()]: self.__separator = cfg[device.lower()]['separator'] if 'skipHeader' in cfg[device.lower()]: self.__skip_header = cfg[device.lower()]['skipHeader'] logging.debug(self.__skip_header) # get the dictionary from toml block, device must be is in lower case hash = cfg['split'][device.lower()] # initialize arrays, move at the end of firstPass ? for key in keys: # mult by __fillValue next # the shape parameter has to be an int or sequence of ints self.__data[key] = np.ones((self.n, self.m)) * self.__FillValue for file in self.fname: with fileinput.input( file, openhook=fileinput.hook_encoded("ISO-8859-1")) as f: for line in f: if f.filelineno() < self.__skip_header + 1: continue if line[0] == '#' or line[0] == '*': continue # split the line, remove leading and trailing space before p = line.strip().split(self.__separator) str = ' ' # fill array with extracted value of line for eack key (physical parameter) for key in keys: self.__data[key][n, m] = p[hash[key]] # debug info str += "{:>{width}}".format( p[hash[key]], width=8) logging.debug(str) # increment m indice (the line number) m += 1 n += 1 m = 0 # for testing in standalone context # --------------------------------- if __name__ == "__main__": # usage: # > python file_extractor.py data/cnv/dfr2900[1-3].cnv -d # > python file_extractor.py data/cnv/dfr2900[1-3].cnv -k PRES TEMP PSAL DOX2 DENS # > python file_extractor.py data/cnv/dfr29*.cnv -d parser = argparse.ArgumentParser( description='This class read multiple ASCII file, extract physical parameter \ from ROSCOP codification at the given column and fill arrays ', epilog='J. Grelet IRD US191 - March 2019') parser.add_argument('-d', '--debug', help='display debug informations', action='store_true') parser.add_argument('-c', '--config', help="toml configuration file, (default: %(default)s)", default='tests/test.toml') parser.add_argument('-k', '--key', nargs='+', default=['PRES', 'TEMP', 'PSAL'], help='display dictionary for key(s), (default: %(default)s)') parser.add_argument('fname', nargs='*', help='cnv file(s) to parse, (default: data/cnv/dfr29*.cnv)') # display extra logging info # see: https://stackoverflow.com/questions/14097061/easier-way-to-enable-verbose-logging # https://docs.python.org/2/howto/argparse.html args = parser.parse_args() if args.debug: logging.basicConfig( format='%(levelname)s:%(message)s', level=logging.DEBUG) fe = FileExtractor(args.fname) print("File(s): {}, Config: {}".format(args.fname, args.config)) cfg = toml.load(args.config) [n, m] = fe.firstPass() print("Indices:", n, m) fe.secondPass(args.key, cfg, 'ctd') # fe.secondPass(['PRES', 'TEMP', 'PSAL', 'DOX2'], cdf, 'ctd') fe.disp(args.key) # fe.disp(['PRES', 'TEMP', 'PSAL', 'DOX2'])