''' file_extractor.py ''' import fileinput import linecache import logging from operator import length_hint import string import toml import sys import argparse import numpy as np import re from glob import glob from datetime import datetime import tools from physical_parameter import Roscop from notanorm import SqliteDb import sqlite3 import ascii import netcdf # define the data table # the ID is actually the rowid AUTOINCREMENT column. # removal of the UNIQUE constraint on DAYD, Casino bug ? DAYD REAL NOT NULL UNIQUE, table_data = """ CREATE TABLE data ( ID INTEGER PRIMARY KEY, DAYD REAL NOT NULL, LATITUDE REAL NOT NULL, LONGITUDE REAL NOT NULL ); """ class Trajectory: ''' This class read multiple ASCII file, extract physical parameter from ROSCOP codification at the given column and fill arrays. Header values and 1 dimensions variables as TIME, LATITUDE and LONGITUDE are automaticaly extracted from toml configuration file, actually bi set_regexp function, may be add inside constructor ? Parameters ---------- fname : files, str, pathlib.Path, list of str File, filename, or list to read and process. roscop: file which describe physical parameter (code_roscop.csv) keys: list of physical parameter to extract dbname: sqlite3 file, default i in memory separator : str, column separator, default None (blank) ''' def __init__(self, fname, roscop, keys, dbname=":memory:", separator=None): '''constructor with values by default''' # private attibutes: self.__dbname = dbname self.__separator = separator self.__julianOrigin = 0 self.__header = '' self.__data = {} self.__regex = {} self.__year = [] # public attibutes: self.fname = fname self.keys = keys self.roscop = roscop self.n = 0 self.m = 0 self.lineHeader = 0 self.db = SqliteDb(self.__dbname) @property def year(self): return self.__year @property def julianOrigin(self): return self.__julianOrigin @property def julian_from_year(self): return tools.dt2julian(datetime(year=self.year, day=1, month=1)) # overloading operators def __getitem__(self, key): ''' overload r[key] ''' if key not in self.__data: logging.error( " file_extractor.py: invalid key: \"{}\"".format(key)) else: return self.__data[key] def __str__(self): ''' overload string representation ''' return 'Class Trajectory, file: %s, size = %d x %d' % (self.fname, self.n, self.m) def update_table(self, keysList): ''' update table data and add new column from pm (physical parameter)''' # if LATITUDE and LONGITUDE are read as a variable, remove them from variables list keys = keysList.copy() if 'LATITUDE' in keys: keys.remove('LATITUDE') if 'LONGITUDE' in keys: keys.remove('LONGITUDE') for pm in keys: logging.debug(f"\tUpdate table data with new column {pm}") addColumn = f"ALTER TABLE data ADD COLUMN {pm} REAL NOT NULL" self.db.query(addColumn) def create_tables(self): ''' Create table data''' self.db.query("DROP TABLE IF EXISTS data") # Create table data self.db.query(table_data) # update table self.update_table(self.keys) def close(self): self.db.close() # get the keys list from __data def getlist(self): ''' return keys ''' return self.__data.keys() def disp(self): # for key in keys: # print("{}:".format(key)) # print(self.__data[key]) buf = '' for key in self.keys: buf += "{}:\n".format(key) buf += "{}\n".format(self.__data[key]) return buf def set_regex(self, cfg, ti, section): ''' prepare (compile) each regular expression inside toml file under section [<device>.header] [ctd.header] isHeader = '^[*#]' isDevice = '^\*\s+(Sea-Bird)' TIME = 'System UpLoad Time\s*=\s*(\w+)\s+(\d+)\s+(\d+)\s+(\d+):(\d+):(\d+)' LATITUDE = 'NMEA\s+Latitude\s*[:=]\s*(\d+)\s+(\d+\.\d+)\s+(\w)' LONGITUDE = 'NMEA\s+Longitude\s*[:=]\s*(\d+)\s+(\d+.\d+)\s+(\w)' ''' # first pass on file(s) d = cfg[ti.lower()][section] # fill the __regex dict with compiled regex for key in d.keys(): self.__regex[key] = re.compile(d[key]) def update_arrays(self): ''' extract data from sqlite database and fill self.__data arrays ''' # print infos after reding all files n = self.db.count('data') #m = self.db.max('data') # need more documentation about return dict from select #n = int(st[0]['COUNT(ID)']) #m = int(max_size[0][f"MAX({self.keys[0]})"]) print(f"Array sizes: {n}") # get data from table station and fill array query = self.db.select('data') # initialize one dimension variables for idx, item in enumerate(query): # define array size from table column name for k in item: # k is a type of <class 'notanorm.base.CIKey'>, convert to char !!! key = f"{k}" if idx == 0: if key != 'ID': #print(f"{key}: {self.roscop[key]}") if '_FillValue' in self.roscop[key]: self.__data[key] = np.full(n, self.roscop[key]['_FillValue']) else: self.__data[key] = np.empty(n) else: self.__data[key] = np.empty(n) # fill arrays self.__data[key][idx] = item[key] self.n = n # save all database columns as key self.keys = self.__data.keys() def read_files(self, cfg, device): logging.debug("Enter in read_files()") # initialize datetime object dt = datetime # get the dictionary from toml block, device must be is in lower case hash = cfg['split'][device.lower()] # set separator field if declared in toml section, none by default if 'separator' in cfg[device.lower()]: self.__separator = cfg[device.lower()]['separator'] # set julian day origin field if declared in toml section, zero by default if 'julianOrigin' in cfg[device.lower()]: self.__julianOrigin = cfg[device.lower()]['julianOrigin'] # read each file and extract header and data and fill sqlite tables for file in self.fname: process_header = False process_data = False station = [] with fileinput.input( file, openhook=fileinput.hook_encoded("ISO-8859-1")) as f: sql = {} self.__header = '' print(f"Reading file: {file}") # read all lines in file for line in f: # if header line, save to __header private property and go to next line #logging.debug(f"line read: {line}") if 'endHeader' in self.__regex and self.__regex['endHeader'].match(line): process_data = True if 'isHeader' in self.__regex and self.__regex['isHeader'].match(line): self.__header += line continue if 'isData' in self.__regex and self.__regex['isData'].search(line): process_data = True if process_data: sql = {} if self.__regex['TIME'].search(line): hour, minute, second = \ self.__regex['TIME'].search(line).groups() #print(f"{hour}:{minute}:{second}") if self.__regex['DATE'].search(line): day, month, year = \ self.__regex['DATE'].search(line).groups() #print(f"{day}/{month}/{year}") # format date and time to "May 09 2011 16:33:53" dateTime = f"{day}/{month}/{year} {hour}:{minute}:{second}" #print(dateTime) if 'LATITUDE' in self.__regex and self.__regex['LATITUDE'].search(line): (lat_hemi, lat_deg, lat_min) = \ self.__regex['LATITUDE'].search(line).groups() #print(f"{lat_deg} {lat_min} {lat_hemi}") # transform to decimal using ternary operator latitude = float(lat_deg) + (float(lat_min) / 60.) if lat_hemi == 'N' else \ (float(lat_deg) + (float(lat_min) / 60.)) * -1 sql['LATITUDE'] = latitude #sql['lat'] = tools.Dec2dmc(float(latitude),'N') if 'LONGITUDE' in self.__regex and self.__regex['LONGITUDE'].search(line): (lon_hemi, lon_deg, lon_min) = \ self.__regex['LONGITUDE'].search(line).groups() #print(f"{lon_deg} {lon_min} {lon_hemi}") longitude = float(lon_deg) + (float(lon_min) / 60.) if lon_hemi == 'E' else \ (float(lon_deg) + (float(lon_min) / 60.)) * -1 sql['LONGITUDE'] = longitude #sql['lon'] = tools.Dec2dmc(float(longitude),'E') # set datetime object if 'dateTimeFormat' in cfg[device.lower()]: dtf = cfg[device.lower()]['dateTimeFormat'] else: dtf = "%d/%m/%Y %H:%M:%S" date_time = dt.strptime(dateTime, dtf) sql['DAYD'] = tools.dt2julian(date_time) #print(f"Line: {line} separator: {self.__separator}") # now, extract and process all data # split the line, remove leading and trailing space before p = line.strip().split(self.__separator) #print(p) logging.debug(f"line split: {p}") # insert data from list p with indice hash[key] for key in self.keys: logging.debug(f"{key}, {hash[key]}, {p[hash[key]]}") sql[key] = float(p[hash[key]].replace(',','.')) self.db.insert("data", sql ) process_data = False # end of readline in file self.update_arrays() def process(self, args, cfg, ti): ''' Extract data from ASCII files and return Trajectory instance and array size of extracted data Parameters ---------- args : ConfigParser cfg : dict toml instance describing the file structure to decode ti : str {'CNV', 'XBT','LADCP','TSG',} The typeInstrument key Returns ------- fe: Profile n, m: array size ''' print('processing...') # check if no file selected or cancel button pressed logging.debug("File(s): {}, config: {}, Keys: {}".format( args.files, args.config, args.keys)) # if physical parameters are not given from cmd line, option -k, use the toml <device>.split values if args.keys == None: args.keys = cfg['split'][ti.lower()].keys() # extract header and data from files # if args.database: # fe = Profile(args.files, self.roscop, args.keys, dbname='test.db') # else: # fe = Profile(args.files, r, args.keys) self.create_tables() # prepare (compile) each regular expression inside toml file under section [<device=ti>.header] self.set_regex(cfg, ti, 'header') self.set_regex(cfg, ti, 'format') self.read_files(cfg, ti) #return fe # write ASCII hdr and data files ascii.writeTrajectory(cfg, ti, self, self.roscop) # write the NetCDF file netcdf.writeTrajectory(cfg, ti, self, self.roscop) # for testing in standalone context # --------------------------------- if __name__ == "__main__": # usage: # python file_extractor.py data/CTD/cnv/dfr2900[1-3].cnv -i CTD -d # python file_extractor.py data/CTD/cnv/dfr2900*.cnv -i CTD -k PRES ETDD TEMP PSAL DOX2 DENS # python file_extractor.py data/XBT/T7_0000*.EDF -k DEPTH TEMP SVEL -i XBT # python file_extractor.py data/CTD/btl/fr290*.btl -i BTL -k BOTL DEPTH ETDD TE01 PSA1 DO11 # typeInstrument is a dictionary as key: files extension typeInstrument = {'CTD': ('cnv', 'CNV'), 'XBT': ( 'EDF', 'edf'), 'LADCP': ('lad', 'LAD'), 'TSG': ('colcor','COLCOR'), 'BTL': ('btl', 'BTL')} #variables_1D = ['TIME', 'LATITUDE', 'LONGITUDE','BATH'] ti = typeInstrument # an alias parser = argparse.ArgumentParser( description='This class read multiple ASCII file, extract physical parameter \ from ROSCOP codification at the given column and fill arrays ', epilog='J. Grelet IRD US191 - March 2019') parser.add_argument('-d', '--debug', help='display debug informations', action='store_true') parser.add_argument('-c', '--config', help="toml configuration file, (default: %(default)s)", default='config.toml') parser.add_argument('-i', '--instrument', nargs='?', choices=ti.keys(), help='specify the instrument that produce files, eg CTD, XBT, TSG, LADCP') parser.add_argument('-k', '--keys', nargs='+', default=['PRES', 'TEMP', 'PSAL'], help='display dictionary for key(s), (default: %(default)s)') parser.add_argument('files', nargs='*', help='ASCII file(s) to parse') parser.add_argument('--sbe35', nargs='*', help='ASCII file(s) to parse') # display extra logging info # see: https://stackoverflow.com/questions/14097061/easier-way-to-enable-verbose-logging # https://docs.python.org/2/howto/argparse.html args = parser.parse_args() if args.debug: logging.basicConfig( format='%(levelname)s:%(message)s', level=logging.DEBUG) # work with DOs, Git bash and Linux files = [] for file in args.files: files += glob(file) # call fe with dbname='test.db' to create db file, dbname='test.db' #fe = Trajectory(files, Roscop('code_roscop.csv'), args.keys, dbname='test.db') fe = Trajectory(files, Roscop('code_roscop.csv'), args.keys) fe.create_tables() logging.debug(f"File(s): {files}, Config: {args.config}") cfg = toml.load(args.config) fe.set_regex(cfg, args.instrument, 'header') fe.read_files(cfg, args.instrument) logging.debug(f"Indices: {fe.n} x {fe.m}\nkeys: {fe.keys}") # if args.sbe35 and args.instrument == 'BTL': # sbe35 = [] # for t in args.sbe35: # sbe35 += glob(t) # fe.fname = sbe35 # fe.set_regex(cfg, args.instrument, 'header') # fe.read_files(cfg, args.instrument) # elif args.sbe35 and args.instrument != 'BTL': # print("option --sbe35 can only be used with the BTL instrument (-i BTL)") # exit # # debug logging.debug(fe.getlist()) for k in fe.keys: for i in range(fe.n): logging.debug(f"{fe[k][i][1]} : {fe[k][i][fe.m-1]}") fe.close()