-
Jacques Grelet authoredJacques Grelet authored
trajectory.py 15.92 KiB
'''
file_extractor.py
'''
import fileinput
import linecache
import logging
from operator import length_hint
import string
import toml
import sys
import argparse
import numpy as np
import re
from glob import glob
from datetime import datetime
import tools
from physical_parameter import Roscop
from notanorm import SqliteDb
import sqlite3
import ascii
import netcdf
# define the data table
# the ID is actually the rowid AUTOINCREMENT column.
# removal of the UNIQUE constraint on DAYD, Casino bug ? DAYD REAL NOT NULL UNIQUE,
table_data = """
CREATE TABLE data (
ID INTEGER PRIMARY KEY,
DAYD REAL NOT NULL,
LATITUDE REAL NOT NULL,
LONGITUDE REAL NOT NULL
); """
class Trajectory:
'''
This class read multiple ASCII file, extract physical parameter from ROSCOP codification at the given column
and fill arrays.
Header values and 1 dimensions variables as TIME, LATITUDE and LONGITUDE are
automaticaly extracted from toml configuration file, actually bi set_regexp function, may be add inside constructor ?
Parameters
----------
fname : files, str, pathlib.Path, list of str
File, filename, or list to read and process.
roscop: file which describe physical parameter (code_roscop.csv)
keys: list of physical parameter to extract
dbname: sqlite3 file, default i in memory
separator : str, column separator, default None (blank)
'''
def __init__(self, fname, roscop, keys, dbname=":memory:", separator=None):
'''constructor with values by default'''
# private attibutes:
self.__dbname = dbname
self.__separator = separator
self.__julianOrigin = 0
self.__header = ''
self.__data = {}
self.__regex = {}
self.__year = []
# public attibutes:
self.fname = fname
self.keys = keys
self.roscop = roscop
self.n = 0
self.m = 0
self.lineHeader = 0
self.db = SqliteDb(self.__dbname)
@property
def year(self):
return self.__year
@property
def julianOrigin(self):
return self.__julianOrigin
@property
def julian_from_year(self):
return tools.dt2julian(datetime(year=self.year, day=1, month=1))
# overloading operators
def __getitem__(self, key):
''' overload r[key] '''
if key not in self.__data:
logging.error(
" file_extractor.py: invalid key: \"{}\"".format(key))
else:
return self.__data[key]
def __str__(self):
''' overload string representation '''
return 'Class Trajectory, file: %s, size = %d x %d' % (self.fname, self.n, self.m)
def update_table(self, keysList):
''' update table data and add new column from pm (physical parameter)'''
# if LATITUDE and LONGITUDE are read as a variable, remove them from variables list
keys = keysList.copy()
if 'LATITUDE' in keys: keys.remove('LATITUDE')
if 'LONGITUDE' in keys: keys.remove('LONGITUDE')
for pm in keys:
logging.debug(f"\tUpdate table data with new column {pm}")
addColumn = f"ALTER TABLE data ADD COLUMN {pm} REAL NOT NULL"
self.db.query(addColumn)
def create_tables(self):
''' Create table data'''
self.db.query("DROP TABLE IF EXISTS data")
# Create table data
self.db.query(table_data)
# update table
self.update_table(self.keys)
def close(self):
self.db.close()
# get the keys list from __data
def getlist(self):
''' return keys '''
return self.__data.keys()
def disp(self):
# for key in keys:
# print("{}:".format(key))
# print(self.__data[key])
buf = ''
for key in self.keys:
buf += "{}:\n".format(key)
buf += "{}\n".format(self.__data[key])
return buf
def set_regex(self, cfg, ti, section):
''' prepare (compile) each regular expression inside toml file under section [<device>.header]
[ctd.header]
isHeader = '^[*#]'
isDevice = '^\*\s+(Sea-Bird)'
TIME = 'System UpLoad Time\s*=\s*(\w+)\s+(\d+)\s+(\d+)\s+(\d+):(\d+):(\d+)'
LATITUDE = 'NMEA\s+Latitude\s*[:=]\s*(\d+)\s+(\d+\.\d+)\s+(\w)'
LONGITUDE = 'NMEA\s+Longitude\s*[:=]\s*(\d+)\s+(\d+.\d+)\s+(\w)'
'''
# first pass on file(s)
d = cfg[ti.lower()][section]
# fill the __regex dict with compiled regex
for key in d.keys():
self.__regex[key] = re.compile(d[key])
def update_arrays(self):
''' extract data from sqlite database and fill self.__data arrays
'''
# print infos after reding all files
n = self.db.count('data')
#m = self.db.max('data')
# need more documentation about return dict from select
#n = int(st[0]['COUNT(ID)'])
#m = int(max_size[0][f"MAX({self.keys[0]})"])
print(f"Array sizes: {n}")
# get data from table station and fill array
query = self.db.select('data')
# initialize one dimension variables
for idx, item in enumerate(query):
# define array size from table column name
for k in item:
# k is a type of <class 'notanorm.base.CIKey'>, convert to char !!!
key = f"{k}"
if idx == 0:
if key != 'ID':
#print(f"{key}: {self.roscop[key]}")
if '_FillValue' in self.roscop[key]:
self.__data[key] = np.full(n, self.roscop[key]['_FillValue'])
else:
self.__data[key] = np.empty(n)
else:
self.__data[key] = np.empty(n)
# fill arrays
self.__data[key][idx] = item[key]
self.n = n
# save all database columns as key
self.keys = self.__data.keys()
def read_files(self, cfg, device):
logging.debug("Enter in read_files()")
# initialize datetime object
dt = datetime
# get the dictionary from toml block, device must be is in lower case
hash = cfg['split'][device.lower()]
# set separator field if declared in toml section, none by default
if 'separator' in cfg[device.lower()]:
self.__separator = cfg[device.lower()]['separator']
# set julian day origin field if declared in toml section, zero by default
if 'julianOrigin' in cfg[device.lower()]:
self.__julianOrigin = cfg[device.lower()]['julianOrigin']
# read each file and extract header and data and fill sqlite tables
for file in self.fname:
process_header = False
process_data = False
station = []
with fileinput.input(
file, openhook=fileinput.hook_encoded("ISO-8859-1")) as f:
sql = {}
self.__header = ''
print(f"Reading file: {file}")
# read all lines in file
for line in f:
# if header line, save to __header private property and go to next line
#logging.debug(f"line read: {line}")
if 'endHeader' in self.__regex and self.__regex['endHeader'].match(line):
process_data = True
if 'isHeader' in self.__regex and self.__regex['isHeader'].match(line):
self.__header += line
continue
if 'isData' in self.__regex and self.__regex['isData'].search(line):
process_data = True
if process_data:
sql = {}
if self.__regex['TIME'].search(line):
hour, minute, second = \
self.__regex['TIME'].search(line).groups()
#print(f"{hour}:{minute}:{second}")
if self.__regex['DATE'].search(line):
day, month, year = \
self.__regex['DATE'].search(line).groups()
#print(f"{day}/{month}/{year}")
# format date and time to "May 09 2011 16:33:53"
dateTime = f"{day}/{month}/{year} {hour}:{minute}:{second}"
#print(dateTime)
if 'LATITUDE' in self.__regex and self.__regex['LATITUDE'].search(line):
(lat_hemi, lat_deg, lat_min) = \
self.__regex['LATITUDE'].search(line).groups()
#print(f"{lat_deg} {lat_min} {lat_hemi}")
# transform to decimal using ternary operator
latitude = float(lat_deg) + (float(lat_min) / 60.) if lat_hemi == 'N' else \
(float(lat_deg) + (float(lat_min) / 60.)) * -1
sql['LATITUDE'] = latitude
#sql['lat'] = tools.Dec2dmc(float(latitude),'N')
if 'LONGITUDE' in self.__regex and self.__regex['LONGITUDE'].search(line):
(lon_hemi, lon_deg, lon_min) = \
self.__regex['LONGITUDE'].search(line).groups()
#print(f"{lon_deg} {lon_min} {lon_hemi}")
longitude = float(lon_deg) + (float(lon_min) / 60.) if lon_hemi == 'E' else \
(float(lon_deg) + (float(lon_min) / 60.)) * -1
sql['LONGITUDE'] = longitude
#sql['lon'] = tools.Dec2dmc(float(longitude),'E')
# set datetime object
if 'dateTimeFormat' in cfg[device.lower()]:
dtf = cfg[device.lower()]['dateTimeFormat']
else:
dtf = "%d/%m/%Y %H:%M:%S"
date_time = dt.strptime(dateTime, dtf)
sql['DAYD'] = tools.dt2julian(date_time)
#print(f"Line: {line} separator: {self.__separator}")
# now, extract and process all data
# split the line, remove leading and trailing space before
p = line.strip().split(self.__separator)
#print(p)
logging.debug(f"line split: {p}")
# insert data from list p with indice hash[key]
for key in self.keys:
logging.debug(f"{key}, {hash[key]}, {p[hash[key]]}")
sql[key] = float(p[hash[key]].replace(',','.'))
self.db.insert("data", sql )
process_data = False
# end of readline in file
self.update_arrays()
def process(self, args, cfg, ti):
'''
Extract data from ASCII files and return Trajectory instance and array size of extracted data
Parameters
----------
args : ConfigParser
cfg : dict
toml instance describing the file structure to decode
ti : str {'CNV', 'XBT','LADCP','TSG',}
The typeInstrument key
Returns
-------
fe: Profile
n, m: array size
'''
print('processing...')
# check if no file selected or cancel button pressed
logging.debug("File(s): {}, config: {}, Keys: {}".format(
args.files, args.config, args.keys))
# if physical parameters are not given from cmd line, option -k, use the toml <device>.split values
if args.keys == None:
args.keys = cfg['split'][ti.lower()].keys()
# extract header and data from files
# if args.database:
# fe = Profile(args.files, self.roscop, args.keys, dbname='test.db')
# else:
# fe = Profile(args.files, r, args.keys)
self.create_tables()
# prepare (compile) each regular expression inside toml file under section [<device=ti>.header]
self.set_regex(cfg, ti, 'header')
self.set_regex(cfg, ti, 'format')
self.read_files(cfg, ti)
#return fe
# write ASCII hdr and data files
ascii.writeTrajectory(cfg, ti, self, self.roscop)
# write the NetCDF file
netcdf.writeTrajectory(cfg, ti, self, self.roscop)
# for testing in standalone context
# ---------------------------------
if __name__ == "__main__":
# usage:
# python file_extractor.py data/CTD/cnv/dfr2900[1-3].cnv -i CTD -d
# python file_extractor.py data/CTD/cnv/dfr2900*.cnv -i CTD -k PRES ETDD TEMP PSAL DOX2 DENS
# python file_extractor.py data/XBT/T7_0000*.EDF -k DEPTH TEMP SVEL -i XBT
# python file_extractor.py data/CTD/btl/fr290*.btl -i BTL -k BOTL DEPTH ETDD TE01 PSA1 DO11
# typeInstrument is a dictionary as key: files extension
typeInstrument = {'CTD': ('cnv', 'CNV'), 'XBT': (
'EDF', 'edf'), 'LADCP': ('lad', 'LAD'), 'TSG': ('colcor','COLCOR'),
'BTL': ('btl', 'BTL')}
#variables_1D = ['TIME', 'LATITUDE', 'LONGITUDE','BATH']
ti = typeInstrument # an alias
parser = argparse.ArgumentParser(
description='This class read multiple ASCII file, extract physical parameter \
from ROSCOP codification at the given column and fill arrays ',
epilog='J. Grelet IRD US191 - March 2019')
parser.add_argument('-d', '--debug', help='display debug informations',
action='store_true')
parser.add_argument('-c', '--config', help="toml configuration file, (default: %(default)s)",
default='config.toml')
parser.add_argument('-i', '--instrument', nargs='?', choices=ti.keys(),
help='specify the instrument that produce files, eg CTD, XBT, TSG, LADCP')
parser.add_argument('-k', '--keys', nargs='+', default=['PRES', 'TEMP', 'PSAL'],
help='display dictionary for key(s), (default: %(default)s)')
parser.add_argument('files', nargs='*',
help='ASCII file(s) to parse')
parser.add_argument('--sbe35', nargs='*',
help='ASCII file(s) to parse')
# display extra logging info
# see: https://stackoverflow.com/questions/14097061/easier-way-to-enable-verbose-logging
# https://docs.python.org/2/howto/argparse.html
args = parser.parse_args()
if args.debug:
logging.basicConfig(
format='%(levelname)s:%(message)s', level=logging.DEBUG)
# work with DOs, Git bash and Linux
files = []
for file in args.files:
files += glob(file)
# call fe with dbname='test.db' to create db file, dbname='test.db'
#fe = Trajectory(files, Roscop('code_roscop.csv'), args.keys, dbname='test.db')
fe = Trajectory(files, Roscop('code_roscop.csv'), args.keys)
fe.create_tables()
logging.debug(f"File(s): {files}, Config: {args.config}")
cfg = toml.load(args.config)
fe.set_regex(cfg, args.instrument, 'header')
fe.read_files(cfg, args.instrument)
logging.debug(f"Indices: {fe.n} x {fe.m}\nkeys: {fe.keys}")
# if args.sbe35 and args.instrument == 'BTL':
# sbe35 = []
# for t in args.sbe35:
# sbe35 += glob(t)
# fe.fname = sbe35
# fe.set_regex(cfg, args.instrument, 'header')
# fe.read_files(cfg, args.instrument)
# elif args.sbe35 and args.instrument != 'BTL':
# print("option --sbe35 can only be used with the BTL instrument (-i BTL)")
# exit
# # debug
logging.debug(fe.getlist())
for k in fe.keys:
for i in range(fe.n):
logging.debug(f"{fe[k][i][1]} : {fe[k][i][fe.m-1]}")
fe.close()