Newer
Older
'''
file_extractor.py
'''
import fileinput
import sys
import argparse
import numpy as np
import tools
from physical_parameter import Roscop
from notanorm import SqliteDb
# define SQL station table
table_station = """
CREATE TABLE station (
id INTEGER PRIMARY KEY,
header TEXT,
julian_day REAL NOT NULL UNIQUE,
latitude REAL NOT NULL,
);"""
# define the profile table
# the id is actually the rowid AUTOINCREMENT column.
table_profile = """
CREATE TABLE profile (
id INTEGER PRIMARY KEY,
station_id INTEGER,
FOREIGN KEY (station_id)
REFERENCES station (id)
); """
This class read multiple ASCII file, extract physical parameter from ROSCOP codification at the given column
Header values and 1 dimensions variables as TIME, LATITUDE and LONGITUDE are
automaticaly extracted from toml configuration file, actually bi set_regexp function, may be add inside constructor ?
Parameters
----------
fname : file, str, pathlib.Path, list of str
File, filename, or list to read.
keys: list of physical parameter to extract
separator : str, column separator, default None (blank)
# constructor with values by defaul
def __init__(self, fname, roscop, keys, dbname=":memory:", separator=None):
# replace this constante with roscop fill value
def __getitem__(self, key):
''' overload r[key] '''
if key not in self.__data:
logging.error(
" file_extractor.py: invalid key: \"{}\"".format(key))
else:
return self.__data[key]
def __str__(self):
''' overload string representation '''
return 'Class FileExtractor, file: %s, size = %d x %d' % (self.fname, self.n, self.m)
# for key in keys:
# print("{}:".format(key))
# print(self.__data[key])
buf = ''
buf += "{}:\n".format(key)
buf += "{}\n".format(self.__data[key])
return buf
''' prepare (compile) each regular expression inside toml file under section [<device>.header]
[ctd.header]
isHeader = '^[*#]'
isDevice = '^\*\s+(Sea-Bird)'
TIME = 'System UpLoad Time\s*=\s*(\w+)\s+(\d+)\s+(\d+)\s+(\d+):(\d+):(\d+)'
LATITUDE = 'NMEA\s+Latitude\s*[:=]\s*(\d+)\s+(\d+\.\d+)\s+(\w)'
LONGITUDE = 'NMEA\s+Longitude\s*[:=]\s*(\d+)\s+(\d+.\d+)\s+(\w)'
'''
# fill the __regex dict with compiled regex
for key in d.keys():
self.__regex[key] = re.compile(d[key])
#db.query("DROP DATABASE IF EXISTS '{}'".format(fname))
self.db.query(table_station)
# update table profile and add new column from pm (physical parameter)
#print(f"\tUpdate table profile with new column {pm}")
addColumn = f"ALTER TABLE profile ADD COLUMN {pm} REAL NOT NULL"
# get the dictionary from toml block, device must be is in lower case
hash = cfg['split'][device.lower()]
# set separator field if declared in toml section, none by default
if 'separator' in cfg[device.lower()]:
self.__separator = cfg[device.lower()]['separator']
# read each file and extract header and data and fill sqlite tables
process_header = False
process_data = False
station = []
file, openhook=fileinput.hook_encoded("ISO-8859-1")) as f:
# if header line, save to __header private property and go to next line
if 'endHeader' in self.__regex:
if self.__regex['endHeader'].match(line):
process_header = True
if 'isHeader' in self.__regex:
if self.__regex['isHeader'].match(line):
self.__header += line
if process_header:
pass
else:
continue
if not process_data:
if not 'isHeader' in self.__regex and not process_header:
self.__header += line
continue
# at the end of header, extract information from regex, insert data
# to the table station and go to next line
if process_header:
#print(f"Enter in process header : {self.__header}")
#print(f"Header with line: {line}")
# read and decode header for each entries in configuration
# toml file, section [device.header]
for k in self.__regex.keys():
# extract STATION number
if k == "station" and self.__regex[k].search(self.__header):
[station] = self.__regex[k].search(self.__header).groups()
#print("station: {}, type: {}".format(station, type(station)))
sql['station'] = int(station)
# key is DATETIME
if k == "DATETIME" and self.__regex[k].search(self.__header):
month, day, year, hour, minute, second = \
self.__regex[k].search(self.__header).groups()
if not self.__year:
self.__year = int(year)
# key is DATE
if k == "DATE" and self.__regex[k].search(self.__header):
if device.lower() == 'ladcp':
year, month, day = \
self.__regex[k].search(self.__header).groups()
else:
month, day, year = \
# key is TIME
if k == "TIME" and self.__regex[k].search(self.__header):
hour, minute, second = \
self.__regex[k].search(self.__header).groups()
# key is LATITUDE
if k == "LATITUDE" and self.__regex[k].search(self.__header):
if device.lower() == 'ladcp':
[latitude] = self.__regex[k].search(self.__header).groups()
else:
(lat_deg, lat_min, lat_hemi) = self.__regex[k].search(self.__header).groups()
# format latitude to string
latitude_str = "%s%c%s %s" % (lat_deg, tools.DEGREE, lat_min, lat_hemi)
# transform to decimal using ternary operator
latitude = float(lat_deg) + (float(lat_min) / 60.) if lat_hemi == 'N' else \
(float(lat_deg) + (float(lat_min) / 60.)) * -1
sql['LATITUDE'] = latitude
sql['lat'] = tools.Dec2dmc(float(latitude),'N')
# key is LONGITUDE
if k == "LONGITUDE" and self.__regex[k].search(self.__header):
if device.lower() == 'ladcp':
[longitude] = self.__regex[k].search(self.__header).groups()
else:
(lon_deg, lon_min, lon_hemi) = self.__regex[k].search(self.__header).groups()
# format longitude to string
longitude_str = "%s%c%s %s" % (lon_deg, tools.DEGREE, lon_min, lon_hemi)
# transform to decimal using ternary operator
longitude = float(lon_deg) + (float(lon_min) / 60.) if lon_hemi == 'E' else \
(float(lon_deg) + (float(lon_min) / 60.)) * -1
sql['LONGITUDE'] = longitude
sql['lon'] = tools.Dec2dmc(float(longitude),'E')
# key is BATH
if k == "BATH" and self.__regex[k].search(self.__header):
[bath] = self.__regex[k].search(self.__header).groups()
sql['bath'] = bath
# end of matching regex inside header
process_header = False
process_data = True
# format date and time to "May 09 2011 16:33:53"
dateTime = f"{day}/{month}/{year} {hour}:{minute}:{second}"
# set datetime object
if 'dateTimeFormat' in cfg[device.lower()]:
dtf = cfg[device.lower()]['dateTimeFormat']
else:
dtf = "%d/%m/%Y %H:%M:%S"
sql['date_time'] = dt.strptime(dateTime, dtf)
sql['julian_day'] = tools.dt2julian(sql['date_time'])
#print(f"insert station: {sql}")
# insert or query return last cursor, get the value of the primary key
# with lastrowid
ret = self.db.insert("station", sql)
pk = ret.lastrowid
#print(f"Return: {pk}")
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# end of if process_header:
if process_data:
# now, extract and process all data
# split the line, remove leading and trailing space before
p = line.strip().split(self.__separator)
#print(p)
sql = {}
#[sql[key] = p[hash[key]] for key in self.keys]
sql['station_id'] = pk
for key in self.keys:
sql[key] = p[hash[key]]
#self.db.insert("profile", station_id = 1, PRES = 1, TEMP = 20, PSAL = 35, DOX2 = 20, DENS = 30)
self.db.insert("profile", sql )
# end of readline in file
# add end_date_time in station table if ETDD (julian) is present in data
if 'ETDD' in self.keys:
jj = tools.dt2julian(datetime(year=self.__year, day=1, month=1))
dt = tools.julian2dt(float(sql['ETDD'])+jj-1)
#print(dt.strftime("%d/%m/%Y %H:%M:%S"))
self.db.update("station", id = pk,
end_date_time = dt.strftime("%Y-%m-%d %H:%M:%S"))
#self.db.update("station", id = pk, end_date_time = dt)
# print infos after reding all files
st = self.db.query('SELECT COUNT(id) FROM station')
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
#print(f"SELECT COUNT({self.keys[0]}) FROM profile")
max_press = self.db.query(f"SELECT COUNT({self.keys[0]}) FROM profile")
# need more documentation about return dict from select
#print(max_press[0].keys())
n = int(st[0]['COUNT(id)'])
m = int(max_press[0][f"COUNT({self.keys[0]})"])
print(f"get sizes: {n} x {m}")
# hdr is a list of dict, add station ?
variables_1D = ['PROFILE', 'TIME', 'LATITUDE', 'LONGITUDE','BATH']
for k in variables_1D:
#print(self.roscop[k])
if '_FillValue' in self.roscop[k]:
self.__data[k] = np.full(n, self.roscop[k]['_FillValue'])
else:
self.__data[k] = np.empty(n)
#query = self.db.query('SELECT julian_day, latitude, longitude, bath FROM station')
query = self.db.select('station', ['id','station', 'julian_day', 'end_date_time',
'latitude', 'longitude', 'bath'])
#print(query)
profil_pk = []
for idx, item in enumerate(query):
profil_pk.append(item['id'])
self.__data['PROFILE'][idx] = item['station']
#print(item['station'])
self.__data['TIME'][idx] = item['julian_day']
#self.__data['END_TIME'][idx] = item['end_date_time']
self.__data['LATITUDE'][idx] = item['latitude']
self.__data['LONGITUDE'][idx] = item['longitude']
self.__data['BATH'][idx] = item['bath']
# initialize array
for k in self.keys:
if '_FillValue' in self.roscop[k]:
self.__data[k] = np.full([n, m], self.roscop[k]['_FillValue'])
else:
self.__data[k] = np.empty([n, m])
# for each parameters
for k in self.keys:
# for each entries in station table, n is a list with indice start at 0
for n in profil_pk:
query = self.db.select('profile', [k], station_id = profil_pk[n-1])
for idx, item in enumerate(query):
self.__data[k][n-1, idx] = item[k]
self.m = m
self.n = n
# for testing in standalone context
# ---------------------------------
if __name__ == "__main__":
# > python file_extractor.py data/CTD/cnv/dfr2900[1-3].cnv -d -i CTD
# > python file_extractor.py data/CTD/cnv/dfr2900*.cnv -k PRES ETDD TEMP PSAL DOX2 DENS -i CTD
# > python file_extractor.py data/XBT/T7_0000*.EDF -k DEPTH TEMP SVEL -i XBT
parser = argparse.ArgumentParser(
description='This class read multiple ASCII file, extract physical parameter \
from ROSCOP codification at the given column and fill arrays ',
epilog='J. Grelet IRD US191 - March 2019')
parser.add_argument('-d', '--debug', help='display debug informations',
action='store_true')
parser.add_argument('-c', '--config', help="toml configuration file, (default: %(default)s)",
default='tests/test.toml')
parser.add_argument('-i', '--instrument', nargs='?', choices=['CTD','XBT'],
help='specify the instrument that produce files, eg CTD, XBT, TSG, LADCP')
parser.add_argument('-k', '--keys', nargs='+', default=['PRES', 'TEMP', 'PSAL'],
help='display dictionary for key(s), (default: %(default)s)')
parser.add_argument('files', nargs='*',
help='ASCII file(s) to parse')
# display extra logging info
# see: https://stackoverflow.com/questions/14097061/easier-way-to-enable-verbose-logging
# https://docs.python.org/2/howto/argparse.html
args = parser.parse_args()
if args.debug:
logging.basicConfig(
format='%(levelname)s:%(message)s', level=logging.DEBUG)
# work with DOs, Git bash and Linux
files = []
for file in args.files:
files += glob(file)
fe = FileExtractor(files, Roscop('code_roscop.csv'), args.keys)
#print(f"File(s): {files}, Config: {args.config}")
fe.read_files(cfg, args.instrument)
# print(f"Indices: {fe.n} x {fe.m}\nkeys: {fe.keys}")
# # debug
# print(fe['PRES'])
# print(fe['TEMP'])
# print(fe['PSAL'])