Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
'''
file_extractor.py
'''
import fileinput
import logging
import toml
import sys
import argparse
import numpy as np
import re
from datetime import datetime
import tools
from physical_parameter import Roscop
class FileExtractor:
'''
This class read multiple ASCII file, extract physical parameter from ROSCOP codification at the given column
and fill arrays.
Header values and 1 dimensions variables as TIME, LATITUDE and LONGITUDE are
automaticaly extracted from toml configuration file, actually bi set_regexp function, may be add inside constructor ?
Parameters
----------
fname : file, str, pathlib.Path, list of str
File, filename, or list to read.
keys: list of physical parameter to extract
separator : str, column separator, default None (blank)
'''
# constructor with values by defaul
def __init__(self, fname, roscop, keys, separator=None):
# attibutes
# public:
self.fname = fname
self.keys = keys
self.roscop = roscop
self.n = 0
self.m = 0
self.lineHeader = 0
# private:
self.__separator = separator
self.__header = {}
self.__data = {}
self.__regex = {}
# replace this constante with roscop fill value
#self.__FillValue = 1e36
# overloading operators
def __getitem__(self, key):
''' overload r[key] '''
if key not in self.__data:
logging.error(
" file_extractor.py: invalid key: \"{}\"".format(key))
else:
return self.__data[key]
def __str__(self):
''' overload string representation '''
return 'Class FileExtractor, file: %s, size = %d x %d' % (self.fname, self.n, self.m)
def disp(self):
# for key in keys:
# print("{}:".format(key))
# print(self.__data[key])
buf = ''
for key in self.keys:
buf += "{}:\n".format(key)
buf += "{}\n".format(self.__data[key])
return buf
def set_regex(self, cfg, ti):
''' prepare (compile) each regular expression inside toml file under section [<device>.header]
[ctd.header]
isHeader = '^[*#]'
isDevice = '^\*\s+(Sea-Bird)'
TIME = 'System UpLoad Time\s*=\s*(\w+)\s+(\d+)\s+(\d+)\s+(\d+):(\d+):(\d+)'
LATITUDE = 'NMEA\s+Latitude\s*[:=]\s*(\d+)\s+(\d+\.\d+)\s+(\w)'
LONGITUDE = 'NMEA\s+Longitude\s*[:=]\s*(\d+)\s+(\d+.\d+)\s+(\w)'
'''
# first pass on file(s)
d = cfg[ti.lower()]['header']
# fill the __regex dict with compiled regex
for key in d.keys():
self.__regex[key] = re.compile(d[key])
def first_pass(self):
'''
Returns
------
out : [n,m]
The size of array.
'''
lineHeader = 0
lineData = 0
filesRead = 0
indMax = 0
isHeader = True
for file in self.fname:
with fileinput.input(
file, openhook=fileinput.hook_encoded("ISO-8859-1")) as f:
lineData = 0
lineHeader = 0
isHeader = True
filesRead += 1
for line in f:
# header detection, skip header lines
if isHeader:
if 'isHeader' in self.__regex:
if self.__regex['isHeader'].match(line):
lineHeader += 1
continue
elif 'endHeader' in self.__regex:
if self.__regex['endHeader'].match(line):
lineHeader += 1
isHeader = False
else:
lineHeader += 1
continue
# increment the line number
lineData += 1
if lineData > indMax:
indMax = lineData
logging.debug(
" {} -> header: {:>{w}} data: {:>{w2}}".format(
file, lineHeader, lineData, w=3, w2=6))
# the size of arrays
self.n = filesRead
self.m = indMax
self.lineHeader = lineHeader
# second pass, extract data from roscop code in fname and fill array
def second_pass(self, cfg, device, variables_1D):
'''
Read the file to its internal dict
Parameters
----------
keys: sequence, a list of physical parameter to read.
ex: ['PRES', 'TEMP', 'PSAL']
cfg: toml.load() instance, configuration file
device: str, instrument
ex: CTD,XBT or LADCP
'''
n = 0
m = 0
# initialize datetime object
dt = datetime
# set separator field if declared in toml section, none by default
if 'separator' in cfg[device.lower()]:
self.__separator = cfg[device.lower()]['separator']
# set skipHeader is declared in toml section, 0 by default
# get the dictionary from toml block, device must be is in lower case
hash = cfg['split'][device.lower()]
# initialize arrays, move at the end of firstPass ?
for key in variables_1D:
#self.__data[key] = np.ones((self.n)) * self.__FillValue
if '_FillValue' in self.roscop[key]:
self.__data[key] = np.full(self.n, self.roscop[key]['_FillValue'])
else:
self.__data[key] = np.empty(self.n)
for key in self.keys:
# mult by __fillValue next
# the shape parameter has to be an int or sequence of ints
if '_FillValue' in self.roscop[key]:
self.__data[key] = np.full([self.n, self.m], self.roscop[key]['_FillValue'])
else:
self.__data[key] = np.empty([self.n, self.m])
for file in self.fname:
with fileinput.input(
file, openhook=fileinput.hook_encoded("ISO-8859-1")) as f:
logging.debug(file)
day = month = year = hour = minute = second = 0
for line in f:
if f.filelineno() < self.lineHeader + 1:
# read and decode header
for k in self.__regex.keys():
# key is DATETIME
if k == "DATETIME" and self.__regex[k].search(line):
(month, day, year, hour, minute, second) = \
self.__regex[k].search(line).groups()
# format date and time to "May 09 2011 16:33:53"
dateTime = "%s/%s/%s %s:%s:%s" % (day, month, year, hour, minute, second)
# set datetime object
dt = dt.strptime(dateTime, "%d/%b/%Y %H:%M:%S")
# dt.strptime(dateTime, "%d/%b/%Y %H:%M:%S")# dateTime conversion to "09/05/2011 16:33:53"
# dateTime = "%s" % \
# (dt.strptime(dateTime, "%d/%b/%Y %H:%M:%S").strftime("%d/%m/%Y %H:%M:%S"))
# # conversion to "20110509163353"
# epic_date = "%s" % \
# (dt.strptime(dateTime, "%d/%m/%Y %H:%M:%S").strftime("%Y%m%d%H%M%S"))
# # conversion to julian day
# julian = float((dt.strptime(dateTime, "%d/%m/%Y %H:%M:%S").strftime("%j"))) \
# + ((float(hour) * 3600.) + (float(minute) * 60.) + float(second) ) / 86400.
# # we use julian day with origine 0
# julian -= 1
self.__data['TIME'][n] = tools.dt2julian(dt)
# key is DATE
if k == "DATE" and self.__regex[k].search(line):
if device.lower() == 'ladcp':
(year, month, day) = \
self.__regex[k].search(line).groups()
else:
(month, day, year) = \
self.__regex[k].search(line).groups()
# key is TIME
if k == "TIME" and self.__regex[k].search(line):
(hour, minute, second) = \
self.__regex[k].search(line).groups()
# format date and time to "May 09 2011 16:33:53"
dateTime = "%s/%s/%s %s:%s:%s" % (day, month, year, hour, minute, second)
# dateTime conversion to "09/05/2011 16:33:53"
dateTime = "%s" % \
(dt.strptime(dateTime, "%d/%m/%Y %H:%M:%S").strftime("%d/%m/%Y %H:%M:%S"))
# set datetime object
dt = dt.strptime(dateTime, "%d/%m/%Y %H:%M:%S")
# # conversion to "20110509163353"
# epic_date = "%s" % \
# (dt.strptime(dateTime, "%d/%m/%Y %H:%M:%S").strftime("%Y%m%d%H%M%S"))
# # conversion to julian day
# julian = float((dt.strptime(dateTime, "%d/%m/%Y %H:%M:%S").strftime("%j"))) \
# + ((float(hour) * 3600.) + (float(minute) * 60.) + float(second) ) / 86400.
# # we use julian day with origine 0
# julian -= 1
self.__data['TIME'][n] = tools.dt2julian(dt)
# key is LATITUDE
if k == "LATITUDE" and self.__regex[k].search(line):
if device.lower() == 'ladcp':
[latitude] = self.__regex[k].search(line).groups()
else:
(lat_deg, lat_min, lat_hemi) = self.__regex[k].search(line).groups()
# format latitude to string
latitude_str = "%s%c%s %s" % (lat_deg, tools.DEGREE, lat_min, lat_hemi)
# transform to decimal using ternary operator
latitude = float(lat_deg) + (float(lat_min) / 60.) if lat_hemi == 'N' else \
(float(lat_deg) + (float(lat_min) / 60.)) * -1
self.__data['LATITUDE'][n] = latitude
# key is LONGITUDE
if k == "LONGITUDE" and self.__regex[k].search(line):
if device.lower() == 'ladcp':
[longitude] = self.__regex[k].search(line).groups()
else:
(lon_deg, lon_min, lon_hemi) = self.__regex[k].search(line).groups()
# format longitude to string
longitude_str = "%s%c%s %s" % (lon_deg, tools.DEGREE, lon_min, lon_hemi)
# transform to decimal using ternary operator
longitude = float(lon_deg) + (float(lon_min) / 60.) if lon_hemi == 'E' else \
(float(lon_deg) + (float(lon_min) / 60.)) * -1
self.__data['LONGITUDE'][n] = longitude
# key is BATH
if k == "BATH" and self.__regex[k].search(line):
[bath] = self.__regex[k].search(line).groups()
self.__data['BATH'][n] = bath
continue
# split the line, remove leading and trailing space before
p = line.strip().split(self.__separator)
str = ' '
# fill array with extracted value of line for eack key (physical parameter)
for key in self.keys:
self.__data[key][n, m] = p[hash[key]]
# debug info
str += "{:>{width}}".format(
p[hash[key]], width=8)
logging.debug(str)
# increment m indice (the line number)
m += 1
n += 1
m = 0
# for testing in standalone context
# ---------------------------------
if __name__ == "__main__":
# usage Unix:
# > python file_extractor.py data/CTD/cnv/dfr2900[1-3].cnv -d -i CTD
# > python file_extractor.py data/CTD/cnv/dfr2900*.cnv -k PRES TEMP PSAL DOX2 DENS -i CTD
#
# usage DOS:
# > python file_extractor.py data/CTD/cnv/dfr2900?.cnv -d -i CTD
# > python file_extractor.py data/CTD/cnv/dfr2900?.cnv -k PRES TEMP PSAL DOX2 DENS -i CTD
#
parser = argparse.ArgumentParser(
description='This class read multiple ASCII file, extract physical parameter \
from ROSCOP codification at the given column and fill arrays ',
epilog='J. Grelet IRD US191 - March 2019')
parser.add_argument('-d', '--debug', help='display debug informations',
action='store_true')
parser.add_argument('-c', '--config', help="toml configuration file, (default: %(default)s)",
default='tests/test.toml')
parser.add_argument('-i', '--instrument', nargs='?', choices=['CTD','XBT'],
help='specify the instrument that produce files, eg CTD, XBT, TSG, LADCP')
parser.add_argument('-k', '--keys', nargs='+', default=['PRES', 'TEMP', 'PSAL'],
help='display dictionary for key(s), (default: %(default)s)')
parser.add_argument('fname', nargs='*',
help='cnv file(s) to parse, (default: data/cnv/dfr29*.cnv)')
# display extra logging info
# see: https://stackoverflow.com/questions/14097061/easier-way-to-enable-verbose-logging
# https://docs.python.org/2/howto/argparse.html
args = parser.parse_args()
if args.debug:
logging.basicConfig(
format='%(levelname)s:%(message)s', level=logging.DEBUG)
fe = FileExtractor(args.fname, Roscop('code_roscop.csv'), args.keys)
print("File(s): {}, Config: {}".format(args.fname, args.config))
cfg = toml.load(args.config)
fe.set_regex(cfg, args.instrument)
fe.first_pass()
print("Indices: {} x {}\nkeys: {}".format(fe.n, fe.m, fe.keys))
fe.second_pass(cfg, args.instrument, ['TIME', 'LATITUDE', 'LONGITUDE','BATH'])
# debug
print(fe['PRES'])
print(fe['TEMP'])
print(fe['PSAL'])