Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Sen2Chain
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ESPACE-DEV
Sen2Chain
Commits
24f7e245
Commit
24f7e245
authored
Oct 5, 2023
by
juliettemiranville
Browse files
Options
Downloads
Patches
Plain Diff
checking what is already extracted and data can be compiled
parent
6fac6d0a
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
sen2chain/time_series.py
+519
-188
519 additions, 188 deletions
sen2chain/time_series.py
with
519 additions
and
188 deletions
sen2chain/time_series.py
+
519
−
188
View file @
24f7e245
...
...
@@ -3,7 +3,10 @@
"""
Module for computing time series.
"""
import
os
import
csv
import
hashlib
import
zipfile
import
logging
import
pathlib
from
pathlib
import
Path
...
...
@@ -26,9 +29,10 @@ from typing import Sequence, List, Dict, Union
import
matplotlib.pyplot
as
plt
import
numpy
as
np
import
math
from
datetime
import
timedelta
from
datetime
import
datetime
,
timedelta
import
time
import
multiprocessing
import
shutil
import
itertools
from
functools
import
partial
...
...
@@ -40,7 +44,11 @@ from .products import L2aProduct, IndiceProduct
logger
=
logging
.
getLogger
(
__name__
)
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logging
.
basicConfig
(
level
=
logging
.
INFO
,
filename
=
Config
().
get
(
"
extraction_path
"
)
+
"
/app.log
"
,
filemode
=
"
a
"
)
# logging.basicConfig(format='%(process)s %(asctime)s %(levelname)s:%(module)s:%(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
logging
.
getLogger
().
handlers
[
0
].
setFormatter
(
logging
.
Formatter
(
...
...
@@ -52,14 +60,12 @@ logging.getLogger().handlers[0].setFormatter(
class
TimeSeries
:
"""
Class for time series extraction.
:param vectors_file: path to the vector file.
:param indices: list of valid indices names.
:param date_min: starting date of the time series (YYYY-MM-DD format).
:param date_max: ending date of the time series (YYYY-MM-DD format).
:param cover_min: minimum cloud cover value. Default: 0.
:param cover_max: minimum cloud cover value. Default: 100.
Usage:
>>>
ts
=
TimeSeries
(
"
polygons.geojson
"
,
...
...
@@ -79,7 +85,7 @@ class TimeSeries:
cover_max
:
int
=
100
,
field_names
:
str
=
None
,
multiproc
:
int
=
0
,
getstat
:
bool
=
True
,
#
getstat: bool = True,
cm_version
:
str
=
"
CM001
"
,
probability
:
int
=
1
,
iterations
:
int
=
5
,
...
...
@@ -90,6 +96,7 @@ class TimeSeries:
)
->
None
:
self
.
_vectors_file
=
Path
(
vectors_file
)
self
.
_checksum
=
self
.
calculate_checksum
()
self
.
_date_min
=
date_min
self
.
_date_max
=
date_max
self
.
_cover_min
=
cover_min
...
...
@@ -105,6 +112,8 @@ class TimeSeries:
self
.
cld_hi_prob
=
cld_hi_prob
self
.
thin_cir
=
thin_cir
if
(
date_min
is
None
and
date_max
is
not
None
)
or
(
date_min
is
not
None
and
date_max
is
None
):
raise
ValueError
(
"
If you specify datemin or datemax, you must specify both
"
)
if
indices
is
None
:
self
.
_indices_list
=
IndicesCollection
.
list
else
:
...
...
@@ -117,24 +126,93 @@ class TimeSeries:
self
.
_key
=
self
.
_field_names
else
:
self
.
_key
=
"
fid
"
self
.
_tiles_geoms
=
self
.
_get_tiles_geom_dict
()
self
.
_df_dicts
=
dict
()
if
getstat
is
True
:
path
=
Path
(
self
.
_vectors_file
)
existing_folder_name
=
None
for
d
in
Path
(
self
.
_out_path
).
glob
(
'
*
'
):
if
d
.
is_dir
()
and
self
.
_checksum
in
d
.
name
:
existing_folder_name
=
d
.
name
break
if
existing_folder_name
:
self
.
_data_name
=
existing_folder_name
.
split
(
"
_
"
)[
0
]
logger
.
info
(
"
Vous avez déjà executer ce shp
"
)
else
:
self
.
_data_name
=
self
.
_vectors_file
.
stem
folder
=
self
.
_data_name
+
"
_
"
+
self
.
_checksum
logger
.
info
(
"
Vous trouverez vos données dans le dossier {}
"
.
format
(
folder
))
folder_cp
=
Path
(
self
.
_out_path
)
/
folder
/
"
shp_files
"
folder_cp
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
for
ext
in
[
'
.shp
'
,
'
.cpg
'
,
'
.dbf
'
,
'
.prj
'
,
'
.shx
'
]:
file_to_cp
=
path
.
with_suffix
(
ext
)
shutil
.
copy2
(
file_to_cp
,
folder_cp
)
def
calculate_checksum
(
self
):
#getting all the files :
temp_path
=
Config
().
get
(
"
temp_path
"
)
path
=
Path
(
self
.
_vectors_file
)
#checksum :
sha256_hash
=
hashlib
.
sha256
()
for
ext
in
[
'
.shp
'
,
'
.cpg
'
,
'
.dbf
'
,
'
.prj
'
,
'
.shx
'
]:
with
open
(
path
.
with_suffix
(
ext
),
"
rb
"
)
as
f
:
while
True
:
data
=
f
.
read
(
65536
)
if
not
data
:
break
sha256_hash
.
update
(
data
)
checksum
=
sha256_hash
.
hexdigest
()
print
(
checksum
)
return
checksum
def
extract_data
(
self
,
getstat
:
bool
=
True
):
# a tester avec un très gros jeu de données. avec des petites plages de dates, plus rapide avec les listes
start
=
time
.
time
()
if
getstat
:
if
self
.
_multiproc
:
self
.
_get_stats_multiproc
()
else
:
self
.
_get_stats
()
end
=
time
.
time
()
logger
.
info
(
"
Execution time: {}
"
.
format
(
timedelta
(
seconds
=
end
-
start
))
)
@property
def
data
(
self
)
->
Dict
[
str
,
pd
.
DataFrame
]:
"""
Returns a dictionnary with indices as keys and pandas.DataFrame as values.
"""
return
self
.
_df_dicts
df
=
dict
()
if
self
.
cm_version
==
"
CM001
"
:
cm
=
"
CM001
"
elif
self
.
cm_version
==
"
CM002
"
:
cm
=
"
CM002-B11
"
elif
self
.
cm_version
==
"
CM003
"
:
cm
=
str
(
"
CM003
"
+
"
-PRB
"
+
str
(
self
.
probability
)
+
"
-ITER
"
+
str
(
self
.
iterations
))
elif
self
.
cm_version
==
"
CM004
"
:
cm
=
str
(
"
CM004
"
+
"
-CSH
"
+
str
(
1
*
self
.
cld_shad
)
+
"
-CMP
"
+
str
(
1
*
self
.
cld_med_prob
)
+
"
-CHP
"
+
str
(
1
*
self
.
cld_hi_prob
)
+
"
-TCI
"
+
str
(
1
*
self
.
thin_cir
)
+
"
-ITER
"
+
str
(
1
*
self
.
iterations
))
name
=
self
.
_data_name
+
"
_
"
+
self
.
_checksum
compiled_data
=
Path
(
self
.
_out_path
)
/
name
/
"
compiled_data
"
if
not
os
.
path
.
isdir
(
compiled_data
):
self
.
result_csv
(
alldata
=
True
)
for
indice
in
os
.
listdir
(
compiled_data
):
path_file
=
Path
(
compiled_data
)
/
indice
/
cm
/
"
all_data
"
/
f
'
{
self
.
_data_name
}
_CC
{
self
.
_cover_max
}
_all_data.csv
'
if
os
.
path
.
isfile
(
path_file
):
df
[
indice
]
=
pd
.
read_csv
(
str
(
path_file
),
dtype
=
object
,
index_col
=
0
)
df
[
indice
][[
'
count
'
,
'
nodata
'
,
'
nbcld
'
,
'
cldprb
'
,
'
min
'
,
'
max
'
,
'
mean
'
,
'
std
'
,
'
median
'
,
'
percentile_25
'
,
'
percentile_75
'
,
'
id
'
]]
=
df
[
indice
][[
'
count
'
,
'
nodata
'
,
'
nbcld
'
,
'
cldprb
'
,
'
min
'
,
'
max
'
,
'
mean
'
,
'
std
'
,
'
median
'
,
'
percentile_25
'
,
'
percentile_75
'
,
'
id
'
]].
apply
(
pd
.
to_numeric
,
errors
=
'
coerce
'
)
else
:
logger
.
info
(
"
All data file doesn
'
t exist for {},{},{}, please check info or try extract_data
"
.
format
(
indice
,
cm
,
self
.
_cover_max
))
return
df
@staticmethod
def
_is_indice_in_tile
(
indice
:
str
,
tile
:
Tile
)
->
bool
:
"""
Does the tile has a folder for this indice.
:param indice: indice name.
:param tile: Tile object.
"""
...
...
@@ -181,7 +259,6 @@ class TimeSeries:
:param cover_min: minimum cloud coverage.
:param cover_max: maximum cloud coverage.
"""
#test filtering with cm
products
=
getattr
(
vars
(
tile
)[
indice
.
lower
()
].
masks
,
cm_version
).
params
(
...
...
@@ -205,7 +282,6 @@ class TimeSeries:
cloud_proba_path
:
str
=
None
,
)
->
Dict
:
"""
Extracts statistics from a raster in a geometry.
:param feature: GeoJSON like object.
:param raster_path: path to the raster.
"""
...
...
@@ -216,11 +292,6 @@ class TimeSeries:
feature
[
"
geometry
"
],
)
)
# with rasterio.open(str(raster_path)) as raster_src:
# raster_profile = raster_src.profile
# logger.info(raster_profile)
stats
=
zonal_stats
(
geom
,
str
(
raster_path
),
...
...
@@ -236,34 +307,14 @@ class TimeSeries:
"
percentile_75
"
,
],
)[
0
]
# logger.info(stats)
if
cloud_path
:
# def mycount(x):
# return np.count_nonzero(x == 1)
# with rasterio.open(str(cloud_path)) as cld_src:
# cld_profile = cld_src.profile
# cld_array = cld_src.read(1, out_shape = (10980, 10980), resampling=Resampling.nearest)
# cld_array = cld_src.read(1)
# logger.info("shape: {}".format(cld_array.shape))
# stats_cld = zonal_stats(geom, cld_array, affine=raster_profile["transform"] , stats=["count", "nodata"], add_stats={'cldcount':mycount})
# stats_cld = zonal_stats(geom, cld_array, affine=raster_profile["transform"], categorical=True)[0]
# stats_cld = zonal_stats(geom, str(cloud_path), nodata = 16383.0, categorical=True)[0]
# stats_cld = zonal_stats(geom, str(cloud_path), stats=["count", "nodata"], add_stats={'cldcount':mycount})
# stats_cld = zonal_stats(geom, str(cloud_path), categorical=True, nodata = 16383.0, category_map= {-999: 'tttttttttttt', 1.0: 'clds', 0.0:'noclds', 16383.0: 'nnnnnnn'})[0]
stats_cld
=
zonal_stats
(
geom
,
str
(
cloud_path
),
stats
=
[
"
count
"
,
"
nodata
"
]
)[
0
]
# logger.info(stats_cld)
try
:
nbcld
=
stats
[
"
nodata
"
]
-
stats_cld
[
"
nodata
"
]
except
:
nbcld
=
0
# logger.info(stats_cld)
# logger.info(cld_pct)
stats
[
"
nbcld
"
]
=
nbcld
if
cloud_proba_path
:
...
...
@@ -272,14 +323,27 @@ class TimeSeries:
)[
0
]
stats
[
"
cldprb
"
]
=
stats_cld_prb
[
"
mean
"
]
logger
.
info
(
stats
)
# mettre ici le cloud mask !
return
stats
def
_get_stats
(
self
)
->
None
:
start
=
time
.
time
()
"""
Compute stats in polygons.
"""
folder
=
self
.
_data_name
+
"
_
"
+
self
.
_checksum
if
self
.
cm_version
==
"
CM001
"
:
cm
=
"
CM001
"
elif
self
.
cm_version
==
"
CM002
"
:
cm
=
"
CM002-B11
"
elif
self
.
cm_version
==
"
CM003
"
:
cm
=
str
(
"
CM003
"
+
"
-PRB
"
+
str
(
self
.
probability
)
+
"
-ITER
"
+
str
(
self
.
iterations
))
elif
self
.
cm_version
==
"
CM004
"
:
cm
=
str
(
"
CM004
"
+
"
-CSH
"
+
str
(
1
*
self
.
cld_shad
)
+
"
-CMP
"
+
str
(
1
*
self
.
cld_med_prob
)
+
"
-CHP
"
+
str
(
1
*
self
.
cld_hi_prob
)
+
"
-TCI
"
+
str
(
1
*
self
.
thin_cir
)
+
"
-ITER
"
+
str
(
1
*
self
.
iterations
))
with
fiona
.
open
(
str
(
self
.
_vectors_file
),
"
r
"
)
as
vectors
:
features
=
{
feat
[
"
id
"
]:
feat
for
feat
in
vectors
}
for
index2
,
indice
in
enumerate
(
self
.
_indices_list
):
...
...
@@ -289,8 +353,7 @@ class TimeSeries:
)
)
rows_list
=
[]
# for tile, fid_list in self._tiles_geoms.items():
products_data
=
set
([
'
_
'
.
join
(
f
.
stem
.
split
(
'
_
'
)[
1
:])
+
"
.jp2
"
for
f
in
(
Path
(
self
.
_out_path
)
/
folder
/
"
raw_data
"
/
indice
/
cm
).
glob
(
'
*
'
)
if
f
.
is_file
()])
for
index3
,
tile
in
enumerate
(
self
.
_tiles_geoms
):
fid_list
=
self
.
_tiles_geoms
[
tile
]
tile_obj
=
Tile
(
tile
)
...
...
@@ -300,7 +363,6 @@ class TimeSeries:
index3
+
1
,
len
(
self
.
_tiles_geoms
),
tile
)
)
if
TimeSeries
.
_is_indice_in_tile
(
indice
,
tile_obj
):
tile_indice_path
=
tile_obj
.
paths
[
"
indices
"
][
indice
.
lower
()
...
...
@@ -320,24 +382,9 @@ class TimeSeries:
self
.
cld_hi_prob
,
self
.
thin_cir
,
)
products
.
_dict
=
{
cle
:
valeur
for
cle
,
valeur
in
products
.
_dict
.
items
()
if
cle
not
in
products_data
}
for
index1
,
prod
in
enumerate
(
products
):
indice_product
=
IndiceProduct
(
prod
.
identifier
)
# prod_path = str(indice_product.path)
# prod_path_unmasked = prod_path.replace(
# "_CM001", ""
# )
# l2a = L2aProduct(indice_product.l2a)
# if l2a.path.exists():
# prod_path_cloud_proba = l2a.msk_cldprb_20m
# else:
# prod_path_cloud_proba = None
# prod_path = tile_indice_path / prod.identifier[:(-12 - len(indice))] / prod.identifier
# cloud_path = tile_obj.paths["l2a"] / (prod.identifier[:(-12 - len(indice))] + "_CLOUD_MASK.jp2")
# prod_path_unmasked = tile_indice_path / prod.identifier[:(-12 - len(indice))] / (prod.identifier[:-11] + '.jp2')
# prod_path_cloud_proba = L2aProduct(prod.identifier[:(-12 - len(indice))]).msk_cldprb_20m
# logger.info(prod_path_cloud_proba)
logger
.
info
(
"
Product {}/{}: {}
"
.
format
(
index1
+
1
,
len
(
products
),
prod
.
identifier
...
...
@@ -348,10 +395,6 @@ class TimeSeries:
for
index
,
fid
in
enumerate
(
fid_list
):
df_dict
=
OrderedDict
()
df_dict
[
"
fid
"
]
=
fid
# feat_properties = features[fid]["properties"]
# if feat_properties:
# df_dict.update(feat_properties)
df_dict
.
update
(
TimeSeries
.
_get_raster_stats_in_geom
(
features
[
fid
],
...
...
@@ -360,8 +403,6 @@ class TimeSeries:
indice_product
.
msk_cldprb_20m
,
)
)
# df_properties = features[fid]["properties"]
df_dict
[
"
date
"
]
=
prod
.
date
df_dict
[
"
tile
"
]
=
tile
df_dict
[
"
filename
"
]
=
prod
.
identifier
...
...
@@ -370,10 +411,12 @@ class TimeSeries:
df_dict
[
prop
]
=
features
[
fid
][
"
properties
"
][
prop
]
rows_list
.
append
(
df_dict
)
if
rows_list
:
self
.
_df_dicts
[
indice
]
=
TimeSeries
.
_list_to_df
(
rows_list
)
self
.
to_csv
()
rows_list
.
clear
()
self
.
_df_dicts
.
clear
()
end
=
time
.
time
()
logger
.
info
(
"
Execution time: {}
"
.
format
(
timedelta
(
seconds
=
end
-
start
))
...
...
@@ -381,24 +424,6 @@ class TimeSeries:
def
_raster_stats_multi
(
self
,
features
,
shared_list
,
proc_item
):
indice_product
=
IndiceProduct
(
proc_item
[
0
].
identifier
)
# logger.info(
# "masque nuages path: {} - {}".format(
# indice_product.identifier,
# str(indice_product.msk_cldprb_20m)
# )
# )
# prod_path = str(indice_product.path)
# prod_path_unmasked = prod_path.replace("_CM001", "")
# l2a = L2aProduct(indice_product.l2a)
# if l2a.path.exists():
# prod_path_cloud_proba = l2a.msk_cldprb_20m
# else:
# prod_path_cloud_proba = None
# prod_path = proc_item[2] / proc_item[0].identifier[:(-12 - len(proc_item[3]))] / proc_item[0].identifier
# prod_path_unmasked = proc_item[2] / proc_item[0].identifier[:(-12 - len(proc_item[3]))] / (proc_item[0].identifier[:-11] + '.jp2')
# prod_path_cloud_proba = L2aProduct(proc_item[0].identifier[:(-12 - len(proc_item[3]))]).msk_cldprb_20m
# logger.info(prod_path_cloud_proba)
fid
=
proc_item
[
1
]
result_dict
=
OrderedDict
()
result_dict
[
"
fid
"
]
=
fid
...
...
@@ -414,17 +439,28 @@ class TimeSeries:
result_dict
[
"
tile
"
]
=
proc_item
[
4
]
result_dict
[
"
filename
"
]
=
proc_item
[
0
].
identifier
for
prop
in
features
[
fid
][
"
properties
"
]:
# if type(features[fid]["properties"][prop]) == float:
# result_dict[prop] = "{:.6f}".format(features[fid]["properties"][prop])
# logger.info("toto {}".format(result_dict[prop]))
# else:
# result_dict[prop] = features[fid]["properties"][prop]
# logger.info("tata {}".format(result_dict[prop]))
result_dict
[
prop
]
=
features
[
fid
][
"
properties
"
][
prop
]
shared_list
.
append
(
result_dict
)
def
_get_stats_multiproc
(
self
,)
->
None
:
start
=
time
.
time
()
folder
=
self
.
_data_name
+
"
_
"
+
self
.
_checksum
if
self
.
cm_version
==
"
CM001
"
:
cm
=
"
CM001
"
elif
self
.
cm_version
==
"
CM002
"
:
cm
=
"
CM002-B11
"
elif
self
.
cm_version
==
"
CM003
"
:
cm
=
str
(
"
CM003
"
+
"
-PRB
"
+
str
(
self
.
probability
)
+
"
-ITER
"
+
str
(
self
.
iterations
))
elif
self
.
cm_version
==
"
CM004
"
:
cm
=
str
(
"
CM004
"
+
"
-CSH
"
+
str
(
1
*
self
.
cld_shad
)
+
"
-CMP
"
+
str
(
1
*
self
.
cld_med_prob
)
+
"
-CHP
"
+
str
(
1
*
self
.
cld_hi_prob
)
+
"
-TCI
"
+
str
(
1
*
self
.
thin_cir
)
+
"
-ITER
"
+
str
(
1
*
self
.
iterations
))
"""
Compute stats in polygons.
"""
with
fiona
.
open
(
str
(
self
.
_vectors_file
),
"
r
"
)
as
vectors
:
features
=
{
feat
[
"
id
"
]:
feat
for
feat
in
vectors
}
...
...
@@ -434,21 +470,18 @@ class TimeSeries:
index2
+
1
,
len
(
self
.
_indices_list
),
indice
)
)
# rows_list = []
manager
=
multiprocessing
.
Manager
()
shared_list
=
manager
.
list
()
# for tile, fid_list in self._tiles_geoms.items():
#recupérer dans raw_data les noms des produits qu'on a déjà fait
products_data
=
set
([
'
_
'
.
join
(
f
.
stem
.
split
(
'
_
'
)[
1
:])
+
"
.jp2
"
for
f
in
(
Path
(
self
.
_out_path
)
/
folder
/
"
raw_data
"
/
indice
/
cm
).
glob
(
'
*
'
)
if
f
.
is_file
()])
for
index3
,
tile
in
enumerate
(
self
.
_tiles_geoms
):
shared_list
=
manager
.
list
()
fid_list
=
self
.
_tiles_geoms
[
tile
]
tile_obj
=
Tile
(
tile
)
logger
.
info
(
"
Tile {}/{}: {}
"
.
format
(
index3
+
1
,
len
(
self
.
_tiles_geoms
),
tile
)
)
if
TimeSeries
.
_is_indice_in_tile
(
indice
,
tile_obj
):
tile_indice_path
=
tile_obj
.
paths
[
"
indices
"
][
indice
.
lower
()
...
...
@@ -468,7 +501,7 @@ class TimeSeries:
self
.
cld_hi_prob
,
self
.
thin_cir
,
)
products
.
_dict
=
{
cle
:
valeur
for
cle
,
valeur
in
products
.
_dict
.
items
()
if
cle
not
in
products_data
}
proc_list
=
list
(
itertools
.
product
(
products
,
...
...
@@ -479,7 +512,6 @@ class TimeSeries:
)
)
logger
.
info
(
"
{} extractions
"
.
format
(
len
(
proc_list
)))
pool
=
multiprocessing
.
Pool
(
self
.
_multiproc
)
results
=
[
pool
.
map
(
...
...
@@ -494,40 +526,18 @@ class TimeSeries:
pool
.
close
()
pool
.
join
()
# logger.info("{}".format(shared_list))
# for index1, prod in enumerate(products):
rows_list
=
list
(
shared_list
)
# logger.info("rows_list {}".format(rows_list))
if
rows_list
:
self
.
_df_dicts
[
indice
]
=
TimeSeries
.
_list_to_df
(
rows_list
)
# prod_path = tile_indice_path / prod.identifier[:(-12 - len(indice))] / prod.identifier
# logger.info("Product {}/{}: {}".format(index1 + 1, len(products), prod.identifier))
# logger.info("{} features".format(len(fid_list)))
# for index, fid in enumerate(fid_list):
# df_dict = OrderedDict()
# df_dict["fid"] = fid
self
.
to_csv
()
rows_list
.
clear
()
self
.
_df_dicts
.
clear
()
# # feat_properties = features[fid]["properties"]
# # if feat_properties:
# # df_dict.update(feat_properties)
# df_dict.update(TimeSeries._get_raster_stats_in_geom(features[fid], prod_path))
# # df_properties = features[fid]["properties"]
# df_dict["date"] = prod.date
# df_dict["tile"] = tile
# df_dict["filename"] = prod.identifier
# for prop in features[fid]["properties"]:
# df_dict[prop] = features[fid]["properties"][prop]
# rows_list.append(df_dict)
# if rows_list:
# self._df_dicts[indice] = TimeSeries._list_to_df(rows_list)
end
=
time
.
time
()
logger
.
info
(
"
Execution time: {}
"
.
format
(
timedelta
(
seconds
=
end
-
start
))
)
...
...
@@ -535,7 +545,6 @@ class TimeSeries:
@staticmethod
def
_list_to_df
(
rows_list
:
List
[
Dict
])
->
pd
.
DataFrame
:
"""
Converts a list of dictionaries as a pandas dataframe.
:param rows_list: list of dictionaries.
"""
df
=
pd
.
DataFrame
.
from_dict
(
rows_list
)
...
...
@@ -547,13 +556,13 @@ class TimeSeries:
def
to_csv
(
self
,
out_path
:
Union
[
str
,
pathlib
.
PosixPath
]
=
None
)
->
None
:
"""
Exports the time series to CSV format.
:param out_path: output folder. Default is DATA/EXTRACTION.
"""
if
out_path
is
None
:
out_path
=
self
.
_out_path
out_path_folder
=
Path
(
out_path
)
/
self
.
_vectors_file
.
stem
out_path_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
folder
=
self
.
_data_name
+
"
_
"
+
self
.
_checksum
base_path_folder
=
Path
(
out_path
)
/
folder
/
"
raw_data
"
base_path_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
list_order
=
[
"
fid
"
,
...
...
@@ -571,26 +580,27 @@ class TimeSeries:
"
percentile_25
"
,
"
percentile_75
"
,
]
# b=[a for a in df.columns if a not in liste]
for
df_name
,
df
in
self
.
_df_dicts
.
items
():
csv_path
=
out_path_folder
/
"
{0}_{1}_{2}_{3}.csv
"
.
format
(
self
.
_vectors_file
.
stem
,
df_name
,
self
.
_date_min
,
self
.
_date_max
,
)
# df.sort_values(by=['fid', 'date', 'count', 'tile']).reindex(columns=(list_order + [a for a in df.columns if a not in list_order]), copy=False).to_csv(str(csv_path))
df
[
"
fid
"
]
=
pd
.
to_numeric
(
df
[
"
fid
"
])
parts
=
os
.
path
.
splitext
(
df
[
"
filename
"
][
0
])[
0
].
split
(
"
_
"
)
out_path_folder
=
Path
(
base_path_folder
)
/
df_name
/
parts
[
8
]
out_path_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
dates
=
df
.
index
.
unique
()
for
date
in
dates
:
df_filtered
=
df
[
df
.
index
==
date
]
csv_path
=
out_path_folder
/
"
{0}_{1}.csv
"
.
format
(
self
.
_data_name
,
os
.
path
.
splitext
(
df_filtered
[
"
filename
"
][
0
])[
0
]
)
df
.
loc
[
df
.
index
==
date
,
"
fid
"
]
=
pd
.
to_numeric
(
df_filtered
[
"
fid
"
])
dg
=
(
d
f
.
sort_values
(
by
=
[
"
fid
"
,
"
date
"
,
"
count
"
,
"
std
"
],
df_filtere
d
.
sort_values
(
by
=
[
"
date
"
,
"
fid
"
,
"
count
"
,
"
std
"
],
ascending
=
[
True
,
True
,
True
,
False
],
)
.
reindex
(
columns
=
(
list_order
+
[
a
for
a
in
df
.
columns
if
a
not
in
list_order
]
+
[
a
for
a
in
df
_filtered
.
columns
if
a
not
in
list_order
]
),
copy
=
False
,
)
...
...
@@ -606,25 +616,24 @@ class TimeSeries:
inplace
=
True
,
)
dg
.
to_csv
(
str
(
csv_path
))
logger
.
info
(
"
exported to csv: {}
"
.
format
(
csv_path
))
logger
.
info
(
"
exported to csv:
(
{}
)
"
.
format
(
csv_path
))
def
plot_global
(
self
,
out_path
:
Union
[
str
,
pathlib
.
PosixPath
]
=
None
)
->
None
:
"""
Exports the time series to CSV format.
:param out_path: output folder. Default is DATA/EXTRACTION.
"""
if
out_path
is
None
:
out_path
=
self
.
_out_path
out_path_folder
=
Path
(
out_path
)
/
self
.
_vectors_file
.
stem
folder
=
self
.
_data_name
+
"
_
"
+
self
.
_checksum
out_path_folder
=
Path
(
out_path
)
/
folder
out_path_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
for
df_name
,
df
in
self
.
_df_dicts
.
items
():
for
df_name
,
df
in
self
.
data
.
items
():
png_path
=
(
out_path_folder
/
"
{0}_plot-global_{1}_{2}_{3}.png
"
.
format
(
self
.
_
vectors_file
.
stem
,
self
.
_
data_name
,
df_name
,
self
.
_date_min
,
self
.
_date_max
,
...
...
@@ -644,6 +653,9 @@ class TimeSeries:
plt
.
close
()
logger
.
info
(
"
Plot saved to png: {}
"
.
format
(
png_path
))
def
plot_details
(
self
,
out_path
:
Union
[
str
,
pathlib
.
PosixPath
]
=
None
)
->
None
:
...
...
@@ -653,13 +665,14 @@ class TimeSeries:
"""
if
out_path
is
None
:
out_path
=
self
.
_out_path
out_path_folder
=
Path
(
out_path
)
/
self
.
_vectors_file
.
stem
folder
=
self
.
_data_name
+
"
_
"
+
self
.
_checksum
out_path_folder
=
Path
(
out_path
)
/
folder
out_path_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
for
df_name
,
df
in
self
.
_df_dicts
.
items
():
for
df_name
,
df
in
self
.
data
.
items
():
png_path
=
(
out_path_folder
/
"
{0}_plot-details_{1}_{2}_{3}.png
"
.
format
(
self
.
_
vectors_file
.
stem
,
self
.
_
data_name
,
df_name
,
self
.
_date_min
,
self
.
_date_max
,
...
...
@@ -668,7 +681,7 @@ class TimeSeries:
png_path_nonan
=
(
out_path_folder
/
"
{0}_plot-details_{1}-nonan_{2}_{3}.png
"
.
format
(
self
.
_
vectors_file
.
stem
,
self
.
_
data_name
,
df_name
,
self
.
_date_min
,
self
.
_date_max
,
...
...
@@ -717,9 +730,7 @@ class TimeSeries:
linestyle
=
""
,
legend
=
False
,
)
# dfe.dropna().plot.bar(y=['na_ratio'], ax=ax2, color='red',legend=False)
ax
.
plot
(
np
.
nan
,
"
-r
"
,
label
=
"
NaN ratio
"
)
# ax.legend(loc=0, prop={'size': 6})
ax
.
legend
(
loc
=
0
,
labelspacing
=
0.05
)
fig
.
tight_layout
()
fig
.
suptitle
(
df_name
,
fontsize
=
16
)
...
...
@@ -765,7 +776,8 @@ class TimeSeries:
"""
Extract ql images around vectors.
"""
if
out_path
is
None
:
out_path
=
Config
().
get
(
"
extraction_path
"
)
out_path_folder
=
Path
(
out_path
)
/
self
.
_vectors_file
.
stem
folder
=
self
.
_data_name
+
"
_
"
+
self
.
_checksum
out_path_folder
=
Path
(
out_path
)
/
folder
out_path_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
for
df_name
,
df
in
self
.
data
.
items
():
...
...
@@ -799,7 +811,7 @@ class TimeSeries:
indice_png_path
=
(
out_path_fid_folder
/
"
{0}_MOZ-QL_{1}_{2}_{3}_{4}.jpg
"
.
format
(
self
.
_
vectors_file
.
stem
,
self
.
_
data_name
,
fidname
,
df_name
,
self
.
_date_min
,
...
...
@@ -809,7 +821,7 @@ class TimeSeries:
l2a_png_path
=
(
out_path_fid_folder
/
"
{0}_MOZ-QL_{1}_{2}_{3}_{4}.jpg
"
.
format
(
self
.
_
vectors_file
.
stem
,
self
.
_
data_name
,
fidname
,
"
L2A
"
,
self
.
_date_min
,
...
...
@@ -841,13 +853,11 @@ class TimeSeries:
np
.
array
(
axs
).
flatten
(),
np
.
array
(
bxs
).
flatten
(),
):
# logger.info("row[filename]: {}".format(row['filename']))
prod_id
=
IndiceProduct
(
row
[
"
filename
"
]).
l2a
# prod_id = row['filename'][:(-12 - len(df_name))]
indice_png_tile_path
=
(
out_path_fid_folder
/
"
{0}_QL_{1}_{2}_{3}_{4}_{5}.jpg
"
.
format
(
self
.
_
vectors_file
.
stem
,
self
.
_
data_name
,
fidname
,
df_name
,
prod_id
[
11
:
19
],
...
...
@@ -858,7 +868,7 @@ class TimeSeries:
l2a_png_tile_path
=
(
out_path_fid_folder
/
"
{0}_QL_{1}_{2}_{3}_{4}_{5}.jpg
"
.
format
(
self
.
_
vectors_file
.
stem
,
self
.
_
data_name
,
fidname
,
"
L2A
"
,
prod_id
[
11
:
19
],
...
...
@@ -873,9 +883,6 @@ class TimeSeries:
tile_l2a_path
=
tile_obj
.
paths
[
"
l2a
"
]
prod_path
=
tile_indice_path
/
prod_id
/
row
[
"
filename
"
]
tci_path
=
L2aProduct
(
prod_id
).
tci_10m
# tci_path = list(Path(str(tile_l2a_path / row['filename'][:(-12 - len(df_name))])+ '.SAFE/')\
# .glob('GRANULE/*/IMG_DATA/R10m/*_TCI_10m.jp2'))
crop_extent
=
gpd
.
read_file
(
str
(
self
.
_vectors_file
))
logger
.
info
(
tci_path
)
raster_tci
=
rasterio
.
open
(
tci_path
)
...
...
@@ -978,3 +985,327 @@ class TimeSeries:
figb
.
suptitle
(
fidname
+
"
-
"
+
df_name
)
figb
.
savefig
(
str
(
indice_png_path
),
bbox_inches
=
"
tight
"
)
plt
.
close
(
fig
=
figb
)
def
result_csv
(
self
,
alldata
:
bool
=
False
,
daily
:
bool
=
False
,
weekly
:
bool
=
False
,
monthly
:
bool
=
False
,
yearly
:
bool
=
False
,
datemin
:
str
=
None
,
datemax
:
str
=
None
,
covermax
:
int
=
100
):
if
(
datemin
is
None
and
datemax
is
not
None
)
or
(
datemin
is
not
None
and
datemax
is
None
):
raise
ValueError
(
"
If you specify datemin or datemax, you must specify both
"
)
# Définir 'cm' en fonction de 'self.cm_version'
if
self
.
cm_version
==
"
CM001
"
:
cm
=
"
CM001
"
elif
self
.
cm_version
==
"
CM002
"
:
cm
=
"
CM002-B11
"
elif
self
.
cm_version
==
"
CM003
"
:
cm
=
str
(
"
CM003
"
+
"
-PRB
"
+
str
(
self
.
probability
)
+
"
-ITER
"
+
str
(
self
.
iterations
))
elif
self
.
cm_version
==
"
CM004
"
:
cm
=
str
(
"
CM004
"
+
"
-CSH
"
+
str
(
1
*
self
.
cld_shad
)
+
"
-CMP
"
+
str
(
1
*
self
.
cld_med_prob
)
+
"
-CHP
"
+
str
(
1
*
self
.
cld_hi_prob
)
+
"
-TCI
"
+
str
(
1
*
self
.
thin_cir
)
+
"
-ITER
"
+
str
(
1
*
self
.
iterations
))
name
=
self
.
_data_name
+
"
_
"
+
self
.
_checksum
raw_data
=
Path
(
self
.
_out_path
)
/
name
/
"
raw_data
"
compiled_data
=
Path
(
self
.
_out_path
)
/
name
/
"
compiled_data
"
list_order
=
[
'
date
'
,
'
fid
'
,
'
tile
'
,
'
filename
'
,
'
count
'
,
'
nodata
'
,
'
nbcld
'
,
'
cldprb
'
,
'
min
'
,
'
max
'
,
'
mean
'
,
'
std
'
,
'
median
'
,
'
percentile_25
'
,
'
percentile_75
'
]
if
os
.
path
.
isdir
(
raw_data
):
compiled_data
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
for
indice
in
os
.
listdir
(
raw_data
):
print
(
"
\n
Indice
"
,
indice
,
"
:
\n
"
)
ind
=
Path
(
compiled_data
)
/
indice
ind
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
if
cm
in
os
.
listdir
(
Path
(
raw_data
)
/
indice
):
cm_path
=
Path
(
ind
)
/
cm
cm_path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
all_data_folder
=
Path
(
cm_path
)
/
"
all_data
"
if
daily
:
daily_folder
=
Path
(
ind
)
/
cm
/
"
daily
"
if
weekly
:
weekly_folder
=
Path
(
ind
)
/
cm
/
"
weekly
"
if
monthly
:
monthly_folder
=
Path
(
ind
)
/
cm
/
"
monthly
"
if
yearly
:
yearly_folder
=
Path
(
ind
)
/
cm
/
"
yearly
"
if
datemin
is
not
None
and
datemax
is
not
None
:
custom_folder
=
Path
(
ind
)
/
cm
/
"
custom
"
path_file
=
Path
(
all_data_folder
)
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_all_data.csv
'
all_data
=
pd
.
DataFrame
()
if
os
.
path
.
isdir
(
all_data_folder
)
and
os
.
path
.
isfile
(
path_file
):
all_data
=
pd
.
concat
([
all_data
,
pd
.
read_csv
(
str
(
path_file
),
dtype
=
object
)])
if
covermax
<
100
:
data
=
data
[
data
[
'
cldprb
'
]
<
covermax
]
else
:
all_data_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
raw
=
Path
(
raw_data
)
/
indice
/
cm
new_data
=
pd
.
DataFrame
()
if
daily
:
daily_new_data
=
pd
.
DataFrame
()
#getting all the data available and not already compiled
dates_by_week
=
{}
dates_by_month
=
{}
dates_by_year
=
{}
for
f
in
os
.
listdir
(
raw
):
parties
=
f
.
split
(
'
_
'
)
tile
=
parties
[
6
][
1
:]
date
=
f
"
{
parties
[
3
][
:
4
]
}
-
{
parties
[
3
][
4
:
6
]
}
-
{
parties
[
3
][
6
:
8
]
}
"
correspondence
=
False
if
alldata
:
if
not
all_data
.
empty
:
#on a déjà un csv all_data
for
index
,
row
in
all_data
.
iterrows
():
if
tile
==
row
[
'
tile
'
]
and
date
==
row
[
'
date
'
][:
10
]:
correspondence
=
True
if
not
correspondence
:
new_data
=
pd
.
concat
([
new_data
,
pd
.
read_csv
(
str
(
Path
(
raw
)
/
f
),
dtype
=
object
)])
else
:
#on avait pas de csv all_data
df
=
pd
.
read_csv
(
str
(
Path
(
raw
)
/
f
),
dtype
=
object
)
new_data
=
pd
.
concat
([
new_data
,
df
])
if
daily
:
daily_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
daily_correspondence
=
False
daily_csv
=
daily_folder
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_daily_
{
date
}
.csv
'
if
os
.
path
.
exists
(
daily_csv
):
#if a file with the date exist
data
=
pd
.
read_csv
(
daily_csv
,
dtype
=
object
)
for
index
,
row
in
data
.
iterrows
():
if
tile
==
row
[
'
tile
'
]:
logger
.
info
(
f
"
A file with this day (
{
date
}
) already exists, updating...
"
)
daily_correspondence
=
True
break
else
:
logger
.
info
(
f
"
A file with this day (
{
date
}
) doesn
'
t exists, creating...
"
)
if
not
daily_correspondence
:
daily_new_data
=
pd
.
concat
([
daily_new_data
,
pd
.
read_csv
(
str
(
Path
(
raw
)
/
f
),
dtype
=
object
)])
else
:
logger
.
info
(
f
"
No new data for this day (
{
date
}
)
"
)
if
weekly
:
weekly_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
week
=
datetime
.
strptime
(
date
,
"
%Y-%m-%d
"
).
strftime
(
'
%Y-%U
'
)
if
week
not
in
dates_by_week
:
dates_by_week
[
week
]
=
[]
dates_by_week
[
week
].
append
((
date
,
tile
,
f
))
for
week
in
dates_by_week
:
dates_by_week
[
week
].
sort
(
key
=
lambda
x
:
x
[
0
])
# Trie en fonction de la date (x[0])
if
monthly
:
month
=
f
"
{
parties
[
3
][
:
4
]
}
-
{
parties
[
3
][
4
:
6
]
}
"
if
month
not
in
dates_by_month
:
dates_by_month
[
month
]
=
[]
dates_by_month
[
month
].
append
((
date
,
tile
,
f
))
for
month
in
dates_by_month
:
dates_by_month
[
month
].
sort
(
key
=
lambda
x
:
x
[
0
])
# Trie en fonction de la date (x[0])
if
yearly
:
year
=
f
"
{
parties
[
3
][
:
4
]
}
"
if
year
not
in
dates_by_year
:
dates_by_year
[
year
]
=
[]
dates_by_year
[
year
].
append
((
date
,
tile
,
f
))
for
year
in
dates_by_year
:
dates_by_year
[
year
].
sort
(
key
=
lambda
x
:
x
[
0
])
# Trie en fonction de la date (x[0])
if
alldata
:
if
not
new_data
.
empty
:
csv
=
Path
(
all_data_folder
)
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_all_data.csv
'
new_data
[
'
fid
'
]
=
pd
.
to_numeric
(
new_data
[
'
fid
'
])
if
not
all_data
.
empty
:
combined_data
=
all_data
.
_append
(
new_data
,
ignore_index
=
True
)
combined_data
[
'
fid
'
]
=
pd
.
to_numeric
(
combined_data
[
'
fid
'
])
dg
=
(
combined_data
.
sort_values
(
by
=
[
'
date
'
,
'
fid
'
,
'
count
'
,
'
std
'
],
ascending
=
[
True
,
True
,
True
,
False
]).
\
reindex
(
columns
=
(
list_order
+
[
a
for
a
in
combined_data
.
columns
if
a
not
in
list_order
]),
copy
=
False
).
\
reset_index
(
drop
=
True
).
\
drop_duplicates
([
'
date
'
,
'
fid
'
],
keep
=
'
last
'
)
)
if
datemin
is
not
None
and
datemax
is
not
None
:
dgcustom
=
dg
[(
dg
[
'
date
'
]
>=
datemin
)
&
(
dg
[
'
date
'
]
<=
datemax
)]
else
:
dg
=
(
new_data
.
sort_values
(
by
=
[
'
date
'
,
'
fid
'
,
'
count
'
,
'
std
'
],
ascending
=
[
True
,
True
,
True
,
False
]).
\
reindex
(
columns
=
(
list_order
+
[
a
for
a
in
new_data
.
columns
if
a
not
in
list_order
]),
copy
=
False
).
\
reset_index
(
drop
=
True
).
\
drop_duplicates
([
'
date
'
,
'
fid
'
],
keep
=
'
last
'
)
)
if
datemin
is
not
None
and
datemax
is
not
None
:
dgcustom
=
dg
[(
dg
[
'
date
'
]
>=
datemin
)
&
(
dg
[
'
date
'
]
<=
datemax
)]
dg
[
'
date
'
]
=
pd
.
to_datetime
(
dg
[
'
date
'
])
if
covermax
<
100
:
dg
=
dg
[
dg
[
'
cldprb
'
]
<
covermax
]
dg
.
to_csv
(
csv
,
index
=
False
)
logger
.
info
(
f
"
CSV created : (
{
csv
}
)
"
)
else
:
logger
.
info
(
"
No data to compile in the all_data folder : {}, {}
"
.
format
(
cm
,
indice
))
if
datemin
is
not
None
and
datemax
is
not
None
:
dgcustom
=
all_data
[(
all_data
[
'
date
'
]
>=
datemin
)
&
(
all_data
[
'
date
'
]
<=
datemax
)]
if
datemin
is
not
None
and
datemax
is
not
None
:
custom_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
csv
=
Path
(
custom_folder
)
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_
{
datemin
}
_
{
datemax
}
.csv
'
if
not
dgcustom
.
empty
:
dgcustom
[
'
date
'
]
=
pd
.
to_datetime
(
dgcustom
[
'
date
'
])
dgcustom
.
to_csv
(
csv
,
index
=
False
)
logger
.
info
(
f
"
CSV created : (
{
csv
}
)
"
)
if
daily
:
daily_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
if
not
daily_new_data
.
empty
:
daily_new_data
[
'
fid
'
]
=
pd
.
to_numeric
(
daily_new_data
[
'
fid
'
])
df
=
(
daily_new_data
.
sort_values
(
by
=
[
'
date
'
,
'
fid
'
,
'
count
'
,
'
std
'
],
ascending
=
[
True
,
True
,
True
,
False
]).
\
reindex
(
columns
=
(
list_order
+
[
a
for
a
in
daily_new_data
.
columns
if
a
not
in
list_order
]),
copy
=
False
).
\
reset_index
(
drop
=
True
).
\
drop_duplicates
([
'
date
'
,
'
fid
'
],
keep
=
'
last
'
))
df
[
'
date
'
]
=
pd
.
to_datetime
(
df
[
'
date
'
])
groups_by_day
=
df
.
groupby
(
'
date
'
)
for
date
,
group
in
groups_by_day
:
csv
=
daily_folder
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_daily_
{
date
.
strftime
(
"
%Y-%m-%d
"
)
}
.csv
'
group
.
to_csv
(
csv
,
index
=
False
)
logger
.
info
(
f
"
CSV created : (
{
csv
}
)
"
)
else
:
#pas d'update
logger
.
info
(
"
No new data to compile in daily folder : {}, {}
"
.
format
(
cm
,
indice
))
if
weekly
:
if
not
weekly_folder
.
exists
():
weekly_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
weekly_new_data
=
pd
.
DataFrame
()
for
week
,
date_list
in
dates_by_week
.
items
():
read
=
Path
(
weekly_folder
)
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_weekly_
{
week
}
.csv
'
if
os
.
path
.
isfile
(
read
):
logger
.
info
(
f
"
A file with this week (
{
week
}
) already exists, updating...
"
)
data
=
pd
.
read_csv
(
str
(
read
),
dtype
=
object
)
date_tile_set
=
set
([(
date
[
0
][:
10
],
date
[
1
])
for
date
in
date_list
])
new_w_data
=
pd
.
concat
([
pd
.
read_csv
(
str
(
Path
(
raw
)
/
date
[
2
]),
dtype
=
object
)
for
date
in
date_list
])
new_w_data
=
new_w_data
[
~
(
new_w_data
.
apply
(
lambda
row
:
(
row
[
'
date
'
][:
10
],
row
[
'
tile
'
])
in
date_tile_set
,
axis
=
1
))]
if
not
new_w_data
.
empty
:
weekly_new_data
=
pd
.
concat
([
data
,
new_w_data
])
else
:
logger
.
info
(
f
"
No new data for this week (
{
week
}
)
"
)
else
:
logger
.
info
(
f
"
A file with this week (
{
week
}
) doesn
'
t exists, creating...
"
)
weekly_new_data
=
pd
.
concat
([
weekly_new_data
,
pd
.
concat
([
pd
.
read_csv
(
str
(
Path
(
raw
)
/
date
[
2
]),
dtype
=
object
)
for
date
in
date_list
])])
if
not
weekly_new_data
.
empty
:
weekly_new_data
[
'
fid
'
]
=
pd
.
to_numeric
(
weekly_new_data
[
'
fid
'
])
df
=
(
weekly_new_data
.
sort_values
(
by
=
[
'
date
'
,
'
fid
'
,
'
count
'
,
'
std
'
],
ascending
=
[
True
,
True
,
True
,
False
]).
\
reindex
(
columns
=
(
list_order
+
[
a
for
a
in
weekly_new_data
.
columns
if
a
not
in
list_order
]),
copy
=
False
).
\
reset_index
(
drop
=
True
).
\
drop_duplicates
([
'
date
'
,
'
fid
'
],
keep
=
'
last
'
))
df
[
'
date
'
]
=
pd
.
to_datetime
(
df
[
'
date
'
])
groups_by_week
=
df
.
groupby
(
df
[
'
date
'
].
dt
.
strftime
(
'
%Y-%U
'
))
for
week
,
group
in
groups_by_week
:
csv
=
weekly_folder
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_weekly_
{
week
}
.csv
'
group
.
to_csv
(
csv
,
index
=
False
)
logger
.
info
(
f
"
CSV created : (
{
csv
}
)
"
)
if
monthly
:
#faire le traitement ici permet d'éviter d'ouvrir plusieurs fois le même fichier
if
not
monthly_folder
.
exists
():
monthly_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
monthly_new_data
=
pd
.
DataFrame
()
for
month
,
date_list
in
dates_by_month
.
items
():
read
=
Path
(
monthly_folder
)
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_monthly_
{
month
}
.csv
'
if
os
.
path
.
isfile
(
read
):
logger
.
info
(
f
"
A file with this month (
{
month
}
) already exists, updating...
"
)
data
=
pd
.
read_csv
(
str
(
read
),
dtype
=
object
)
data
[
'
datewouthtime
'
]
=
data
[
'
date
'
].
str
[
0
:
10
]
data_wntd
=
pd
.
DataFrame
(
date_list
,
columns
=
[
'
datewouthtime
'
,
'
tile
'
,
'
file
'
])
merged
=
data_wntd
.
merge
(
data
,
on
=
[
'
datewouthtime
'
,
'
tile
'
],
how
=
'
left
'
,
indicator
=
True
)
filtered_data_wntd
=
merged
[
merged
[
'
_merge
'
]
!=
'
both
'
].
drop
(
columns
=
[
'
_merge
'
])
data
.
drop
(
columns
=
[
"
datewouthtime
"
],
inplace
=
True
)
if
not
filtered_data_wntd
.
empty
:
new_data_list
=
[]
for
_
,
row
in
filtered_data_wntd
.
iterrows
():
file_path
=
Path
(
raw
)
/
row
[
'
file
'
]
if
os
.
path
.
isfile
(
file_path
):
csv_data
=
pd
.
read_csv
(
file_path
,
dtype
=
object
)
new_data_list
.
append
(
csv_data
)
if
new_data_list
:
new_data_list
.
append
(
data
)
monthly_new_data
=
pd
.
concat
(
new_data_list
,
ignore_index
=
True
)
else
:
logger
.
info
(
"
No new data to compile in monthly folder for this month : {}, {}, {}
"
.
format
(
cm
,
indice
,
month
))
else
:
logger
.
info
(
f
"
A file with this month (
{
month
}
) doesn
'
t exists, creating...
"
)
data_to_concatenate
=
[
pd
.
read_csv
(
str
(
Path
(
raw
)
/
date
[
2
]),
dtype
=
object
)
for
date
in
date_list
if
os
.
path
.
isfile
(
str
(
Path
(
raw
)
/
date
[
2
]))]
if
data_to_concatenate
:
monthly_new_data
=
pd
.
concat
([
monthly_new_data
]
+
data_to_concatenate
,
ignore_index
=
True
)
if
not
monthly_new_data
.
empty
:
#"not monthly_new_data.empty !"
monthly_new_data
[
'
fid
'
]
=
pd
.
to_numeric
(
monthly_new_data
[
'
fid
'
])
df
=
(
monthly_new_data
.
sort_values
(
by
=
[
'
date
'
,
'
fid
'
,
'
count
'
,
'
std
'
],
ascending
=
[
True
,
True
,
True
,
False
]).
\
reindex
(
columns
=
(
list_order
+
[
a
for
a
in
monthly_new_data
.
columns
if
a
not
in
list_order
]),
copy
=
False
).
\
reset_index
(
drop
=
True
).
\
drop_duplicates
([
'
date
'
,
'
fid
'
],
keep
=
'
last
'
))
df
[
'
date
'
]
=
pd
.
to_datetime
(
df
[
'
date
'
])
groups_by_month
=
df
.
groupby
(
df
[
'
date
'
].
dt
.
to_period
(
'
M
'
))
for
period
,
group
in
groups_by_month
:
csv
=
monthly_folder
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_monthly_
{
period
.
start_time
.
strftime
(
"
%Y-%m
"
)
}
.csv
'
group
.
to_csv
(
csv
,
index
=
False
)
logger
.
info
(
f
"
CSV created : (
{
csv
}
)
"
)
if
yearly
:
if
not
yearly_folder
.
exists
():
yearly_folder
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
yearly_new_data
=
pd
.
DataFrame
()
for
year
,
date_list
in
dates_by_year
.
items
():
read
=
Path
(
yearly_folder
)
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_yearly_
{
year
}
.csv
'
if
os
.
path
.
isfile
(
read
):
# A file with this year already exists, so it needs to be updated
logger
.
info
(
f
"
A file with this year (
{
year
}
) already exists, updating...
"
)
data
=
pd
.
read_csv
(
str
(
read
),
dtype
=
object
)
data
[
'
datewouthtime
'
]
=
data
[
'
date
'
].
str
[:
10
]
data_wntd
=
pd
.
DataFrame
(
date_list
,
columns
=
[
'
datewouthtime
'
,
'
tile
'
,
'
file
'
])
merged
=
data_wntd
.
merge
(
data
,
on
=
[
'
datewouthtime
'
,
'
tile
'
],
how
=
'
left
'
,
indicator
=
True
)
filtered_data_wntd
=
merged
[
merged
[
'
_merge
'
]
!=
'
both
'
].
drop
(
columns
=
[
'
_merge
'
])
data
.
drop
(
columns
=
[
"
datewouthtime
"
],
inplace
=
True
)
if
not
filtered_data_wntd
.
empty
:
new_data_list
=
[]
# List to store new data
for
_
,
row
in
filtered_data_wntd
.
iterrows
():
file_path
=
Path
(
raw
)
/
row
[
'
file
'
]
if
os
.
path
.
isfile
(
file_path
):
csv_data
=
pd
.
read_csv
(
file_path
,
dtype
=
object
)
new_data_list
.
append
(
csv_data
)
if
new_data_list
:
new_data_list
.
append
(
data
)
# Add existing data
yearly_new_data
=
pd
.
concat
(
new_data_list
,
ignore_index
=
True
)
else
:
logger
.
info
(
"
No new data to compile in yearly folder for this year : {}, {}, {}
"
.
format
(
cm
,
indice
,
year
))
else
:
logger
.
info
(
f
"
The entire year (
{
year
}
) doesn
'
t exist, creating...
"
)
yearly_new_data
=
pd
.
concat
([
yearly_new_data
,
pd
.
concat
([
pd
.
read_csv
(
str
(
Path
(
raw
)
/
date
[
2
]),
dtype
=
object
)
for
date
in
date_list
])])
if
not
yearly_new_data
.
empty
:
yearly_new_data
[
'
fid
'
]
=
pd
.
to_numeric
(
yearly_new_data
[
'
fid
'
])
df
=
(
yearly_new_data
.
sort_values
(
by
=
[
'
date
'
,
'
fid
'
,
'
count
'
,
'
std
'
],
ascending
=
[
True
,
True
,
True
,
False
]).
\
reindex
(
columns
=
(
list_order
+
[
a
for
a
in
yearly_new_data
.
columns
if
a
not
in
list_order
]),
copy
=
False
).
\
reset_index
(
drop
=
True
).
\
drop_duplicates
([
'
date
'
,
'
fid
'
],
keep
=
'
last
'
))
df
[
'
date
'
]
=
pd
.
to_datetime
(
df
[
'
date
'
])
groups_by_year
=
df
.
groupby
(
df
[
'
date
'
].
dt
.
strftime
(
'
%Y
'
))
for
year
,
group
in
groups_by_year
:
csv
=
yearly_folder
/
f
'
{
self
.
_data_name
}
_CC
{
covermax
}
_yearly_
{
year
}
.csv
'
group
.
to_csv
(
csv
,
index
=
False
)
logger
.
info
(
f
"
CSV created : (
{
csv
}
)
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
sign in
to comment