From 9a4d7dd6271b6816a6ce546aac37d81ebd783184 Mon Sep 17 00:00:00 2001
From: Impact <pascal.mouquet@ird.fr>
Date: Tue, 17 Sep 2019 14:46:00 +0400
Subject: [PATCH] clean lib and time series improvements

---
 sen2chain/library.py     | 16 +++++++++-
 sen2chain/tiles.py       | 23 ++++++++++----
 sen2chain/time_series.py | 65 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 93 insertions(+), 11 deletions(-)

diff --git a/sen2chain/library.py b/sen2chain/library.py
index 22ec776..84f59f1 100644
--- a/sen2chain/library.py
+++ b/sen2chain/library.py
@@ -57,7 +57,7 @@ class Library:
         return self._indices
         
     def clean(self, 
-              clean_list : list = [], 
+              clean_list: list = [], 
               remove: bool = False):
         if not clean_list:
             clean_list = self.l1c
@@ -67,6 +67,20 @@ class Library:
                 til.clean_lib(remove=remove)
             except:
                 pass
+            
+    def archive_l1c(self,
+                    archive_list: list = [],
+                    ):
+        if archive_list:
+            for t in archive_list:
+                try:
+                    til = Tile(t)
+                    til.archive_l1c()
+                except:
+                    pass
+        else:
+            logger.info("Please specify a tile list to archive")
+        
 
 
 class TempContainer:
diff --git a/sen2chain/tiles.py b/sen2chain/tiles.py
index 04cae7a..509f7b6 100644
--- a/sen2chain/tiles.py
+++ b/sen2chain/tiles.py
@@ -343,6 +343,21 @@ class Tile:
         - indices error (0kb)        
         """
         #~ logger.info("Cleaning {} library".format(self.name))
+        
+        # identify corrupted jp2 in l1c folder
+        erase_set = set()
+        for f in self._paths["l1c"].glob("*/GRANULE/*/IMG_DATA/*.jp2"):
+            
+            if f.stat().st_size == 0:
+                logger.info("Identified 0b corrupted {} in L1C folder".format(f.name))
+                if remove:
+                    erase_set.update({f.parent.parent.parent.parent})
+        for e in erase_set:
+            try:
+                logger.info("Removing {} from L1C folder".format(e))
+                shutil.rmtree(str(e))
+            except:
+                logger.error("Can't remove {} from L1C folder".format(e))
         # identify residual l2a from l1c folder
         for f in self._paths["l1c"].glob("*L2A*.SAFE"):
             logger.info("Identified {} in L1C folder".format(f.name))
@@ -378,8 +393,9 @@ class Tile:
                     f.unlink()
         # identify 0B or absent indices QL
         for f in self._paths["indices"]:
-            #~ print(f, self._paths["indices"][f])
+            #~ logger.info(f, self._paths["indices"][f])
             for p in self._paths["indices"][f].glob("*_MSIL2A_*/"):
+                #~ logger.info(p)
                 if p.is_file():
                     logger.info("Identified old indice format {}".format(p.name))
                     if remove:
@@ -395,8 +411,7 @@ class Tile:
                                 logger.info("Removing indice QL {}".format(q.name))
                                 q.unlink()
     
-    def archive_l1c(self,
-                move: bool = False):
+    def archive_l1c(self):
         """
         Check and move l1c products to archive folder
         
@@ -406,8 +421,6 @@ class Tile:
         
         logger.info("Scanning L1C ready to move")
         prod_list = ProductsList()
-        #~ archive_l1c_set = set(self.l1c.products) - {identifier.replace("L2A_", "L1C_").replace("_USER_", "__OPER__")
-                                                     #~ for identifier in self.l2a.products}
         archive_l1c_set = {a for a in {identifier.replace("L2A_", "L1C_").replace("_USER_", "__OPER__")
                                                      for identifier in self.l2a.products} if a in set(self.l1c.products)}
         
diff --git a/sen2chain/time_series.py b/sen2chain/time_series.py
index 56bdc57..5f5e14f 100644
--- a/sen2chain/time_series.py
+++ b/sen2chain/time_series.py
@@ -13,6 +13,7 @@ import rasterio
 from rasterio.plot import show
 from rasterio.plot import plotting_extent
 from rasterio.mask import mask
+from rasterio.enums import Resampling
 from fiona import transform
 from shapely.geometry import shape
 from shapely.geometry import mapping
@@ -146,7 +147,8 @@ class TimeSeries:
     @staticmethod
     def _get_raster_stats_in_geom(
             feature: Dict,
-            raster_path: Union[str, pathlib.PosixPath]
+            raster_path: Union[str, pathlib.PosixPath],
+            cloud_path: str = None
     ) -> Dict:
         """Extracts statistics from a raster in a geometry.
 
@@ -157,6 +159,10 @@ class TimeSeries:
                                               rasterio.open(str(raster_path)).crs["init"],
                                               feature["geometry"]))
 
+        #~ with rasterio.open(str(raster_path)) as raster_src:
+            #~ raster_profile = raster_src.profile
+        #~ logger.info(raster_profile)
+
         stats = zonal_stats(geom, str(raster_path), stats=["count",
                                                            "nodata",
                                                            "min",
@@ -164,6 +170,43 @@ class TimeSeries:
                                                            "median",
                                                            "mean",
                                                            "std"])[0]
+        #~ logger.info(stats)
+        
+        if cloud_path:
+            #~ def mycount(x):
+                #~ return np.count_nonzero(x == 1)            
+            #~ with rasterio.open(str(cloud_path)) as cld_src:
+                #~ cld_profile = cld_src.profile
+                #~ cld_array = cld_src.read(1, out_shape = (10980, 10980), resampling=Resampling.nearest)
+                #~ cld_array = cld_src.read(1)
+                #~ logger.info("shape: {}".format(cld_array.shape))
+
+            #~ stats_cld = zonal_stats(geom, cld_array, affine=raster_profile["transform"] , stats=["count", "nodata"], add_stats={'cldcount':mycount})
+            #~ stats_cld = zonal_stats(geom, cld_array, affine=raster_profile["transform"], categorical=True)[0]
+            #~ stats_cld = zonal_stats(geom, str(cloud_path), nodata = 16383.0, categorical=True)[0]
+            #~ stats_cld = zonal_stats(geom, str(cloud_path), stats=["count", "nodata"], add_stats={'cldcount':mycount})
+            #~ stats_cld = zonal_stats(geom, str(cloud_path), categorical=True, nodata = 16383.0, category_map= {-999: 'tttttttttttt', 1.0: 'clds', 0.0:'noclds', 16383.0: 'nnnnnnn'})[0]
+            stats_cld = zonal_stats(geom, str(cloud_path), stats=["count", "nodata"])[0]
+            
+            #~ logger.info(stats_cld)
+            try:
+                nbcld = stats["nodata"] - stats_cld["nodata"]
+            except:
+                nbcld = 0
+
+            #~ logger.info(stats_cld)
+            #~ logger.info(cld_pct)
+            stats['nbcld'] = nbcld
+            
+        logger.info(stats)
+        
+        
+        
+        
+        # mettre ici le cloud mask !
+        
+        
+        
         return stats
 
     def _get_stats(self) -> None:
@@ -190,6 +233,9 @@ class TimeSeries:
 
                         for index1, prod in enumerate(products):
                             prod_path = tile_indice_path / prod.identifier[:(-12 - len(indice))] / prod.identifier
+                            #~ cloud_path = tile_obj.paths["l2a"] / (prod.identifier[:(-12 - len(indice))] + "_CLOUD_MASK.jp2")
+                            prod_path_unmasked = tile_indice_path / prod.identifier[:(-12 - len(indice))] / (prod.identifier[:-11] + '.jp2')
+                            
                             logger.info("Product {}/{}: {}".format(index1 + 1, len(products), prod.identifier))
                             logger.info("{} features".format(len(fid_list)))
                             
@@ -200,7 +246,7 @@ class TimeSeries:
                                 # feat_properties = features[fid]["properties"]
                                 # if feat_properties:
                                     # df_dict.update(feat_properties)
-                                df_dict.update(TimeSeries._get_raster_stats_in_geom(features[fid], prod_path))
+                                df_dict.update(TimeSeries._get_raster_stats_in_geom(features[fid], prod_path, prod_path_unmasked))
 
                                 # df_properties = features[fid]["properties"]
                                 df_dict["date"] = prod.date
@@ -218,10 +264,12 @@ class TimeSeries:
 
     def _raster_stats_multi(self, features, shared_list, proc_item):
         prod_path = proc_item[2] / proc_item[0].identifier[:(-12 - len(proc_item[3]))] / proc_item[0].identifier
+        prod_path_unmasked = proc_item[2] / proc_item[0].identifier[:(-12 - len(proc_item[3]))] / (proc_item[0].identifier[:-11] + '.jp2')
+        
         fid = proc_item[1]
         result_dict = OrderedDict()
         result_dict["fid"] = fid
-        result_dict.update(TimeSeries._get_raster_stats_in_geom(features[fid], prod_path))
+        result_dict.update(TimeSeries._get_raster_stats_in_geom(features[fid], prod_path, prod_path_unmasked))
         result_dict["date"] = proc_item[0].date
         result_dict["tile"] = proc_item[4]
         result_dict["filename"] = proc_item[0].identifier
@@ -323,7 +371,7 @@ class TimeSeries:
         out_path_folder = Path(out_path) / self._vectors_file.stem
         out_path_folder.mkdir(parents=True, exist_ok=True)
         
-        list_order = ['fid', 'count', 'nodata', 'min', 'max', 'mean', 'median', 'std', 'tile', 'filename']
+        list_order = ['fid', 'count', 'nodata', 'nbcld', 'min', 'max', 'mean', 'median', 'std', 'tile', 'filename']
         #~ b=[a for a in df.columns if a not in liste]
         
         for df_name, df in self._df_dicts.items():
@@ -332,7 +380,14 @@ class TimeSeries:
                 df_name,
                 self._date_min,
                 self._date_max)
-            df.sort_values(by=['fid']).reindex(columns=(list_order + [a for a in df.columns if a not in list_order]), copy=False).to_csv(str(csv_path))
+            #~ df.sort_values(by=['fid', 'date', 'count', 'tile']).reindex(columns=(list_order + [a for a in df.columns if a not in list_order]), copy=False).to_csv(str(csv_path))
+            dg = df.sort_values(by=['fid', 'date', 'count', 'std'], ascending=[True, True, True, False]).\
+                    reindex(columns=(list_order + [a for a in df.columns if a not in list_order]), copy=False).\
+                    reset_index().\
+                    drop_duplicates(['date','fid'], keep='last')
+                    
+            dg.set_index("date", inplace=True, drop=True)
+            dg.to_csv(str(csv_path))
             logger.info("exported to csv: {}".format(csv_path))
     
     def plot_global(self, out_path: Union[str, pathlib.PosixPath] = None) -> None:
-- 
GitLab