From 549d7f38ac4056f118da9ab09baabeb13d4eb288 Mon Sep 17 00:00:00 2001
From: Olivier Jossoud <olivier.jossoud@lsce.ipsl.fr>
Date: Wed, 4 Dec 2019 14:13:17 +0100
Subject: [PATCH] Explo. Use cfatools.

---
 src/dataprovider/exploprovider.py | 208 ++++--------------------------
 src/main.py                       |   4 +-
 src/uim/explouim.py               |  73 +++++------
 3 files changed, 60 insertions(+), 225 deletions(-)

diff --git a/src/dataprovider/exploprovider.py b/src/dataprovider/exploprovider.py
index 43aea64..22b550d 100644
--- a/src/dataprovider/exploprovider.py
+++ b/src/dataprovider/exploprovider.py
@@ -1,26 +1,21 @@
 import pandas as pd
-import datetime
 import os
 import re
-import xmltodict
 import xml.etree.cElementTree as ET
 from io import StringIO
-from cfatools.logreader.instrument import InstrumentReader
-
-import utils
-from dataprovider.picarroprovider import PicarroProvider
-
+from cfatools.logreader.dataset import DatasetReader
 
 
 class ExploProvider:
 
-    def __init__(self, picarro_prvd: PicarroProvider):
+    saved_setup_subdir = "saved_setups/"
+    saved_setup_ext = ".xml"
+
+    def __init__(self):
         self.datasets_root_directory = ""
-        self.datasets = {}
-        self.picarro_prvd = picarro_prvd
-        self.instrument_reader = None
+        self.dataset_readers = {}
 
-    def explore_root_directory(self, root_directory: str) -> list:
+    def explore_root_directory(self, root_directory: str) -> dict:
         """Get the names of the datasets directories.
 
         Parameters
@@ -34,7 +29,6 @@ class ExploProvider:
             List of dataset directories name (without full path)
 
         """
-        self.instrument_reader = InstrumentReader(base_path=root_directory)
         directories = []
 
         # Find all directories in datasets root directory (not recursive)
@@ -50,93 +44,19 @@ class ExploProvider:
         dataset_directories.sort()
 
         self.datasets_root_directory = root_directory
-        self.datasets_dirs = dataset_directories
         for directory in dataset_directories:
-            dataset = Dataset(root_directory, directory, self.picarro_prvd)
-            self.datasets[directory] = dataset
-
-        return dataset_directories
-
-
-class Dataset:
-
-    def __init__(self, root_directory: str, directory_name: str, picarro_prvd: PicarroProvider):
-        self.root_directory = root_directory
-        self.directory_name = directory_name
-        self.full_directory_name = root_directory + "/" + directory_name
-
-        self.picarro_prvd = picarro_prvd
-
-        # Get dataset name
-        self.dataset_text = directory_name[-9:]
-        self.first_data_datetime = datetime.datetime.now(tz=datetime.timezone.utc)
-        self.last_data_datetime = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc)
-
-        self.instlogs = {}
-        self.manual_event_log = None
-
-        # Setup save/load
-        self.saved_setup_dir = self.full_directory_name + "/saved_setups/"
-        self.saved_setup_ext = ".xml"
-
-        self.explore_dataset()
-
-    def explore_dataset(self) -> None:
-        filenames = os.listdir(self.full_directory_name)
-
-        for filename in filenames:
-            try:
-                inst_and_type = re.search("^" + self.directory_name + '_(.+?).log$', filename).group(1)
-            except AttributeError:
-                # The found file does not match normal instrument's log file pattern
-                continue
-
-            instrument_name = inst_and_type.split("_")[0]
-
-            if len(inst_and_type.split("_")) == 2:
-                log_type = inst_and_type.split("_")[1]
-                if log_type == "instant":
-                    if instrument_name == "ICBKCTRL":
-                        instrument_log = IceblockInstantLog(self.full_directory_name, filename, instrument_name)
-                    else:
-                        instrument_log = InstrumentInstantLog(self.full_directory_name, filename, instrument_name)
-                elif log_type == "periodic":
-                    instrument_log = InstrumentPeriodicLog(self.full_directory_name, filename, instrument_name)
-                    self.first_data_datetime = min(self.first_data_datetime, instrument_log.df["datetime"].min())
-                    self.last_data_datetime = max(self.last_data_datetime, instrument_log.df["datetime"].max())
-                else:
-                    raise ValueError("Unknown log type: [" + log_type + "]")
-                self.instlogs[inst_and_type] = instrument_log
-            elif instrument_name == "manual-event":
-                self.manual_event_log = ManualEventLog(self.full_directory_name, filename, instrument_name)
-
-        # Picarro data are not logged the same way as the others, it is logged directly in the Picarro instrument.
-        # In order to have comparable data files, create "artificial" PICARRO_periodic log file from the Picarro log
-        # files.
-        picarro_filename = self.directory_name + "_PICARRO_periodic.log"
-        if picarro_filename not in filenames:
-            try:
-                picarro_df = self.picarro_prvd.get_df(self.first_data_datetime,
-                                                      self.last_data_datetime,
-                                                      ["H2O", "Delta_D_H", "Delta_18_16"])
-            except ValueError as e:
-                print("Failed to get Picarro data: " + str(e))
-                return
-
-            picarro_df.to_csv(path_or_buf=self.full_directory_name + "/" + picarro_filename,
-                              sep="\t",
-                              index=False,
-                              mode='w',  # Always override file content
-                              date_format=utils.datetime_format
-                              )
-            picarro_log = InstrumentPeriodicLog(self.full_directory_name, picarro_filename, "PICARRO")
-            self.instlogs["PICARRO_periodic"] = picarro_log
-
-    def save_setup(self, setup_name: str, variable_df: pd.DataFrame, view_range: list) -> None:
+            self.dataset_readers[directory] = DatasetReader(base_path=root_directory, dataset=directory)
+
+        return self.dataset_readers
+
+    def save_setup(self, dataset_reader: DatasetReader,
+                   setup_name: str, variable_df: pd.DataFrame, view_range: list) -> None:
+        saved_setup_dir = dataset_reader.dataset_path + self.saved_setup_subdir
+
         # Build 'saved setup' full file name
-        if not os.path.exists(self.saved_setup_dir):
-            os.mkdir(self.saved_setup_dir)
-        filename = self.saved_setup_dir + setup_name + self.saved_setup_ext
+        if not os.path.exists(saved_setup_dir):
+            os.mkdir(saved_setup_dir)
+        filename = saved_setup_dir + setup_name + self.saved_setup_ext
 
         # Variables table
         variables_str = variable_df.to_csv(sep=";",
@@ -155,8 +75,8 @@ class Dataset:
         tree = ET.ElementTree(root_elmt)
         tree.write(filename)
 
-    def load_setup(self, filename: str) -> tuple:
-        full_filename = self.saved_setup_dir + filename + self.saved_setup_ext
+    def load_setup(self, dataset_reader: DatasetReader, filename: str) -> tuple:
+        full_filename = dataset_reader.dataset_path + self.saved_setup_subdir + filename + self.saved_setup_ext
 
         # Open XML file
         tree = ET.parse(full_filename)
@@ -176,7 +96,7 @@ class Dataset:
 
         return variable_df, view_range_dict
 
-    def setup_filename_is_valid(self, filename: str) -> tuple:
+    def setup_filename_is_valid(self, dataset_reader: DatasetReader, filename: str) -> tuple:
         """Check if the file name is valid: no special characters, file does not already exists.
 
         Parameters
@@ -194,18 +114,19 @@ class Dataset:
         if not re.match("^[A-Za-z0-9_-]*$", filename):
             error_msg = "File name can only contain letters, digits and '-' or '_'. File extension is automatically set."
             return False, error_msg
-        elif filename in self.get_setup_saved_files():
+        elif filename in self.get_setup_saved_files(dataset_reader):
             error_msg = "File already exists."
             return False, error_msg
         else:
             return True, ""
 
-    def get_setup_saved_files(self) -> list:
+    def get_setup_saved_files(self, dataset_reader: DatasetReader) -> list:
         """Get a list of the 'setup' file names (without extension) existing in the 'saved_setups' directory."""
-        if not os.path.exists(self.saved_setup_dir):
+        saved_setups_dir = dataset_reader.dataset_path + self.saved_setup_subdir
+        if not os.path.exists(saved_setups_dir):
             return []
 
-        filenames = os.listdir(self.saved_setup_dir)
+        filenames = os.listdir(saved_setups_dir)
 
         files_without_ext = [os.path.splitext(filename)[0] for filename in filenames]
         return files_without_ext
@@ -230,31 +151,8 @@ class InstrumentLog:
     def __get_df__(self) -> pd.DataFrame:
         raise NotImplementedError("Subclasses should implement this.")
 
-
-class InstrumentInstantLog(InstrumentLog):
-
-    def __init__(self, full_directory_name: str, filename: str, instrument_name: str):
-        InstrumentLog.__init__(self, full_directory_name, filename, instrument_name)
-
-    def __get_df__(self) -> pd.DataFrame:
-        df = pd.read_csv(self.full_file_name, sep=",", parse_dates=["datetime"])
-        return df
-
-    def get_variables(self):
-        return self.df.name.unique()
-
-    def get_timeseries(self, variable: str) -> pd.DataFrame:
-        timeseries_df = self.df[self.df["name"] == variable]
-        timeseries_df = timeseries_df.drop(columns=['name'])
-
-        try:
-            timeseries_df["value"] = timeseries_df["value"].astype(float)
-        except ValueError:
-            timeseries_df["value_int"] = timeseries_df["value"].astype("category").cat.codes
-        return timeseries_df
-
-
 class IceblockInstantLog(InstrumentLog):
+    # TODO: Transfer these functions to cfatools package
 
     def __init__(self, full_directory_name: str, filename: str, instrument_name: str):
         InstrumentLog.__init__(self, full_directory_name, filename, instrument_name)
@@ -298,55 +196,3 @@ class IceblockInstantLog(InstrumentLog):
 
         return melting_df
 
-
-class ManualEventLog(InstrumentLog):
-
-    def __init__(self, full_directory_name: str, filename: str, instrument_name: str):
-        InstrumentLog.__init__(self, full_directory_name, filename, instrument_name)
-
-    def __get_df__(self) -> pd.DataFrame:
-        # The manual-event log file is not a valid XML file: the root tage is missing. So open the content of the file,
-        # and add the root tags
-        with open(self.full_file_name) as f:
-            xml_str = f.read()
-        xml_str = "<root>" + xml_str + "</root>"
-
-        # Convert the XML to dict, then convert the dict to pd.Dataframe
-        xml_dict = xmltodict.parse(xml_str)
-        if "datetime" in xml_dict["root"]["event"]:  # Only 1 event -> one less level in dict tree
-            df = pd.DataFrame([xml_dict["root"]["event"]])
-        else:
-            df = pd.DataFrame.from_dict(xml_dict["root"]["event"])
-
-        # Rename "description" column
-        df.rename(columns={"description": 'event'}, inplace=True)
-
-        return df
-
-    def get_variables(self):
-        return ["event"]
-
-    def get_timeseries(self, variable: str) -> pd.DataFrame:
-        timeseries_df = self.df[["datetime", variable]]
-        timeseries_df.rename(columns={variable: 'value'}, inplace=True)
-        return timeseries_df
-
-
-class InstrumentPeriodicLog(InstrumentLog):
-
-    def __init__(self, full_directory_name: str, filename: str, instrument_name: str):
-        InstrumentLog.__init__(self, full_directory_name, filename, instrument_name)
-
-    def __get_df__(self) -> pd.DataFrame:
-        df = pd.read_csv(self.full_file_name, sep="\t", parse_dates=["datetime"])
-        return df
-
-    def get_variables(self):
-        all_cols = list(self.df)
-        variable_cols = [colname for colname in all_cols if colname != "datetime"]
-        return variable_cols
-
-    def get_timeseries(self, variable: str) -> pd.DataFrame:
-        timeseries_df = self.df[["datetime", variable]]
-        timeseries_df.rename(columns={variable: 'value'}, inplace=True)
-        return timeseries_df
diff --git a/src/main.py b/src/main.py
index 89bc58f..2b39740 100755
--- a/src/main.py
+++ b/src/main.py
@@ -8,7 +8,6 @@ from gui.mainwindow import MainWindow
 
 from dataprovider.conductcalibprovider import ConductCalibProvider
 from dataprovider.exploprovider import ExploProvider
-from dataprovider.picarroprovider import PicarroProvider
 
 from uim.conductcalibuim import ConductCalibUim
 from uim.explouim import ExploUim
@@ -31,8 +30,7 @@ main_window_ui = main_window.main_ui
 # DATA PROVIDERS
 ########################################################################################################################
 conduct_prvd = ConductCalibProvider()
-picarro_prvd = PicarroProvider(config)
-explo_prvd = ExploProvider(picarro_prvd)
+explo_prvd = ExploProvider()
 
 ########################################################################################################################
 # GUI MANAGERS
diff --git a/src/uim/explouim.py b/src/uim/explouim.py
index 74a8403..5728928 100644
--- a/src/uim/explouim.py
+++ b/src/uim/explouim.py
@@ -1,5 +1,4 @@
 import datetime
-import re
 import math
 import pyqtgraph as pg
 from PyQt5.QtWidgets import *
@@ -7,7 +6,6 @@ from PyQt5.QtGui import QColor
 from PyQt5.QtCore import *
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
-import numpy as np
 from pyqtgraph.GraphicsScene.mouseEvents import MouseClickEvent
 
 import utils
@@ -17,6 +15,7 @@ from gui.uimainwindow import Ui_MainWindow
 from gui.stabwindow import StabWindow
 
 
+
 class ExploUim:
 
     # "Variables" tableWidget columns identifiers.
@@ -45,6 +44,7 @@ class ExploUim:
         self.config = config
         self.stab_window = stab_window
 
+        self.dataset_readers = dict()
         self.current_dataset = None
 
         # The "var_id" is used to identify and match table's lines and plot's items (curves, etc.). Table's row id can
@@ -79,24 +79,27 @@ class ExploUim:
     def __initialize_dataset_combobox__(self):
         """Populate the "datasets" combobox with the existing dataset directory names."""
         data_root_dir = self.config.read("DATA_SOURCE", "absolute_root_dir")
-        dataset_dirs = self.explo_prvd.explore_root_directory(data_root_dir)
-        dataset_dirs.sort(reverse=True)
-        for directory in dataset_dirs:
-            self.main_ui.explo_combobox_dataset.addItem(directory)
+        self.dataset_readers = self.explo_prvd.explore_root_directory(data_root_dir)
+        dataset_names = list(self.dataset_readers.keys())
+        dataset_names.sort()  # Most recent dataset has last index
+        for dataset_name in dataset_names:
+            self.main_ui.explo_combobox_dataset.addItem(dataset_name)
+        self.main_ui.explo_combobox_dataset.setCurrentIndex(len(dataset_names)-1)  # Select the most recent dataset
 
-        self.__update_current_dataset__(dataset_dirs[0])
+        self.__update_current_dataset__(dataset_names[-1])
 
-    def __update_current_dataset__(self, dataset_dir: str, ask_confirmation: bool = True):
+    def __update_current_dataset__(self, dataset_name):
         # Clear table and plot
         self.main_ui.explo_tablewidget_variables.setRowCount(0)
         self.__initialize_plot__()
 
         # Change current dataset and add a new row
-        self.current_dataset = self.explo_prvd.datasets[dataset_dir]
+        self.current_dataset = self.dataset_readers[dataset_name]
         self.__add_new_row_in_variable_table__()
 
         # Enable/disable the possibility to display manual events, depending on the events log file availability.
-        self.main_ui.explo_checkbox_manualevent.setEnabled(self.current_dataset.manual_event_log is not None)
+        has_manual_events = "manual-event" in self.current_dataset.get_instruments_names()
+        self.main_ui.explo_checkbox_manualevent.setEnabled(has_manual_events)
         self.main_ui.explo_checkbox_manualevent.setChecked(False)
 
         # Update 'existing setups'
@@ -140,7 +143,8 @@ class ExploUim:
 
         # Connect Instrument change to variables display
         table.cellWidget(row_id, self.INSTRUMENT_COL).currentTextChanged.connect(
-            lambda text, row_id=row_id: self.__update_variables_combobox__(combobox_text=text, combobox=variable_item))
+            lambda text, row_id=row_id: self.__update_variables_combobox__(combobox_text=text,
+                                                                           variables_combobox=variable_item))
 
         # Color
         color_item = QTableWidgetItem()
@@ -201,23 +205,15 @@ class ExploUim:
         dataset_dir = self.main_ui.explo_combobox_dataset.currentText()
         if dataset_dir == "":
             return
-
-        dataset = self.explo_prvd.datasets[dataset_dir]
-        instrument_logs = dataset.instlogs
-        instrument_logs_names = [log_name for log_name in instrument_logs]
-        instrument_logs_names.sort()
-        for instrument_name in instrument_logs_names:
+        for instrument_name in self.current_dataset.get_instruments_names():
             combobox.addItem(instrument_name)
 
-    def __update_variables_combobox__(self, combobox_text: str, combobox: QComboBox):
+    def __update_variables_combobox__(self, combobox_text: str, variables_combobox: QComboBox):
         if combobox_text == "":
             return
-        # variables_combobox = self.main_ui.explo_tablewidget_variables.cellWidget(row, self.VARIABLE_COL)
-        variables_combobox = combobox
         variables_combobox.clear()
-        instrument_log_name = combobox_text
-        instrument_log = self.current_dataset.instlogs[instrument_log_name]
-        variable_names = instrument_log.get_variables()
+        instrument_name = combobox_text
+        variable_names = self.current_dataset.get_instrument_variables(instrument_name)
         for variable_name in variable_names:
             variables_combobox.addItem(variable_name)
 
@@ -233,13 +229,10 @@ class ExploUim:
         table = self.main_ui.explo_tablewidget_variables
 
         # Get instrument log
-        instrument_log_name = table.cellWidget(row_id, self.INSTRUMENT_COL).currentText()
-        instrument_log = self.current_dataset.instlogs[instrument_log_name]
-
-        # Get variable name
+        instrument_name = table.cellWidget(row_id, self.INSTRUMENT_COL).currentText()
         variable_name = table.cellWidget(row_id, self.VARIABLE_COL).currentText()
 
-        timeseries = instrument_log.get_timeseries(variable_name)
+        timeseries = self.current_dataset.get_timeseries(instrument_name, variable_name)
 
         return timeseries
 
@@ -287,7 +280,7 @@ class ExploUim:
 
             # Get Y orig (original non-shifted variable value)
             df = self.__get_row_dataframe__(row_id).copy()
-            df = df[df["datetime"] <= instant]
+            df = df[df.index <= instant]
             if len(df.index) == 0:
                 y_orig = "Out of range"
             else:
@@ -349,7 +342,7 @@ class ExploUim:
 
     def __update_manual_event__(self, checked_state: int):
         if checked_state == 2:
-            events_df = self.current_dataset.manual_event_log.get_timeseries("event")
+            events_df = self.current_dataset.get_timeseries("manual-event", "event")
             for event in events_df.iterrows():
                 x_pos = utils.pd_time_to_epoch_ms([event[1]["datetime"]])[0]
 
@@ -413,8 +406,9 @@ class ExploUim:
     def __get_timeseries_x_values__(self, timeseries: pd.DataFrame, timeshift_sec: float = 0) -> list:
         # As it is a _step_ curve, add a last datetime point to determine the end of the last step. This is the datetime
         # of the last available data of the dataset, plus one second.
-        last_datetime = self.current_dataset.last_data_datetime + datetime.timedelta(seconds=1)
-        x_values = timeseries['datetime'].copy()
+        first, last = self.current_dataset.get_data_timeframe()
+        last_datetime = last + datetime.timedelta(seconds=1)
+        x_values = timeseries.index.copy().to_series(keep_tz=True)
         x_values = x_values.append(pd.Series([last_datetime]))
 
         # Apply time shift
@@ -457,9 +451,6 @@ class ExploUim:
             self.main_ui.explo_lineedit_setup_name.setStyleSheet("color: 'black';")
 
     def __save_setup__(self):
-        # Dataset
-        dataset = self.current_dataset
-
         # File name
         filename = self.main_ui.explo_lineedit_setup_name.text()
 
@@ -479,7 +470,7 @@ class ExploUim:
         # View range
         view_range = self.plot_item.getViewBox().viewRange()
 
-        dataset.save_setup(filename, variable_df, view_range)
+        self.explo_prvd.save_setup(self.current_dataset, filename, variable_df, view_range)
         self.__refresh_existing_setups__()
 
         # Reset filename input widgets
@@ -493,7 +484,7 @@ class ExploUim:
 
         # Get a dataframe containing the variables data
         filename = self.main_ui.explo_listwidget_setup_list.selectedItems()[0].text()
-        variable_df, view_range_dict = self.current_dataset.load_setup(filename)
+        variable_df, view_range_dict = self.explo_prvd.load_setup(self.current_dataset, filename)
 
         # Variables: table (and automatically: plot)
         table = self.main_ui.explo_tablewidget_variables
@@ -536,7 +527,7 @@ class ExploUim:
                                              padding=0)
 
     def __refresh_existing_setups__(self):
-        files = self.current_dataset.get_setup_saved_files()
+        files = self.explo_prvd.get_setup_saved_files(self.current_dataset)
         self.main_ui.explo_listwidget_setup_list.clear()
         for file in files:
             self.main_ui.explo_listwidget_setup_list.addItem(file)
@@ -607,9 +598,9 @@ class ExploUim:
             return
 
         # Get data
-        instrument_log = self.current_dataset.instlogs[self.stab_window.ui.stab_combobox_instrument.currentText()]
+        instrument_name = self.stab_window.ui.stab_combobox_instrument.currentText()
         variable_name = self.stab_window.ui.stab_combobox_variable.currentText()
-        self.stab_timeseries = instrument_log.get_timeseries(variable_name).copy()
+        self.stab_timeseries = self.current_dataset.get_timeseries(instrument_name, variable_name).copy()
 
         # Set data to step curve
         x_values = self.__get_timeseries_x_values__(self.stab_timeseries)
@@ -618,7 +609,7 @@ class ExploUim:
                                      stepMode=True)
 
         # Convert data in a form more convenient for plot
-        self.stab_timeseries["datetime"] = utils.pd_time_to_epoch_ms(self.stab_timeseries["datetime"])
+        self.stab_timeseries["datetime"] = utils.pd_time_to_epoch_ms(self.stab_timeseries.index)
         if not is_numeric_dtype(self.stab_timeseries["value"]):
             self.stab_timeseries["value"] = self.stab_timeseries["value_int"]
 
-- 
GitLab