From 549d7f38ac4056f118da9ab09baabeb13d4eb288 Mon Sep 17 00:00:00 2001 From: Olivier Jossoud <olivier.jossoud@lsce.ipsl.fr> Date: Wed, 4 Dec 2019 14:13:17 +0100 Subject: [PATCH] Explo. Use cfatools. --- src/dataprovider/exploprovider.py | 208 ++++-------------------------- src/main.py | 4 +- src/uim/explouim.py | 73 +++++------ 3 files changed, 60 insertions(+), 225 deletions(-) diff --git a/src/dataprovider/exploprovider.py b/src/dataprovider/exploprovider.py index 43aea64..22b550d 100644 --- a/src/dataprovider/exploprovider.py +++ b/src/dataprovider/exploprovider.py @@ -1,26 +1,21 @@ import pandas as pd -import datetime import os import re -import xmltodict import xml.etree.cElementTree as ET from io import StringIO -from cfatools.logreader.instrument import InstrumentReader - -import utils -from dataprovider.picarroprovider import PicarroProvider - +from cfatools.logreader.dataset import DatasetReader class ExploProvider: - def __init__(self, picarro_prvd: PicarroProvider): + saved_setup_subdir = "saved_setups/" + saved_setup_ext = ".xml" + + def __init__(self): self.datasets_root_directory = "" - self.datasets = {} - self.picarro_prvd = picarro_prvd - self.instrument_reader = None + self.dataset_readers = {} - def explore_root_directory(self, root_directory: str) -> list: + def explore_root_directory(self, root_directory: str) -> dict: """Get the names of the datasets directories. Parameters @@ -34,7 +29,6 @@ class ExploProvider: List of dataset directories name (without full path) """ - self.instrument_reader = InstrumentReader(base_path=root_directory) directories = [] # Find all directories in datasets root directory (not recursive) @@ -50,93 +44,19 @@ class ExploProvider: dataset_directories.sort() self.datasets_root_directory = root_directory - self.datasets_dirs = dataset_directories for directory in dataset_directories: - dataset = Dataset(root_directory, directory, self.picarro_prvd) - self.datasets[directory] = dataset - - return dataset_directories - - -class Dataset: - - def __init__(self, root_directory: str, directory_name: str, picarro_prvd: PicarroProvider): - self.root_directory = root_directory - self.directory_name = directory_name - self.full_directory_name = root_directory + "/" + directory_name - - self.picarro_prvd = picarro_prvd - - # Get dataset name - self.dataset_text = directory_name[-9:] - self.first_data_datetime = datetime.datetime.now(tz=datetime.timezone.utc) - self.last_data_datetime = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) - - self.instlogs = {} - self.manual_event_log = None - - # Setup save/load - self.saved_setup_dir = self.full_directory_name + "/saved_setups/" - self.saved_setup_ext = ".xml" - - self.explore_dataset() - - def explore_dataset(self) -> None: - filenames = os.listdir(self.full_directory_name) - - for filename in filenames: - try: - inst_and_type = re.search("^" + self.directory_name + '_(.+?).log$', filename).group(1) - except AttributeError: - # The found file does not match normal instrument's log file pattern - continue - - instrument_name = inst_and_type.split("_")[0] - - if len(inst_and_type.split("_")) == 2: - log_type = inst_and_type.split("_")[1] - if log_type == "instant": - if instrument_name == "ICBKCTRL": - instrument_log = IceblockInstantLog(self.full_directory_name, filename, instrument_name) - else: - instrument_log = InstrumentInstantLog(self.full_directory_name, filename, instrument_name) - elif log_type == "periodic": - instrument_log = InstrumentPeriodicLog(self.full_directory_name, filename, instrument_name) - self.first_data_datetime = min(self.first_data_datetime, instrument_log.df["datetime"].min()) - self.last_data_datetime = max(self.last_data_datetime, instrument_log.df["datetime"].max()) - else: - raise ValueError("Unknown log type: [" + log_type + "]") - self.instlogs[inst_and_type] = instrument_log - elif instrument_name == "manual-event": - self.manual_event_log = ManualEventLog(self.full_directory_name, filename, instrument_name) - - # Picarro data are not logged the same way as the others, it is logged directly in the Picarro instrument. - # In order to have comparable data files, create "artificial" PICARRO_periodic log file from the Picarro log - # files. - picarro_filename = self.directory_name + "_PICARRO_periodic.log" - if picarro_filename not in filenames: - try: - picarro_df = self.picarro_prvd.get_df(self.first_data_datetime, - self.last_data_datetime, - ["H2O", "Delta_D_H", "Delta_18_16"]) - except ValueError as e: - print("Failed to get Picarro data: " + str(e)) - return - - picarro_df.to_csv(path_or_buf=self.full_directory_name + "/" + picarro_filename, - sep="\t", - index=False, - mode='w', # Always override file content - date_format=utils.datetime_format - ) - picarro_log = InstrumentPeriodicLog(self.full_directory_name, picarro_filename, "PICARRO") - self.instlogs["PICARRO_periodic"] = picarro_log - - def save_setup(self, setup_name: str, variable_df: pd.DataFrame, view_range: list) -> None: + self.dataset_readers[directory] = DatasetReader(base_path=root_directory, dataset=directory) + + return self.dataset_readers + + def save_setup(self, dataset_reader: DatasetReader, + setup_name: str, variable_df: pd.DataFrame, view_range: list) -> None: + saved_setup_dir = dataset_reader.dataset_path + self.saved_setup_subdir + # Build 'saved setup' full file name - if not os.path.exists(self.saved_setup_dir): - os.mkdir(self.saved_setup_dir) - filename = self.saved_setup_dir + setup_name + self.saved_setup_ext + if not os.path.exists(saved_setup_dir): + os.mkdir(saved_setup_dir) + filename = saved_setup_dir + setup_name + self.saved_setup_ext # Variables table variables_str = variable_df.to_csv(sep=";", @@ -155,8 +75,8 @@ class Dataset: tree = ET.ElementTree(root_elmt) tree.write(filename) - def load_setup(self, filename: str) -> tuple: - full_filename = self.saved_setup_dir + filename + self.saved_setup_ext + def load_setup(self, dataset_reader: DatasetReader, filename: str) -> tuple: + full_filename = dataset_reader.dataset_path + self.saved_setup_subdir + filename + self.saved_setup_ext # Open XML file tree = ET.parse(full_filename) @@ -176,7 +96,7 @@ class Dataset: return variable_df, view_range_dict - def setup_filename_is_valid(self, filename: str) -> tuple: + def setup_filename_is_valid(self, dataset_reader: DatasetReader, filename: str) -> tuple: """Check if the file name is valid: no special characters, file does not already exists. Parameters @@ -194,18 +114,19 @@ class Dataset: if not re.match("^[A-Za-z0-9_-]*$", filename): error_msg = "File name can only contain letters, digits and '-' or '_'. File extension is automatically set." return False, error_msg - elif filename in self.get_setup_saved_files(): + elif filename in self.get_setup_saved_files(dataset_reader): error_msg = "File already exists." return False, error_msg else: return True, "" - def get_setup_saved_files(self) -> list: + def get_setup_saved_files(self, dataset_reader: DatasetReader) -> list: """Get a list of the 'setup' file names (without extension) existing in the 'saved_setups' directory.""" - if not os.path.exists(self.saved_setup_dir): + saved_setups_dir = dataset_reader.dataset_path + self.saved_setup_subdir + if not os.path.exists(saved_setups_dir): return [] - filenames = os.listdir(self.saved_setup_dir) + filenames = os.listdir(saved_setups_dir) files_without_ext = [os.path.splitext(filename)[0] for filename in filenames] return files_without_ext @@ -230,31 +151,8 @@ class InstrumentLog: def __get_df__(self) -> pd.DataFrame: raise NotImplementedError("Subclasses should implement this.") - -class InstrumentInstantLog(InstrumentLog): - - def __init__(self, full_directory_name: str, filename: str, instrument_name: str): - InstrumentLog.__init__(self, full_directory_name, filename, instrument_name) - - def __get_df__(self) -> pd.DataFrame: - df = pd.read_csv(self.full_file_name, sep=",", parse_dates=["datetime"]) - return df - - def get_variables(self): - return self.df.name.unique() - - def get_timeseries(self, variable: str) -> pd.DataFrame: - timeseries_df = self.df[self.df["name"] == variable] - timeseries_df = timeseries_df.drop(columns=['name']) - - try: - timeseries_df["value"] = timeseries_df["value"].astype(float) - except ValueError: - timeseries_df["value_int"] = timeseries_df["value"].astype("category").cat.codes - return timeseries_df - - class IceblockInstantLog(InstrumentLog): + # TODO: Transfer these functions to cfatools package def __init__(self, full_directory_name: str, filename: str, instrument_name: str): InstrumentLog.__init__(self, full_directory_name, filename, instrument_name) @@ -298,55 +196,3 @@ class IceblockInstantLog(InstrumentLog): return melting_df - -class ManualEventLog(InstrumentLog): - - def __init__(self, full_directory_name: str, filename: str, instrument_name: str): - InstrumentLog.__init__(self, full_directory_name, filename, instrument_name) - - def __get_df__(self) -> pd.DataFrame: - # The manual-event log file is not a valid XML file: the root tage is missing. So open the content of the file, - # and add the root tags - with open(self.full_file_name) as f: - xml_str = f.read() - xml_str = "<root>" + xml_str + "</root>" - - # Convert the XML to dict, then convert the dict to pd.Dataframe - xml_dict = xmltodict.parse(xml_str) - if "datetime" in xml_dict["root"]["event"]: # Only 1 event -> one less level in dict tree - df = pd.DataFrame([xml_dict["root"]["event"]]) - else: - df = pd.DataFrame.from_dict(xml_dict["root"]["event"]) - - # Rename "description" column - df.rename(columns={"description": 'event'}, inplace=True) - - return df - - def get_variables(self): - return ["event"] - - def get_timeseries(self, variable: str) -> pd.DataFrame: - timeseries_df = self.df[["datetime", variable]] - timeseries_df.rename(columns={variable: 'value'}, inplace=True) - return timeseries_df - - -class InstrumentPeriodicLog(InstrumentLog): - - def __init__(self, full_directory_name: str, filename: str, instrument_name: str): - InstrumentLog.__init__(self, full_directory_name, filename, instrument_name) - - def __get_df__(self) -> pd.DataFrame: - df = pd.read_csv(self.full_file_name, sep="\t", parse_dates=["datetime"]) - return df - - def get_variables(self): - all_cols = list(self.df) - variable_cols = [colname for colname in all_cols if colname != "datetime"] - return variable_cols - - def get_timeseries(self, variable: str) -> pd.DataFrame: - timeseries_df = self.df[["datetime", variable]] - timeseries_df.rename(columns={variable: 'value'}, inplace=True) - return timeseries_df diff --git a/src/main.py b/src/main.py index 89bc58f..2b39740 100755 --- a/src/main.py +++ b/src/main.py @@ -8,7 +8,6 @@ from gui.mainwindow import MainWindow from dataprovider.conductcalibprovider import ConductCalibProvider from dataprovider.exploprovider import ExploProvider -from dataprovider.picarroprovider import PicarroProvider from uim.conductcalibuim import ConductCalibUim from uim.explouim import ExploUim @@ -31,8 +30,7 @@ main_window_ui = main_window.main_ui # DATA PROVIDERS ######################################################################################################################## conduct_prvd = ConductCalibProvider() -picarro_prvd = PicarroProvider(config) -explo_prvd = ExploProvider(picarro_prvd) +explo_prvd = ExploProvider() ######################################################################################################################## # GUI MANAGERS diff --git a/src/uim/explouim.py b/src/uim/explouim.py index 74a8403..5728928 100644 --- a/src/uim/explouim.py +++ b/src/uim/explouim.py @@ -1,5 +1,4 @@ import datetime -import re import math import pyqtgraph as pg from PyQt5.QtWidgets import * @@ -7,7 +6,6 @@ from PyQt5.QtGui import QColor from PyQt5.QtCore import * import pandas as pd from pandas.api.types import is_numeric_dtype -import numpy as np from pyqtgraph.GraphicsScene.mouseEvents import MouseClickEvent import utils @@ -17,6 +15,7 @@ from gui.uimainwindow import Ui_MainWindow from gui.stabwindow import StabWindow + class ExploUim: # "Variables" tableWidget columns identifiers. @@ -45,6 +44,7 @@ class ExploUim: self.config = config self.stab_window = stab_window + self.dataset_readers = dict() self.current_dataset = None # The "var_id" is used to identify and match table's lines and plot's items (curves, etc.). Table's row id can @@ -79,24 +79,27 @@ class ExploUim: def __initialize_dataset_combobox__(self): """Populate the "datasets" combobox with the existing dataset directory names.""" data_root_dir = self.config.read("DATA_SOURCE", "absolute_root_dir") - dataset_dirs = self.explo_prvd.explore_root_directory(data_root_dir) - dataset_dirs.sort(reverse=True) - for directory in dataset_dirs: - self.main_ui.explo_combobox_dataset.addItem(directory) + self.dataset_readers = self.explo_prvd.explore_root_directory(data_root_dir) + dataset_names = list(self.dataset_readers.keys()) + dataset_names.sort() # Most recent dataset has last index + for dataset_name in dataset_names: + self.main_ui.explo_combobox_dataset.addItem(dataset_name) + self.main_ui.explo_combobox_dataset.setCurrentIndex(len(dataset_names)-1) # Select the most recent dataset - self.__update_current_dataset__(dataset_dirs[0]) + self.__update_current_dataset__(dataset_names[-1]) - def __update_current_dataset__(self, dataset_dir: str, ask_confirmation: bool = True): + def __update_current_dataset__(self, dataset_name): # Clear table and plot self.main_ui.explo_tablewidget_variables.setRowCount(0) self.__initialize_plot__() # Change current dataset and add a new row - self.current_dataset = self.explo_prvd.datasets[dataset_dir] + self.current_dataset = self.dataset_readers[dataset_name] self.__add_new_row_in_variable_table__() # Enable/disable the possibility to display manual events, depending on the events log file availability. - self.main_ui.explo_checkbox_manualevent.setEnabled(self.current_dataset.manual_event_log is not None) + has_manual_events = "manual-event" in self.current_dataset.get_instruments_names() + self.main_ui.explo_checkbox_manualevent.setEnabled(has_manual_events) self.main_ui.explo_checkbox_manualevent.setChecked(False) # Update 'existing setups' @@ -140,7 +143,8 @@ class ExploUim: # Connect Instrument change to variables display table.cellWidget(row_id, self.INSTRUMENT_COL).currentTextChanged.connect( - lambda text, row_id=row_id: self.__update_variables_combobox__(combobox_text=text, combobox=variable_item)) + lambda text, row_id=row_id: self.__update_variables_combobox__(combobox_text=text, + variables_combobox=variable_item)) # Color color_item = QTableWidgetItem() @@ -201,23 +205,15 @@ class ExploUim: dataset_dir = self.main_ui.explo_combobox_dataset.currentText() if dataset_dir == "": return - - dataset = self.explo_prvd.datasets[dataset_dir] - instrument_logs = dataset.instlogs - instrument_logs_names = [log_name for log_name in instrument_logs] - instrument_logs_names.sort() - for instrument_name in instrument_logs_names: + for instrument_name in self.current_dataset.get_instruments_names(): combobox.addItem(instrument_name) - def __update_variables_combobox__(self, combobox_text: str, combobox: QComboBox): + def __update_variables_combobox__(self, combobox_text: str, variables_combobox: QComboBox): if combobox_text == "": return - # variables_combobox = self.main_ui.explo_tablewidget_variables.cellWidget(row, self.VARIABLE_COL) - variables_combobox = combobox variables_combobox.clear() - instrument_log_name = combobox_text - instrument_log = self.current_dataset.instlogs[instrument_log_name] - variable_names = instrument_log.get_variables() + instrument_name = combobox_text + variable_names = self.current_dataset.get_instrument_variables(instrument_name) for variable_name in variable_names: variables_combobox.addItem(variable_name) @@ -233,13 +229,10 @@ class ExploUim: table = self.main_ui.explo_tablewidget_variables # Get instrument log - instrument_log_name = table.cellWidget(row_id, self.INSTRUMENT_COL).currentText() - instrument_log = self.current_dataset.instlogs[instrument_log_name] - - # Get variable name + instrument_name = table.cellWidget(row_id, self.INSTRUMENT_COL).currentText() variable_name = table.cellWidget(row_id, self.VARIABLE_COL).currentText() - timeseries = instrument_log.get_timeseries(variable_name) + timeseries = self.current_dataset.get_timeseries(instrument_name, variable_name) return timeseries @@ -287,7 +280,7 @@ class ExploUim: # Get Y orig (original non-shifted variable value) df = self.__get_row_dataframe__(row_id).copy() - df = df[df["datetime"] <= instant] + df = df[df.index <= instant] if len(df.index) == 0: y_orig = "Out of range" else: @@ -349,7 +342,7 @@ class ExploUim: def __update_manual_event__(self, checked_state: int): if checked_state == 2: - events_df = self.current_dataset.manual_event_log.get_timeseries("event") + events_df = self.current_dataset.get_timeseries("manual-event", "event") for event in events_df.iterrows(): x_pos = utils.pd_time_to_epoch_ms([event[1]["datetime"]])[0] @@ -413,8 +406,9 @@ class ExploUim: def __get_timeseries_x_values__(self, timeseries: pd.DataFrame, timeshift_sec: float = 0) -> list: # As it is a _step_ curve, add a last datetime point to determine the end of the last step. This is the datetime # of the last available data of the dataset, plus one second. - last_datetime = self.current_dataset.last_data_datetime + datetime.timedelta(seconds=1) - x_values = timeseries['datetime'].copy() + first, last = self.current_dataset.get_data_timeframe() + last_datetime = last + datetime.timedelta(seconds=1) + x_values = timeseries.index.copy().to_series(keep_tz=True) x_values = x_values.append(pd.Series([last_datetime])) # Apply time shift @@ -457,9 +451,6 @@ class ExploUim: self.main_ui.explo_lineedit_setup_name.setStyleSheet("color: 'black';") def __save_setup__(self): - # Dataset - dataset = self.current_dataset - # File name filename = self.main_ui.explo_lineedit_setup_name.text() @@ -479,7 +470,7 @@ class ExploUim: # View range view_range = self.plot_item.getViewBox().viewRange() - dataset.save_setup(filename, variable_df, view_range) + self.explo_prvd.save_setup(self.current_dataset, filename, variable_df, view_range) self.__refresh_existing_setups__() # Reset filename input widgets @@ -493,7 +484,7 @@ class ExploUim: # Get a dataframe containing the variables data filename = self.main_ui.explo_listwidget_setup_list.selectedItems()[0].text() - variable_df, view_range_dict = self.current_dataset.load_setup(filename) + variable_df, view_range_dict = self.explo_prvd.load_setup(self.current_dataset, filename) # Variables: table (and automatically: plot) table = self.main_ui.explo_tablewidget_variables @@ -536,7 +527,7 @@ class ExploUim: padding=0) def __refresh_existing_setups__(self): - files = self.current_dataset.get_setup_saved_files() + files = self.explo_prvd.get_setup_saved_files(self.current_dataset) self.main_ui.explo_listwidget_setup_list.clear() for file in files: self.main_ui.explo_listwidget_setup_list.addItem(file) @@ -607,9 +598,9 @@ class ExploUim: return # Get data - instrument_log = self.current_dataset.instlogs[self.stab_window.ui.stab_combobox_instrument.currentText()] + instrument_name = self.stab_window.ui.stab_combobox_instrument.currentText() variable_name = self.stab_window.ui.stab_combobox_variable.currentText() - self.stab_timeseries = instrument_log.get_timeseries(variable_name).copy() + self.stab_timeseries = self.current_dataset.get_timeseries(instrument_name, variable_name).copy() # Set data to step curve x_values = self.__get_timeseries_x_values__(self.stab_timeseries) @@ -618,7 +609,7 @@ class ExploUim: stepMode=True) # Convert data in a form more convenient for plot - self.stab_timeseries["datetime"] = utils.pd_time_to_epoch_ms(self.stab_timeseries["datetime"]) + self.stab_timeseries["datetime"] = utils.pd_time_to_epoch_ms(self.stab_timeseries.index) if not is_numeric_dtype(self.stab_timeseries["value"]): self.stab_timeseries["value"] = self.stab_timeseries["value_int"] -- GitLab