Implement a event sampler that can also filter on tables conditions.

ab95027a · POLLET Vincent · 8b96034c · ab95027a
Commit ab95027a authored 1 year ago by POLLET Vincent
--- a/cta_data2hdf5/programs/events_sampler.py
+++ b/cta_data2hdf5/programs/events_sampler.py
+from pathlib import Path
+import argparse
+from collections import defaultdict
+import numpy as np
+import tables
+import warnings
+def copy_node_if_not_existing(node, output_file):
+    """Copy a hdf5 node to `output_file` unless it already exists."""
+    if node._v_pathname not in output_file:
+        # if parent group doesn't exit create it
+        if node._v_parent._v_pathname not in output_file:
+            parent = output_file.create_group(
+                node._v_parent._v_parent._v_pathname,
+                node._v_parent._v_name,
+                createparents=True
+            )
+        else:
+            parent = output_file.get_node(node._v_parent._v_pathname)
+        output_file.copy_node(node, parent)
+def create_empty_table(output_file, node):
+    """Create an empty table at `node` position with node description in `output_file`."""
+    # create the output table if not existing
+    if node._v_pathname not in output_file:
+        output_file.create_table(
+            node._v_parent._v_pathname,
+            node.name,
+            description=node.description,
+            createparents=True
+        )
+def fill_file_without_events(output_file, copying_file):
+    """Populate the outputfile with all the fields in `copying_file` that don't concern events.
+        The goal is to create a template hdf5 that can then be filled with events. The fields to copy
+        are selected as the fields that are not indexed by "event_id". The tables that have a column
+        "event_id" are created but left empty.
+    """
+    with tables.open_file(copying_file, 'r') as R0_f:
+        for node in R0_f.walk_nodes():
+            if isinstance(node, tables.Table) and "event_id" in node.colnames:
+                create_empty_table(output_file, node)
+            else:
+                copy_node_if_not_existing(node, output_file)
+def sample_events(sampling_filepaths, n_events, condition, condition_node_path):
+    """Sample `n_events` in `sampling_filepaths` files that satisfy `condition` and return their filepath and event_id.
+        Parameters
+        ----------
+        sampling_filepaths: list of Path
+            List of the files paths to use for sampling.
+        n_events: int
+            Number of events to sample. A warning is issued if their are less events that verify `condition` than `n_events`
+        condition: str
+            Pytables query condition in string format. eg: "colname >= 3".
+            As pytables doesn't support uint64 columns in conditions, a python loop will tried if the condition fails, 
+            but this only works with conditions operating on a single column.
+        Returns: defaultdict
+            Dictionarry with key the sampled events filepath index, and values the sampled events "event_id" in the file.
+    """
+    filtered_events = []
+    for filepath_idx, filepath in enumerate(sampling_filepaths):
+        with tables.open_file(filepath, 'r') as sampling_f:
+            filtered_events.extend(
+                [
+                    (filepath_idx, event["event_id"])
+                    for event in sampling_f.get_node(condition_node_path).where(condition)
+                ]
+            )
+    if not filtered_events:
+        raise ValueError("There aren't any event satisfying the condition !")
+    if len(filtered_events) < n_events:
+        warnings.warn("Only {} events satisfy the condition and will be copied to output file !".format(len(filtered_events)))
+        n_events = min(n_events, len(filtered_events))
+    sampled_indices = np.random.choice(len(filtered_events), size=n_events, replace=False)
+    event_info = defaultdict(list)
+    for idx, (filepath_idx, event_id) in enumerate(filtered_events):
+        if idx in sampled_indices:
+            event_info[filepath_idx].append(event_id)
+    return event_info
+def copy_event(event, table):
+    row = table.row
+    for col in table.colnames:
+        row[col] = event[col]
+    row.append()
+def copy_sampled_events(output_file, copying_filepaths, sampled_events_info):
+    """Copy the sampled events from the copying files to `output_file`.
+    """
+    for filepath_idx, event_ids in sampled_events_info.items():
+        with tables.open_file(copying_filepaths[filepath_idx], 'r') as copying_f:
+            for node in copying_f.walk_nodes():
+                if isinstance(node, tables.Table) and "event_id" in node.colnames:
+                    output_file_table = output_file.get_node(node._v_pathname)
+                    # would be nice to use pytables condition but they don't support uint64 columns -_- ...
+                    for event in node:
+                        if event["event_id"] in event_ids:
+                            copy_event(event, output_file_table)
+def main():
+    parser = argparse.ArgumentParser(
+        description="Samples n events among the events in the files in the input folder that "
+                    "satisfy a pytables condition.\n"
+                    'eg: n=50 and condition is "gammaness>0.7" will sample 50 events among those gamaness > 0.7'
+                    "Copies the sampled events from the files in the COPYING folder to the output file.\n"
+                    "This is usefull to sample events in lower data level files based on higher data levels conditions, "
+                    "but it requires that all events in the sampling folder files are also present in the copying folder files."
+    )
+    parser.add_argument(
+        "-i",
+        "--sampling_folder",
+        help="Folder containing the .h5 files to use to sample events based on condition. "
+             "Any events in this files must also be in the copying folder files.",
+        required=True
+    )
+    parser.add_argument(
+        "-j",
+        "--copying_folder",
+        help="Folder containing the .h5 files to copy according to the sampling."
+             "The sorted files list MUST match the sampling folder.",
+        required=True
+    )
+    parser.add_argument(
+        "-o",
+        "--output_file",
+        help="Name of the output file.",
+        required=True
+    )
+    parser.add_argument(
+        "-n",
+        "--n_events",
+        help="Number of events to sample.",
+        required=True,
+        type=int
+    )
+    parser.add_argument(
+        "-c",
+        "--condition",
+        help="Condition that the sampled events must fullfill. eg event_quality == 0 or gamaness >= 0.7",
+        default="",
+        type=str
+    )
+    parser.add_argument(
+        "-p",
+        "--node_path",
+        help="Path to the Table node to use to apply the condition. "
+             "Eg: /dl1/event/telescope/parameters/tel_001 for a condition on event_quality.",
+        required=True,
+        type=str
+    )
+    args = parser.parse_args()
+    path_sampling = Path(args.sampling_folder)
+    path_copying = Path(args.copying_folder)
+    if not path_sampling.exists():
+        raise ValueError("R0 folder does not exists.")
+    if not path_copying.exists():
+        raise ValueError("DL1 folder does not exists.")
+    sampling_filepaths = sorted(list(path_sampling.glob("*.h5")))
+    copying_filepaths = sorted(list(path_copying.glob("*.h5")))
+    if not sampling_filepaths:
+        raise ValueError("Sampling folder does not contain any .h5 files.")
+    if not copying_filepaths:
+        raise ValueError("Copying folder does not contain any .h5 files.")
+    if len(copying_filepaths) != len(sampling_filepaths):
+        raise ValueError("Sampling and copying folder don't have the same number of files. The files must match exactly !")
+    path_output_file = Path(args.output_file)
+    # To only keep good events one could filter on event_quality == GOOD
+    # rejected events category are based on the field EventQuality in DL1s
+    # https://cta-lapp.pages.in2p3.fr/HiPeRTA/d9/dfd/namespaceEventQuality.html
+    sampled_events_info = sample_events(sampling_filepaths, args.n_events, args.condition, args.node_path)
+    with tables.open_file(path_output_file, 'w') as output_file:
+        # pass the first copying file to be used as a template to create the output file.
+        fill_file_without_events(output_file, copying_filepaths[0])
+        copy_sampled_events(output_file, copying_filepaths, sampled_events_info)
+if __name__ == "__main__":
+    main()
\ No newline at end of file