Skip to content
Snippets Groups Projects
Commit ab95027a authored by POLLET Vincent's avatar POLLET Vincent
Browse files

Implement a event sampler that can also filter on tables conditions.

parent 8b96034c
No related branches found
No related tags found
1 merge request!5Implement a event sampler that can also filter on tables conditions.
from pathlib import Path
import argparse
from collections import defaultdict
import numpy as np
import tables
import warnings
def copy_node_if_not_existing(node, output_file):
"""Copy a hdf5 node to `output_file` unless it already exists."""
if node._v_pathname not in output_file:
# if parent group doesn't exit create it
if node._v_parent._v_pathname not in output_file:
parent = output_file.create_group(
node._v_parent._v_parent._v_pathname,
node._v_parent._v_name,
createparents=True
)
else:
parent = output_file.get_node(node._v_parent._v_pathname)
output_file.copy_node(node, parent)
def create_empty_table(output_file, node):
"""Create an empty table at `node` position with node description in `output_file`."""
# create the output table if not existing
if node._v_pathname not in output_file:
output_file.create_table(
node._v_parent._v_pathname,
node.name,
description=node.description,
createparents=True
)
def fill_file_without_events(output_file, copying_file):
"""Populate the outputfile with all the fields in `copying_file` that don't concern events.
The goal is to create a template hdf5 that can then be filled with events. The fields to copy
are selected as the fields that are not indexed by "event_id". The tables that have a column
"event_id" are created but left empty.
"""
with tables.open_file(copying_file, 'r') as R0_f:
for node in R0_f.walk_nodes():
if isinstance(node, tables.Table) and "event_id" in node.colnames:
create_empty_table(output_file, node)
else:
copy_node_if_not_existing(node, output_file)
def sample_events(sampling_filepaths, n_events, condition, condition_node_path):
"""Sample `n_events` in `sampling_filepaths` files that satisfy `condition` and return their filepath and event_id.
Parameters
----------
sampling_filepaths: list of Path
List of the files paths to use for sampling.
n_events: int
Number of events to sample. A warning is issued if their are less events that verify `condition` than `n_events`
condition: str
Pytables query condition in string format. eg: "colname >= 3".
As pytables doesn't support uint64 columns in conditions, a python loop will tried if the condition fails,
but this only works with conditions operating on a single column.
Returns: defaultdict
Dictionarry with key the sampled events filepath index, and values the sampled events "event_id" in the file.
"""
filtered_events = []
for filepath_idx, filepath in enumerate(sampling_filepaths):
with tables.open_file(filepath, 'r') as sampling_f:
filtered_events.extend(
[
(filepath_idx, event["event_id"])
for event in sampling_f.get_node(condition_node_path).where(condition)
]
)
if not filtered_events:
raise ValueError("There aren't any event satisfying the condition !")
if len(filtered_events) < n_events:
warnings.warn("Only {} events satisfy the condition and will be copied to output file !".format(len(filtered_events)))
n_events = min(n_events, len(filtered_events))
sampled_indices = np.random.choice(len(filtered_events), size=n_events, replace=False)
event_info = defaultdict(list)
for idx, (filepath_idx, event_id) in enumerate(filtered_events):
if idx in sampled_indices:
event_info[filepath_idx].append(event_id)
return event_info
def copy_event(event, table):
row = table.row
for col in table.colnames:
row[col] = event[col]
row.append()
def copy_sampled_events(output_file, copying_filepaths, sampled_events_info):
"""Copy the sampled events from the copying files to `output_file`.
"""
for filepath_idx, event_ids in sampled_events_info.items():
with tables.open_file(copying_filepaths[filepath_idx], 'r') as copying_f:
for node in copying_f.walk_nodes():
if isinstance(node, tables.Table) and "event_id" in node.colnames:
output_file_table = output_file.get_node(node._v_pathname)
# would be nice to use pytables condition but they don't support uint64 columns -_- ...
for event in node:
if event["event_id"] in event_ids:
copy_event(event, output_file_table)
def main():
parser = argparse.ArgumentParser(
description="Samples n events among the events in the files in the input folder that "
"satisfy a pytables condition.\n"
'eg: n=50 and condition is "gammaness>0.7" will sample 50 events among those gamaness > 0.7'
"Copies the sampled events from the files in the COPYING folder to the output file.\n"
"This is usefull to sample events in lower data level files based on higher data levels conditions, "
"but it requires that all events in the sampling folder files are also present in the copying folder files."
)
parser.add_argument(
"-i",
"--sampling_folder",
help="Folder containing the .h5 files to use to sample events based on condition. "
"Any events in this files must also be in the copying folder files.",
required=True
)
parser.add_argument(
"-j",
"--copying_folder",
help="Folder containing the .h5 files to copy according to the sampling."
"The sorted files list MUST match the sampling folder.",
required=True
)
parser.add_argument(
"-o",
"--output_file",
help="Name of the output file.",
required=True
)
parser.add_argument(
"-n",
"--n_events",
help="Number of events to sample.",
required=True,
type=int
)
parser.add_argument(
"-c",
"--condition",
help="Condition that the sampled events must fullfill. eg event_quality == 0 or gamaness >= 0.7",
default="",
type=str
)
parser.add_argument(
"-p",
"--node_path",
help="Path to the Table node to use to apply the condition. "
"Eg: /dl1/event/telescope/parameters/tel_001 for a condition on event_quality.",
required=True,
type=str
)
args = parser.parse_args()
path_sampling = Path(args.sampling_folder)
path_copying = Path(args.copying_folder)
if not path_sampling.exists():
raise ValueError("R0 folder does not exists.")
if not path_copying.exists():
raise ValueError("DL1 folder does not exists.")
sampling_filepaths = sorted(list(path_sampling.glob("*.h5")))
copying_filepaths = sorted(list(path_copying.glob("*.h5")))
if not sampling_filepaths:
raise ValueError("Sampling folder does not contain any .h5 files.")
if not copying_filepaths:
raise ValueError("Copying folder does not contain any .h5 files.")
if len(copying_filepaths) != len(sampling_filepaths):
raise ValueError("Sampling and copying folder don't have the same number of files. The files must match exactly !")
path_output_file = Path(args.output_file)
# To only keep good events one could filter on event_quality == GOOD
# rejected events category are based on the field EventQuality in DL1s
# https://cta-lapp.pages.in2p3.fr/HiPeRTA/d9/dfd/namespaceEventQuality.html
sampled_events_info = sample_events(sampling_filepaths, args.n_events, args.condition, args.node_path)
with tables.open_file(path_output_file, 'w') as output_file:
# pass the first copying file to be used as a template to create the output file.
fill_file_without_events(output_file, copying_filepaths[0])
copy_sampled_events(output_file, copying_filepaths, sampled_events_info)
if __name__ == "__main__":
main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment