Commit 754ee6cb authored by Guillaume's avatar Guillaume
Browse files

Guillaume improvements on datasets parsing.

parent 54530335
# Default ignored files
/shelf/
/workspace.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8 (base)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="HtmlUnknownAttribute" enabled="true" level="WARNING" enabled_by_default="true">
<option name="myValues">
<value>
<list size="1">
<item index="0" class="java.lang.String" itemvalue="align" />
</list>
</value>
</option>
<option name="myCustomValuesEnabled" value="true" />
</inspection_tool>
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="1">
<item index="0" class="java.lang.String" itemvalue="smtplib" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (base)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/catalog.iml" filepath="$PROJECT_DIR$/.idea/catalog.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import pyessv
from pyessv import TemplateParsingError
from glob import iglob
from pathlib import Path
from TimeRange import TimeRange
from vocabulary import VOCAB
def main():
# TODO: From CLI args.
root = '/Users/glipsl/Documents/work/catalog/bdd/'
project = 'CMIP6'
# Get DRS collections.
dir_drs = VOCAB[project]['directory_format']
file_drs = VOCAB[project]['filename_format']
# Add time range collection.
time_range = pyessv.create_collection(
pyessv.load('wcrp:{}'.format(project)),
"time_range",
description="Time Range",
term_regex=r'[0-9]+\-[0-9]+'
)
# Override version collection with "latest" pattern.
version = pyessv.create_collection(
pyessv.load('wcrp:{}'.format(project)),
"version",
description="Version",
term_regex=r'^v[0-9]{8}|latest$'
)
# DRS keys.
dir_keys = [pyessv.load(i).raw_name for i in dir_drs]
file_keys = [pyessv.load(i).raw_name for i in file_drs]
# Set path template for vocabulary check.
dir_template = os.path.join(project, '/'.join(['{}'] * len(dir_drs)))
dir_parser = pyessv.create_template_parser(dir_template, dir_drs, strictness=1, seperator='/')
# Set file template for vocabulary check for fixed frequency.
file_template = '_'.join(['{}'] * len(file_drs))
file_parser = pyessv.create_template_parser(file_template, file_drs, strictness=1, seperator='_')
# Set file template for vocabulary check.
file_template = '_'.join(['{}'] * (len(file_drs) - 1))
fx_file_parser = pyessv.create_template_parser(file_template, file_drs[:-1], strictness=1, seperator='_')
# Globbing pattern.
pattern = os.path.join(root, project, '**/' * len(dir_drs), '*.nc')
for path in iglob(pattern):
# Get Path object.
path = Path(path)
# Remove root directory.
p = path.relative_to(root)
# Initialize final dictionary of DRS facets.
facets = dict()
# Deserialize path.
try:
# Check vocabulary.
dir_parser.parse(p.parent.as_posix())
# Deserialize p.parent in dict excluding project.
facets = dict(zip(dir_keys, p.parent.parts[1:]))
# Vocabulary error handling.
except TemplateParsingError as e:
print(e)
# Key error handling. Due to wrong number of facets in comparison with the path parts.
except KeyError as e:
print(e)
# Deserialize filename.
try:
# Check vocabulary.
try:
file_parser.parse(p.stem)
# Deserialize time range in date format.
timerange = TimeRange(p.stem.split('_')[-1])
tstart, tend = timerange.start, timerange.end
# Vocabulary error handling.
except TemplateParsingError as e:
# Try checking vocabulary with fixed variable template.
try:
fx_file_parser.parse(p.stem)
# No timerange.
tstart, tend = None
# Vocabulary error handling.
except TemplateParsingError as e:
print(e)
# Key error handling. Due to wrong number of facets in comparison with the filename parts.
except KeyError as e:
print(e)
# Key error handling. Due to wrong number of facets in comparison with the filename parts.
except KeyError as e:
print(e)
# Deserialize p.name and update dict.
facets.update(dict(zip(file_keys[:-1], p.name.split('_')[:-1])))
facets['period_start'] = tstart
facets['period_end'] = tend
# Write in CSV.
#TODO
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -60,6 +60,8 @@ class ESMCatFabric():
def SaveJSON(self):
--> Template Jinja2
dicToSave={}
dicToSave["esmcat_version"]="0.1.0"
dicToSave["id"]=self.name
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
class TimeRange(object):
def __init__(self, timerange):
# Split time range into start and end digits.
self.start, self.end = map(self.iso_format, timerange.split('-'))
@staticmethod
def iso_format(timestamp):
"""
Converts string digits into iso format: %Y%m%dT%H:%M:%s.
For yearly and monthly truncated timestamps the dates from filename are filled
with the 0 digit to reach 14 digits. Consequently, yearly dates starts
at January 1st and monthly dates starts at first day of the month.
"""
if len(timestamp) == 4:
# Start year at january 1st
timestamp = (timestamp + '0101').ljust(14, '0')
elif len(timestamp) == 6:
# Start month at first day
timestamp = (timestamp + '01').ljust(14, '0')
else:
timestamp = timestamp.ljust(14, '0')
return '{years}-{months}-{days}T{hours}:{minutes}:{seconds}'.format(**{'years': timestamp[0:4],
'months': timestamp[4:6],
'days': timestamp[6:8],
'hours': timestamp[8:10],
'minutes': timestamp[10:12],
'seconds': timestamp[12:14]})
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 5 11:40:05 2021
@author: ltrousse
"""
from multiprocessing import Pool
import time # bien que non utilisé dans la fonction utilisé par les workers .. il le faut .. sinon le callcak n'est pas appélé !!! (histoire de gestion des timeout)
import os
import os,re
import pyessv
from glob import iglob
from FileCatcher import FileCatcher
from pCMIP6 import pCMIP6
from ESMFabric import ESMCatFabric
from pathlib import Path
from TimeRange import TimeRange
from vocabulary import VOCAB
def get_dir_format(project):
"""
Load project directory structure from PYESSV Archive.
"""
# Load project vocabulary.
vocabulary = pyessv.load('wcrp:{}'.format(project))
# Correct the directory structure if needed.
directory_structure = vocabulary.data['directory_format'].replace('activity_drs', 'activity_id')
# Extract names of collections (excluding "root").
names = [re.search(r'\w+', part).group() for part in directory_structure.split('s/')[1:]]
# Get set of pyessv collections.
collections = set()
for name in names:
for collection in vocabulary.collections:
if name == collection.raw_name:
collections.add(collection)
return names, collections
def get_file_format(project):
"""
Load project filename format from PYESSV Archive.
"""
# Load project vocabulary.
vocabulary = pyessv.load('wcrp:{}'.format(project))
# Extract names of collections.
names = [re.search(r'\w+', part).group() for part in vocabulary.data['filename_format'].split(')s')]
# Get set of pyessv collections.
collections = set()
for part in drs:
for collection in vocabulary.collections:
if part == collection.raw_name:
collections.add(collection)
return drs, collections
#rootCMIP6CatPath = "/home/ltroussellier/Catalogue/TestScriptCat/CMIP6_cat.yaml" # ici il faudra mettre "/thredds/ipsl/catalog/CMIP6_cat.yaml"
......@@ -40,6 +83,9 @@ def GetListeAllDirPathToCat(path,granularite):
return listeRes
def createCMIP6ESMCat(path, where=None):
# le path est le path de départ .. du type /bdd/CMIP6/PMIP/IPSL/IPSL-C6MR-RS/lig127k
# le where ... c'est où on veut sauvergarder le catalogue mais au vu de ce qu'on a dit ... ça sera dans path/.catalog/...ici... Du coup est ce que le where est utile ?
......@@ -49,6 +95,7 @@ def createCMIP6ESMCat(path, where=None):
# on crée le cat ESM ..
#print("ESM : "+path)
fC = FileCatcher(path)
#là on a tous les sous fichiers
mHeader = list(pCMIP6(fC.lFile[0]).p.keys()) #grace à l'objet pCMIP6 ... on décode le path .. pour avoir chaque facet et sa valeur .. les facets feront les noms des colonnes du csv
mData=[]
......@@ -79,7 +126,17 @@ def JobDone(name): # quand le job est fini .. on met a jour le root cat avec le
def main():
#TODO: inclure ici un parser d'argument CLI avec pour paramètre:
# le projet
# le root path à scanner
# le path de dest du catalog.
project = 'CMIP6'
root = '/bdd/CMIP6'
out_cat = '/Users/glipsl/Documents/work/catalog'
# Load project directory structure.
drs = get_drs(project)
CMIP6Path = "/bdd/CMIP6" # ici ? je sais pas, faut voir sur l'idris/tgcc où est le rep CMIP
Granularite = "institution_id"
nbInPool = 2
......
......@@ -10,11 +10,15 @@ from collections import OrderedDict
import itertools
class pCMIP6():
def __init__(self,completefilepath): #complete path from /bdd...
def __init__(self,completefilepath):
pattern = os.path.join(rpath, '**/' * len(drs), '*.nc')
#complete path from /bdd...
#exemple : /bdd/CMIP6/PMIP/NCAR/CESM2/lig127k/r1i1p1f1/Amon/co2/gn/latest/co2_Amon_CESM2_lig127k_r1i1p1f1_gn_010101-015012.nc
lInfos = completefilepath.split("/")
dst_DicPath= OrderedDict()
dst_DicPath["path"]=completefilepath
self.p = dst_DicPath.copy()
......
pyessv
netcdftime
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
VOCAB = {
'CMIP6': {
'directory_format': (
'wcrp:cmip6:activity_id',
'wcrp:cmip6:institution_id',
'wcrp:cmip6:source_id',
'wcrp:cmip6:experiment_id',
'wcrp:cmip6:member_id',
'wcrp:cmip6:table_id',
'wcrp:cmip6:variable_id',
'wcrp:cmip6:grid_label',
'wcrp:cmip6:version'
),
'filename_format': (
'wcrp:cmip6:variable_id',
'wcrp:cmip6:table_id',
'wcrp:cmip6:source_id',
'wcrp:cmip6:experiment_id',
'wcrp:cmip6:member_id',
'wcrp:cmip6:grid_label',
'wcrp:cmip6:time_range'
)
},
'CMIP5': {
'directory_format': (
'wcrp:cmip5:product',
'wcrp:cmip5:institute',
'wcrp:cmip5:model',
'wcrp:cmip5:experiment',
'wcrp:cmip5:time_frequency',
'wcrp:cmip5:realm',
'wcrp:cmip5:cmor_table',
'wcrp:cmip5:ensemble',
'wcrp:cmip5:version',
'wcrp:cmip5:variable'
),
'filename_format': (
'wcrp:cmip5:variable',
'wcrp:cmip5:cmor_table',
'wcrp:cmip5:model',
'wcrp:cmip5:experiment',
'wcrp:cmip5:ensemble',
'wcrp:cmip5:time_range'
)
},
'CORDEX': {
'directory_format': (
'wcrp:cordex:product',
'wcrp:cordex:domain',
'wcrp:cordex:institute',
'wcrp:cordex:driving_model',
'wcrp:cordex:experiment',
'wcrp:cordex:ensemble',
'wcrp:cordex:rcm_model',
'wcrp:cordex:rcm_version',
'wcrp:cordex:time_frequency',
'wcrp:cordex:variable',
'wcrp:cordex:version'
),
'filename_format': (
'wcrp:cordex:variable',
'wcrp:cordex:domain',
'wcrp:cordex:driving_model',
'wcrp:cordex:experiment',
'wcrp:cordex:ensemble',
'wcrp:cordex:rcm_model',
'wcrp:cordex:rcm_version',
'wcrp:cordex:time_frequency',
'wcrp:cordex:time_range'
)
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment