Commit ca4ddf4a authored by TROUSSELLIER Laurent's avatar TROUSSELLIER Laurent
Browse files

DRSParser en global

parents 225e0f75 9d2083fe
# Default ignored files
/shelf/
/workspace.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8 (base)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="HtmlUnknownAttribute" enabled="true" level="WARNING" enabled_by_default="true">
<option name="myValues">
<value>
<list size="1">
<item index="0" class="java.lang.String" itemvalue="align" />
</list>
</value>
</option>
<option name="myCustomValuesEnabled" value="true" />
</inspection_tool>
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="1">
<item index="0" class="java.lang.String" itemvalue="smtplib" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (base)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/catalog.iml" filepath="$PROJECT_DIR$/.idea/catalog.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import pyessv
from pyessv import TemplateParsingError
from glob import iglob
from pathlib import Path
from TimeRange import TimeRange
from vocabulary import VOCAB
def main():
# TODO: From CLI args.
#root = '/Users/glipsl/Documents/work/catalog/bdd/'
root = '/home/ltrousse/Bureau/ModCatAdmin/catalog/bdd'
project = 'CMIP6'
# Get DRS collections.
dir_drs = VOCAB[project]['directory_format']
file_drs = VOCAB[project]['filename_format']
# Add time range collection.
time_range = pyessv.create_collection(
pyessv.load('wcrp:{}'.format(project)),
"time_range",
description="Time Range",
term_regex=r'[0-9]+\-[0-9]+'
)
# Override version collection with "latest" pattern.
version = pyessv.create_collection(
pyessv.load('wcrp:{}'.format(project)),
"version",
description="Version",
term_regex=r'^v[0-9]{8}|latest$'
)
# DRS keys.
dir_keys = [pyessv.load(i).raw_name for i in dir_drs]
file_keys = [pyessv.load(i).raw_name for i in file_drs]
# Set path template for vocabulary check.
dir_template = os.path.join(project, '/'.join(['{}'] * len(dir_drs)))
dir_parser = pyessv.create_template_parser(dir_template, dir_drs, strictness=1, seperator='/')
# Set file template for vocabulary check for fixed frequency.
file_template = '_'.join(['{}'] * len(file_drs))
file_parser = pyessv.create_template_parser(file_template, file_drs, strictness=1, seperator='_')
# Set file template for vocabulary check.
file_template = '_'.join(['{}'] * (len(file_drs) - 1))
fx_file_parser = pyessv.create_template_parser(file_template, file_drs[:-1], strictness=1, seperator='_')
# Globbing pattern.
pattern = os.path.join(root, project, '**/' * len(dir_drs), '*.nc')
for path in iglob(pattern):
# Get Path object.
path = Path(path)
# Remove root directory.
p = path.relative_to(root)
# Initialize final dictionary of DRS facets.
facets = dict()
# Deserialize path.
try:
# Check vocabulary.
dir_parser.parse(p.parent.as_posix())
# Deserialize p.parent in dict excluding project.
facets = dict(zip(dir_keys, p.parent.parts[1:]))
# Vocabulary error handling.
except TemplateParsingError as e:
print(e)
# Key error handling. Due to wrong number of facets in comparison with the path parts.
except KeyError as e:
print(e)
# Deserialize filename.
try:
# Check vocabulary.
try:
file_parser.parse(p.stem)
# Deserialize time range in date format.
timerange = TimeRange(p.stem.split('_')[-1])
tstart, tend = timerange.start, timerange.end
# Vocabulary error handling.
except TemplateParsingError as e:
# Try checking vocabulary with fixed variable template.
try:
fx_file_parser.parse(p.stem)
# No timerange.
tstart, tend = None
# Vocabulary error handling.
except TemplateParsingError as e:
print(e)
# Key error handling. Due to wrong number of facets in comparison with the filename parts.
except KeyError as e:
print(e)
# Key error handling. Due to wrong number of facets in comparison with the filename parts.
except KeyError as e:
print(e)
# Deserialize p.name and update dict.
facets.update(dict(zip(file_keys[:-1], p.name.split('_')[:-1])))
facets['period_start'] = tstart
facets['period_end'] = tend
print(facets)
# Write in CSV.
#TODO
except:
pass
if __name__ == "__main__":
main()
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
:platform: Unix
:synopsis: DRS parser used in this module.
"""
import os
import pyessv
from pyessv import TemplateParsingError
from TimeRange import TimeRange
from constants import VOCAB
class DRSParser(object):
"""
Class handling DRS parsing of the path and filename depending on the project.
"""
def __init__(self, project):
# Add time range collection.
pyessv.create_collection(
pyessv.load('wcrp:{}'.format(project)),
"time_range",
description="Time Range",
term_regex=r'[0-9]+\-[0-9]+'
)
# Get DRS collections.
self.dir_drs = VOCAB[project]['directory_format']
self.file_drs = VOCAB[project]['filename_format']
# DRS keys.
self.dir_keys = [pyessv.load(i).raw_name for i in self.dir_drs]
self.file_keys = [pyessv.load(i).raw_name for i in self.file_drs]
# Set path template for vocabulary check.
dir_template = os.path.join(project, '/'.join(['{}'] * len(self.dir_drs)))
self.dir_parser = pyessv.create_template_parser(dir_template, self.dir_drs, strictness=1, seperator='/')
# Set file template for vocabulary check for fixed frequency.
file_template = '_'.join(['{}'] * len(self.file_drs))
self.file_parser = pyessv.create_template_parser(file_template, self.file_drs, strictness=1, seperator='_')
# Set file template for vocabulary check.
file_template = '_'.join(['{}'] * (len(self.file_drs) - 1))
self.fx_file_parser = pyessv.create_template_parser(file_template, self.file_drs[:-1], strictness=1,
seperator='_')
def get_facets_from_path(self, path):
"""
Deserialize pathlib.Path object against a DRS.
"""
# Check vocabulary.
try:
self.dir_parser.parse(path.parent.as_posix())
# Deserialize p.parent in dict excluding project.
facets = dict(zip(self.dir_keys, path.parent.parts[1:]))
return facets
# Vocabulary error handling.
except TemplateParsingError as e:
print(e)
# Key error handling. Due to wrong number of facets in comparison with the path parts.
except KeyError as e:
print(e)
def get_facets_from_filename(self, basename):
"""
Deserialize a filename string against a DRS.
"""
# Initialize tstart & tend
tstart, tend = None, None
# Check vocabulary.
try:
self.file_parser.parse(basename)
# Deserialize time range in date format.
timerange = TimeRange(basename.split('_')[-1])
tstart, tend = timerange.start, timerange.end
# Vocabulary error handling.
except TemplateParsingError:
# Try checking vocabulary with fixed variable template.
try:
self.fx_file_parser.parse(basename)
# No timerange.
tstart, tend = "", ""
# Vocabulary error handling.
except TemplateParsingError as e:
print(e)
# Key error handling. Due to wrong number of facets in comparison with the filename parts.
except KeyError as e:
print(e)
# Key error handling. Due to wrong number of facets in comparison with the filename parts.
except KeyError as e:
print(e)
# Deserialize filename and add time range facets.
facets = dict(zip(self.file_keys[:-1], basename.split('_')[:-1]))
facets['period_start'] = tstart
facets['period_end'] = tend
return facets
{
"esmcat_version":"{{ esmcat_version }}",
"id":"{{ id }}",
"description": "{{ description }}",
"catalog_file": "{{ catalog_file }}",
"attributes": [{% for attr in attributes %}
{
"column_name": "{{ attr.column_name }}",
"vocabulary": "{{ attr.vocabulary }}"
}{% if not loop.last %},{% endif %}
{% endfor %}],
"assets": {
"column_name": "{{ assets.column_name }}",
"format": "{{ assets.format }}"
}
}
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 29 17:05:56 2021
@author: ltrousse
"""
import json # pour écrire le json du catalogue
import csv # pour écrire le csv du catalogue
class ESMCatFabric():
def __init__(self,name,header,data, path = None, description = None):
self.header = header
self.data = data
self.name = name
self.path = path
self.description = description
@classmethod
def from_esmCat(cls,catalog):
return cls(catalog.esmcol_data["id"],list(catalog.df.columns),catalog.df.values.tolist(),catalog.esmcol_path[:catalog.esmcol_path.rfind("/")],catalog.esmcol_data["description"])
def Add(self,data):
self.data.append(data)
self.Update()
def Remove(self,data):
self.data.remove(data)
self.Update()
def Update(self):
self.SaveCSV()
self.SaveJSON()
def CreateCat(self,pathToSave, description):
self.path = pathToSave
self.description = description
self.SaveCSV()
self.SaveJSON()
# ça fait presque la meme chose que catalog.serialize(name="testSerialize", catalog_type="file")
# sauf que ici on cree un CSV sans compression alors que dans esm_datastore il utilise la compression gzip (pourquoi pas, mais du coup on peut plus "voir" le fichier CSV)
def SaveCSV(self):
#self.pC = pCMIP6(self.fC.lFile[10])
#header = list(self.pC.p.keys())
#print(header)
#data=[]
#for file in self.fC.lFile:
#print(list( pCMIP6(file).p.values()))
#lToSave =
# ['None' if v is None else v for v in list( pCMIP6(file).p.values())]
# data.append(list( pCMIP6(file).p.values()))
f = open(self.path+"/"+self.name+".csv", 'w')
writer = csv.writer(f)
writer.writerow(self.header)
writer.writerows(self.data)
f.close()
def SaveJSON(self):
--> Template Jinja2
dicToSave={}
dicToSave["esmcat_version"]="0.1.0"
dicToSave["id"]=self.name
dicToSave["description"]= self.description
dicToSave["catalog_file"]=self.name+".csv"
attributeList=[]
#dicAttr1={}
#dicAttr1["column_name"]="activity_id"
#attributeList.append(dicAttr1)
dicToSave["attributes"]=[]
dicAsset={}
dicAsset["column_name"]= self.header
dicAsset["format"]="netcdf"
dicToSave["assets"]=dicAsset
#print(json.dumps(dicToSave))
with open(self.path+"/"+self.name+".json", 'w') as fp:
json.dump(dicToSave, fp, indent=4)
def __repr__(self):
res = ""
res+=str(self.header)+"\n"
res+=str(self.name)+"\n"
res+=str(self.path)+"\n"
res+=str(self.description)+"\n"
res+=str(self.data)+"\n"
return res
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 29 17:11:56 2021
@author: ltrousse
"""
import os
class FileCatcher():
def __init__(self,path):
self.lFile = []
self.getAll(path)
def getAll(self,path):
alls = [f for f in os.scandir(path)]
for f in alls:
if f.is_file():
self.lFile.append(f.path)
else:
if f.is_dir():
self.getAll(f.path)
\ No newline at end of file
<<<<<<< HEAD
Test
=======
TODO :
un script multithreaded pour parse les CMIP6, CMIP5, CORDEX .. recup le path de chaque fichier .. le split .. creer un fihcier json+csv .. (granularité : Experiment).. + un cat root
l'idée c'est de faire le catalogue en auto ...
la fin de l'histoire :
Un rootcat là : /thredds/ipsl/catalog/CMIP6_cat.yaml (éventuellement le root : ModCat.yaml qui pointera vers CMIP6_cat;CMIP5_cat : CORDEX_cat qu'on pourrait placer au même endroit ^^ )
Et ensuite 2 façon de faire .. à tester :
1/ un cat ESM (json+csv) à "l'experiment_id" et c'est tout (le root pointe vers tous les cat ESM placés là : /bdd/CMIP6/PIMP/IPSL/IPSL6Rbla/lgm/.catalog/cat.json
2/ des yaml qui pointe dans les rep suivant ... jusqu'au cat ESM ( on les place où ceux là ? )
Du coup But de l'histoire :
En entrée
on a /bdd .. qui a toutes les infos nécessaires pour faire ces cat
En sortie
Les cat (une fois choisi la tech choisie)
Bonus : Le tout avec possibilité de multithread
=> du coup l'entrée ... juste un subset de bdd/ pour qu'ensuite on puisse le lancer n fois sur plusieurs subset en meme temps
Prob reperés :
Comment Split le path de chaque projet pour trouver les colonnes de l'ESM ?
------------------------------------------------------------------
l'idée :
1/
- créer le root catalog ici : /modf/catalog/FakeDir
- créer les cat IPSL ici : /modf/catalog/FakeDir/TGCC/....( + l'arborescence ex : IPSL-CM6A-LR/lgm/.catalog/bla.json)
- créer les autres Cat issus des ESGF ici : /modf/catalog/FakeDir/IDRIS/...(+l'arborescence )
2/
- check que tout va bien dans ces catalogs => sinon retour 1/
3/
- Faire le script de copy des fichiers au bon endroit ... sans le lancer ..
Pour faire le 1/ ...
Need : choisir la tech .. soit lien direct sur les ESM .. soit lien via 4 ou 5 yalm qui se ballade l'arborescence
Pour faire le 2/ ...
3 uses cases ...
- le gars qui veut une expermient_id particuliere ... comment il fait .
- le gars qui veut la même experiment_id mais pour plusieurs institution_id
- le gars qui veut pour une variable donnée .. plusieurs experiment_id
-----------------------------------------------------------------
- Quel chemin mettre dans les ESM => celui vu par ciclad
# Solution proposée
- 3 scripts différents
- 1 sur l'idris
- 1 sur le tgcc (le même qu'au dessus)
- 1 sur ciclad
-----------------------------------------------------------------
>>>>>>> 9d2083fea67ba54eb50b7a7b5cdf8fafe99f435d
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
class TimeRange(object):
def __init__(self, timerange):
# Split time range into start and end digits.
self.start, self.end = map(self.iso_format, timerange.split('-'))
@staticmethod
def iso_format(timestamp):
"""
Converts string digits into iso format: %Y%m%dT%H:%M:%s.
For yearly and monthly truncated timestamps the dates from filename are filled
with the 0 digit to reach 14 digits. Consequently, yearly dates starts
at January 1st and monthly dates starts at first day of the month.
"""
if len(timestamp) == 4:
# Start year at january 1st
timestamp = (timestamp + '0101').ljust(14, '0')
elif len(timestamp) == 6:
# Start month at first day
timestamp = (timestamp + '01').ljust(14, '0')
else:
timestamp = timestamp.ljust(14, '0')
return '{years}-{months}-{days}T{hours}:{minutes}:{seconds}'.format(**{'years': timestamp[0:4],
'months': timestamp[4:6],
'days': timestamp[6:8],
'hours': timestamp[8:10],
'minutes': timestamp[10:12],
'seconds': timestamp[12:14]})
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from multiprocessing import Pool
import TimeRange # bien que non utilisé dans la fonction utilisé par les workers .. il le faut .. sinon le callcak n'est pas appélé !!! (histoire de gestion des timeout)
import os,re
import pyessv
from glob import iglob
from FileCatcher import FileCatcher
from pCMIP6 import pCMIP6
from ESMFabric import ESMCatFabric
from pathlib import Path
from TimeRange import TimeRange
from vocabulary import VOCAB
def get_dir_format(project):
"""
Load project directory structure from PYESSV Archive.
"""
# Load project vocabulary.
vocabulary = pyessv.load('wcrp:{}'.format(project))
# Correct the directory structure if needed.
directory_structure = vocabulary.data['directory_format'].replace('activity_drs', 'activity_id')
# Extract names of collections (excluding "root").
names = [re.search(r'\w+', part).group() for part in directory_structure.split('s/')[1:]]
# Get set of pyessv collections.
collections = set()
for name in names:
for collection in vocabulary.collections:
if name == collection.raw_name:
collections.add(collection)
return names, collections
def get_file_format(project):
"""
Load project filename format from PYESSV Archive.
"""
# Load project vocabulary.
vocabulary = pyessv.load('wcrp:{}'.format(project))
# Extract names of collections.
names = [re.search(r'\w+', part).group() for part in vocabulary.data['filename_format'].split(')s')]