Commit 1d8373e6 authored by POLCHER Jan's avatar POLCHER Jan 🚴🏾
Browse files

First version of the station data management software together with a sample...

First version of the station data management software together with a sample file for the statin metadata. This is the first step to positioning the stations onto the HTUs generated by RoutingPreProc.py
parent 9c004dde
#
import numpy as np
import datetime
import sys
from netCDF4 import Dataset
from netCDF4 import stringtochar, chartostring, stringtoarr
#
import Station as ST
import ConversionCalculations as CC
#
class AllStations :
def __init__(self, Source="") :
self.stations = []
self.nbstations = 0
self.hist = None
self.nbmonth = 0
self.source = Source
self.mergersource = None
return
#
def add(self, stn) :
#
# Only station if length of data larger than 0
#
if stn.Length > 0 :
self.stations.append(stn)
# Manage source of merged data by building the union
if self.mergersource is not None :
self.mergersource = list(set(self.mergersource).union(stn.mergersource))
else :
self.mergersource = stn.mergersource
#
self.nbstations += 1
return
#
# Get some time information from the list of stations
#
def FirstDate(self) :
return min([stn.Time[0] for stn in self.stations])
def LastDate(self) :
return max([stn.Time[-1] for stn in self.stations])
#
# Function to dump all stations into a NetCDF file
#
def dumpnetcdf(self, dates, filename, hist) :
#
self.hist = hist
#
FillValue = 1.0e+20
self.nbmonth = len(dates)
# Maximum options for original data
if self.mergersource is not None :
self.maxalt = len(self.mergersource)-1
else :
self.maxalt = 0
#
# Compute total number of alternatives
#
altloc=np.empty((self.maxalt,self.nbstations), int)
altloc[:,:] = -1
self.altsz=0
for i,stn in enumerate(self.stations) :
if stn.mergersource is not None :
for src in stn.mergersource[1:] :
ia = self.mergersource.index(src)
altloc[ia-1,i] = self.altsz
self.altsz += 1
#
# Transfer data to matrices along the new time axis.
#
original=np.empty((self.nbmonth,self.nbstations), float)
original[:,:] = np.nan
alth=np.empty((self.nbmonth,self.altsz), float)
alth[:,:] = np.nan
for i,stn in enumerate(self.stations) :
if isinstance(stn.Original,np.ndarray) :
original[:,i] = CC.tonewtime(stn.Time, stn.Original, dates)
else :
original[:,i] = CC.tonewtime(stn.Time, stn.Original[0], dates)
icnt = 1
for ia in range(len(stn.mergersource)-1) :
if altloc[ia,i] >= 0 :
alth[:,altloc[ia,i]] = CC.tonewtime(stn.Time, stn.Original[icnt], dates)
icnt += 1
#
flags=np.empty((self.nbmonth,self.nbstations), float)
flags[:,:] = np.nan
for i,stn in enumerate(self.stations) :
if stn.Flag is not None :
flags[:,i] = CC.tonewtime(stn.Time, stn.Flag, dates)
#
merged2d=np.empty((self.nbmonth,self.nbstations), float)
merged2d[:,:] = np.nan
for i,stn in enumerate(self.stations) :
if stn.Merged is not None :
merged2d[:,i] = CC.tonewtime(stn.Time, stn.Merged, dates)
#
#
# Open NetCDF file to be filled
#
out=Dataset(filename, 'w')
out.history = hist
out.metadata = "Missing metadata are replaced by XXX or -999"
out.Source = self.source
if self.mergersource is not None :
for i,src in enumerate(self.mergersource) :
att="MergeSource"+str(i+1)
exec('out.%s = "%s"' % (att,src))
out.ConversionDate = datetime.date.today().isoformat()
#
# Create dimensions
#
strlen=60
out.createDimension('time', self.nbmonth)
out.createDimension('stations', self.nbstations)
if self.maxalt > 0 :
out.createDimension('maxalt', self.maxalt)
out.createDimension('z', self.altsz)
out.createDimension('str', strlen)
#
time=out.createVariable('time', 'd', ('time',), fill_value=FillValue)
time.units="seconds since "+self.FirstDate().isoformat().replace("T"," ")
time.long_name="Time axis"
for i in range(self.nbmonth):
time[i]=((dates[i]-self.FirstDate()).days)*86400
#
# Set netCDF variables for station information
#
nb=out.createVariable('number', 'i', ('stations',))
nb.long_name="GRDC Station number"
nex=out.createVariable('next', 'i', ('stations',))
nex.long_name="Downstream GRDC Station"
wmoreg=out.createVariable('WMOreg', 'i', ('stations',))
wmoreg.long_name="WMO region"
wmosubreg=out.createVariable('WMOsubreg', 'i', ('stations',))
wmosubreg.long_name="WMO sub-region"
stname=out.createVariable('name', 'c', ('stations','str',))
stname.long_name="Station Name"
riv=out.createVariable('river', 'c', ('stations','str',))
riv.long_name="River Name"
ctry=out.createVariable('country', 'c', ('stations','str',))
ctry.long_name="Country of station"
filedate=out.createVariable('FileDate', 'c', ('stations','str',))
filedate.long_name="File generation date"
lastupdate=out.createVariable('LastUpdate', 'c', ('stations','str',))
lastupdate.long_name="Last update of data"
lon=out.createVariable('lon', 'f', ('stations',))
lon.units="degrees_east"
lon.long_name="Longitude"
lat=out.createVariable('lat', 'f', ('stations',))
lat.units="degrees_north"
lat.long_name="Latitude"
area=out.createVariable('area', 'f', ('stations',))
area.units="km^2"
area.long_name="Catchment Area"
altitude=out.createVariable('altitude', 'f', ('stations',))
altitude.units="m"
altitude.long_name="Altitude of Station"
#
# Add the data to the variables
#
for i,stn in enumerate(self.stations) :
nb[i]=stn.Number
wmoreg[i]=stn.WMOregion
wmosubreg[i]=stn.WMOsubregion
nex[i]=stn.NextStation
if isinstance(stn.Name, str) :
stname[i,:]=stringtochar(np.array(stn.Name.replace("ö","o"),dtype="S60"))
else :
stname[i,:]=stringtoarr(stn.Name.tolist(), strlen)
if isinstance(stn.River, str) :
riv[i,:]=stringtochar(np.array(stn.River,dtype="S60"))
else :
riv[i,:]=stringtoarr(stn.River.tolist(), strlen)
if isinstance(stn.Country, str) :
ctry[i,:]=stringtochar(np.array(stn.Country,dtype="S60"))
else :
ctry[i,:]=stringtoarr(stn.Country.tolist(), strlen)
if isinstance(stn.FileDate, str) :
filedate[i,:]=stringtochar(np.array(stn.FileDate,dtype="S60"))
else :
filedate[i,:]=stringtoarr(stn.FileDate.tolist(), strlen)
if isinstance(stn.LastUpdate, str) :
lastupdate[i,:]=stringtochar(np.array(stn.LastUpdate,dtype="S60"))
else :
lastupdate[i,:]=stringtoarr(stn.LastUpdate.tolist(), strlen)
lon[i]=stn.Location[0]
lat[i]=stn.Location[1]
area[i]=stn.Area
altitude[i]=stn.Altitude
#
#
# Original Data
#
original[np.isnan(original)]=FillValue
hydro=out.createVariable('hydrographs', 'f', ('time','stations',), fill_value=FillValue)
hydro.units="m^3/s"
hydro.long_name="Original Hydrographs"
hydro.coordinates="time stations"
hydro[:,:]=original[:,:]
#
# Hydrographs from other sources
#
if self.maxalt > 0 :
altindex=out.createVariable('alternative_index', 'i', ('maxalt','stations',))
altindex.long_name="Index of alternative hydrographs in althydro field"
altindex[:]=altloc
althydro=out.createVariable('alt_hydrograph', 'f', ('time','z',), fill_value=FillValue)
althydro.units="m^3/s"
althydro.long_name="Hydrographs from other sources used to build the merged product"
althydro.coordinates="time stations"
alth[np.isnan(alth)]=FillValue
althydro[:,:] = alth[:,:]
#
# Flags
#
if np.count_nonzero(np.isnan(flags)) < flags.size :
flag=out.createVariable('flags', 'f', ('time','stations',), fill_value=FillValue)
flag.units="-"
flag.long_name="GRDC Flag : Percentage of valid values used for calculation from daily data"
flag.coding1="80 < flag <= 100 : merge = corr (Average of daily values used)"
flag.coding1="0 < flag <= 80 : merge = hydro (Monthly values used)"
flag.coordinates="time stations"
flags[np.isnan(flags)]=FillValue
flag[:,:]=flags[:,:]
#
# Merged Data
#
if np.count_nonzero(np.isnan(merged2d)) < merged2d.size :
merge=out.createVariable('mergedhydro', 'f', ('time','stations',), fill_value=FillValue)
merge.units="m^3/s"
merge.long_name="Hydrographs - Best estimate"
merge.coordinates="time stations"
merged2d[np.isnan(merged2d)]=FillValue
merge[:,:]=merged2d[:,:]
#
out.close()
#
return
#
#
#
def fetchnetcdf(self, filename) :
ncin=Dataset(filename, 'r')
#
self.nbstations = ncin.variables["number"].shape[0]
self.nbmonth = ncin.variables["time"].shape[0]
tmpmerge = []
for (a,v) in ncin.__dict__.items():
if a == "history" :
self.hist = v
if a == "Source" :
self.source = v
if a.find("MergeSource") == 0 :
tmpmerge.append(v)
if len(tmpmerge) > 0 :
self.mergersource = tmpmerge
#
for i in range(self.nbstations) :
self.stations.append(ST.Station("netcdf", ncin, index=i))
#
ncin.close()
#
return
#
# Functions to get all stations which are close to each other
# Assume we have Location for all stations (else they make little sense !).
#
def closeto(A, B, distance=5000.) :
AclosetoB = []
BclosetoA = []
d=[]
for Ai,Astn in enumerate(A.stations) :
d=np.array([CC.distance_haversine(Astn.Location, Bstn.Location) for Bstn in B.stations])
if np.count_nonzero(np.array(d) < distance) > 0 :
AclosetoB.append(Ai)
BclosetoA.append(list(np.where(np.array(d) < distance)[0]))
return AclosetoB, BclosetoA
#
# Identify pairs in A and B which could be the same station.
# The choice tells which source is to be used for the metadata.
#
def PairAtoB (A, B, AclosetoB, BclosetoA, mindist, maxerr, interactive=False) :
pairs = []
choice = []
for i,Ai in enumerate(AclosetoB) :
Allscore = []
Alldist = []
Alluperr = []
for Bi in BclosetoA[i] :
#
score=[]
#
dist = np.abs(CC.distance_haversine(A.stations[Ai].Location, B.stations[Bi].Location))
if dist > 0 :
score.append(min(mindist/dist,1.0))
else :
score.append(1.0)
#
if A.stations[Ai].Area > ST.defint and B.stations[Bi].Area > ST.defint :
uperr = np.abs((A.stations[Ai].Area-B.stations[Bi].Area)/A.stations[Ai].Area*100.)
if uperr > 0 :
score.append(min(maxerr/uperr,1.0))
else :
score.append(1.0)
else :
uperr = np.nan
# Simplify river names. river, rio in other languages need to be added.
Ariver = A.stations[Ai].River.tolist().lower().replace("river","").replace("rio","")
Briver = B.stations[Bi].River.tolist().lower().replace("river","").replace("rio","")
#
score.append(CC.scorestring(A.stations[Ai].Name.tolist(), B.stations[Bi].Name.tolist()))
score.append(CC.scorestring(Ariver, Briver))
#
if np.mean(np.array(score)) >= 0.9 :
print("Merge +++> ", A.stations[Ai].Name, "---", B.stations[Bi].Name, " Distance [m] ", dist, \
"UpStr. area [%] ", uperr, " Mean score : ",np.mean(np.array(score)))
pairs.append([Ai,Bi])
choice.append("A")
else :
Allscore.append(score)
Alldist.append(dist)
Alluperr.append(uperr)
#
# If interactive the user can choose between the options.
#
if len(Allscore) == len(BclosetoA[i]) and interactive :
print("========================================================================")
print("= Find a matching B =")
print("========================================================================")
print("Stations A : ", A.stations[Ai].Name, " on ", A.stations[Ai].River)
print("Stations A : Location = ", A.stations[Ai].Location)
print("Stations A : Upstream area [km^2] = ", A.stations[Ai].Area)
print("==> Options are ! ")
opt=""
for j,Bi in enumerate(BclosetoA[i]) :
print("Stations B"+str(j)+" : Distance to A [m] ", Alldist[j], "UpStr. area [%] ", Alluperr[j], \
" Mean score : ",np.mean(np.array(Allscore[j])))
print("Stations B"+str(j)+" : ", B.stations[Bi].Name, " on ", B.stations[Bi].River)
print("Stations B"+str(j)+" : Location = ", B.stations[Bi].Location)
print("Stations B"+str(j)+" : Upstream area [km^2] = ", B.stations[Bi].Area)
print(" ")
opt = opt+" B"+str(j)
print("Select best matching station in the B")
Bopt = input(opt+" or No (nothing) : ")
if len(Bopt) > 0 and Bopt[0].capitalize() == "B" :
try :
intsel = int(Bopt[-1])
except :
intsel = -1
#
if intsel < len(BclosetoA[i]) :
Bi = BclosetoA[i][intsel]
meta = input("Which metadata should be kept [A]/B : ")
if meta.capitalize() == "A" :
print("==> "+Bopt+" and Metadata of A are chosen")
pairs.append([Ai,Bi])
choice.append("A")
elif meta.capitalize() == "B" :
print("==> "+Bopt+" and Metadata of B are chosen")
pairs.append([Ai,Bi])
choice.append("B")
else :
print("==> "+Bopt+" and default metadata (A) chosen")
pairs.append([Ai,Bi])
choice.append("A")
else :
print("Station does not exist. Choice droped !")
else :
print("Stations do not match")
return pairs, choice
#
# Concatenating all non paired station in two lists
#
def concatenate(A, B, pairs, Sources="") :
C=AllStations(Source=Sources)
#
for Ai,Astn in enumerate(A.stations) :
if not any(np.array(pairs)[:,0] == Ai) :
C.add(Astn)
#
for Bi,Bstn in enumerate(B.stations) :
if not any(np.array(pairs)[:,1] == Bi) :
C.add(Bstn)
#
return C
#
# Merge the data of the paired stations
#
def mergedata(A, B, C, pairs, choice, verbose=False) :
#
for i,X in enumerate(pairs) :
Ai,Bi = X
if choice[i] == "A" :
NewStn = A.stations[Ai]
AltTime = B.stations[Bi].Time
if isinstance(B.stations[Bi].Original,np.ndarray) :
AltOrig = [B.stations[Bi].Original]
else :
AltOrig = B.stations[Bi].Original
AltSource = B.stations[Bi].mergersource
AltMerged = B.stations[Bi].Merged
else :
NewStn = B.stations[Bi]
AltTime = A.stations[Ai].Time
if isinstance(A.stations[Ai].Original,np.ndarray) :
AltOrig = [A.stations[Ai].Original]
else :
AltOrig = A.stations[Ai].Original
AltSource = A.stations[Ai].mergersource
AltMerged = A.stations[Ai].Merged
#
for ia,alt in enumerate(AltSource[:]) :
NewStn.mergersource.append(alt)
if isinstance(NewStn.Original,np.ndarray) :
NewStn.Original = [NewStn.Original, CC.tonewtime(AltTime, AltOrig[ia], NewStn.Time)]
else :
NewStn.Original.append(CC.tonewtime(AltTime, AltOrig[ia], NewStn.Time))
#
if verbose :
print("For station ",A.stations[Ai].Name," the following sources exist :", \
NewStn.mergersource, " Number of Originals :", len(NewStn.Original))
#
# Generate a new merged product.
#
NewMerged = CC.mergedata(NewStn.Original, NewStn.Merged, verbose)
NewStn.Merged = NewMerged
C.add(NewStn)
#
return C
#
# Check Station number and generate new ones if needed.
#
def stnnbchecker(C) :
stnnb=np.array([C.stations[i].Number for i in range(C.nbstations)], int)
reg=np.array([C.stations[i].WMOregion for i in range(C.nbstations)], int)
subreg=np.array([C.stations[i].WMOsubregion for i in range(C.nbstations)], int)
u, c = np.unique(stnnb[stnnb > 0], return_counts=True)
#
# Solve duplicate first
#
while np.max(c) > 1 :
dup = u[np.argmax(c)]
duploc = np.argmin(np.abs(stnnb-dup))
dreg = int(stnnb[duploc]/100000)
dsubreg = int((stnnb[duploc]-dreg*100000)/1000)
regstnnb = stnnb[(stnnb >= dup) & (stnnb < dreg*100000+(dsubreg+1)*1000)]
if regstnnb.shape[0] < 1 :
t=dreg*100000+(subreg[i]+1)*1000-1
else :
t= np.max(regstnnb)+1
if not any(stnnb == t) :
stnnb[duploc] = t
C.stations[duploc].Number = t
print("stnnbchecker : replacing ", u[np.argmax(c)], " by ", t)
else :
print("stnnbchecker 2 : Generated a number which already exists !")
sys.exit()
u, c = np.unique(stnnb[stnnb > 0], return_counts=True)
#
# Generate new numbers if needed
#
if np.count_nonzero(stnnb < 0) > 999 :
subreg[:] = subreg[:]-1
mask = np.where(stnnb < 0)
for i in np.nditer(mask) :
regstnnb=stnnb[(stnnb >= reg[i]*100000+subreg[i]*1000) & (stnnb < (reg[i]+1)*100000)]
if regstnnb.shape[0] < 1 :
t = reg[i]*100000+subreg[i]*1000
else :
t = np.max(regstnnb)+1
if not any(stnnb == t) :
stnnb[i] = t
C.stations[i].Number = stnnb[i]
C.stations[i].WMOsubregion = subreg[i]
else:
print("stnnbchecker 2 : Generated a number which already exists !")
sys.exit()
#
return C
import numpy as np
import datetime
import math
import sys
#
from difflib import SequenceMatcher
#
# For Json format get dates
#
def key2date(obj, key) :
dates = []
for dd in obj[key] :
dates.append(datetime.datetime(int(dd[0:4]), int(dd[4:6]), int(dd[6:8])))
return dates
#
# For Json format get data
#
def key2data(obj, key) :
nval=len(obj[key].keys())
data=np.zeros((nval))
for i,dd in enumerate(obj[key]) :
data[i] = obj[key][dd]
return data
#
# Compute montly means when original data is daily
#
def monthlymean(d,x) :
inyrs = np.array([di.year for di in d])
inym = np.array([(di.year*100)+di.month for di in d])
# Carefull data can have gaps, We recreate a list of years
nbinyrs = inyrs[-1]-inyrs[0]+1
nd = np.arange(nbinyrs)+inyrs[0]
# New data
od = []
ox = []
#
for iy in range(nbinyrs) :
for im in range(12) :
ind = (inym==nd[iy]*100+(im+1))
mday = datetime.datetime(nd[iy], im+1, 15)
mlen = (mday.replace(month = mday.month % 12 +1, day = 1)-datetime.timedelta(days=1)).day
od.append(mday)
if len(x[ind]) < mlen :
# See if we can fix that
ox.append(np.nan)
else :
ox.append(np.mean(x[ind]))
#
return od, np.array(ox)
#
# get time axis from a NetCDF filex
#
def gettimeaxis(nf):
dims=nf.dimensions.keys()
for d in dims:
if ( d.find('time') >= 0 ):
tname=d
t=np.copy(nf.variables[tname][:])
tunits=getattr(nf.variables[tname],"units")
if ( tunits.find("seconds") >= 0):
epoch=datetime.datetime.strptime(tunits[tunits.find("since ")+6:], "%Y-%m-%d %H:%M:%S")
date_list = [epoch+datetime.timedelta(seconds=x.astype(np.float64)) for x in t]
elif( tunits.find("days") >= 0):
epoch=datetime.datetime.strptime(tunits[tunits.find("since ")+6:], "%Y-%m-%d %H:%M:%S")
date_list = [epoch+datetime.timedelta(days=x.astype(np.float64)) for x in t]
elif( tunits.find("month") >= 0):
epoch=datetime.datetime.strptime(tunits[tunits.find("since ")+6:], "%Y-%m-%d %H:%M:%S")
date_list=[]
for x in t :
m=((epoch.month+(x-1))%12)+1
y=epoch.year+int((x-m)/12)+1
date_list.append(datetime.date(int(y),int(m),epoch.day))
else :
print("The time axis is not in seconds and I cannot yet deal with it")
sys.exit()
return date_list
#
#
#
def distance_haversine(A, B, radius=6371000.):
""" note that the default distance is in meters """
dLat = math.radians(B[1]-A[1])
dLon = math.radians(B[0]-A[0])
lat1 = math.radians(A[1])
lat2 = math.radians(B[1])
a = math.sin(dLat/2) * math.sin(dLat/2) + math.sin(dLon/2) * math.sin(dLon/2) * math.cos(lat1) * math.cos(lat2)
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
return c * radius
#
# Function to score strings
#
def scorestring(stra, strb) :
# One string is in another
match = SequenceMatcher(None,stra,strb).find_longest_match(0, len(stra), 0, len(strb))
#
if match.size == min(len(stra), len(strb)) :
# Sub-string case
score = 1.0