hists.py 8.14 KB
Newer Older
1

2
from functools import wraps
3

4
import dask_histogram as dh
Clément Haëck's avatar
Clément Haëck committed
5
import numpy as np
6
import pandas as pd
7 8
import xarray as xr
from scipy import stats
9

10 11 12
import lib
import lib.data

Clément Haëck's avatar
Clément Haëck committed
13

14 15
ARGS = {'region', 'days', 'kind', 'nomask'}
defaults = dict(kind='2thr', nomask=False)
Clément Haëck's avatar
Clément Haëck committed
16

17

Clément Haëck's avatar
Clément Haëck committed
18 19 20 21
def PREGEX(args):
    pregex = ["number_%(number:fmt=d:rgx=%I)/",
              "scale_%(scale:fmt=.1f)/",
              "coef_%(coef:fmt=d)/"]
22

Clément Haëck's avatar
Clément Haëck committed
23
    if args['kind'] == '2thr':
Clément Haëck's avatar
Clément Haëck committed
24 25
        pregex += ["hist_GS3",
                   "_thr_%(thr_lo:fmt=.2f)_%(thr_hi:fmt=.2f)"]
Clément Haëck's avatar
Clément Haëck committed
26 27 28 29 30 31 32
    elif args['kind'] == '2d':
        pregex.append("hist_2D_GS3")
    elif args['kind'] == 'thr':
        pregex += ["hist_GS3",
                   "_thr_%(threshold:fmt=.2f)"]
    else:
        raise ValueError("Unknown hist pregex type '{}'".format(args['dim']))
Clément Haëck's avatar
Clément Haëck committed
33

Clément Haëck's avatar
Clément Haëck committed
34
    pregex += ["_%(time:Y)",
Clément Haëck's avatar
Clément Haëck committed
35
               "_nomask" if args['nomask'] else "",
Clément Haëck's avatar
Clément Haëck committed
36
                ".nc"]
Clément Haëck's avatar
Clément Haëck committed
37

Clément Haëck's avatar
Clément Haëck committed
38
    return ''.join(pregex)
Clément Haëck's avatar
Clément Haëck committed
39 40 41 42 43 44


def ROOT(args):
    return lib.data.get_default_directory(args, 'Hists')


Clément Haëck's avatar
Clément Haëck committed
45
auto_attr = lib.data.create_dataset(__name__, PREGEX, ROOT, ARGS)
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65


def get_data(args=None, remove_dims=None, **kwargs):
    import xarray as xr

    def preprocess(ds):
        for d in remove_dims:
            if d in ds.coords and ds.coords[d].size <= 1:
                if d in ds.dims:
                    ds = ds.isel({d: 0})
            ds = ds.reset_coords(remove_dims, drop=True)
        return ds

    prepro_func = None
    if remove_dims is not None:
        prepro_func = preprocess

    finder = auto_attr['get_finder'](args, **kwargs)
    return xr.open_mfdataset(finder.get_files(), preprocess=prepro_func,
                             parallel=False, data_vars='different')
66 67


Clément Haëck's avatar
Clément Haëck committed
68
def var_name(name, var):
69 70 71
    return '{}_{}'.format(name, var)


72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
def histogram_da(da: xr.DataArray, bins, density=True, **kwargs):
    """Compute histogram over a DataArray.

    Uses the flattened array.
    Uses dask_histogram.
    Returns DataArray and bins edges.
    """
    h = dh.factory(da.data.ravel(), axes=[bins], **kwargs)
    hist, _ = h.to_dask_array()
    hist = xr.DataArray(hist, coords=dict(bins=bins.edges[:-1]))

    if density:
        hist = hist / bins.widths
        hist = hist / (hist*bins.widths).sum('bins')

    hist.name = da.name + '_hist'
    return hist, bins.edges


91
def get_full_bins(da, bins=None):
Clément Haëck's avatar
Clément Haëck committed
92
    """Get bins with the rightmost edge."""
Clément Haëck's avatar
Clément Haëck committed
93
    if bins is None:
Clément Haëck's avatar
Clément Haëck committed
94
        bins = da[var_name('bins', da.VAR)]
95 96
    if isinstance(bins, str):
        bins = da[bins]
Clément Haëck's avatar
Clément Haëck committed
97
    return np.append(bins.values, bins.attrs['right_edge'])
98 99


100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
def variables(name_in='hist', allowconcat=True):
    """Create a function acting on histogram of different variables.

    Histograms are computed for different variables. Histograms have different
    binning, so that some computation should be done individually.
    This makes a function able to act on all (or some) of the variables of a
    dataset.
    A supplied dataset should have a list of its variable in the `VARS`
    attribute. A supplied dataArray should have its variable name in the `VAR`
    attribute.

    The result can be a dataArray (for a single variable or concatenated results
    for all variables), or a dataset with a dataArray for each variable result.

    Hopefully the examples of decorated functions below can help understand.

    Arguments
    ---------
    concatdef: bool
        If True, allow the concatenation of the results for different variables.
    name_in:
        Name of the input variable. For exemple 'pdf' for the PDF of a variable.
        Look then for the dataArrays `pdf_var1`, `pdf_var2`,...
    """
    def decorator(func):
        @wraps(func)
126
        def decorated(ds, concat=allowconcat, variables=None, **kwargs):
127 128
            if not allowconcat and concat:
                raise ValueError("Cannot concatenate with this function.")
Clément Haëck's avatar
Clément Haëck committed
129 130 131 132 133 134 135
            single_out = False  # Return a single DataArray

            # DataArray with no variables indication
            if variables is None and isinstance(ds, xr.DataArray):
                if 'VAR' not in ds.attrs:
                    return func(ds, **kwargs)
                variables = ds.attrs['VAR']
136 137

            if isinstance(variables, str):
Clément Haëck's avatar
Clément Haëck committed
138 139
                variables = [variables]
                single_out = True
140 141 142 143 144

            # List of available variables
            if variables is None:
                variables = ds.VARS
            # DataArrays names
Clément Haëck's avatar
Clément Haëck committed
145
            variables_ext = [var_name(name_in, v) for v in variables]
146 147 148

            out_vars = []
            for var, var_ext in zip(variables, variables_ext):
Clément Haëck's avatar
Clément Haëck committed
149 150
                da = ds if isinstance(ds, xr.DataArray) else ds[var_ext]
                da.attrs['VAR'] = var  # normally already set
151
                out = func(da, **kwargs)
Clément Haëck's avatar
Clément Haëck committed
152 153
                out.attrs['VAR'] = var
                if concat and not single_out:
154 155
                    out = out.expand_dims(var=[var])
                else:
Clément Haëck's avatar
Clément Haëck committed
156
                    out = out.rename(var_name(out.name, var))
157 158
                out_vars.append(out)

Clément Haëck's avatar
Clément Haëck committed
159 160
            if single_out:
                return out_vars[0]
161 162 163 164 165 166 167 168 169 170 171 172 173
            if concat:
                return xr.concat(out_vars, 'var')
            out = xr.merge(out_vars)
            out.attrs['VARS'] = variables
            return out

        return decorated
    return decorator


@variables()
def get_N(hist):
    """Return total number of hits per image."""
Clément Haëck's avatar
Clément Haëck committed
174
    return hist.sum(var_name('bins', hist.VAR)).rename('N')
175 176 177


@variables('N')
178
def get_surface_fraction(N, mask='frt'):
179
    """Return fraction of surface impacted by fronts."""
180
    frac = N.sel(mask=mask) / N.sum('mask')
181 182 183 184 185
    return frac.rename('frac')


@variables(allowconcat=False)
def get_pdf(hist):
186
    """Return histogram normalized to a probability density function."""
Clément Haëck's avatar
Clément Haëck committed
187
    bins_name = var_name('bins', hist.VAR)
188
    # Get frequency (probability to hit a given bin)
189
    freq = hist / hist.sum(bins_name)
190 191

    # Get frequency density (probablity to hit a given value range)
Clément Haëck's avatar
Clément Haëck committed
192
    widths = xr.DataArray(np.diff(get_full_bins(hist)), dims=bins_name)
193 194 195
    pdf = freq / widths

    # Normalise (so the integral is 1)
196 197
    pdf = pdf / (widths*pdf).sum(bins_name)
    return pdf.rename('pdf')
198 199


200
@variables('pdf')
201 202
def get_quantile(pdf, q=0.5):
    """Returns quantile computed from pdf."""
Clément Haëck's avatar
Clément Haëck committed
203
    bins_name = var_name('bins', pdf.VAR)
204
    bins = get_full_bins(pdf, bins_name)
Clément Haëck's avatar
Clément Haëck committed
205

206 207
    def uf_quant(arr):
        return stats.rv_histogram((arr, bins)).ppf(q)
208

209 210
    quant = xr.apply_ufunc(
        uf_quant, pdf,
211
        input_core_dims=[[bins_name]],
212 213 214 215 216
        output_core_dims=[[]],
        output_dtypes=[float],
        dask='parallelized',
        vectorize=True
    )
Clément Haëck's avatar
Clément Haëck committed
217

218 219 220 221 222 223
    return quant.rename('q{:g}'.format(q))


def get_median(pdf):
    """Return median computed from pdf."""
    return get_quantile(pdf, q=0.5)
224

Clément Haëck's avatar
Clément Haëck committed
225

226 227
@variables()
def get_total(hist):
228
    """Return total amount of quantity counted in histograms."""
Clément Haëck's avatar
Clément Haëck committed
229
    bins_name = var_name('bins', hist.VAR)
Clément Haëck's avatar
Clément Haëck committed
230
    return (hist * hist[bins_name]).sum(bins_name).rename('total')
Clément Haëck's avatar
Clément Haëck committed
231 232


233
def resample_8D_multiple_years(ds, days=8):
234 235 236 237 238
    """Resample data to 8D (or other) by summing histograms.

    New time interval are the same year to year. The last interval in each
    year can thus be smaller.
    """
239
    ts = pd.DatetimeIndex([])
Clément Haëck's avatar
Clément Haëck committed
240
    for year in sorted(list(set(ds.time.dt.year.values))):
241
        ts = ts.append(pd.date_range('{}/01/01'.format(year),
242 243
                                     freq='{}D'.format(days),
                                     periods=365//days))
244 245
    ts = ts.append(pd.to_datetime(['{}/12/31'.format(year)]))

246
    ds = ds.groupby_bins('time', ts, labels=ts[:-1]).sum(keep_attrs=True)
247 248 249 250 251
    ds = ds.rename(time_bins='time')
    return ds


def restore_time(ds):
252
    """Pass from dayofyear to classic time."""
253 254 255 256 257 258 259
    ds = ds.rename(dayofyear='time')
    ds = ds.assign_coords(time=pd.to_datetime(2000*1000+ds.time.values,
                                              format='%Y%j'))
    return ds


def get_climatology(ds):
260 261 262 263
    """Return climatological mean and standard deviations.

    Tuple of median, first decile, last decile.
    """
264 265 266 267 268
    grp = ds.groupby("time.dayofyear")
    cli = grp.median()
    cli_d1 = grp.quantile(0.1)
    cli_d9 = grp.quantile(0.9)

Clément Haëck's avatar
Clément Haëck committed
269 270 271
    cli = restore_time(cli)
    cli_d1 = restore_time(cli_d1)
    cli_d9 = restore_time(cli_d9)
272
    return cli, cli_d1, cli_d9