Source code for xscen.reduce

"""Functions to reduce an ensemble of simulations."""

import logging
from typing import Optional, Union

import numpy as np
import xarray as xr
import xclim.ensembles as xce

from .config import parse_config

logger = logging.getLogger(__name__)

__all__ = ["build_reduction_data", "reduce_ensemble"]



[docs]
@parse_config
def build_reduction_data(
    datasets: Union[dict, list[xr.Dataset]],
    *,
    xrfreqs: Optional[list[str]] = None,
    horizons: Optional[list[str]] = None,
) -> xr.DataArray:
    """Construct the input required for ensemble reduction.

    This will combine all variables into a single DataArray and stack all dimensions except "realization".

    Parameters
    ----------
    datasets : Union[dict, list]
        Dictionary of datasets in the format {"id": dataset}, or list of datasets. This can be generated by calling .to_dataset_dict() on a catalog.
    xrfreqs : list of str, optional
        List of unique frequencies across the datasets.
        If None, the script will attempt to guess the frequencies from the datasets' metadata or with xr.infer_freq().
    horizons : list of str, optional
        Subset of horizons on which to create the data.

    Returns
    -------
    xr.DataArray
        2D DataArray of dimensions "realization" and "criteria", to be used as input for ensemble reduction.
    """
    # Use metadata to identify the simulation attributes
    info = {}
    keys = datasets.keys() if isinstance(datasets, dict) else range(len(datasets))
    for key in keys:
        info[key] = {}
        info[key]["id"] = datasets[key].attrs.get("cat:id", None) or key
        info[key]["xrfreq"] = datasets[key].attrs.get("cat:xrfreq") or xr.infer_freq(
            datasets[key].time
        )

    xrfreqs = xrfreqs or np.unique(info[key]["xrfreq"] for key in info.keys())

    criteria = None
    # Loop through each xrfreq
    for xrfreq in xrfreqs:
        # Subset on the datasets that have the right xrfreq and change the dictionary key to only the ID
        ds_dict = {
            info[k]["id"]: v for k, v in datasets.items() if info[k]["xrfreq"] == xrfreq
        }

        # Create the ensemble
        ens = xce.create_ensemble(datasets=ds_dict)

        if horizons:
            ens = ens.where(ens.horizon.isin(horizons), drop=True)

        criteria = _concat_criteria(criteria, ens)

    # drop columns that are all NaN
    criteria = criteria.dropna(dim="criteria", how="all")
    if criteria.isnull().sum().values != 0:
        raise ValueError("criteria dataset contains NaNs")

    # Attributes
    criteria.attrs = {"long_name": "criteria for ensemble selection"}

    return criteria




[docs]
@parse_config
def reduce_ensemble(data: xr.DataArray, method: str, kwargs: dict):
    """Reduce an ensemble of simulations using clustering algorithms from xclim.ensembles.

    Parameters
    ----------
    data : xr.DataArray
        Selection criteria data : 2-D xr.DataArray with dimensions 'realization' and 'criteria'.
        These are the values used for clustering. Realizations represent the individual original
        ensemble members and criteria the variables/indicators used in the grouping algorithm.
        This data can be generated using build_reduction_data().
    method : str
      ['kkz', 'kmeans']. Clustering method.
    kwargs : dict
        Arguments to send to either xclim.ensembles.kkz_reduce_ensemble or xclim.ensembles.kmeans_reduce_ensemble

    Returns
    -------
    selected : xr.DataArray
        DataArray of dimension 'realization' with the selected simulations.
    clusters : dict
        If using kmeans clustering, realizations grouped by cluster.
    fig_data : dict
        If using kmeans clustering, data necessary to call xclim.ensembles.plot_rsqprofile()
    """
    selected = getattr(xce, f"{method}_reduce_ensemble")(data=data, **kwargs)

    clusters = {}
    fig_data = {}
    if method == "kmeans":
        fig_data = selected[2]
        clusters_tmp = selected[1]
        selected = selected[0]
        realization = np.arange(len(clusters_tmp))

        clusters = {
            g: data.realization.isel(realization=realization[clusters_tmp == g])
            for g in np.unique(clusters_tmp)
        }
    selected = data.realization.isel(realization=selected)

    return selected, clusters, fig_data



def _concat_criteria(criteria: Optional[xr.DataArray], ens: xr.Dataset):
    """Combine all variables and dimensions excepting 'realization'."""
    if criteria is None:
        i = 0
    else:
        i = int(criteria.criteria[-1] + 1)

    for vv in ens.data_vars:
        da = ens[vv]
        da.name = "values"
        # Stack all dimensions that are not 'realization'
        da = da.stack(
            {"criteria": list({d for d in da.dims}.difference(["realization"]))}
        )
        da = da.assign_coords({"criteria": np.arange(i, i + len(da.criteria))})
        if "horizon" in da.coords:
            da = da.drop_vars("horizon")

        if criteria is None:
            criteria = da
        else:
            criteria = xr.concat([criteria, da], dim="criteria")
        i = i + len(da.criteria)

    return criteria