Source code for xscen.reduce

"""Functions to reduce an ensemble of simulations."""

import warnings
from typing import Optional, Union

import numpy as np
import xarray as xr
import xclim.ensembles as xce

from .config import parse_config



[docs]
@parse_config
def build_reduction_data(
    datasets: Union[dict, list[xr.Dataset]],
    *,
    xrfreqs: Optional[list[str]] = None,
    horizons: Optional[list[str]] = None,
) -> xr.DataArray:
    """Construct the input required for ensemble reduction.

    This will combine all variables into a single DataArray and stack all dimensions except "realization".

    Parameters
    ----------
    datasets : Union[dict, list]
        Dictionary of datasets in the format {"id": dataset}, or list of datasets. This can be generated by calling .to_dataset_dict() on a catalog.
    xrfreqs : list of str, optional
        List of unique frequencies across the datasets.
        If None, the script will attempt to guess the frequencies from the datasets' metadata or with xr.infer_freq().
    horizons : list of str, optional
        Subset of horizons on which to create the data.

    Returns
    -------
    xr.DataArray
        2D DataArray of dimensions "realization" and "criteria", to be used as input for ensemble reduction.
    """
    warnings.warn(
        "This function will be dropped in a future version, as it is now redundant with xclim.ensembles.make_criteria."
        "Either use xclim.ensembles.make_criteria directly (preceded by xclim.ensembles.create_ensemble if needed) or "
        "use xscen's reduce_ensemble function to build the criteria and reduce the ensemble in one step.",
        FutureWarning,
    )

    # Use metadata to identify the simulation attributes
    info = {}
    keys = datasets.keys() if isinstance(datasets, dict) else range(len(datasets))
    for key in keys:
        info[key] = {}
        info[key]["id"] = datasets[key].attrs.get("cat:id", None) or key
        info[key]["xrfreq"] = datasets[key].attrs.get("cat:xrfreq") or xr.infer_freq(
            datasets[key].time
        )

    xrfreqs = xrfreqs or np.unique(info[key]["xrfreq"] for key in info.keys())

    criteria = None
    # Loop through each xrfreq
    for xrfreq in xrfreqs:
        # Subset on the datasets that have the right xrfreq and change the dictionary key to only the ID
        ds_dict = {
            info[k]["id"]: v for k, v in datasets.items() if info[k]["xrfreq"] == xrfreq
        }

        # Create the ensemble
        ens = xce.create_ensemble(datasets=ds_dict)

        if horizons:
            ens = ens.where(ens.horizon.isin(horizons), drop=True)

        criteria = _concat_criteria(criteria, ens)

    # drop columns that are all NaN
    criteria = criteria.dropna(dim="criteria", how="all")
    if criteria.isnull().sum().values != 0:
        raise ValueError("criteria dataset contains NaNs")

    # Attributes
    criteria.attrs = {"long_name": "criteria for ensemble selection"}

    return criteria




[docs]
@parse_config
def reduce_ensemble(
    data: Union[xr.DataArray, dict, list, xr.Dataset],
    method: str,
    *,
    horizons: Optional[list[str]] = None,
    create_kwargs: Optional[dict] = None,
    **kwargs,
):
    r"""Reduce an ensemble of simulations using clustering algorithms from xclim.ensembles.

    Parameters
    ----------
    data : xr.DataArray
        Selection criteria data : 2-D xr.DataArray with dimensions 'realization' and 'criteria'.
        These are the values used for clustering. Realizations represent the individual original
        ensemble members and criteria the variables/indicators used in the grouping algorithm.
        This data can be generated using py:func:`xclim.ensembles.make_criteria`.
        Alternatively, either a xr.Dataset, a list of xr.Dataset or a dictionary of xr.Dataset can be passed,
        in which case the data will be built using py:func:`xclim.ensembles.create_ensemble` and py:func:`xclim.ensembles.make_criteria`.
    method : str
      ['kkz', 'kmeans']. Clustering method.
    horizons : list of str, optional
        Subset of horizons on which to create the data. Only used if `data` needs to be built.
    create_kwargs : dict, optional
        Arguments to pass to py:func:`xclim.ensembles.create_ensemble` if `data` is not an xr.DataArray.
    \*\*kwargs : dict
        Arguments to send to either py:func:`xclim.ensembles.kkz_reduce_ensemble` or py:func:`xclim.ensembles.kmeans_reduce_ensemble`.

    Returns
    -------
    selected : xr.DataArray
        DataArray of dimension 'realization' with the selected simulations.
    clusters : dict
        If using kmeans clustering, realizations grouped by cluster.
    fig_data : dict
        If using kmeans clustering, data necessary to call py:func:`xclim.ensembles.plot_rsqprofile`.

    Notes
    -----
    If building `data` to be constructed by this function, the datasets should already have a climatology computed on them, such that the data
    has no temporal dimension aside from the "horizon" coordinate (which is optional and might be used to subset the data).
    If the indicators are a mix of yearly, seasonal, and monthly, they should be stacked on the same time/horizon axis and put in the same dataset.
    You can use py:func:`xscen.utils.unstack_dates` on seasonal or monthly indicators to this end.
    """
    warnings.warn(
        "This function has been moved to xscen.ensembles.reduce_ensemble. This version will be dropped in a future release.",
        FutureWarning,
    )
    return reduce_ensemble(
        data=data,
        method=method,
        horizons=horizons,
        create_kwargs=create_kwargs,
        **kwargs,
    )



def _concat_criteria(criteria: Optional[xr.DataArray], ens: xr.Dataset):
    """Combine all variables and dimensions excepting 'realization'."""
    if criteria is None:
        i = 0
    else:
        i = int(criteria.criteria[-1] + 1)

    for vv in ens.data_vars:
        da = ens[vv]
        da.name = "values"
        # Stack all dimensions that are not 'realization'
        da = da.stack(
            {"criteria": list({d for d in da.dims}.difference(["realization"]))}
        )
        da = da.assign_coords({"criteria": np.arange(i, i + len(da.criteria))})
        if "horizon" in da.coords:
            da = da.drop_vars("horizon")

        if criteria is None:
            criteria = da
        else:
            criteria = xr.concat([criteria, da], dim="criteria")
        i = i + len(da.criteria)

    return criteria