{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0",
   "metadata": {
    "nbsphinx": "hidden"
   },
   "outputs": [],
   "source": [
    "# Remove flox spam\n",
    "\n",
    "import logging\n",
    "\n",
    "# Get the logger for the 'flox' package\n",
    "logger = logging.getLogger(\"flox\")\n",
    "# Set the logging level to WARNING\n",
    "logger.setLevel(logging.WARNING)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
    "\n",
    "import xscen as xs\n",
    "\n",
    "output_folder = Path().absolute() / \"_data\"\n",
    "\n",
    "# Create a project Catalog\n",
    "project = {\n",
    "    \"title\": \"example-diagnostics\",\n",
    "    \"description\": \"This is an example catalog for xscen's documentation.\",\n",
    "}\n",
    "\n",
    "pcat = xs.ProjectCatalog(\n",
    "    str(output_folder / \"example-diagnostics.json\"),\n",
    "    create=True,\n",
    "    project=project,\n",
    "    overwrite=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2",
   "metadata": {},
   "source": [
    "# Diagnostics\n",
    "\n",
    "It can be useful to perform a various diagnostic tests in order to check that the data that was produced is as expected. Diagnostics can also help us assess bias adjustment methods.\n",
    "\n",
    "Make sure you run GettingStarted.ipynb before this one, the GettingStarted outputs will be used a inputs in this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Load catalog from the GettingStarted notebook\n",
    "gettingStarted_cat = xs.ProjectCatalog(\n",
    "    str(output_folder / \"example-gettingstarted.json\")\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4",
   "metadata": {},
   "source": [
    "## Health checks\n",
    "\n",
    "<div class=\"alert alert-info\"> <b>NOTE</b>\n",
    "\n",
    "For more information on the available `cfchecks`, `missing`, and `data_flags` methods, [please consult the xclim documentation](https://xclim.readthedocs.io/en/stable/checks.html).\n",
    "</div>\n",
    "\n",
    "Health checks located under `xscen.diagnostics.health_checks` are a series of checkups that can be performed on a Dataset to make sure that it has the expected structure, frequency, calendar, etc., with the ability to call `xclim.core.cfchecks`, `xclim.core.missing`, and `xclim.core.dataflags`. The function gives full control on which checkups should raise an Exception and which should only give a UserWarning.\n",
    "\n",
    "The arguments are:\n",
    "\n",
    "- `structure`: Dictionary with keys \"dims\" and \"coords\" containing the expected dimensions and coordinates.\n",
    "- `calendar`:  Expected calendar. Synonyms should be detected correctly (e.g. \"standard\" and \"gregorian\").\n",
    "- `start_date` & `end_date`:  To check if the dataset contains those.\n",
    "- `variables_and_units`: Dictionary containing the expected variables and units.\n",
    "- `cfchecks`: Dictionary of `xclim.core.cfchecks` to perform, per variable.\n",
    "- `freq`: Expected frequency, written as the result of xr.infer_freq(ds.time).\n",
    "- `missing`: String, list of strings, or dictionary of `xclim.core.missing` checks to perform.\n",
    "- `flags`: Dictionary of `xclim.core.dataflags.data_flags` to perform, per variable.\n",
    "\n",
    "Additionally, `flags_kwargs` is used to pass additional arguments to the data_flags (\"dims\" and \"freq\"), while `return_flags` can be used to return the Dataset created by `xclim.core.dataflags.data_flags`;\n",
    "\n",
    "Use the argument `raise_on` to list to list which test should raise an error if it fails. Use [\"all\"] to raise on all checks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# load input\n",
    "# 'CMIP6_ScenarioMIP_NCC_NorESM2-MM_ssp245_r2i1p1f1_example-region' will be used for this example\n",
    "ds = gettingStarted_cat.search(\n",
    "    id=\"CMIP6_ScenarioMIP_NCC_NorESM2-MM_ssp245_r2i1p1f1_example-region\",\n",
    "    processing_level=\"regridded\",\n",
    ").to_dataset()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# The checks that we want to perform. Note that all checks are optional.\n",
    "structure = {\"coords\": [\"lat\", \"lon\", \"time\"]}\n",
    "calendar = \"365_day\"  # We have a standard calendar, so we'll be warned.\n",
    "start_date = \"1971-01-01\"  # The dataset starts on 1981-01-01, so this should fail\n",
    "end_date = \"2045-01-01\"  # The dataset ends later, but we check that it contains at least 2045-01-01.\n",
    "variables_and_units = {\"tas\": \"degC\"}  # The dataset is in Kelvin, so we'll get warned.\n",
    "cfchecks = {\"tas\": {\"cfcheck_from_name\": {}}}\n",
    "freq = \"MS\"  # We actually have daily data, so we should get a warning.\n",
    "missing = {\"missing_any\": {\"freq\": \"D\"}}\n",
    "flags = {\"tas\": {\"temperature_extremely_low\": {}}}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "xs.diagnostics.health_checks(\n",
    "    ds,\n",
    "    structure=structure,\n",
    "    calendar=calendar,\n",
    "    start_date=start_date,\n",
    "    end_date=end_date,\n",
    "    variables_and_units=variables_and_units,\n",
    "    cfchecks=cfchecks,\n",
    "    freq=freq,\n",
    "    missing=missing,\n",
    "    flags=flags,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8",
   "metadata": {},
   "source": [
    "## Properties and measures\n",
    "\n",
    "This framework for the diagnostic tests was inspired by the [VALUE project](http://www.value-cost.eu/).\n",
    "Statistical Properties is the xclim term for 'indices' in the VALUE project.\n",
    "\n",
    "The `xscen.properties_and_measures` function is a wrapper for [xsbda.properties](https://xsdba.readthedocs.io/en/stable/_modules/xsdba/properties.html) and [xsbda.measures](https://xsdba.readthedocs.io/en/stable/_modules/xsdba/measures.html).\n",
    "\n",
    "- `xsbda.properties` are statistical properties of a climate dataset. They allow for a better understanding of the climate by collapsing the time dimension. A few examples: mean, variance, mean spell length, annual cycle, etc.\n",
    "\n",
    "- `xsbda.measures` assess the difference between two datasets of properties. A few examples: bias, ratio, circular bias, etc."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9",
   "metadata": {},
   "source": [
    "Let's start by calculating the properties on the reference dataset.\n",
    "You have to provide the path to a YAML file `properties` describing the properties you want to compute.\n",
    "You can also specify a period to select and a unit conversion to apply before computing the properties.\n",
    "\n",
    "This example will use a YAML file structured like this:\n",
    "\n",
    "```\n",
    "realm: generic\n",
    "indicators:\n",
    "  quantile_98_tas:\n",
    "    base: xsdba.properties.quantile\n",
    "    cf_attrs:\n",
    "      long_name: 98th quantile of the mean temperature\n",
    "    input:\n",
    "      da: tas\n",
    "    parameters:\n",
    "      q: 0.98\n",
    "      group: time.season\n",
    "  maximum_length_of_warm_spell:\n",
    "    base: xsdba.properties.spell_length_distribution\n",
    "    cf_attrs:\n",
    "      long_name: Maximum spell length distribution when the mean temperature is larger or equal to the 90th quantile.\n",
    "    input:\n",
    "      da: tas\n",
    "    parameters:\n",
    "      method: quantile\n",
    "      op: '>='\n",
    "      thresh: 0.9\n",
    "      stat: max\n",
    "  mean-tas:\n",
    "    base: xsdba.properties.mean\n",
    "    cf_attrs:\n",
    "      long_name: Ratio of the mean temperature\n",
    "    input:\n",
    "      da: tas\n",
    "    measure: xsdba.measures.BIAS\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {},
   "outputs": [],
   "source": [
    "properties = \"samples/properties.yml\"\n",
    "period = [1981, 2010]\n",
    "change_units_arg = {\"tas\": \"degC\"}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11",
   "metadata": {},
   "source": [
    "The properties can be given an argument `group` ('time', 'time.season' or 'time.month'). For 'time', the time collapsing will be performed over the whole period. For 'time.season'/'time.month', the time collapsing will be performed over each season/month. See `quantile_98_tas` as an example for season."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load input\n",
    "dref = gettingStarted_cat.search(source=\"ERA5-Land\").to_dataset()\n",
    "\n",
    "# calculate properties and measures\n",
    "prop_ref, _ = xs.properties_and_measures(\n",
    "    ds=dref,\n",
    "    properties=properties,\n",
    "    period=period,\n",
    "    change_units_arg=change_units_arg,\n",
    "    to_level_prop=\"diag-properties-ref\",\n",
    ")\n",
    "\n",
    "# save and update catalog\n",
    "filename = str(\n",
    "    output_folder\n",
    "    / f\"{prop_ref.attrs['cat:id']}.{prop_ref.attrs['cat:domain']}.{prop_ref.attrs['cat:processing_level']}.zarr\"\n",
    ")\n",
    "xs.save_to_zarr(ds=prop_ref, filename=filename, mode=\"o\")\n",
    "pcat.update_from_ds(ds=prop_ref, path=filename, format=\"zarr\")\n",
    "prop_ref"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13",
   "metadata": {},
   "source": [
    "To compute a measure as well as a property, add the `dref_for_measure` argument with the reference properties calculated above. This will mesure the difference between the reference properties and the scenario properties.\n",
    "A default measure is associated with each properties, but it is possible to define a new one in the YAML (see `mean-tas` for example where the default (bias) was changed for ratio.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load input\n",
    "dscen = gettingStarted_cat.search(\n",
    "    source=\"NorESM2-MM\",\n",
    "    experiment=\"ssp245\",\n",
    "    member=\"r1.*\",\n",
    "    processing_level=\"biasadjusted\",\n",
    ").to_dataset()\n",
    "\n",
    "# calculate properties and measures\n",
    "prop_scen, meas_scen = xs.properties_and_measures(\n",
    "    ds=dscen,\n",
    "    properties=properties,\n",
    "    period=period,\n",
    "    dref_for_measure=prop_ref,\n",
    "    change_units_arg={\"tas\": \"degC\"},\n",
    "    to_level_prop=\"diag-properties-scen\",\n",
    "    to_level_meas=\"diag-measures-scen\",\n",
    ")\n",
    "\n",
    "\n",
    "display(prop_scen)\n",
    "display(meas_scen)\n",
    "\n",
    "# save and update catalog\n",
    "for ds in [prop_scen, meas_scen]:\n",
    "    filename = str(\n",
    "        output_folder\n",
    "        / f\"{ds.attrs['cat:id']}.{ds.attrs['cat:domain']}.{ds.attrs['cat:processing_level']}.zarr\"\n",
    "    )\n",
    "    xs.save_to_zarr(ds=ds, filename=filename, mode=\"o\")\n",
    "    pcat.update_from_ds(ds=ds, path=filename, format=\"zarr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15",
   "metadata": {},
   "outputs": [],
   "source": [
    "var = \"mean-tas\"\n",
    "\n",
    "# plot\n",
    "fig, axs = plt.subplots(1, 3, figsize=(15, 5))\n",
    "prop_ref[var].transpose(\"lat\", ...).plot(ax=axs[0], cmap=\"plasma\", vmin=3, vmax=6)\n",
    "prop_scen[var].transpose(\"lat\", ...).plot(ax=axs[1], cmap=\"plasma\", vmin=3, vmax=6)\n",
    "meas_scen[var].transpose(\"lat\", ...).plot(ax=axs[2], cmap=\"RdBu_r\", vmin=-3, vmax=3)\n",
    "axs[0].set_title(\"Reference\")\n",
    "axs[1].set_title(\"Scenario\")\n",
    "axs[2].set_title(\"Bias between Reference and Scenario\")\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16",
   "metadata": {},
   "source": [
    "If you have different methods of bias adjustment, you might want to compare them and see for each property which method performs best (bias close to 0, ratio close to 1) with a `measures_heatmap`.\n",
    "\n",
    "Below is an example comparing properties of a simulation (no bias adjustment) and a scenario (with quantile mapping bias adjustment). Both the simulation and the scenario use the same reference for the measures.\n",
    "\n",
    "Note that it is possible to add many rows to `measures_heatmap`.\n",
    "\n",
    "<div class=\"alert alert-info\"> <b>NOTE</b>\n",
    "\n",
    "The bias correction performed in the Getting Started tutorial was adjusted for speed rather than performance, using only a few quantiles. The performance results below are thus quite poor, but that was expected.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17",
   "metadata": {},
   "outputs": [],
   "source": [
    "# repeat the step above for the simulation (no bias adjustment)\n",
    "dsim = gettingStarted_cat.search(\n",
    "    source=\"NorESM2-MM\",\n",
    "    experiment=\"ssp245\",\n",
    "    member=\"r1.*\",\n",
    "    processing_level=\"regridded\",\n",
    ").to_dataset()\n",
    "prop_sim, meas_sim = xs.properties_and_measures(\n",
    "    ds=dsim,\n",
    "    properties=properties,\n",
    "    period=period,\n",
    "    dref_for_measure=prop_ref,\n",
    "    change_units_arg=change_units_arg,\n",
    "    to_level_prop=\"diag-properties-sim\",\n",
    "    to_level_meas=\"diag-measures-sim\",\n",
    ")\n",
    "\n",
    "# save and update catalog\n",
    "for ds in [prop_sim, meas_sim]:\n",
    "    filename = str(\n",
    "        output_folder\n",
    "        / f\"{ds.attrs['cat:id']}.{ds.attrs['cat:domain']}.{ds.attrs['cat:processing_level']}.zarr\"\n",
    "    )\n",
    "    xs.save_to_zarr(ds=ds, filename=filename, mode=\"o\")\n",
    "    pcat.update_from_ds(ds=ds, path=filename, format=\"zarr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the measures for both kinds of data (sim and scen)\n",
    "meas_datasets = pcat.search(\n",
    "    processing_level=[\"diag-measures-sim\", \"diag-measures-scen\"]\n",
    ").to_dataset_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19",
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import colors\n",
    "\n",
    "# calculate the heatmap\n",
    "hm = xs.diagnostics.measures_heatmap(meas_datasets=meas_datasets)\n",
    "\n",
    "# plot the heat map\n",
    "fig_hmap, ax = plt.subplots(figsize=(10, 2))\n",
    "cmap = plt.cm.RdYlGn_r\n",
    "norm = colors.BoundaryNorm(np.linspace(0, 1, 4), cmap.N)\n",
    "im = ax.imshow(hm.heatmap.values, cmap=cmap, norm=norm)\n",
    "ax.set_xticks(ticks=np.arange(3), labels=hm.properties.values, rotation=45, ha=\"right\")\n",
    "ax.set_yticks(ticks=np.arange(2), labels=hm.realization.values)\n",
    "divider = make_axes_locatable(ax)\n",
    "cax = divider.new_vertical(size=\"15%\", pad=0.4)\n",
    "fig_hmap.add_axes(cax)\n",
    "cbar = fig_hmap.colorbar(im, cax=cax, ticks=[0, 1], orientation=\"horizontal\")\n",
    "cbar.ax.set_xticklabels([\"best\", \"worst\"])\n",
    "plt.title(\"Normalised mean measure of properties\")\n",
    "fig_hmap.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20",
   "metadata": {},
   "source": [
    "`measure_improved` is another way to compare two datasets. It returns the fraction of the grid points that performed better in the second dataset than in the first dataset. It is useful to see which of properties are best corrected for by the bias adjustment method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21",
   "metadata": {},
   "outputs": [],
   "source": [
    "# change the order of meas_dataset to have sim first, because we want to see how scen improved compared to sim.\n",
    "ordered_keys = [\n",
    "    \"CMIP6_ScenarioMIP_NCC_NorESM2-MM_ssp245_r1i1p1f1_example-region.finer-grid.diag-measures-sim.fx\",\n",
    "    \"CMIP6_ScenarioMIP_NCC_NorESM2-MM_ssp245_r1i1p1f1_example-region.finer-grid.diag-measures-scen.fx\",\n",
    "]\n",
    "meas_datasets = {k: meas_datasets[k] for k in ordered_keys}\n",
    "\n",
    "pb = xs.diagnostics.measures_improvement(meas_datasets=meas_datasets)\n",
    "\n",
    "# plot\n",
    "percent_better = pb.improved_grid_points.values\n",
    "percent_better = np.reshape(np.array(percent_better), (1, 3))\n",
    "fig_per, ax = plt.subplots(figsize=(10, 2))\n",
    "cmap = plt.cm.RdYlGn\n",
    "norm = colors.BoundaryNorm(np.linspace(0, 1, 100), cmap.N)\n",
    "im = ax.imshow(percent_better, cmap=cmap, norm=norm)\n",
    "ax.set_xticks(ticks=np.arange(3), labels=pb.properties.values, rotation=45, ha=\"right\")\n",
    "ax.set_yticks(ticks=np.arange(1), labels=[\"\"])\n",
    "\n",
    "divider = make_axes_locatable(ax)\n",
    "cax = divider.new_vertical(size=\"15%\", pad=0.4)\n",
    "fig_per.add_axes(cax)\n",
    "cbar = fig_per.colorbar(\n",
    "    im, cax=cax, ticks=np.arange(0, 1.1, 0.1), orientation=\"horizontal\"\n",
    ")\n",
    "plt.title(\n",
    "    \"Fraction of grid cells of scen that improved or stayed the same compared to sim\"\n",
    ")\n",
    "fig_per.tight_layout()"
   ]
  }
 ],
 "metadata": {
  "@webio": {
   "lastCommId": null,
   "lastKernelId": null
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}