An Exception was encountered at ‘In [14]’.

ROF monthly, annual, seasonal discharge at ocean outlets #

Use the following datasets

  1. reach-D19 gauge link ascii

  2. D19 flow site geopackage

  3. D19 discharge netCDF

  4. monthly and yearly flow netCD (history file)

1. Setupt

2. Loading discharge data

  • Read monthly history files from archive.

  • Reference data: monthly discharge estimates at 922 big river mouths from Dai et al. 2019 data (D19)

3. Read river, catchment, gauge information

  • catchment polygon (geopackage)

  • gauge point (geopackage)

  • gauge-catchment link (csv)

  • outlet reach information (netCDF) including discharging ocean names

4. Ocean discharge line plots

  • total seasonal flow for oceans.

Hide code cell source

%matplotlib inline

import os, sys
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
import cartopy.feature as cfeature

from scripts.utility import load_yaml
from scripts.utility import no_time_variable
from scripts.utility import read_shps
from scripts.utility import get_index_array

rivers_50m = cfeature.NaturalEarthFeature("physical", "rivers_lake_centerlines", "50m")
land = cfeature.LAND

print("\nThe Python version: %s.%s.%s" % sys.version_info[:3])
print(xr.__name__, xr.__version__)
print(pd.__name__, pd.__version__)
print(gpd.__name__, gpd.__version__)
The Python version: 3.11.4
xarray 2025.4.0
pandas 2.2.3
geopandas 1.0.1
ERROR 1: PROJ: proj_create_from_database: Open of /glade/work/hannay/miniconda3/envs/cupid-analysis/share/proj failed

1. Setup #

Hide code cell source

# Parameter Defaults
# parameters are set in CUPiD's config.yml file
# when running interactively, manually set the parameters below

CESM_output_dir = ""
case_name = None  # case name: e.g., "b.e30_beta02.BLT1850.ne30_t232.104"
base_case_name = None  # base case name: e.g., "b.e23_alpha17f.BLT1850.ne30_t232.092"
start_date = ""
end_date = ""
base_start_date = ""  # base simulation starting date: "0001-01-01"
base_end_date = ""  # base simulation ending date: "0100-01-01"
serial = True  # use dask LocalCluster
lc_kwargs = {}

analysis_name = ""  # Used for Figure png names
climo_nyears = 10  # number of years to compute the climatology
grid_name = "f09_f09_mosart"  # ROF grid name used in case
base_grid_name = (
    grid_name  # spcify ROF grid name for base_case in config.yml if different than case
)
figureSave = False
# Parameters
case_name = "b.e30_beta06.B1850C_LTso.ne30_t232_wgx3.192"
base_case_name = "b.e30_beta06.B1850C_LTso.ne30_t232_wgx3.188"
CESM_output_dir = "/glade/derecho/scratch/hannay/archive"
base_case_output_dir = "/glade/derecho/scratch/gmarques/archive"
start_date = "0002-01-01"
end_date = "0021-12-01"
base_start_date = "0002-01-01"
base_end_date = "0021-12-01"
obs_data_dir = (
    "/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CUPiD_obs_data"
)
ts_dir = None
lc_kwargs = {"threads_per_worker": 1}
serial = True
analysis_name = ""
grid_name = "f09_f09_mosart"
climo_nyears = 10
figureSave = False
subset_kwargs = {}
product = "/glade/work/hannay/CUPiD/examples/key_metrics/computed_notebooks//rof/global_discharge_ocean_compare_obs.ipynb"

Hide code cell source

# ROF additional setup
setup = load_yaml("./setup/setup.yaml")

domain_dir = setup[
    "ancillary_dir"
]  # ancillary directory including such as ROF domain, river network data
geospatial_dir = setup["geospatial_dir"]  # including shapefiles or geopackages
ref_flow_dir = setup["ref_flow_dir"]  # including observed or reference flow data
case_meta = setup["case_meta"]  # RO grid meta
catch_gpkg = setup["catch_gpkg"]  # catchment geopackage meta
reach_gpkg = setup["reach_gpkg"]  # reach geopackage meta
network_nc = setup["river_network"]  # river network meta

if not analysis_name:
    analysis_name = case_name
if base_grid_name:
    base_grid_name = grid_name

case_dic = {
    case_name: {
        "grid": grid_name,
        "sim_period": slice(f"{start_date}", f"{end_date}"),
        "climo_nyrs": min(climo_nyears, int(end_date[:4]) - int(start_date[:4]) + 1),
    },
    base_case_name: {
        "grid": grid_name,
        "sim_period": slice(f"{base_start_date}", f"{base_end_date}"),
        "climo_nyrs": min(
            climo_nyears, int(base_end_date[:4]) - int(base_start_date[:4]) + 1
        ),
    },
}
oceans_list = [
    "arctic",
    "atlantic",
    "indian",
    "mediterranean",
    "pacific",
    "south_china",
    "global",
]

dasks (optional)#

Hide code cell source

# When running interactively, cupid_run should be set to False
cupid_run = True

client = None
if cupid_run:
    # Spin up cluster (if running in parallel)
    if not serial:
        from dask.distributed import Client, LocalCluster

        cluster = LocalCluster(**lc_kwargs)
        client = Client(cluster)
else:
    if not serial:
        from dask.distributed import Client
        from dask_jobqueue import PBSCluster

        cluster = PBSCluster(
            cores=1,
            processes=1,
            memory="50GB",
            queue="casper",
            walltime="01:00:00",
        )
        cluster.scale(jobs=10)
        client = Client(cluster)
client

2. Loading discharge data #

2.1. Monthly/annual flow netCDFs#

  • month_data (xr dataset)

  • year_data (xr dataset)

  • seas_data (xr dataset)

Hide code cell source

%%time

reachID = {}
month_data = {}
year_data = {}
seas_data = {}
for case, meta in case_dic.items():
    in_dire = os.path.join(CESM_output_dir, case, "rof/hist")
    model = case_meta[meta["grid"]]["model"]
    domain = case_meta[meta["grid"]]["domain_nc"]
    var_list = case_meta[meta["grid"]]["vars_read"]

    def preprocess(ds):
        return ds[var_list]

    year_list = [
        "{:04d}".format(yr)
        for yr in np.arange(
            int(meta["sim_period"].start[0:4]), int(meta["sim_period"].stop[0:4]) + 1
        )
    ]

    nc_list = []
    for nc_path in sorted(glob.glob(f"{in_dire}/{case}.{model}.h*.????-*.nc")):
        for yr in year_list:
            if yr in os.path.basename(nc_path):
                nc_list.append(nc_path)

    # load data
    ds = xr.open_mfdataset(
        nc_list,
        data_vars="minimal",
        parallel=True,
        preprocess=preprocess,
    ).sel(time=meta["sim_period"])

    # monthly
    month_data[case] = ds.isel(time=slice(-meta["climo_nyrs"] * 12, None))
    # annual
    year_data[case] = (
        ds.isel(time=slice(-meta["climo_nyrs"] * 12, None))
        .resample(time="YS")
        .mean(dim="time")
        .load()
    )
    # seasonal (compute here instead of reading for conisistent analysis period)
    seas_data[case] = (
        ds.isel(time=slice(-meta["climo_nyrs"] * 12, None))
        .groupby("time.month")
        .mean("time")
        .load()
    )
    vars_no_time = no_time_variable(month_data[case])
    if vars_no_time:
        seas_data[case][vars_no_time] = seas_data[case][vars_no_time].isel(
            month=0, drop=True
        )
    mon_time = month_data[case].time.values
    if domain == "None":
        reachID[case] = month_data[case]["reachID"].values
    else:
        reachID[case] = (
            xr.open_dataset(f"{domain_dir}/{domain}")["reachID"]
            .stack(seg=("lat", "lon"))
            .values
        )
    print(f"Finished loading {case}")
Finished loading b.e30_beta06.B1850C_LTso.ne30_t232_wgx3.192
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
File <timed exec>:28

File /glade/work/hannay/miniconda3/envs/cupid-analysis/lib/python3.11/site-packages/xarray/backends/api.py:1597, in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, data_vars, coords, combine, parallel, join, attrs_file, combine_attrs, **kwargs)
   1594 paths = _find_absolute_paths(paths, engine=engine, **kwargs)
   1596 if not paths:
-> 1597     raise OSError("no files to open")
   1599 paths1d: list[str | ReadBuffer]
   1600 if combine == "nested":

OSError: no files to open

2.2 D19 discharge data#

  • ds_q_obs_mon (xr datasets)

  • ds_q_obs_yr (xr datasets)

  • dr_q_obs_seasonal (xr datasets)

Hide code cell source

%%time

# read monthly data
ds_q = xr.open_dataset(
    "%s/D09/coastal-stns-Vol-monthly.updated-May2019.mod.nc" % (ref_flow_dir),
    decode_times=False,
)
ds_q["time"] = xr.cftime_range(
    start="1900-01-01", end="2018-12-01", freq="MS", calendar="standard"
)

# monthly- if time_period is outside observation period, use the entire obs period
obs_available = True
if ds_q["time"].sel(time=slice(str(mon_time[0]), str(mon_time[-1]))).values.size == 0:
    obs_available = False
    ds_q_obs_mon = ds_q["FLOW"]
else:
    ds_q_obs_mon = ds_q["FLOW"].sel(time=slice(str(mon_time[0]), str(mon_time[-1])))
# compute annual flow from monthly
ds_q_obs_yr = ds_q_obs_mon.resample(time="YE").mean(dim="time")
# compute annual cycle at monthly scale
dr_q_obs_seasonal = ds_q_obs_mon.groupby("time.month").mean("time")
CPU times: user 155 ms, sys: 3.63 ms, total: 158 ms
Wall time: 160 ms
<timed exec>:6: DeprecationWarning: cftime_range() is deprecated, please use xarray.date_range(..., use_cftime=True) instead.

3. Reading river, catchment, gauge infomation #

  • gauge-catchment (or grid box) link (csv)

  • gauge point (geopackage)

  • ocean polygon (geopackage)

  • catchment polygon (geopackage)

  • outlet reach information (netCDF)

3.2 D19 flow site geopackage#

  • gauge_shp (dataframe)

Hide code cell source

%%time

gauge_shp = gpd.read_file(
    os.path.join(ref_flow_dir, "D09", "geospatial", "D09_925.gpkg")
)
gauge_shp = gauge_shp[gauge_shp["id"] != 9999999]
CPU times: user 13 ms, sys: 4.53 ms, total: 17.6 ms
Wall time: 41.2 ms

3.3 Ocean polygon geopackage#

  • ocean_shp (dataframe)

Hide code cell source

%%time

ocean_shp = gpd.read_file(os.path.join(geospatial_dir, "oceans.gpkg"))
CPU times: user 182 ms, sys: 10.3 s, total: 10.5 s
Wall time: 11 s

3.3 Read river network information#

  • gdf_cat (dataframe)

%%time

## read catchment geopackage
gdf_cat = {}
for case, meta in case_dic.items():
    cat_gpkg = os.path.join(
        geospatial_dir, catch_gpkg[meta["grid"]]["file_name"]
    )  # geopackage name
    id_name_cat = catch_gpkg[meta["grid"]]["id_name"]  # reach ID in geopackage
    var_list = [id_name_cat]
    if "lk" in grid_name:
        var_list.append("lake")
    gdf_cat[case] = read_shps([cat_gpkg], var_list)
Finished reading /glade/campaign/cesm/development/cross-wg/diagnostic_framework/rof_data/geospatial/MOSART_routing_Global_0.5x0.5_c170601_hru.gpkg
Finished reading /glade/campaign/cesm/development/cross-wg/diagnostic_framework/rof_data/geospatial/MOSART_routing_Global_0.5x0.5_c170601_hru.gpkg
CPU times: user 574 ms, sys: 7.76 ms, total: 581 ms
Wall time: 660 ms

3.4 Read river outlet information#

  • Apppend into gdf_cat (dataframe)

Hide code cell source

%%time

# read river outlet netcdf
riv_ocean = {}
for case, meta in case_dic.items():
    riv_ocean_file = os.path.join(
        domain_dir,
        network_nc[meta["grid"]]["file_name"].replace(".aug.nc", ".outlet.nc"),
    )  # network netcdf name
    ds_rn_ocean = xr.open_dataset(riv_ocean_file).set_index(seg="seg_id")
    df_tmp = ds_rn_ocean.to_dataframe()
    riv_ocean[case] = pd.merge(
        gdf_cat[case],
        df_tmp,
        left_on=catch_gpkg[meta["grid"]]["id_name"],
        right_index=True,
    )
CPU times: user 204 ms, sys: 558 ms, total: 762 ms
Wall time: 833 ms

2.6 Merge gauge, outlet catchment dataframe#

  • gauge_shp1 (dataframe)

Hide code cell source

%%time

# Merge gauge_reach lnk (dataframe) into gauge shapefile
gauge_shp1 = {}
for case, meta in case_dic.items():
    df = gauge_reach_lnk[case]

    # df = df.loc[(df['flag'] == 0)]
    df1 = df.drop(columns=["riv_name"])
    df2 = pd.merge(gauge_shp, df1, how="inner", left_on="id", right_on="gauge_id")
    gauge_shp1[case] = pd.merge(
        df2,
        riv_ocean[case],
        how="inner",
        left_on="route_id",
        right_on=catch_gpkg[meta["grid"]]["id_name"],
    )
CPU times: user 10.6 ms, sys: 1.87 ms, total: 12.5 ms
Wall time: 11.9 ms

3. Plot annual cycle for global oceans #

Execution using papermill encountered an exception here and stopped:

Hide code cell source

%time

nrows = 4
ncols = 2
fig, axes = plt.subplots(nrows, ncols, figsize=(7.25, 6.5))
plt.subplots_adjust(
    top=0.95, bottom=0.065, right=0.98, left=0.10, hspace=0.225, wspace=0.250
)  # create some space below the plots by increasing the bottom-value

for ix, ocean_name in enumerate(oceans_list):
    row = ix // 2
    col = ix % 2
    for case, meta in case_dic.items():

        q_name = case_meta[meta["grid"]]["flow_name"]

        if case_meta[meta["grid"]]["network_type"] == "vector":
            if ocean_name == "global":
                id_list = gauge_shp1[case]["route_id"].values
            else:
                id_list = gauge_shp1[case][gauge_shp1[case]["ocean"] == ocean_name][
                    "route_id"
                ].values
            reach_index = get_index_array(reachID[case], id_list)
            dr_flow = seas_data[case][q_name].isel(seg=reach_index).sum(dim="seg")
            dr_flow.plot(ax=axes[row, col], linestyle="-", lw=0.75, label=case)

        elif case_meta[grid_name]["network_type"] == "grid":  # means 2d grid
            if ocean_name == "global":
                id_list = gauge_shp1[case]["route_id"].values
            else:
                id_list = gauge_shp1[case][gauge_shp1[case]["ocean"] == ocean_name][
                    "route_id"
                ].values

            reach_index = get_index_array(reachID[case], id_list)
            seas_data_vector = seas_data[case][q_name].stack(seg=("lat", "lon"))
            dr_flow = seas_data_vector.isel(seg=reach_index).sum(dim="seg")
            dr_flow.plot(ax=axes[row, col], linestyle="-", lw=0.75, label=case)

    # reference data
    if obs_available:
        if ocean_name == "global":
            id_list = gauge_shp1[case]["id"].values
        else:
            id_list = gauge_shp1[case][gauge_shp1[case]["ocean"] == ocean_name][
                "id"
            ].values
        gauge_index = get_index_array(ds_q["id"].values, id_list)
        dr_obs = dr_q_obs_seasonal.isel(station=gauge_index).sum(dim="station")
        dr_obs.plot(
            ax=axes[row, col],
            linestyle="None",
            marker="o",
            markersize=2,
            c="k",
            label="D19",
        )

    axes[row, col].set_title("%d %s" % (ix + 1, ocean_name), fontsize=9)
    axes[row, col].set_xlabel("")
    if row < 7:
        axes[row, col].set_xticklabels("")
    if col == 0:
        axes[row, col].set_ylabel("Mon. flow [m$^3$/s]", fontsize=9)
    else:
        axes[row, col].set_ylabel("")
    axes[row, col].tick_params("both", labelsize="x-small")

# Legend- make space below the plot-raise bottom. there will be an label below the second last (bottom middle) ax, thanks to the bbox_to_anchor=(x, y) with a negative y-value.
axes[row, col].legend(
    loc="center left", bbox_to_anchor=(1.10, 0.40, 0.75, 0.1), ncol=1, fontsize="small"
)

for jx in range(ix + 1, nrows * ncols):
    row = jx // 2
    col = jx % 2
    fig.delaxes(axes[row][col])

if figureSave:
    plt.savefig(f"./NB2_Fig1_ocean_discharge_season_{analysis_name}.png", dpi=200)
CPU times: user 2 μs, sys: 2 μs, total: 4 μs
Wall time: 8.11 μs
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[14], line 36
     31 else:
     32     id_list = gauge_shp1[case][gauge_shp1[case]["ocean"] == ocean_name][
     33         "route_id"
     34     ].values
---> 36 reach_index = get_index_array(reachID[case], id_list)
     37 seas_data_vector = seas_data[case][q_name].stack(seg=("lat", "lon"))
     38 dr_flow = seas_data_vector.isel(seg=reach_index).sum(dim="seg")

KeyError: 'b.e30_beta06.B1850C_LTso.ne30_t232_wgx3.188'
../_images/c694f8f5a80a2cd395ba9747ae897d75f016a8d6fa850b568c13bbbd2b4a66c5.png
if client:
    client.shutdown()