Summary of Rainfall Runoff datasets

This file shows summary of all rainfall-runoff datasets available in the package and how to access these datasets using a unified interface RainfallRunoff.

At the time of running this script, the datasets have been previosly downloaded. Therefore, if you run this script for the first time, it may take days to run or may even not run successfully till the end due to internet connection issues.

[1]:
import os
import site

wd_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath('__file__')))))
#wd_dir = os.path.dirname(os.path.dirname(os.path.realpath('__file__')))
#wd_dir = os.path.dirname(os.path.realpath('__file__'))
print(wd_dir)
site.addsitedir(wd_dir)

import textwrap

import matplotlib
nice_fonts = {
    #"text.usetex": True,
    "font.family": "sans-serif",  #sans -serif
    #"font.serif" : "Times New Roman",
}
matplotlib.rcParams.update(nice_fonts)

import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

from easy_mpl.utils import despine_axes

from aqua_fetch.utils import print_info
from aqua_fetch import RainfallRunoff

print_info()
# path where the data will be downloaded or has previously been downloaded
DATA_PATH = '/mnt/datawaha/hyex/atr/gscad_database/raw'
/home/abbaa0a/AquaFetch
numpy 1.26.4
pandas 2.2.3
water_quality 0.1.0
python 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0]
os posix
matplotlib 3.8.4
shapefile 2.3.1
xarray 2024.7.0
netCDF4 1.6.2
scipy 1.13.0
Script Executed on:  19 January 2025 12:00:21
tot_cpus 112
avail_cpus 112
mem_gib 251.52817153930664
[2]:
datasets = {
    "Arcticnet" : DATA_PATH,
    "Bull" : DATA_PATH,
    "CABra" : DATA_PATH,
    # GRDC Caravan is overshadowing the other datasets
    # so better put it at start
    "GRDCCaravan": DATA_PATH,
    #"CAMELS_AUS" : os.path.join(DATA_PATH, 'CAMELS_AUS_V1'),
    "CAMELS_AUS": os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_GB" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_BR" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_US" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_CL" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_DK" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_CH" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_DE" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_FR" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_SE" : os.path.join(DATA_PATH, 'CAMELS'),
    "CAMELS_IND" : os.path.join(DATA_PATH, 'CAMELS'),
    "Caravan_DK": DATA_PATH,
    "LamaHCE" : DATA_PATH,
    "LamaHIce" : os.path.join(DATA_PATH, 'LamaHIce_daily'),
    "HYSETS": os.path.join(DATA_PATH, 'HYSETS'),
    "CCAM": DATA_PATH,
    "Japan": DATA_PATH,
    "Ireland": DATA_PATH,
    "Finland": DATA_PATH,
    "Italy": DATA_PATH,
    "Poland": DATA_PATH,
    "Portugal": DATA_PATH,
    "Simbi": DATA_PATH,
    "Spain": DATA_PATH,
    "Thailand": DATA_PATH,
    "USGS": DATA_PATH,
}

colors = plt.cm.tab20.colors + plt.cm.tab20b.colors

rets = {}
items = {}

block1 = ['HYSETS', 'Italy', 'GRDCCaravan', 'LamaHCE', 'LamaHIce', "CABra", "CAMELS_US",
          "CAMELS_CL", 'Ireland', 'Spain', 'Poland', 'CAMELS_SE', 'USGS', "Bull", "CAMELS_BR"]

block2 = ['CAMELS_DK', 'CAMELS_FR', 'CAMELS_DE', 'Portugal',
          "CAMELS_GB", "CAMELS_CH", "Caravan_DK"]

block3 = ['Arcticnet', 'Thailand', 'CCAM', 'Japan', 'Finland', 'CAMELS_AUS',
          'CAMELS_IND', "Simbi"]

# collect the coords data
coords_data = {}
for idx, (src, path) in enumerate(datasets.items()):

    kws = {}
    if src == 'LamaHCE':
        kws = dict(timestep='D', data_type='total_upstrm')

    ds = RainfallRunoff(src, path=path, verbosity=0, **kws)

    coords_data[src] = ds.stn_coords()

# draw the figure
_, ax = plt.subplots(figsize=(10, 12))

map = Basemap(ax=ax, resolution='l')
map.drawcoastlines(linewidth=0.3, ax=ax, color="gray", zorder=0)
for idx, src in enumerate(datasets.keys()):

    coords = coords_data[src]

    ret = map.scatter(coords['long'].values, coords['lat'].values,
                marker=".",
                s=2,
                linewidths=0.0,
                color = colors[idx],
                alpha=1.0,
                label=f"{src} (n={coords.shape[0]})")

    rets[src] = ret
    items[src] = coords.shape[0]

leg1 = ax.legend(
    [rets[src] for src in sorted(block1)],
    [f"{src} (n={items[src]})" for src in sorted(block1)],
    markerscale=12,
    fontsize=8,
    borderpad=0.2,
    labelspacing=0.5,
    title_fontproperties={'weight': 'bold', 'size': 8+2},
    bbox_to_anchor=(0.001, 0.001),
    loc="lower left",
    framealpha=0.6
    )
leg2 = ax.legend([rets[src] for src in sorted(block2)],
                [f"{src} (n={items[src]})" for src in sorted(block2)],
        markerscale=12,
        fontsize=8,
        borderpad=0.2,
        labelspacing=0.5,
        title_fontproperties={'weight': 'bold', 'size': 8+2},
        bbox_to_anchor=(0.34, 0.001),
        loc="lower left",
        )
leg3 = ax.legend([rets[src] for src in block3],
                [f"{src} (n={items[src]})" for src in block3],
        markerscale=12,
        fontsize=8,
        borderpad=0.2,
        labelspacing=0.5,
        title_fontproperties={'weight': 'bold', 'size': 8+2},
        bbox_to_anchor=(0.60, 0.001),
        loc="lower left",
        )
ax.add_artist(leg1)
ax.add_artist(leg2)
#ax.add_artist(leg3)

despine_axes(ax)
#plt.savefig("rr_stations.png", dpi=600, bbox_inches="tight")
plt.show()
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:2541: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:3223: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(os.path.join(fpath),
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:3234: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(fpath,
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
../_images/_notebooks_rr_summary_2_1.svg

Arcticnet

[3]:
dataset = RainfallRunoff('Arcticnet', path=DATA_PATH, verbosity=0)
print(dataset)
Arcticnet with 106 stations, 27 dynamic and 35 static features

The static features of Arcticnet are same as that of GSHA.

[4]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slp_dg_uav, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[5]:
df = dataset.fetch_static_features()
print(df.shape)
(106, 35)
[6]:
print(df.isna().sum().sum())
df.isna().sum()
22
[6]:
EVP_uncertainty(%)      9
HYRIV_ID                0
LRAD_uncertainty(%)     2
P_uncertainty(%)        0
SRAD_uncertainty(%)     0
T_uncertainty(%)        0
agency                  0
area                    0
cly_pc_uav              0
ele_mt_uav              0
ero_kh_uav              0
gla_pc_use              0
glc_cl_cmj              0
gwt_cm_cav              0
inu_pc_ult              0
lat                     0
lit_cl_cmj              0
long                    0
pet_uncertainty(%)     11
pnv_cl_cmj              0
prm_pc_use              0
sgr_dk_rav              0
slp_dg_uav              0
slt_pc_uav              0
snd_pc_uav              0
wet_pc_u01              0
wet_pc_u02              0
wet_pc_u03              0
wet_pc_u04              0
wet_pc_u05              0
wet_pc_u06              0
wet_pc_u07              0
wet_pc_u08              0
wet_pc_u09              0
wind_uncertainty(%)     0
dtype: int64

find those columns which have at least one NaN value

[7]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[8]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[8]:
EVP_uncertainty(%)      9
LRAD_uncertainty(%)     2
pet_uncertainty(%)     11
dtype: int64
[9]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

Bull

[10]:
dataset = RainfallRunoff('Bull', path=DATA_PATH, verbosity=0)
print(dataset)
Bull with 484 stations, 55 dynamic and 214 static features
[11]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
NSE, aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area,
area_fraction_used_for_aggregation, area_hydroatlas, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav,
clz_cl_smj, cmi_ix_s01, cmi_ix_s02, cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07,
cmi_ix_s08, cmi_ix_s09, cmi_ix_s10, cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse,
dis_m3_pmn, dis_m3_pmx, dis_m3_pyr, dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav,
fec_cl_smj, fmh_cl_smj, for_pc_sse, frac_snow, gauge_lat, gauge_lon, gauge_name, gdp_ud_sav,
gdp_ud_ssu, gla_pc_sse, glc_cl_smj, glc_pc_s01, glc_pc_s02, glc_pc_s03, glc_pc_s04, glc_pc_s05,
glc_pc_s06, glc_pc_s07, glc_pc_s08, glc_pc_s09, glc_pc_s10, glc_pc_s11, glc_pc_s12, glc_pc_s13,
glc_pc_s14, glc_pc_s15, glc_pc_s16, glc_pc_s17, glc_pc_s18, glc_pc_s19, glc_pc_s20, glc_pc_s21,
glc_pc_s22, gwt_cm_sav, hdi_ix_sav, hft_ix_s09, hft_ix_s93, high_prec_dur, high_prec_freq,
inu_pc_slt, inu_pc_smn, inu_pc_smx, ire_pc_sse, kar_pc_sse, lit_cl_smj, lka_pc_sse, lkv_mc_usu,
low_prec_dur, low_prec_freq, moisture_index, nli_ix_sav, non-altered, p_mean, pac_pc_sse, pet_mean,
pet_mm_s01, pet_mm_s02, pet_mm_s03, pet_mm_s04, pet_mm_s05, pet_mm_s06, pet_mm_s07, pet_mm_s08,
pet_mm_s09, pet_mm_s10, pet_mm_s11, pet_mm_s12, pet_mm_syr, pnv_cl_smj, pnv_pc_s01, pnv_pc_s02,
pnv_pc_s03, pnv_pc_s04, pnv_pc_s05, pnv_pc_s06, pnv_pc_s07, pnv_pc_s08, pnv_pc_s09, pnv_pc_s10,
pnv_pc_s11, pnv_pc_s12, pnv_pc_s13, pnv_pc_s14, pnv_pc_s15, pop_ct_usu, ppd_pk_sav, pre_mm_s01,
pre_mm_s02, pre_mm_s03, pre_mm_s04, pre_mm_s05, pre_mm_s06, pre_mm_s07, pre_mm_s08, pre_mm_s09,
pre_mm_s10, pre_mm_s11, pre_mm_s12, pre_mm_syr, prm_pc_sse, pst_pc_sse, rdd_mk_sav, rev_mc_usu,
ria_ha_usu, riv_tc_usu, run_mm_syr, seasonality, sgr_dk_sav, slp_dg_sav, slt_pc_sav, snd_pc_sav,
snw_pc_s01, snw_pc_s02, snw_pc_s03, snw_pc_s04, snw_pc_s05, snw_pc_s06, snw_pc_s07, snw_pc_s08,
snw_pc_s09, snw_pc_s10, snw_pc_s11, snw_pc_s12, snw_pc_smx, snw_pc_syr, soc_th_sav, swc_pc_s01,
swc_pc_s02, swc_pc_s03, swc_pc_s04, swc_pc_s05, swc_pc_s06, swc_pc_s07, swc_pc_s08, swc_pc_s09,
swc_pc_s10, swc_pc_s11, swc_pc_s12, swc_pc_syr, tbi_cl_smj, tec_cl_smj, tmp_dc_s01, tmp_dc_s02,
tmp_dc_s03, tmp_dc_s04, tmp_dc_s05, tmp_dc_s06, tmp_dc_s07, tmp_dc_s08, tmp_dc_s09, tmp_dc_s10,
tmp_dc_s11, tmp_dc_s12, tmp_dc_smn, tmp_dc_smx, tmp_dc_syr, urb_pc_sse, wet_cl_smj, wet_pc_s01,
wet_pc_s02, wet_pc_s03, wet_pc_s04, wet_pc_s05, wet_pc_s06, wet_pc_s07, wet_pc_s08, wet_pc_s09,
wet_pc_sg1, wet_pc_sg2
[12]:
df = dataset.fetch_static_features()
print(df.shape)
(484, 214)
[13]:
print(df.isna().sum().sum())
df.isna().sum()
0
[13]:
NSE           0
aet_mm_s01    0
aet_mm_s02    0
aet_mm_s03    0
aet_mm_s04    0
             ..
wet_pc_s07    0
wet_pc_s08    0
wet_pc_s09    0
wet_pc_sg1    0
wet_pc_sg2    0
Length: 214, dtype: int64

find those columns which have at least one NaN value

[14]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[15]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[15]:
Series([], dtype: float64)
[16]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_2m_max, airtemp_C_2m_min, airtemp_C_AEMET_max, airtemp_C_AEMET_min, airtemp_C_EMO1arc_max,
airtemp_C_EMO1arc_min, airtemp_C_ERA5Land_max, airtemp_C_ERA5Land_min, airtemp_C_mean_2m,
airtemp_C_mean_AEMET, airtemp_C_mean_EMO1arc, airtemp_C_mean_ERA5Land, dptemp_C_max, dptemp_C_mean,
dptemp_C_min, pcp_mm_AEMET, pcp_mm_BULL, pcp_mm_EMO1arc, pcp_mm_ERA5Land, pet_mm_AEMET,
pet_mm_EMO1arc, pet_mm_ERA5Land, pevap_mm, q_cms_obs, solrad_wm2, solrad_wm2_max, solrad_wm2_max,
streamflow_BULL, surface_pressure_max_BULL, surface_pressure_mean_BULL, surface_pressure_min_BULL,
swe_mm, swe_mm_max, swe_mm_min, thermrad_wm2, thermrad_wm2_max, thermrad_wm2_min,
volumetric_soil_water_layer_1_max_BULL, volumetric_soil_water_layer_1_mean_BULL,
volumetric_soil_water_layer_1_min_BULL, volumetric_soil_water_layer_2_max_BULL,
volumetric_soil_water_layer_2_mean_BULL, volumetric_soil_water_layer_2_min_BULL,
volumetric_soil_water_layer_3_max_BULL, volumetric_soil_water_layer_3_mean_BULL,
volumetric_soil_water_layer_3_min_BULL, volumetric_soil_water_layer_4_max_BULL,
volumetric_soil_water_layer_4_mean_BULL, volumetric_soil_water_layer_4_min_BULL,
windspeedu_mps_max_10m, windspeedu_mps_mean_10m, windspeedu_mps_min_10m, windspeedv_mps_max_10m,
windspeedv_mps_mean_10m, windspeedv_mps_min_10m

CABra

[17]:
dataset = RainfallRunoff('CABra', path=DATA_PATH, verbosity=0)
print(dataset)
CABra with 735 stations, 12 dynamic and 97 static features
[18]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
ANA_ID, ANA_ID, ANA_ID, ANA_ID, ANA_ID, ANA_ID, ANA_ID, ANA_ID, ANA_ID, aquif_name, aquif_type,
aridity_index, baseflow_index, catch_area, catch_hand, catch_lith, catch_order, catch_slope,
catch_wtd, clim_et, clim_p, clim_pet, clim_quality, clim_rh, clim_srad, clim_tmax, clim_tmin,
clim_wind, cover_bare, cover_crops, cover_crops, cover_forest, cover_grass, cover_main, cover_moss,
cover_shrub, cover_snow, cover_urban, cover_urban, cover_waterp, cover_waters, dist_urban,
elev_gauge, elev_max, elev_mean, elev_min, fdc_slope, gauge_biome, gauge_hreg, gauge_state,
hand_class, hdisturb_index, latitude, longitude, missing_data, ndvi_djf, ndvi_jja, ndvi_mam,
ndvi_son, p_seasonality, q_1, q_5, q_95, q_99, q_cv, q_elasticity, q_hcv, q_hd, q_hf, q_hfd, q_lcv,
q_ld, q_lf, q_mean, q_zero, quality_index, res_area, res_number, res_regulation, res_volume,
runoff_coef, series_length, soil_bulk, soil_carbon, soil_clay, soil_depth, soil_sand, soil_silt,
soil_textclass, soil_type, sub_hconduc, sub_permeability, sub_porosity, water_demand, well_dynamic,
well_number, well_static
[19]:
df = dataset.fetch_static_features()
print(df.shape)
(735, 97)
[20]:
print(df.isna().sum().sum())
df.isna().sum()
0
[20]:
ANA_ID          0
ANA_ID          0
ANA_ID          0
ANA_ID          0
ANA_ID          0
               ..
sub_porosity    0
water_demand    0
well_dynamic    0
well_number     0
well_static     0
Length: 97, dtype: int64

find those columns which have at least one NaN value

[21]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[22]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[22]:
Series([], dtype: float64)
[23]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
Quality, aet_mm_ens, airtemp_C_ens_max, airtemp_C_ens_min, pcp_mm_ens, pet_mm_hg, pet_mm_pm,
pet_mm_pt, q_cms_obs, rh_%_ens, solrad_wm2_ens, windspeed_mps_ens

CAMELS_AUS

[24]:
dataset = RainfallRunoff('CAMELS_AUS', path=os.path.join(DATA_PATH, 'CAMELS_AUS_V1'), version=1, verbosity=0)
print(dataset)
CAMELS_AUS with 222 stations, 26 dynamic and 166 static features
[25]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, anngro_mega, anngro_meso, anngro_micro, aridity, baseflow_index, carbnatesed,
catchment_area, catchment_di, claya, clayb, confinement, daystart, daystart_P, daystart_Q,
distupdamw, drainage_division, elev_max, elev_mean, elev_min, elev_range, elongratio, end_date,
erosivity, extract_ind_fac, flow_div_fac, flow_regime_di, frac_snow, geol_prim, geol_prim_prop,
geol_sec, geol_sec_prop, gromega_seas, gromeso_seas, gromicro_seas, hdf_mean, high_prec_dur,
high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, igneous, impound_fac, infrastruc_fac,
ksat, landuse_fac, lat_centroid, lat_outlet, lc01_extracti, lc03_waterbo, lc04_saltlak,
lc05_irrcrop, lc06_irrpast, lc07_irrsuga, lc08_rfcropp, lc09_rfpastu, lc10_rfsugar, lc11_wetlands,
lc14_tussclo, lc15_alpineg, lc16_openhum, lc18_opentus, lc19_shrbsca, lc24_shrbden, lc25_shrbope,
lc31_forclos, lc32_foropen, lc33_woodope, lc34_woodspa, lc35_urbanar, leveebank_fac, long_centroid,
long_outlet, low_prec_dur, low_prec_freq, low_prec_timing, low_q_dur, low_q_freq, map_zone,
mean_slope_pct, metamorph, mrvbf_prop_0, mrvbf_prop_1, mrvbf_prop_2, mrvbf_prop_3, mrvbf_prop_4,
mrvbf_prop_5, mrvbf_prop_6, mrvbf_prop_7, mrvbf_prop_8, mrvbf_prop_9, nested_status,
next_station_ds, notes, npp_1, npp_10, npp_11, npp_12, npp_2, npp_3, npp_4, npp_5, npp_6, npp_7,
npp_8, npp_9, npp_ann, num_nested_within, nvis_bare_e, nvis_bare_n, nvis_forests_e, nvis_forests_n,
nvis_grasses_e, nvis_grasses_n, nvis_nodata_e, nvis_nodata_n, nvis_shrubs_e, nvis_shrubs_n,
nvis_woodlands_e, nvis_woodlands_n, oldrock, othersed, p_mean, p_seasonality, pet_mean, pop_gt_1,
pop_gt_10, pop_max, pop_mean, prop_forested, prop_missing_data, q_mean, q_uncert_n,
q_uncert_num_curves, q_uncert_q10, q_uncert_q10_lower, q_uncert_q10_upper, q_uncert_q50,
q_uncert_q50_lower, q_uncert_q50_upper, q_uncert_q90, q_uncert_q90_lower, q_uncert_q90_upper,
relief, reliefratio, river_di, river_region, runoff_ratio, sanda, sedvolc, settlement_fac, silicsed,
slope_fdc, solpawhc, solum_thickness, start_date, state_alt, state_outlet, station_name, strahler,
strdensity, stream_elas, unconsoldted, upsdist, zero_q_freq
[26]:
df = dataset.fetch_static_features()
print(df.shape)
(222, 166)
[27]:
print(df.isna().sum().sum())
df.isna().sum()
1175
[27]:
station_name         0
drainage_division    0
river_region         0
notes                0
lat_outlet           0
                    ..
npp_8                0
npp_9                0
npp_10               0
npp_11               0
npp_12               0
Length: 166, dtype: int64

find those columns which have at least one NaN value

[28]:
df.loc[:, (df.isna().sum()>0)]
[28]:
state_alt next_station_ds q_uncert_num_curves q_uncert_n q_uncert_q10 q_uncert_q10_upper q_uncert_q10_lower q_uncert_q50 q_uncert_q50_upper q_uncert_q50_lower q_uncert_q90 q_uncert_q90_upper q_uncert_q90_lower
station_id
912101A NT NaN 3.0 15226.0 0.015122 25.07% -21.06% 0.027200 20.06% -17.82% 0.121670 18.46% -15.13%
912105A NT 912101A 1.0 15232.0 0.016572 196.84% -93.24% 0.031969 129.72% -77.38% 0.161384 49.79% -40.02%
915011A NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
917107A NaN NaN 2.0 15772.0 0.001552 143.47% -66.93% 0.036077 51.70% -37.00% 0.371124 26.85% -22.30%
919003A NaN NaN 1.0 14933.0 0.004731 21.65% -18.16% 0.053229 15.45% -13.59% 1.273285 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
312061 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
314207 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
314213 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
315450 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
318076 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

222 rows × 13 columns

[29]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[29]:
state_alt              212
next_station_ds        192
q_uncert_num_curves     56
q_uncert_n              56
q_uncert_q10            56
q_uncert_q10_upper     118
q_uncert_q10_lower     118
q_uncert_q50            56
q_uncert_q50_upper      66
q_uncert_q50_lower      67
q_uncert_q90            56
q_uncert_q90_upper      61
q_uncert_q90_lower      61
dtype: int64
[30]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_silo_morton, aet_mm_silo_morton_point, aet_mm_silo_short_crop, aet_mm_silo_tall_crop,
airtemp_C_awap_max, airtemp_C_awap_min, airtemp_C_silo_max, airtemp_C_silo_min, awap_vp_hpa,
et_morton_wet_SILO, evap_morton_lake_SILO, evap_pan_SILO, evap_syn_SILO, mslp_SILO, pcp_mm_awap,
pcp_mm_silo, precipitation_var_AWAP, q_cms_obs, q_mmd_obs, rh_%_silo_tmax, rh_%_silo_tmin,
silo_vp_hpa, solrad_wm2_awap, solrad_wm2_silo, streamflow_MLd_inclInfilled, vp_deficit_SILO
[31]:
dataset = RainfallRunoff('CAMELS_AUS', path=os.path.join(DATA_PATH, 'CAMELS'), version=2, verbosity=0)
print(dataset)
CAMELS_AUS with 561 stations, 26 dynamic and 187 static features
[32]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
anngro_mega, anngro_meso, anngro_micro, aridity, carbnatesed, catchment_area, catchment_di, claya,
clayb, confinement, daystart, daystart_P, daystart_Q, distupdamw, drainage_division, elev_max,
elev_mean, elev_min, elev_range, elongratio, end_date, erosivity, extract_ind_fac, flow_div_fac,
flow_regime_di, frac_snow, geol_prim, geol_prim_prop, geol_sec, geol_sec_prop, gromega_seas,
gromeso_seas, gromicro_seas, high_prec_dur, high_prec_freq, high_prec_timing, igneous, impound_fac,
infrastruc_fac, ksat, landuse_fac, lat_centroid, lat_outlet, lc01_extracti, lc03_waterbo,
lc04_saltlak, lc05_irrcrop, lc06_irrpast, lc07_irrsuga, lc08_rfcropp, lc09_rfpastu, lc10_rfsugar,
lc11_wetlands, lc14_tussclo, lc15_alpineg, lc16_openhum, lc18_opentus, lc19_shrbsca, lc24_shrbden,
lc25_shrbope, lc31_forclos, lc32_foropen, lc33_woodope, lc34_woodspa, lc35_urbanar, leveebank_fac,
long_centroid, long_outlet, low_prec_dur, low_prec_freq, low_prec_timing, map_zone, mean_slope_pct,
metamorph, mrvbf_prop_0, mrvbf_prop_1, mrvbf_prop_2, mrvbf_prop_3, mrvbf_prop_4, mrvbf_prop_5,
mrvbf_prop_6, mrvbf_prop_7, mrvbf_prop_8, mrvbf_prop_9, nested_status, next_station_ds, notes,
npp_1, npp_10, npp_11, npp_12, npp_2, npp_3, npp_4, npp_5, npp_6, npp_7, npp_8, npp_9, npp_ann,
num_nested_within, nvis_bare_e, nvis_bare_n, nvis_forests_e, nvis_forests_n, nvis_grasses_e,
nvis_grasses_n, nvis_nodata_e, nvis_nodata_n, nvis_shrubs_e, nvis_shrubs_n, nvis_woodlands_e,
nvis_woodlands_n, oldrock, othersed, p_mean, p_seasonality, pet_mean, pop_gt_1, pop_gt_10, pop_max,
pop_mean, prop_forested, prop_missing_data, q_uncert_Q_above, q_uncert_days_above,
q_uncert_rmse_all, q_uncert_rmse_lower, q_uncert_rmse_upper, q_uncert_unique_curves, relief,
reliefratio, river_di, river_region, sanda, sedvolc, settlement_fac, sig_dur_RespTime,
sig_dur_high_Q_dur, sig_dur_low_Q_dur, sig_dur_zero_Q_dur, sig_freq_high_Q_freq,
sig_freq_low_Q_freq, sig_freq_zero_Q_freq, sig_mag_BFI, sig_mag_BaseMag, sig_mag_Q5, sig_mag_Q95,
sig_mag_Q_7_day_max, sig_mag_Q_7_day_min, sig_mag_Q_CoV, sig_mag_Q_mean, sig_mag_Q_skew,
sig_mag_Q_var, sig_mag_VarIdx, sig_other_EventRR, sig_other_PeakDistribution,
sig_other_PeakDistribution_low, sig_other_QP_elasticity, sig_other_RR_seasonality,
sig_other_SnowDayRatio, sig_other_SnowStorage, sig_other_Spearmans_rho, sig_other_StorageFromBase,
sig_other_TotalRR, sig_other_ratio_Event_TotalRR, sig_roc_AC1, sig_roc_AC1_low, sig_roc_BaseRecesK,
sig_roc_FDC_slope, sig_roc_FlashIdx, sig_roc_RLD, sig_roc_RecesK_early, sig_roc_RecesVarSeasonality,
sig_timing_HFD_mean, sig_timing_HFI_mean, silicsed, solpawhc, solum_thickness, start_date,
state_alt, state_outlet, station_name, strahler, strdensity, unconsoldted, upsdist
[33]:
df = dataset.fetch_static_features()
print(df.shape)
(561, 187)
[34]:
print(df.isna().sum().sum())
df.isna().sum()
1643
[34]:
station_name         0
drainage_division    0
river_region         0
notes                0
lat_outlet           0
                    ..
npp_8                0
npp_9                0
npp_10               0
npp_11               0
npp_12               0
Length: 187, dtype: int64

find those columns which have at least one NaN value

[35]:
df.loc[:, (df.isna().sum()>0)]
[35]:
state_alt next_station_ds q_uncert_unique_curves q_uncert_rmse_all q_uncert_rmse_lower q_uncert_rmse_upper q_uncert_days_above q_uncert_Q_above sig_mag_VarIdx sig_roc_FDC_slope sig_other_PeakDistribution_low
station_id
912101A NT NaN NaN NaN NaN NaN NaN NaN 0.292867 -1.916733 -2.180623
912105A NT 912101A NaN NaN NaN NaN NaN NaN 0.304694 -1.795139 -1.254491
915011A NaN NaN NaN NaN NaN NaN NaN NaN 1.083646 NaN -6.090788
915206A NaN NaN 25.0 25.172244 6.506520 20.955888 0.078362 19.011459 1.009843 NaN -8.491230
917107A NaN NaN 16.0 53.380009 1168.007627 21.192680 0.132802 12.283859 0.641856 -3.957062 -3.631162
... ... ... ... ... ... ... ... ... ... ... ...
318150 NaN 318181 8.0 13.679565 13.569136 10.168856 0.000000 0.000000 0.459891 -3.489307 -5.351701
318181 NaN NaN 24.0 8.209045 23.363542 5.920785 0.004200 0.450893 0.507649 -3.661257 -5.290249
318191 NaN 318150 11.0 8.226708 12.538870 6.093167 0.000000 0.000000 0.514683 -3.525028 -8.555535
318311 NaN 318150 10.0 19.588965 34.652832 14.517310 0.121428 11.333069 0.678704 -4.723863 -7.717046
319204 NaN NaN 5.0 6.379150 20.465465 4.794664 0.005493 0.536084 0.683732 -5.213989 -6.477004

561 rows × 11 columns

[36]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[36]:
state_alt                         544
next_station_ds                   391
q_uncert_unique_curves            102
q_uncert_rmse_all                 102
q_uncert_rmse_lower               102
q_uncert_rmse_upper               102
q_uncert_days_above               102
q_uncert_Q_above                  102
sig_mag_VarIdx                      2
sig_roc_FDC_slope                  91
sig_other_PeakDistribution_low      3
dtype: int64
[37]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_silo_morton, aet_mm_silo_morton_point, aet_mm_silo_short_crop, aet_mm_silo_tall_crop,
agcd_h09_vp_hpa, agcd_h15_vp_hpa, airtemp_C_agcd_max, airtemp_C_agcd_min, airtemp_C_silo_max,
airtemp_C_silo_min, et_morton_wet_SILO, evap_morton_lake_SILO, evap_pan_SILO, evap_syn_SILO,
mslp_SILO, pcp_mm_agcd, pcp_mm_silo, precipitation_var_AGCD, q_cms_obs, q_mmd_obs, rh_%_silo_tmax,
rh_%_silo_tmin, silo_vp_hpa, solrad_wm2_silo, streamflow_MLd_inclInfilled, vp_deficit_SILO

CAMELS_GB

[38]:
dataset = RainfallRunoff('CAMELS_GB', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_GB with 671 stations, 10 dynamic and 145 static features
[39]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, abs_agriculture_perc, abs_amenities_perc, abs_energy_perc, abs_environmental_perc,
abs_industry_perc, abs_watersupply_perc, area, aridity, bankfull_flow, bares_perc, baseflow_index,
baseflow_index_ceh, benchmark_catch, bulkdens, bulkdens_5, bulkdens_50, bulkdens_95,
bulkdens_missing, clay_perc, clay_perc_missing, conductivity_cosby, conductivity_cosby_5,
conductivity_cosby_50, conductivity_cosby_95, conductivity_cosby_missing, conductivity_hypres,
conductivity_hypres_5, conductivity_hypres_50, conductivity_hypres_95, conductivity_hypres_missing,
crop_perc, discharges, dom_land_cover, dpsbar, dwood_perc, elev_10, elev_50, elev_90, elev_max,
elev_mean, elev_min, ewood_perc, flow_perc_complete, flow_period_end, flow_period_start,
frac_high_perc, frac_low_perc, frac_mod_perc, frac_snow, gauge_easting, gauge_elev, gauge_lat,
gauge_lon, gauge_name, gauge_northing, grass_perc, groundwater_abs, hfd_mean, high_prec_dur,
high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, inter_high_perc, inter_low_perc,
inter_mod_perc, inwater_perc, low_nsig_perc, low_prec_dur, low_prec_freq, low_prec_timing,
low_q_dur, low_q_freq, no_gw_perc, nsig_low_perc, num_reservoir, organic_perc, organic_perc_missing,
p_mean, p_seasonality, pet_mean, porosity_cosby, porosity_cosby_5, porosity_cosby_50,
porosity_cosby_95, porosity_cosby_missing, porosity_hypres, porosity_hypres_5, porosity_hypres_50,
porosity_hypres_95, porosity_hypres_missing, q25_uncert_lower, q25_uncert_upper, q50_uncert_lower,
q50_uncert_upper, q5_uncert_lower, q5_uncert_upper, q75_uncert_lower, q75_uncert_upper,
q95_uncert_lower, q95_uncert_upper, q99_uncert_lower, q99_uncert_upper, q_mean, quncert_meta,
reservoir_cap, reservoir_drain, reservoir_env, reservoir_fs, reservoir_he, reservoir_nav,
reservoir_nousedata, reservoir_wr, reservoir_year_first, reservoir_year_last, root_depth,
root_depth_5, root_depth_50, root_depth_95, root_depth_missing, runoff_ratio, sand_perc,
sand_perc_missing, shrub_perc, silt_perc, silt_perc_missing, slope_fdc, soil_depth_pelletier,
soil_depth_pelletier_5, soil_depth_pelletier_50, soil_depth_pelletier_95,
soil_depth_pelletier_missing, station_type, stream_elas, structurefull_flow, surfacewater_abs, tawc,
tawc_5, tawc_50, tawc_95, tawc_missing, urban_perc, zero_q_freq
[40]:
df = dataset.fetch_static_features()
print(df.shape)
(671, 145)
[41]:
print(df.isna().sum().sum())
df.isna().sum()
10316
[41]:
q_mean                          0
runoff_ratio                    0
stream_elas                     0
slope_fdc                       3
baseflow_index                  0
                               ..
soil_depth_pelletier            0
soil_depth_pelletier_missing    0
soil_depth_pelletier_5          0
soil_depth_pelletier_50         0
soil_depth_pelletier_95         0
Length: 145, dtype: int64

find those columns which have at least one NaN value

[42]:
df.loc[:, (df.isna().sum()>0)]
[42]:
slope_fdc high_prec_timing low_prec_timing surfacewater_abs groundwater_abs discharges abs_agriculture_perc abs_amenities_perc abs_energy_perc abs_environmental_perc ... q25_uncert_upper q25_uncert_lower q50_uncert_upper q50_uncert_lower q75_uncert_upper q75_uncert_lower q95_uncert_upper q95_uncert_lower q99_uncert_upper q99_uncert_lower
gauge_id
38017 1.50 son jja 0.000 0.054 0.005 28.43 0.00 0.00 0.00 ... 21.59 -22.91 17.81 -17.37 13.07 -11.96 12.88 -12.67 10.38 -10.58
42001 3.80 son jja 0.004 0.149 0.003 2.79 0.00 0.00 0.00 ... 26.76 -25.97 15.90 -17.53 25.30 -22.87 24.43 -23.47 14.23 -12.56
55014 2.78 djf jja 0.001 0.000 0.000 47.16 0.00 0.00 0.00 ... 16.99 -16.77 15.09 -15.04 12.25 -12.34 14.04 -14.03 11.15 -11.15
27041 2.04 son mam 0.047 0.053 0.014 58.32 0.12 14.58 0.00 ... 14.67 -15.21 14.57 -14.52 14.56 -14.43 12.83 -13.23 9.28 -10.18
39078 2.02 son jja 0.000 0.090 0.049 0.65 0.00 0.00 0.00 ... 25.60 -26.23 16.68 -16.65 19.92 -19.76 16.59 -16.18 9.20 -8.61
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
66006 3.62 son jja NaN NaN NaN NaN NaN NaN NaN ... 17.83 -17.47 12.86 -12.66 10.78 -10.53 13.71 -13.75 10.86 -10.95
39014 1.72 son jja 0.000 0.217 0.019 0.94 0.00 0.00 0.03 ... 14.67 -13.54 14.00 -13.29 12.99 -12.63 9.83 -9.67 NaN NaN
42010 1.06 son jja 0.714 0.560 0.046 66.65 0.00 0.46 9.92 ... 14.07 -14.02 11.30 -11.45 10.76 -10.73 9.78 -9.59 10.56 -10.19
42011 2.06 son jja 0.000 0.068 0.070 0.69 0.00 0.00 0.00 ... 11.80 -11.47 12.42 -12.60 8.67 -8.51 10.04 -9.41 10.72 -10.64
43009 3.63 son jja 0.050 0.008 0.026 54.02 1.73 36.56 0.00 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

671 rows × 38 columns

[43]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[43]:
slope_fdc                   3
high_prec_timing           15
low_prec_timing             3
surfacewater_abs          229
groundwater_abs           229
discharges                231
abs_agriculture_perc      313
abs_amenities_perc        313
abs_energy_perc           313
abs_environmental_perc    313
abs_industry_perc         313
abs_watersupply_perc      313
reservoir_he              509
reservoir_nav             509
reservoir_drain           509
reservoir_wr              509
reservoir_fs              509
reservoir_env             509
reservoir_nousedata       509
reservoir_year_first      530
reservoir_year_last       530
dpsbar                      2
elev_mean                   2
station_type                1
bankfull_flow             310
structurefull_flow        408
q5_uncert_upper           235
q5_uncert_lower           235
q25_uncert_upper          173
q25_uncert_lower          173
q50_uncert_upper          168
q50_uncert_lower          168
q75_uncert_upper          170
q75_uncert_lower          170
q95_uncert_upper          195
q95_uncert_lower          195
q99_uncert_upper          250
q99_uncert_lower          250
dtype: int64
[44]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, lwdownrad_wm2, pcp_mm, pet_mm, pet_mm_intercep, q_cms_obs, q_mmd_obs, rh_%,
solrad_wm2, windspeed_mps

CAMELS_BR

[45]:
dataset = RainfallRunoff('CAMELS_BR', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_BR with 897 stations, 11 dynamic and 67 static features
[46]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, area, area_ana, area_gsim, area_gsim_quality, aridity, asynchronicity, barren_perc,
baseflow_index, bedrock_depth, carb_rocks_perc, clay_perc, consumptive_use, consumptive_use_perc,
crop_mosaic_perc, crop_perc, dom_land_cover, dom_land_cover_perc, elev_gauge, elev_mean, et_mean,
forest_perc, frac_snow, gauge_lat, gauge_lon, gauge_name, gauge_region, geol_class_1st,
geol_class_1st_perc, geol_class_2nd, geol_class_2nd_perc, geol_permeability, geol_porosity,
grass_perc, hfd_mean, high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur, high_q_freq,
imperv_perc, low_prec_dur, low_prec_freq, low_prec_timing, low_q_dur, low_q_freq,
org_carbon_content, p_mean, p_seasonality, pet_mean, q_mean, q_quality_control_perc,
q_stream_stage_perc, regulation_degree, reservoirs_vol, runoff_ratio, sand_perc, shrub_perc,
silt_perc, slope_fdc, slope_mean, snow_perc, stream_elas, water_table_depth, wet_perc, zero_q_freq
[47]:
df = dataset.fetch_static_features()
print(df.shape)
(897, 67)
[48]:
print(df.isna().sum().sum())
df.isna().sum()
133
[48]:
p_mean               0
pet_mean             0
et_mean              0
aridity              0
p_seasonality        0
                    ..
water_table_depth    0
elev_gauge           0
elev_mean            0
slope_mean           0
area                 0
Length: 67, dtype: int64

find those columns which have at least one NaN value

[49]:
df.loc[:, (df.isna().sum()>0)]
[49]:
frac_snow high_prec_timing geol_class_2nd slope_fdc baseflow_index area_ana
gauge_id
58030000 0.0 djf acid_plutonic_rocks 1.08954 0.79986 796.0
57170000 0.0 djf acid_plutonic_rocks 1.35609 0.76520 980.0
39580000 0.0 mam siliciclastic_sedimentary_rocks 1.68712 0.66356 756.0
41818000 0.0 djf metamorphics 1.98782 0.64151 16600.0
58870000 0.0 djf metamorphics 1.39838 0.71359 1120.0
... ... ... ... ... ... ...
26720000 0.0 djf siliciclastic_sedimentary_rocks 6.72326 0.57567 6610.0
65925000 0.0 son NaN 2.24256 0.54385 1660.0
39560000 0.0 mam metamorphics 2.14107 0.65966 4910.0
71550000 0.0 son siliciclastic_sedimentary_rocks 2.51894 0.58372 NaN
41539998 0.0 djf metamorphics 1.74270 0.69456 NaN

897 rows × 6 columns

[50]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[50]:
frac_snow            5
high_prec_timing     4
geol_class_2nd      47
slope_fdc           16
baseflow_index      18
area_ana            43
dtype: int64
[51]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_mgb, airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_chirps, pcp_mm_cpc,
pcp_mm_mswep, pet_mm_gleam, q_cms_obs, q_mmd_obs

CAMELS_US

[52]:
dataset = RainfallRunoff('CAMELS_US', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_US with 671 stations, 8 dynamic and 59 static features
[53]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_gages2, area_geospa_fabric, aridity, baseflow_index, carbonate_rocks_frac, clay_frac,
dom_land_cover, dom_land_cover_frac, elev_mean, frac_forest, frac_snow, gauge_lat, gauge_lon,
gauge_name, geol_1st_class, geol_2nd_class, geol_permeability, geol_porostiy, glim_1st_class_frac,
glim_2nd_class_frac, gvf_diff, gvf_max, hfd_mean, high_prec_dur, high_prec_freq, high_prec_timing,
high_q_dur, high_q_freq, huc_02, lai_diff, lai_max, low_prec_dur, low_prec_freq, low_prec_timing,
low_q_dur, low_q_freq, max_water_content, organic_frac, other_frac, p_mean, p_seasonality, pet_mean,
q5, q95, q_mean, root_depth_50, root_depth_99, runoff_ratio, sand_frac, silt_frac, slope_fdc,
slope_mean, soil_conductivity, soil_depth_pelletier, soil_depth_statsgo, soil_porosity, stream_elas,
water_frac, zero_q_freq
[54]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_min, dayl(s), pcp_mm, q_cms_obs, solrad_wm2, swe_mm, vp_hpa

CAMELS_CL

[55]:
dataset = RainfallRunoff('CAMELS_CL', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_CL with 516 stations, 12 dynamic and 104 static features
[56]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, area, aridity_chirps, aridity_cr2met, aridity_mswep, aridity_tmpa, baseflow_index, big_dam,
carb_rocks_frac, crop_frac, dom_land_cover, dom_land_cover_frac, elev_gauge, elev_max, elev_mean,
elev_med, elev_min, forest_frac, fp_frac, fp_nf_index, frac_snow_chirps, frac_snow_cr2met,
frac_snow_mswep, frac_snow_tmpa, gauge_lat, gauge_lon, gauge_name, geol_class_1st,
geol_class_1st_frac, geol_class_2nd, geol_class_2nd_frac, grass_frac, gw_rights_flow, gw_rights_n,
hfd_mean, high_prec_dur_chirps, high_prec_dur_cr2met, high_prec_dur_mswep, high_prec_dur_tmpa,
high_prec_freq_chirps, high_prec_freq_cr2met, high_prec_freq_mswep, high_prec_freq_tmpa,
high_prec_timing_chirps, high_prec_timing_cr2met, high_prec_timing_mswep, high_prec_timing_tmpa,
high_q_dur, high_q_freq, imp_frac, interv_degree, land_cover_missing, lc_barren, lc_glacier,
location_type, low_prec_dur_chirps, low_prec_dur_cr2met, low_prec_dur_mswep, low_prec_dur_tmpa,
low_prec_freq_chirps, low_prec_freq_cr2met, low_prec_freq_mswep, low_prec_freq_tmpa,
low_prec_timing_chirps, low_prec_timing_cr2met, low_prec_timing_mswep, low_prec_timing_tmpa,
low_q_dur, low_q_freq, n_obs, nested_inner, nested_outer, nf_frac, p_mean_chirps, p_mean_cr2met,
p_mean_mswep, p_mean_spread, p_mean_tmpa, p_seasonality_chirps, p_seasonality_cr2met,
p_seasonality_mswep, p_seasonality_tmpa, pet_mean, q_mean, record_period_end, record_period_start,
runoff_ratio_chirps, runoff_ratio_cr2met, runoff_ratio_mswep, runoff_ratio_tmpa, shrub_frac,
slope_fdc, slope_mean, snow_frac, stream_elas_chirps, stream_elas_cr2met, stream_elas_mswep,
stream_elas_tmpa, sur_rights_flow, sur_rights_n, swe_ratio, wet_frac, zero_q_freq
[57]:
df = dataset.fetch_static_features()
print(df.shape)
(516, 104)
[58]:
print(df.isna().sum().sum())
df.isna().sum()
12185
[58]:
gauge_id
gauge_name             0
gauge_lat              0
gauge_lon              0
record_period_start    0
record_period_end      0
                      ..
sur_rights_flow        0
interv_degree          0
gw_rights_n            0
gw_rights_flow         0
big_dam                0
Length: 104, dtype: int64

find those rows which have at least one NaN value

[59]:
df.loc[:, (df.isna().sum()>0)]
[59]:
gauge_id location_type geol_class_2nd p_mean_chirps p_mean_tmpa aridity_chirps aridity_tmpa p_seasonality_chirps p_seasonality_tmpa frac_snow_chirps frac_snow_tmpa ... baseflow_index hfd_mean Q95 Q5 high_q_freq high_q_dur low_q_freq low_q_dur zero_q_freq swe_ratio
8220009 NaN Intermediate plutonic rocks 3.69311266 NaN 0.8586003 NaN -1.074889369 NaN 0.0000000000000 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
10362001 coastal Pyroclastics 4.34012320 NaN 0.6072567 NaN -0.622200375 NaN 0.0002103480252 NaN ... 0.5624186 122.0500 6.015702219 0.10735714728 13.44168511 2.714286 117.13468456 29.341772 0.0000000000 NaN
7317005 NaN Basic volcanic rocks 4.72452895 NaN 0.5293816 NaN -0.960896391 NaN 0.1469334578557 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2112005 NaN Unconsolidated sediments 0.21673525 NaN 18.4153050 NaN 1.336308822 NaN 0.0002204325151 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5746001 NaN Unconsolidated sediments 1.16770048 NaN 3.1408824 NaN -1.403499734 NaN 0.0000000000000 NaN ... 0.7183103 124.6000 1.580334352 0.22086357569 2.43736966 2.086957 8.17534408 13.416667 0.0000000000 0.003368185140
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5101001 NaN Acid plutonic rocks 0.72182012 NaN 4.7505564 NaN -1.440400284 NaN 0.0038392564541 NaN ... 0.6971627 162.1176 2.386918187 0.01539947218 43.72596796 18.340909 148.08311081 44.803279 0.0000000000 0.344224603708
10401001 NaN Metamorphics 4.61487255 NaN 0.5567343 NaN -0.598618823 NaN 0.0007067884319 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2110002 NaN Siliciclastic sedimentary rocks 0.28986612 NaN 11.4299221 NaN 1.555320147 NaN 0.0021539836182 NaN ... 0.9021013 190.0625 0.014444286 0.00737807572 0.00000000 0.000000 0.00000000 0.000000 0.0000000000 NaN
8350001 NaN Acid volcanic rocks 8.45346995 NaN 0.3105805 NaN -0.836864115 NaN 0.0829756992572 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
11315001 NaN Unconsolidated sediments 2.79655852 NaN 0.7749488 NaN -0.306493095 NaN 0.1256086732201 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

516 rows × 42 columns

[60]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[60]:
gauge_id
location_type              386
geol_class_2nd              16
p_mean_chirps               43
p_mean_tmpa                516
aridity_chirps              43
aridity_tmpa               516
p_seasonality_chirps        43
p_seasonality_tmpa         516
frac_snow_chirps            43
frac_snow_tmpa             516
high_prec_freq_chirps       43
high_prec_freq_tmpa        516
high_prec_dur_chirps        43
high_prec_dur_tmpa         516
high_prec_timing_chirps     43
high_prec_timing_tmpa      516
low_prec_freq_chirps        43
low_prec_freq_tmpa         516
low_prec_dur_chirps         43
low_prec_dur_tmpa          516
low_prec_timing_chirps      43
low_prec_timing_tmpa       516
q_mean                     278
runoff_ratio_cr2met        278
runoff_ratio_chirps        297
runoff_ratio_mswep         278
runoff_ratio_tmpa          516
stream_elas_cr2met         278
stream_elas_chirps         297
stream_elas_mswep          278
stream_elas_tmpa           516
slope_fdc                  278
baseflow_index             278
hfd_mean                   278
Q95                        278
Q5                         278
high_q_freq                278
high_q_dur                 278
low_q_freq                 278
low_q_dur                  278
zero_q_freq                278
swe_ratio                  397
dtype: int64
[61]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_chirps, pcp_mm_cr2met, pcp_mm_mswep,
pcp_mm_tmpa, pet_mm_hargreaves, pet_mm_modis, q_cms_obs, q_mmd_obs, swe

CAMELS_DK

[62]:
dataset = RainfallRunoff('CAMELS_DK', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_DK with 304 stations, 13 dynamic and 119 static features
[63]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
FC, HCC, KS, MRC, THS, WP, aridity, bulk_density, catch_accum_number, catch_area, catch_flow_dir,
catch_outlet_lat, catch_outlet_lon, chalk_d, dem_max, dem_mean, dem_median, dem_min,
frac_snow_daily, gauge_record_pct, gauged_type, high_prec_dur, high_prec_freq, high_prec_timing,
low_prec_dur, low_prec_freq, low_prec_timing, p_mean, p_seasonality, pct_aeolain_sand,
pct_agriculture_corine_1990, pct_agriculture_corine_2000, pct_agriculture_corine_2006,
pct_agriculture_corine_2012, pct_agriculture_corine_2018, pct_agriculture_levin_2011,
pct_agriculture_levin_2016, pct_agriculture_levin_2018, pct_agriculture_levin_2021, pct_beach,
pct_clay, pct_claynor_100, pct_claynor_200, pct_claynor_30, pct_claynor_60, pct_down_sand,
pct_flat_area, pct_forest_corine_1990, pct_forest_corine_2000, pct_forest_corine_2006,
pct_forest_corine_2012, pct_forest_corine_2018, pct_forest_levin_2011, pct_forest_levin_2016,
pct_forest_levin_2018, pct_forest_levin_2021, pct_fsandno_100, pct_fsandno_200, pct_fsandno_30,
pct_fsandno_60, pct_glaf_sand, pct_glal_clay, pct_glam_clay, pct_gravel, pct_gsandno_100,
pct_gsandno_200, pct_gsandno_30, pct_gsandno_60, pct_marine_sand, pct_marsh,
pct_naturedry_levin_2011, pct_naturedry_levin_2016, pct_naturedry_levin_2018,
pct_naturedry_levin_2021, pct_naturewet_levin_2011, pct_naturewet_levin_2016,
pct_naturewet_levin_2018, pct_naturewet_levin_2021, pct_organic, pct_sand, pct_sandy_till, pct_silt,
pct_till, pct_urban_corine_1990, pct_urban_corine_2000, pct_urban_corine_2006,
pct_urban_corine_2012, pct_urban_corine_2018, pct_urban_levin_2011, pct_urban_levin_2016,
pct_urban_levin_2018, pct_urban_levin_2021, pct_water_corine_1990, pct_water_corine_2000,
pct_water_corine_2006, pct_water_corine_2012, pct_water_corine_2018, pct_water_deposit,
pct_water_levin_2011, pct_water_levin_2016, pct_water_levin_2018, pct_water_levin_2021,
pct_wetlands_corine_1990, pct_wetlands_corine_2000, pct_wetlands_corine_2006,
pct_wetlands_corine_2012, pct_wetlands_corine_2018, pet_mean, root_depth, slope_max, slope_mean,
slope_median, slope_min, t_mean, tawc, uaquifer_d, uaquifer_t, uclay_t, usand_t
[64]:
df = dataset.fetch_static_features()
print(df.shape)
(304, 119)
[65]:
print(df.isna().sum().sum())
df.isna().sum()
23
[65]:
FC            0
HCC           0
KS            0
MRC           0
THS           0
             ..
tawc          0
uaquifer_d    3
uaquifer_t    3
uclay_t       3
usand_t       3
Length: 119, dtype: int64

find those columns which have at least one NaN value

[66]:
df.loc[:, (df.isna().sum()>0)]
[66]:
chalk_d gauge_record_pct uaquifer_d uaquifer_t uclay_t usand_t
16200607 348.941440 100.000000 6.166402 16.930742 5.468584 5.671323
37470466 451.491863 100.000000 0.630618 46.024319 0.550030 31.309439
67221267 82.682739 100.000000 30.316439 10.650221 28.674266 0.580311
35321353 425.690145 100.000000 7.186167 39.533252 6.985663 0.198207
53411137 287.510620 100.000000 15.445959 8.634338 15.021151 0.210060
... ... ... ... ... ... ...
32211121 68.721796 54.840134 7.125480 8.509604 6.253781 1.478004
42320708 155.292211 100.000000 24.663791 9.827164 21.955691 1.744912
71270476 10.931980 100.000000 9.366122 45.329497 9.176942 0.364699
32240800 19.045147 100.000000 9.834996 28.712421 9.284840 0.198684
42600042 124.778317 90.673026 12.625913 17.442115 12.289986 2.165011

304 rows × 6 columns

[67]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[67]:
chalk_d             3
gauge_record_pct    8
uaquifer_d          3
uaquifer_t          3
uclay_t             3
usand_t             3
dtype: int64
[68]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
Abstraction, DKM_dtp, DKM_gwh, DKM_irr, DKM_sdr, DKM_sre, DKM_wcr, Qdkm, aet_mm, airtemp_C_mean,
pcp_mm, pet_mm, q_cms_obs

CAMELS_CH

[69]:
dataset = RainfallRunoff('CAMELS_CH', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_CH with 331 stations, 9 dynamic and 209 static features
[70]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, aap, acid_plutonic, acid_volcanic, amk, api, area, aridity, baseflow_index_landson,
basic_plutonic, basic_volcanic, bulk_dens, bulk_dens_25, bulk_dens_5, bulk_dens_50, bulk_dens_75,
bulk_dens_90, bulk_dens_missing, bulk_dens_skewness, carbonate_sedimentary, clay_perc, clay_perc_25,
clay_perc_5, clay_perc_50, clay_perc_75, clay_perc_90, clay_perc_missing, clay_perc_skewness,
coarse_fragm_perc, coarse_fragm_perc_25, coarse_fragm_perc_5, coarse_fragm_perc_50,
coarse_fragm_perc_75, coarse_fragm_perc_90, coarse_fragm_perc_missing, coarse_fragm_perc_skewness,
conductivity, conductivity_25, conductivity_5, conductivity_50, conductivity_75, conductivity_90,
conductivity_missing, conductivity_skewness, country, crop_perc, dens_inhabitants, dom_land_cover,
dup, dwood_perc, elev_max, elev_mean, elev_min, elev_percentile10, elev_percentile25,
elev_percentile50, elev_percentile75, elev_percentile90, ewood_perc, ext_area_perc, fju,
flat_area_perc, frac_snow, gauge_easting, gauge_elevation, gauge_lat, gauge_lon, gauge_name,
gauge_northing, geo_log10_permeability, geo_porosity, glac_area, glac_area_neighbours, glac_mass,
glac_vol, grass_perc, hardrock_imperm_perc, hardrock_perc, hes, hfd_mean, high_prec_dur,
high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, hp_count, hp_inst_turb, hp_max_power,
hp_qturb, ice_geo, ice_perc, id6, ind_end_date, ind_number_of_years, ind_start_date,
intermediate_plutonic, inwater_perc, karst_perc, loose_rock_perc, low_prec_dur, low_prec_freq,
low_prec_timing, low_q_dur, low_q_freq, metamorphics, mixed_sedimentary, mixed_wood_perc, mpk, mps,
n_inhabitants, null_perc, num_reservoir, omm, ood, oos, ops, organic_perc, organic_perc_25,
organic_perc_5, organic_perc_50, organic_perc_75, organic_perc_90, organic_perc_missing,
organic_perc_skewness, osm, p_mean, p_seasonality, pet_mean, porosity, porosity_25, porosity_5,
porosity_50, porosity_75, porosity_90, porosity_missing, porosity_skewness, pyroclastic, q_mean,
qua, reservoir_cap, reservoir_fs, reservoir_he, reservoir_irr, reservoir_nousedata,
reservoir_year_first, reservoir_year_last, rock_perc, root_depth, root_depth_25, root_depth_5,
root_depth_50, root_depth_75, root_depth_90, root_depth_missing, root_depth_skewness, runoff_ratio,
sal, sand_perc, sand_perc_25, sand_perc_5, sand_perc_50, sand_perc_75, sand_perc_90,
sand_perc_missing, sand_perc_skewness, scrub_perc, sign_end_date, sign_number_of_years,
sign_start_date, siliciclastic_sedimentary, silt_perc, silt_perc_25, silt_perc_5, silt_perc_50,
silt_perc_75, silt_perc_90, silt_perc_missing, silt_perc_skewness, slope_fdc, slope_mean,
steep_area_perc, stream_elas, sus, tie, tot_avail_water, tot_avail_water_25, tot_avail_water_5,
tot_avail_water_50, tot_avail_water_75, tot_avail_water_90, tot_avail_water_missing,
tot_avail_water_skewness, ukd, unconsol_coarse_perc, unconsol_fine_perc, unconsol_imperm_perc,
unconsol_medium_perc, unconsol_sediments, uod, ups, urban_perc, usm, water_body_name,
water_body_type, water_geo, water_perc, wetlands_perc, zero_q_freq
[71]:
df = dataset.fetch_static_features()
print(df.shape)
(331, 209)
[72]:
print(df.isna().sum().sum())
df.isna().sum()
2097
[72]:
ind_start_date         0
ind_end_date           0
ind_number_of_years    0
p_mean                 0
pet_mean               0
                      ..
elev_percentile90      0
elev_max               0
slope_mean             0
flat_area_perc         0
steep_area_perc        0
Length: 209, dtype: int64

find those columns which have at least one NaN value

[73]:
df.loc[:, (df.isna().sum()>0)]
[73]:
p_seasonality frac_snow high_prec_timing low_prec_timing reservoir_he reservoir_fs reservoir_irr reservoir_nousedata reservoir_year_first reservoir_year_last ... baseflow_index_landson hfd_mean Q5 Q95 high_q_freq high_q_dur low_q_freq low_q_dur zero_q_freq silt_perc_skewness
gauge_id
2004 0.159 0.039 jja son NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN -0.252
2007 -0.118 0.170 djf son NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.635
2009 0.078 0.436 jja son 0.999 0.0 0.001 0.0 1914.0 1989.0 ... 0.787 243.282 1.279 6.207 0.000 0.000 0.051 2.000 0.0 0.285
2011 0.106 0.474 son son 0.998 0.0 0.002 0.0 1927.0 1989.0 ... 0.751 263.667 0.821 6.681 0.051 1.000 0.436 1.000 0.0 0.267
2014 0.279 0.223 jja son 1.000 0.0 0.000 0.0 1910.0 2015.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.421
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6007 0.228 0.379 son djf 1.000 0.0 0.000 0.0 2010.0 2010.0 ... 0.715 211.333 1.298 8.789 2.005 1.714 2.451 7.333 0.0 0.393
6008 NaN NaN son djf NaN NaN NaN NaN NaN NaN ... 0.602 188.875 0.632 10.751 4.385 2.593 29.315 8.069 0.0 -0.603
6009 NaN NaN son djf NaN NaN NaN NaN NaN NaN ... 0.318 191.714 0.127 15.376 26.897 2.667 155.228 12.056 0.0 0.310
6010 NaN NaN son djf NaN NaN NaN NaN NaN NaN ... 0.494 198.400 1.002 12.617 12.195 2.103 4.998 3.571 0.0 -0.744
6011 0.272 0.110 NaN djf 1.000 0.0 0.000 0.0 1918.0 2010.0 ... 0.697 204.250 1.165 10.371 0.000 0.000 0.000 0.000 0.0 0.272

331 rows × 26 columns

[74]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[74]:
p_seasonality              54
frac_snow                  54
high_prec_timing           13
low_prec_timing             5
reservoir_he              223
reservoir_fs              223
reservoir_irr             223
reservoir_nousedata       223
reservoir_year_first      223
reservoir_year_last       223
sign_start_date            42
sign_end_date              42
q_mean                     42
runoff_ratio               42
stream_elas                44
slope_fdc                  42
baseflow_index_landson     42
hfd_mean                   42
Q5                         42
Q95                        42
high_q_freq                42
high_q_dur                 42
low_q_freq                 42
low_q_dur                  42
zero_q_freq                42
silt_perc_skewness          1
dtype: int64
[75]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, q_cms_obs, q_mmd_obs, rel_sun_dur(%), swe_mm,
waterlevel(m)

CAMELS_DE

[76]:
dataset = RainfallRunoff('CAMELS_DE', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_DE with 1555 stations, 21 dynamic and 111 static features
[77]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
NSE_conceptual, NSE_lstm, Q5, Q95, agricultural_areas_perc, aquifer_aquitard_mixed_perc,
aquifer_perc, aquitard_perc, area, area_metadata, artificial_surfaces_perc,
bulk_density_0_30cm_mean, bulk_density_100_200cm_mean, bulk_density_30_100cm_mean,
cavity_fissure_karst_perc, cavity_fissure_perc, cavity_fissure_pores_perc, cavity_pores_perc,
clay_0_30cm_mean, clay_100_200cm_mean, clay_30_100cm_mean, coarse_fragments_0_30cm_mean,
coarse_fragments_100_200cm_mean, coarse_fragments_30_100cm_mean, consolidation_solid_rock_perc,
consolidation_unconsolidated_rock_perc, dams_names, dams_num, dams_purposes, dams_river_names,
dams_total_lake_area, dams_total_lake_volume, dams_year_first, dams_year_last, elev_5, elev_50,
elev_95, elev_max, elev_mean, elev_min, federal_state, flow_perc_complete, flow_period_end,
flow_period_start, forests_and_seminatural_areas_perc, frac_snow, gauge_easting, gauge_elev,
gauge_elev_metadata, gauge_lat, gauge_lon, gauge_name, gauge_northing,
geochemical_rocktype_anthropogenically_modified_through_filling_perc,
geochemical_rocktype_carbonatic_perc, geochemical_rocktype_halitic_perc,
geochemical_rocktype_silicate_carbonatic_perc,
geochemical_rocktype_silicate_organic_components_perc, geochemical_rocktype_silicate_perc,
geochemical_rocktype_sulfatic_halitic_perc, geochemical_rocktype_sulfatic_perc, hfd_mean,
high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, kf_extremely_low_perc,
kf_high_perc, kf_highly_variable_perc, kf_low_perc, kf_low_to_extremely_low_perc, kf_medium_perc,
kf_medium_to_moderate_perc, kf_moderate_perc, kf_moderate_to_low_perc, kf_very_high_perc,
kf_very_high_to_high_perc, kf_very_low_perc, low_prec_dur, low_prec_freq, low_prec_timing,
low_q_dur, low_q_freq, no_data_perc, p_mean, p_seasonality, provider_id, q_mean,
rocktype_magmatite_perc, rocktype_metamorphite_perc, rocktype_sediment_perc, runoff_ratio,
sand_0_30cm_mean, sand_100_200cm_mean, sand_30_100cm_mean, silt_0_30cm_mean, silt_100_200cm_mean,
silt_30_100cm_mean, slope_fdc, soil_organic_carbon_0_30cm_mean, soil_organic_carbon_100_200cm_mean,
soil_organic_carbon_30_100cm_mean, testing_perc_complete, training_perc_complete,
validation_perc_complete, water_bodies_perc, water_body_name, waterbody_perc, wetlands_perc,
zero_q_freq
[78]:
df = dataset.fetch_static_features()
print(df.shape)
(1555, 111)
[79]:
print(df.isna().sum().sum())
df.isna().sum()
6862
[79]:
p_mean            0
p_seasonality     0
frac_snow         0
high_prec_freq    0
high_prec_dur     0
                 ..
elev_min          0
elev_5            0
elev_50           0
elev_95           0
elev_max          0
Length: 111, dtype: int64

find those columns which have at least one NaN value

[80]:
df.loc[:, (df.isna().sum()>0)]
[80]:
high_prec_timing low_prec_timing dams_names dams_river_names dams_year_first dams_year_last dams_total_lake_area dams_total_lake_volume dams_purposes NSE_lstm NSE_conceptual gauge_elev_metadata
gauge_id
DEA11180 jja mam Aabachtalsperre|Borchen Hochwasserrückhaltebec... Aabookach (Afte)|Afte bzw. Wiele|Altenau (Alme... 1930.0 1996.0 10.28 71.90 Water supply|Recreational use|Flood control 0.929 0.854 30.86
DEE10940 jja mam NaN NaN NaN NaN 0.00 0.00 NaN 0.094 0.140 69.77
DE911160 jja mam NaN NaN NaN NaN 0.00 0.00 NaN 0.841 0.844 NaN
DE212640 jja son NaN NaN NaN NaN 0.00 0.00 NaN 0.727 0.620 326.68
DE112130 mam mam NaN NaN NaN NaN 0.00 0.00 NaN 0.688 0.605 105.48
... ... ... ... ... ... ... ... ... ... ... ... ...
DEF13210 jja mam NaN NaN NaN NaN 0.00 0.00 NaN 0.917 NaN NaN
DEF10460 jja mam NaN NaN NaN NaN 0.00 0.00 NaN 0.574 0.355 NaN
DE912320 jja mam NaN NaN NaN NaN 0.00 0.00 NaN 0.770 0.589 NaN
DEA11090 jja mam Aabachtalsperre|Borchen Hochwasserrückhaltebec... Afte bzw. Wiele|Aabookach (Afte)|Altenau (Alme... 1974.0 1996.0 5.37 39.41 Water supply|Flood control 0.928 0.884 63.64
DE213310 jja son NaN NaN NaN NaN 0.00 0.00 NaN 0.637 0.512 542.11

1555 rows × 12 columns

[81]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[81]:
high_prec_timing             8
low_prec_timing              3
dams_names                1240
dams_river_names          1240
dams_year_first           1251
dams_year_last            1251
dams_total_lake_area        41
dams_total_lake_volume       2
dams_purposes             1241
NSE_lstm                    43
NSE_conceptual             157
gauge_elev_metadata        385
dtype: int64
[82]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_max, pcp_mm_mean, pcp_mm_median, pcp_mm_min,
pcp_mm_std, q_cms_obs, q_mmd_obs, rh_%, rh_%_max, rh_%_med, rh_%_min, rh_%_std, solrad_wm2_max,
solrad_wm2_mean, solrad_wm2_med, solrad_wm2_min, solrad_wm2_std, water_level

CAMELS_FR

[83]:
dataset = RainfallRunoff('CAMELS_FR', path=DATA_PATH, verbosity=0)
print(dataset)
CAMELS_FR with 654 stations, 22 dynamic and 344 static features
[84]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
clc_1990_lvl1_1, clc_1990_lvl1_2, clc_1990_lvl1_3, clc_1990_lvl1_4, clc_1990_lvl1_5,
clc_1990_lvl1_dom_class, clc_1990_lvl1_na, clc_1990_lvl2_11, clc_1990_lvl2_12, clc_1990_lvl2_13,
clc_1990_lvl2_14, clc_1990_lvl2_21, clc_1990_lvl2_22, clc_1990_lvl2_23, clc_1990_lvl2_24,
clc_1990_lvl2_31, clc_1990_lvl2_32, clc_1990_lvl2_33, clc_1990_lvl2_41, clc_1990_lvl2_42,
clc_1990_lvl2_51, clc_1990_lvl2_52, clc_1990_lvl2_dom_class, clc_1990_lvl2_na, clc_1990_lvl3_111,
clc_1990_lvl3_112, clc_1990_lvl3_121, clc_1990_lvl3_122, clc_1990_lvl3_123, clc_1990_lvl3_124,
clc_1990_lvl3_131, clc_1990_lvl3_132, clc_1990_lvl3_133, clc_1990_lvl3_141, clc_1990_lvl3_142,
clc_1990_lvl3_211, clc_1990_lvl3_212, clc_1990_lvl3_213, clc_1990_lvl3_221, clc_1990_lvl3_222,
clc_1990_lvl3_223, clc_1990_lvl3_231, clc_1990_lvl3_241, clc_1990_lvl3_242, clc_1990_lvl3_243,
clc_1990_lvl3_244, clc_1990_lvl3_311, clc_1990_lvl3_312, clc_1990_lvl3_313, clc_1990_lvl3_321,
clc_1990_lvl3_322, clc_1990_lvl3_323, clc_1990_lvl3_324, clc_1990_lvl3_331, clc_1990_lvl3_332,
clc_1990_lvl3_333, clc_1990_lvl3_334, clc_1990_lvl3_335, clc_1990_lvl3_411, clc_1990_lvl3_412,
clc_1990_lvl3_421, clc_1990_lvl3_422, clc_1990_lvl3_423, clc_1990_lvl3_511, clc_1990_lvl3_512,
clc_1990_lvl3_521, clc_1990_lvl3_522, clc_1990_lvl3_523, clc_1990_lvl3_dom_class, clc_1990_lvl3_na,
clc_2018_lvl1_1, clc_2018_lvl1_2, clc_2018_lvl1_3, clc_2018_lvl1_4, clc_2018_lvl1_5,
clc_2018_lvl1_dom_class, clc_2018_lvl1_na, clc_2018_lvl2_11, clc_2018_lvl2_12, clc_2018_lvl2_13,
clc_2018_lvl2_14, clc_2018_lvl2_21, clc_2018_lvl2_22, clc_2018_lvl2_23, clc_2018_lvl2_24,
clc_2018_lvl2_31, clc_2018_lvl2_32, clc_2018_lvl2_33, clc_2018_lvl2_41, clc_2018_lvl2_42,
clc_2018_lvl2_51, clc_2018_lvl2_52, clc_2018_lvl2_dom_class, clc_2018_lvl2_na, clc_2018_lvl3_111,
clc_2018_lvl3_112, clc_2018_lvl3_121, clc_2018_lvl3_122, clc_2018_lvl3_123, clc_2018_lvl3_124,
clc_2018_lvl3_131, clc_2018_lvl3_132, clc_2018_lvl3_133, clc_2018_lvl3_141, clc_2018_lvl3_142,
clc_2018_lvl3_211, clc_2018_lvl3_212, clc_2018_lvl3_213, clc_2018_lvl3_221, clc_2018_lvl3_222,
clc_2018_lvl3_223, clc_2018_lvl3_231, clc_2018_lvl3_241, clc_2018_lvl3_242, clc_2018_lvl3_243,
clc_2018_lvl3_244, clc_2018_lvl3_311, clc_2018_lvl3_312, clc_2018_lvl3_313, clc_2018_lvl3_321,
clc_2018_lvl3_322, clc_2018_lvl3_323, clc_2018_lvl3_324, clc_2018_lvl3_331, clc_2018_lvl3_332,
clc_2018_lvl3_333, clc_2018_lvl3_334, clc_2018_lvl3_335, clc_2018_lvl3_411, clc_2018_lvl3_412,
clc_2018_lvl3_421, clc_2018_lvl3_422, clc_2018_lvl3_423, clc_2018_lvl3_511, clc_2018_lvl3_512,
clc_2018_lvl3_521, clc_2018_lvl3_522, clc_2018_lvl3_523, clc_2018_lvl3_dom_class, clc_2018_lvl3_na,
cli_aridity_ou, cli_aridity_pe, cli_aridity_pm, cli_assync_ou, cli_assync_pe, cli_assync_pm,
cli_pet_ou_mean, cli_pet_ou_yr, cli_pet_pe_mean, cli_pet_pe_yr, cli_pet_pm_mean, cli_pet_pm_yr,
cli_prec_date_max, cli_prec_dur_high, cli_prec_dur_low, cli_prec_freq_high, cli_prec_freq_low,
cli_prec_intensity, cli_prec_max, cli_prec_mean, cli_prec_mean_yr, cli_prec_season_pet_ou,
cli_prec_season_pet_pe, cli_prec_season_pet_pm, cli_prec_season_temp, cli_prec_timing_high,
cli_prec_timing_low, cli_psol_frac_berghuijs, cli_psol_frac_safran, cli_temp_mean, dam_influence,
dam_n, dam_volume, geo_dom_class, geo_ev, geo_ig, geo_mt, geo_nd, geo_pa, geo_pb, geo_pi, geo_py,
geo_sc, geo_sm, geo_ss, geo_su, geo_va, geo_vb, geo_vi, geo_wb, hgl_krs_karstic,
hgl_krs_not_karstic, hgl_krs_unknown, hgl_permeability, hgl_porosity, hgl_thm_alluvial,
hgl_thm_bedrock, hgl_thm_intense_folded, hgl_thm_sedimentary, hgl_thm_unknown, hgl_thm_volcanism,
hyc_jay_pet_ou, hyc_jay_pet_pe, hyc_jay_pet_pm, hyc_jay_prec_mean, hyc_jay_ratio_prec_pet_ou,
hyc_jay_ratio_prec_pet_pe, hyc_jay_ratio_prec_pet_pm, hyc_jay_ratio_q_prec, hyd_bfi_ladson,
hyd_bfi_lfstat, hyd_bfi_pelletier_pet_ou, hyd_hfd_mean, hyd_q_date_max, hyd_q_date_qmna,
hyd_q_dur_high, hyd_q_dur_low, hyd_q_freq_high, hyd_q_freq_low, hyd_q_freq_zero, hyd_q_max,
hyd_q_mean, hyd_q_mean_yr, hyd_q_qmna_min, hyd_slope_fdc, hyd_stream_elas, hym_q_anomaly_inrae,
hym_q_date_end, hym_q_date_start, hym_q_low_uncertainty_inrae, hym_q_n_year, hym_q_na_period,
hym_q_na_total, hym_q_questionable, hym_q_unqualified, sit_altitude, sit_altitude_datum,
sit_area_hydro, sit_area_topo, sit_city, sit_code_h3, sit_comment, sit_comment_impact_gene, sit_crs,
sit_date_start, sit_date_update, sit_entity, sit_flood_duration, sit_impact, sit_kp_down, sit_kp_up,
sit_label, sit_label_add, sit_label_usual, sit_latitude, sit_longitude, sit_mnemonic,
sit_month1_low_water, sit_month1_year, sit_publication_rights, sit_section, sit_section_vigilance,
sit_status, sit_test_site, sit_type, sit_type_add, sit_tz, sit_waterbody, sit_watercourse_acc,
sit_zone_hydro, sta_altitude_snap, sta_altitude_staff_gauge, sta_area_snap, sta_city,
sta_code_child, sta_code_h2, sta_code_parent, sta_comment, sta_comment_impact_local, sta_crs,
sta_date_altitude_ref, sta_date_end, sta_date_start, sta_date_update, sta_display_level,
sta_dual_staff_gauge, sta_epsg, sta_impact_local, sta_kp, sta_label, sta_label_add,
sta_main_prod_code, sta_main_prod_name, sta_main_prod_name_short, sta_monitor,
sta_publication_right, sta_purpose, sta_qual_highflow, sta_qual_lowflow, sta_qual_meanflow,
sta_territory, sta_test_station, sta_time_data_gap, sta_time_discontinuity, sta_type, sta_x_l2e,
sta_x_l2e_snap, sta_x_l93, sta_x_l93_snap, sta_x_w84, sta_x_w84_snap, sta_y_l2e, sta_y_l2e_snap,
sta_y_l93, sta_y_l93_snap, sta_y_w84, sta_y_w84_snap, top_altitude_mean, top_dist_outlet_mean,
top_drainage_density, top_itopo_mean, top_mor_circ_ratio, top_mor_compact_coef,
top_mor_elong_ratio_catchment, top_mor_elong_ratio_circ, top_mor_form_factor_horton,
top_mor_form_factor_square, top_mor_relief_ratio, top_mor_shape_factor, top_slo_flat,
top_slo_gentle, top_slo_mean, top_slo_moderate, top_slo_ori_e, top_slo_ori_n, top_slo_ori_ne,
top_slo_ori_nw, top_slo_ori_s, top_slo_ori_se, top_slo_ori_sw, top_slo_ori_w, top_slo_steep,
top_slo_strong, top_slo_very_steep
[85]:
df = dataset.fetch_static_features()
print(df.shape)
(654, 344)
[86]:
print(df.isna().sum().sum())
df.isna().sum()
12253
[86]:
clc_1990_lvl1_1       0
clc_1990_lvl1_2       0
clc_1990_lvl1_3       0
clc_1990_lvl1_4       0
clc_1990_lvl1_5       0
                     ..
top_slo_ori_sw        0
top_slo_ori_w         0
top_slo_steep         0
top_slo_strong        0
top_slo_very_steep    0
Length: 344, dtype: int64

find those columns which have at least one NaN value

[87]:
df.loc[:, (df.isna().sum()>0)]
[87]:
clc_1990_lvl1_dom_class clc_1990_lvl2_dom_class clc_1990_lvl3_dom_class clc_2018_lvl1_dom_class clc_2018_lvl2_dom_class clc_2018_lvl3_dom_class cli_prec_timing_high cli_prec_timing_low hyd_bfi_ladson hyd_bfi_lfstat ... sta_code_h2 sta_code_parent sta_comment sta_comment_impact_local sta_date_altitude_ref sta_date_end sta_display_level sta_kp sta_label_add sta_purpose
A105003001 2.0 31.0 211.0 2.0 31.0 211.0 jja son 0.56723 0.51341 ... A1050310 NaN Mise à l'heure TU le 05/11/2009. - Remplacemen... NaN 2022-02-24 08:21:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
A107020001 2.0 21.0 211.0 2.0 21.0 211.0 son son 0.56320 0.53006 ... A1072010 NaN Nivellement de juillet 2002, géomètre Faber-Sc... NaN 2020-12-14 11:19:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
A112020001 2.0 31.0 211.0 2.0 31.0 211.0 jja son 0.44951 0.37740 ... A1122010 NaN Arrêt des observations le 10/01/2008. - Nivell... NaN NaN 2008-01-10 11:20:00 NaN NaN NaN Low flow monitoring - Flood forecasting
A116003002 2.0 21.0 211.0 2.0 21.0 211.0 jja son 0.53010 0.45926 ... A1080320 NaN Echelle et pont arrachés en mai 1983. Seuil re... NaN 2018-12-05 07:24:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
A140202001 3.0 31.0 311.0 3.0 31.0 311.0 djf son 0.50286 0.44606 ... A1402020 NaN Passage à l'heure TU le 29/10/2009. - Nivellé ... NaN 2020-12-14 11:20:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Y781000101 3.0 32.0 333.0 3.0 32.0 333.0 son jja 0.37525 0.29060 ... Y7804010 NaN Station du réseau de base sur seuil naturel, é... Pompages Manso et Galeria 2021-04-15 08:18:00 NaN NaN NaN NaN Low flow monitoring - Flood forecasting
Y862000101 3.0 31.0 311.0 3.0 31.0 311.0 djf jja NaN NaN ... Y8624010 NaN Courbes de tarage à partir du 31/12/1979 revue... NaN NaN NaN NaN NaN NaN Low flow monitoring - Flood forecasting
Y881000102 3.0 31.0 311.0 3.0 31.0 311.0 djf jja 0.55820 0.54672 ... Y8814020 NaN NaN NaN NaN 2012-04-30 12:00:00 NaN NaN Zoza ancien Flood forecasting - Streamflow monitoring
Y902000101 3.0 31.0 312.0 3.0 31.0 312.0 son jja 0.51000 0.46926 ... Y9025010 NaN NaN Influence forte des barrages de baigneurs en é... NaN NaN NaN NaN Pont de Noceta Low flow monitoring - Flood forecasting - Stre...
Y960000102 3.0 32.0 323.0 3.0 31.0 313.0 djf jja 0.34639 0.28346 ... Y9605230 NaN STATION EN REMPLACEMENT DE CELLE DE TAFONATO Y... Pompages amont ? 2017-09-13 09:38:00 NaN NaN NaN Canniciu Streamflow monitoring

654 rows × 57 columns

[88]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[88]:
clc_1990_lvl1_dom_class       4
clc_1990_lvl2_dom_class       5
clc_1990_lvl3_dom_class       5
clc_2018_lvl1_dom_class       5
clc_2018_lvl2_dom_class       6
clc_2018_lvl3_dom_class       7
cli_prec_timing_high         15
cli_prec_timing_low           2
hyd_bfi_ladson               42
hyd_bfi_lfstat               42
hyd_bfi_pelletier_pet_ou     42
sit_altitude                 10
sit_altitude_datum           10
sit_area_hydro              641
sit_area_topo                 7
sit_city                      2
sit_comment                 515
sit_comment_impact_gene     630
sit_crs                       2
sit_date_start              654
sit_date_update               2
sit_entity                    2
sit_flood_duration          654
sit_impact                    6
sit_kp_down                 590
sit_kp_up                   654
sit_label                     2
sit_label_add               464
sit_label_usual             326
sit_latitude                  2
sit_longitude                 2
sit_mnemonic                619
sit_month1_low_water          2
sit_month1_year               2
sit_publication_rights        2
sit_section                   2
sit_section_vigilance       127
sit_status                    2
sit_test_site                 2
sit_type                      2
sit_type_add                  2
sit_tz                        2
sit_waterbody               654
sit_watercourse_acc         632
sit_zone_hydro                2
sta_altitude_staff_gauge    120
sta_code_child              654
sta_code_h2                  13
sta_code_parent             654
sta_comment                 305
sta_comment_impact_local    624
sta_date_altitude_ref       120
sta_date_end                580
sta_display_level           654
sta_kp                      583
sta_label_add               527
sta_purpose                  17
dtype: int64
[89]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
tsd_humid, tsd_pet_ou, tsd_pet_pe, tsd_pet_pm, tsd_prec, tsd_prec_solid_frac, tsd_q_l, tsd_q_mm,
tsd_rad_dli, tsd_rad_ssi, tsd_swe_isba, tsd_swi_gr, tsd_swi_isba, tsd_temp, tsd_temp_max,
tsd_temp_min, tsd_val_c, tsd_val_i, tsd_val_m, tsd_val_q, tsd_val_s, tsd_wind

CAMELS_SE

[90]:
dataset = RainfallRunoff('CAMELS_SE', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_SE with 50 stations, 4 dynamic and 76 static features
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:2541: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(
[91]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Agriculture_percentage, Area_km2, Bedrock_percentage_sc, Clayey_till_and_clay_till_percentage_sc,
DOR, Elevation_mabsl, Forest_percentage, Glacier_percentage_sc, Glaciers_percentage,
Glaciofluvial_sediment_percentage_sc, Latitude_WGS84, Longitude_WGS84, Name, Open_land_percentage,
Peat_percentage_sc, Pmean_mm_year, Postglacial_sand_and_gravel_percentage_sc, RegVol_m3,
S01_Qmean_CNP_61_90, S01_Qmean_CNP_91_20, S01_Qmean_hs, S02_Qcoeff_CNP_61_90, S02_Qcoeff_CNP_91_20,
S02_Qcoeff_hs, S03_COM_CNP_61_90, S03_COM_CNP_91_20, S03_COM_hs, S04_SPD_CNP_61_90,
S04_SPD_CNP_91_20, S04_SPD_hs, S05_Qmean_spring_CNP_61_90, S05_Qmean_spring_CNP_91_20,
S05_Qmean_spring_hs, S06_Qmean_summer_CNP_61_90, S06_Qmean_summer_CNP_91_20, S06_Qmean_summer_hs,
S07_Qmean_autumn_CNP_61_90, S07_Qmean_autumn_CNP_91_20, S07_Qmean_autumn_hs,
S08_Qmean_winter_CNP_61_90, S08_Qmean_winter_CNP_91_20, S08_Qmean_winter_hs, S09_LFfreq_CNP_61_90,
S09_LFfreq_CNP_91_20, S09_LFfreq_hs, S10_T_minQ_d30_CNP_61_90, S10_T_minQ_d30_CNP_91_20,
S10_T_minQ_d30_hs, S11_minQ_d7_CNP_61_90, S11_minQ_d7_CNP_91_20, S11_minQ_d7_hs,
S12_minQ_d30_CNP_61_90, S12_minQ_d30_CNP_91_20, S12_minQ_d30_hs, S13_HFfreq_CNP_61_90,
S13_HFfreq_CNP_91_20, S13_HFfreq_hs, S14_T_maxQ_d1_CNP_61_90, S14_T_maxQ_d1_CNP_91_20,
S14_T_maxQ_d1_hs, S15_maxQ_d30_CNP_61_90, S15_maxQ_d30_CNP_91_20, S15_maxQ_d30_hs,
S16_maxQ_d1_CNP_61_90, S16_maxQ_d1_CNP_91_20, S16_maxQ_d1_hs, Shrubs_and_grassland_percentage,
Silt_percentage_sc, Slope_mean_degree, Till_and_weathered_deposit_percentage_sc, Till_percentage_sc,
Tmean_C, Urban_percentage, Water_percentage, Water_percentage_sc, Wetlands_percentage
[92]:
df = dataset.fetch_static_features()
print(df.shape)
(50, 76)
[93]:
print(df.isna().sum().sum())
df.isna().sum()
0
[93]:
Agriculture_percentage                     0
Area_km2                                   0
Bedrock_percentage_sc                      0
Clayey_till_and_clay_till_percentage_sc    0
DOR                                        0
                                          ..
Tmean_C                                    0
Urban_percentage                           0
Water_percentage                           0
Water_percentage_sc                        0
Wetlands_percentage                        0
Length: 76, dtype: int64

find those columns which have at least one NaN value

[94]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[95]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[95]:
Series([], dtype: float64)
[96]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, pcp_mm, q_cms_obs, q_mmd_obs

CAMELS_IND

[97]:
dataset = RainfallRunoff('CAMELS_IND', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:3223: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(os.path.join(fpath),
CAMELS_IND with 472 stations, 20 dynamic and 210 static features
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:3234: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
  df = pd.read_csv(fpath,
[98]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_gleam_mean, ai_mean, annual_max_1day, annual_max_30day, annual_max_3day, annual_max_7day,
annual_max_90day, annual_min_7day, annual_q, aridity_p_pet, aridity_pet_aet, asynchronicity,
bare_frac, bfi, built_area_frac, bulkdens_sub_major, bulkdens_sub_mean, bulkdens_top_major,
bulkdense_top_mean, carb_rocks_frac, cen_time, clay_frac_sub, clay_frac_top, crops_frac,
crops_frac_1985, crops_frac_1995, crops_frac_2005, cv_apr_flow, cv_aug_flow, cv_dec_flow,
cv_feb_flow, cv_jan_flow, cv_jul_flow, cv_jun_flow, cv_mar_flow, cv_may_flow, cv_nov_flow,
cv_oct_flow, cv_sep_flow, cwc_area, cwc_lat, cwc_lon, cwc_river, cwc_site_name, dom_land_cover,
dom_land_cover_frac, doy_max_flow, doy_max_flow_7, doy_min_flow, doy_min_flow_7, drinking_frac,
dspbar, elev_max, elev_mean, elev_median, elev_min, evap_canopy_anum, evap_canopy_max,
evap_canopy_mean, evap_canopy_min, evap_surface_anum, evap_surface_max, evap_surface_mean,
evap_surface_min, fall_days, fall_rate_mean, fall_rate_median, first_dam_year, flood_frac,
flooded_veg_frac, flow_availability, freq_q_high, freq_q_low, gauge_elevation, geol_class_1st,
geol_class_1st_frac, geol_class_2nd, geol_class_2nd_frac, geol_permeability, geol_porosity,
ghi_area, ghi_group, ghi_lat, ghi_lon, ghi_stn_id, gini_flow, gravel_frac_sub, gravel_frac_top,
high_prec_dur, high_prec_freq, high_prec_timing, hsg_major, hydroelec_frac, irrigation_frac,
lai_diff, lai_max, lai_mean, lai_min, last_dam_year, low_prec_dur, low_prec_freq, low_prec_timing,
max_high_prec_dur, max_low_prec_dur, mean_anum_flow, mean_apr_flow, mean_atmn_flow, mean_aug_flow,
mean_dec_flow, mean_feb_flow, mean_jan_flow, mean_jul_flow, mean_jun_flow, mean_mar_flow,
mean_may_flow, mean_nov_flow, mean_oct_flow, mean_sep_flow, mean_sumr_flow, mean_swmn_flow,
mean_wint_flow, month_1day_max, month_1day_min, n_dams, navigation_frac, num_dams, num_hyd_alt,
org_carb_sub_major, org_carb_sub_mean, org_carb_top_major, org_carb_top_mean, organic_frac_sub,
organic_frac_top, overflow_frac, p_annual_variability, p_max, p_mean, p_mean_anum,
p_monthly_variability, p_unif, pet_gleam_mean, pet_max, pet_mean, pet_mean_anum, pet_min,
pop_density_2000, pop_density_2005, pop_density_2010, pop_density_2015, pop_density_2020, q_10,
q_25, q_25_swmn, q_50, q_50_swmn, q_5_swmn, q_75, q_75_swmn, q_90, q_95_swmn, q_cv, q_high_days,
q_low_days, q_mean, q_mean_swmn, q_zero, range_frac, rel_hum_mean, res_store_sum, reservoir_index,
rise_days, rise_rate_mean, rise_rate_median, river_basin, runoff_ratio, sand_frac_sub,
sand_frac_top, silt_frac_sub, silt_frac_top, slope_fdc, slope_max, slope_mean, slope_median,
slope_min, sm_lvl1_mean, sm_lvl2_mean, sm_lvl3_mean, sm_lvl4_mean, soil_awc_sub, soil_awc_top,
soil_awsc_major, soil_awsc_max, soil_awsc_min, soil_conductivity_sub, soil_conductivity_top,
soil_depth, srad_lw_mean, srad_sw_mean, streamflow_elas, tailing_frac, tmax_mean, tmin_mean,
total_storage, trees_frac, urban_frac_1985, urban_frac_1995, urban_frac_2005, water_frac, wind_mean,
wtd
[99]:
df = dataset.fetch_static_features()
print(df.shape)
(472, 210)
[100]:
print(df.isna().sum().sum())
df.isna().sum()
20322
[100]:
aet_gleam_mean        0
ai_mean               0
annual_max_1day     300
annual_max_30day    300
annual_max_3day     300
                   ...
urban_frac_1995       0
urban_frac_2005       0
water_frac            0
wind_mean             0
wtd                   0
Length: 210, dtype: int64

find those columns which have at least one NaN value

[101]:
df.loc[:, (df.isna().sum()>0)]
[101]:
annual_max_1day annual_max_30day annual_max_3day annual_max_7day annual_max_90day annual_min_7day annual_q bfi bulkdens_sub_major bulkdens_sub_mean ... q_mean_swmn q_zero reservoir_index rise_days rise_rate_mean rise_rate_median runoff_ratio slope_fdc streamflow_elas tailing_frac
gauge_id
3001 NaN NaN NaN NaN NaN NaN NaN NaN 1.33 1.291356 ... 0.568 NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
3002 756.807 136.155 485.599 284.301 90.494 0.000 828.584 0.372 1.45 1.450000 ... 4.587 86.667 0.000688 62.00 38.509 3.07 0.472 NaN 3.744 0.000000
3003 NaN NaN NaN NaN NaN NaN NaN NaN 1.21 1.210000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
3004 NaN NaN NaN NaN NaN NaN NaN NaN 1.21 1.211649 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
3005 14640.524 3780.572 11975.203 8184.747 2305.069 5.293 21163.952 0.385 1.21 1.218816 ... 3.028 0.000 0.507788 131.75 324.324 5.46 0.331 2.859 1.925 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17021 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.291081 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
17022 370.817 98.813 269.619 191.648 60.805 0.060 322.091 0.254 NaN 1.318424 ... 0.005 148.950 1.030649 78.20 12.834 0.43 0.034 NaN 2.049 0.117647
17023 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.211304 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000
17024 609.535 154.046 460.329 312.861 81.274 0.000 356.873 0.169 NaN 1.320490 ... 0.001 306.800 0.942509 21.65 55.641 5.30 0.031 NaN 2.977 0.117647
17025 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.330000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

472 rows × 86 columns

[102]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[102]:
annual_max_1day     300
annual_max_30day    300
annual_max_3day     300
annual_max_7day     300
annual_max_90day    300
                   ...
rise_rate_median    299
runoff_ratio        244
slope_fdc           331
streamflow_elas     271
tailing_frac         66
Length: 86, dtype: int64
[103]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, airtemp_C_max, airtemp_C_mean, airtemp_C_min, evap_canopy(kg/m2/s),
evap_surface(kg/m2/s), lwdownrad_wm2, pcp_mm, pet_mm, pet_mm_gleam, q_cms_obs, rh_%, sm_lvl1(kg/m2),
sm_lvl2(kg/m2), sm_lvl3(kg/m2), sm_lvl4(kg/m2), solrad_wm2, windspeed_mps, windspeedu_mps,
windspeedv_mps

Caravan_DK

[104]:
dataset = RainfallRunoff('Caravan_DK', path=DATA_PATH, verbosity=0)
print(dataset)
Caravan_DK with 308 stations, 39 dynamic and 211 static features
[105]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area,
area_fraction_used_for_aggregation, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav, clz_cl_smj,
cmi_ix_s01, cmi_ix_s02, cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07, cmi_ix_s08,
cmi_ix_s09, cmi_ix_s10, cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse, dis_m3_pmn,
dis_m3_pmx, dis_m3_pyr, dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav, fec_cl_smj,
fmh_cl_smj, for_pc_sse, frac_snow, gauge_lat, gauge_lon, gauge_name, gdp_ud_sav, gdp_ud_ssu,
gla_pc_sse, glc_cl_smj, glc_pc_s01, glc_pc_s02, glc_pc_s03, glc_pc_s04, glc_pc_s05, glc_pc_s06,
glc_pc_s07, glc_pc_s08, glc_pc_s09, glc_pc_s10, glc_pc_s11, glc_pc_s12, glc_pc_s13, glc_pc_s14,
glc_pc_s15, glc_pc_s16, glc_pc_s17, glc_pc_s18, glc_pc_s19, glc_pc_s20, glc_pc_s21, glc_pc_s22,
gwt_cm_sav, hdi_ix_sav, hft_ix_s09, hft_ix_s93, high_prec_dur, high_prec_freq, inu_pc_slt,
inu_pc_smn, inu_pc_smx, ire_pc_sse, kar_pc_sse, lit_cl_smj, lka_pc_sse, lkv_mc_usu, low_prec_dur,
low_prec_freq, moisture_index, nli_ix_sav, p_mean, pac_pc_sse, pet_mean, pet_mm_s01, pet_mm_s02,
pet_mm_s03, pet_mm_s04, pet_mm_s05, pet_mm_s06, pet_mm_s07, pet_mm_s08, pet_mm_s09, pet_mm_s10,
pet_mm_s11, pet_mm_s12, pet_mm_syr, pnv_cl_smj, pnv_pc_s01, pnv_pc_s02, pnv_pc_s03, pnv_pc_s04,
pnv_pc_s05, pnv_pc_s06, pnv_pc_s07, pnv_pc_s08, pnv_pc_s09, pnv_pc_s10, pnv_pc_s11, pnv_pc_s12,
pnv_pc_s13, pnv_pc_s14, pnv_pc_s15, pop_ct_usu, ppd_pk_sav, pre_mm_s01, pre_mm_s02, pre_mm_s03,
pre_mm_s04, pre_mm_s05, pre_mm_s06, pre_mm_s07, pre_mm_s08, pre_mm_s09, pre_mm_s10, pre_mm_s11,
pre_mm_s12, pre_mm_syr, prm_pc_sse, pst_pc_sse, rdd_mk_sav, rev_mc_usu, ria_ha_usu, riv_tc_usu,
run_mm_syr, seasonality, sgr_dk_sav, slp_dg_sav, slt_pc_sav, snd_pc_sav, snw_pc_s01, snw_pc_s02,
snw_pc_s03, snw_pc_s04, snw_pc_s05, snw_pc_s06, snw_pc_s07, snw_pc_s08, snw_pc_s09, snw_pc_s10,
snw_pc_s11, snw_pc_s12, snw_pc_smx, snw_pc_syr, soc_th_sav, swc_pc_s01, swc_pc_s02, swc_pc_s03,
swc_pc_s04, swc_pc_s05, swc_pc_s06, swc_pc_s07, swc_pc_s08, swc_pc_s09, swc_pc_s10, swc_pc_s11,
swc_pc_s12, swc_pc_syr, tbi_cl_smj, tec_cl_smj, tmp_dc_s01, tmp_dc_s02, tmp_dc_s03, tmp_dc_s04,
tmp_dc_s05, tmp_dc_s06, tmp_dc_s07, tmp_dc_s08, tmp_dc_s09, tmp_dc_s10, tmp_dc_s11, tmp_dc_s12,
tmp_dc_smn, tmp_dc_smx, tmp_dc_syr, urb_pc_sse, wet_cl_smj, wet_pc_s01, wet_pc_s02, wet_pc_s03,
wet_pc_s04, wet_pc_s05, wet_pc_s06, wet_pc_s07, wet_pc_s08, wet_pc_s09, wet_pc_sg1, wet_pc_sg2
[106]:
df = dataset.fetch_static_features()
print(df.shape)
(308, 211)
[107]:
print(df.isna().sum().sum())
df.isna().sum()
0
[107]:
aet_mm_s01    0
aet_mm_s02    0
aet_mm_s03    0
aet_mm_s04    0
aet_mm_s05    0
             ..
wet_pc_s07    0
wet_pc_s08    0
wet_pc_s09    0
wet_pc_sg1    0
wet_pc_sg2    0
Length: 211, dtype: int64

find those columns which have at least one NaN value

[108]:
df.loc[:, (df.isna().sum()>0)]
[108]:
240001
590006
340003
450043
100009
...
610013
180078
150046
490082
20006

308 rows × 0 columns

[109]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[109]:
Series([], dtype: float64)
[110]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
dewpoint_temperature_2m_max, dewpoint_temperature_2m_mean, dewpoint_temperature_2m_min,
potential_evaporation_sum, snow_depth_water_equivalent_max, snow_depth_water_equivalent_mean,
snow_depth_water_equivalent_min, streamflow, surface_net_solar_radiation_max,
surface_net_solar_radiation_mean, surface_net_solar_radiation_min,
surface_net_thermal_radiation_max, surface_net_thermal_radiation_mean,
surface_net_thermal_radiation_min, surface_pressure_max, surface_pressure_mean,
surface_pressure_min, temperature_2m_max, temperature_2m_mean, temperature_2m_min,
total_precipitation_sum, u_component_of_wind_10m_max, u_component_of_wind_10m_mean,
u_component_of_wind_10m_min, v_component_of_wind_10m_max, v_component_of_wind_10m_mean,
v_component_of_wind_10m_min, volumetric_soil_water_layer_1_max, volumetric_soil_water_layer_1_mean,
volumetric_soil_water_layer_1_min, volumetric_soil_water_layer_2_max,
volumetric_soil_water_layer_2_mean, volumetric_soil_water_layer_2_min,
volumetric_soil_water_layer_3_max, volumetric_soil_water_layer_3_mean,
volumetric_soil_water_layer_3_min, volumetric_soil_water_layer_4_max,
volumetric_soil_water_layer_4_mean, volumetric_soil_water_layer_4_min

LamaHCE

[111]:
dataset = RainfallRunoff('LamaHCE', timestep='D', data_type='total_upstrm', path=DATA_PATH, verbosity=0)
print(dataset)
LamaHCE with 859 stations, 22 dynamic and 80 static features
[112]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
agr_fra, area_calc, area_gov, area_ratio, aridity, bare_fra, bedrk_dep, clay_fra, country,
degimpact, diur_art, diur_glac, elev, elev_mean, elev_med, elev_ran, elev_std, elon_ratio, et0_mean,
eta_mean, fedstate, forest_fra, frac_snow, gaps, gc_dom, gc_ig_fra, gc_mt_fra, gc_pa_fra, gc_pb_fra,
gc_pi_fra, gc_py_fra, gc_sc_fra, gc_sm_fra, gc_ss_fra, gc_su_fra, gc_va_fra, gc_vb_fra, gc_wb_fra,
geol_perme, geol_poros, glac_fra, govnr, grav_fra, gvf_diff, gvf_max, hi_prec_du, hi_prec_fr,
hi_prec_ti, lai_diff, lai_max, lake_fra, lat, lc_dom, lo_prec_du, lo_prec_fr, lo_prec_ti, lon,
mvert_ang, mvert_dist, name, ndvi_max, ndvi_min, obsbeg_day, obsbeg_hr, obsend, oc_fra, p_mean,
p_season, region, river, root_dep, sand_fra, silt_fra, slope_mean, soil_condu, soil_poros,
soil_tawc, strm_dens, typimpact, urban_fra
[113]:
df = dataset.fetch_static_features()
print(df.shape)
(859, 80)
[114]:
print(df.isna().sum().sum())
df.isna().sum()
46
[114]:
agr_fra       0
area_calc     0
area_gov      0
area_ratio    0
aridity       0
             ..
soil_poros    0
soil_tawc     0
strm_dens     0
typimpact     0
urban_fra     0
Length: 80, dtype: int64

find those columns which have at least one NaN value

[115]:
df.loc[:, (df.isna().sum()>0)]
[115]:
geol_perme hi_prec_ti lo_prec_ti
ID
826 -12.4 NaN son
819 -11.5 son djf
79 -13.3 jja djf
696 -12.2 jja djf
98 -12.0 jja djf
... ... ... ...
261 -12.1 jja djf
587 -12.9 jja djf
827 -12.6 jja son
250 -13.4 jja djf
72 -12.4 jja djf

859 rows × 3 columns

[116]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[116]:
geol_perme     1
hi_prec_ti    42
lo_prec_ti     3
dtype: int64
[117]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpa, airtemp_C_max, airtemp_C_mean, airtemp_C_min, dptemp_C_max_2m, dptemp_C_mean_2m,
dptemp_C_min_2m, fcst_alb, lai_high_veg, lai_low_veg, pcp_mm, q_cms_obs, solrad_wm2, solrad_wm2_max,
swe_mm, thermrad_wm2, thermrad_wm2_max, total_et, volsw_123, volsw_4, windspeedu_mps, windspeedv_mps
[118]:
dataset = RainfallRunoff('LamaHCE', timestep='H', data_type='total_upstrm', path=DATA_PATH, verbosity=0)
print(dataset)
LamaHCE with 859 stations, 16 dynamic and 84 static features
[119]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
agr_fra, area_calc, area_gov, area_ratio, arid_1, arid_2, bare_fra, bedrk_dep, clay_fra, country,
degimpact, diur_art, diur_glac, elev, elev_mean, elev_med, elev_ran, elev_std, elon_ratio, et0_mean,
eta_mean, fedstate, forest_fra, frac_snow, gaps_post, gaps_pre, gc_dom, gc_ig_fra, gc_mt_fra,
gc_pa_fra, gc_pb_fra, gc_pi_fra, gc_py_fra, gc_sc_fra, gc_sm_fra, gc_ss_fra, gc_su_fra, gc_va_fra,
gc_vb_fra, gc_wb_fra, geol_perme, geol_poros, glac_fra, govnr, grav_fra, gvf_diff, gvf_max,
hi_prec_du, hi_prec_fr, hi_prec_ti, lai_diff, lai_max, lake_fra, lat, lc_dom, lo_prec_du,
lo_prec_fr, lo_prec_ti, lon, mvert_ang, mvert_dist, name, ndvi_max, ndvi_min, nrs_euhyd, nrs_rivat,
obsbeg_day, obsbeg_hr, obsend, oc_fra, p_mean, p_season, region, river, root_dep, sand_fra,
silt_fra, slope_mean, soil_condu, soil_poros, soil_tawc, strm_dens, typimpact, urban_fra
[120]:
df = dataset.fetch_static_features()
print(df.shape)
(859, 84)
[121]:
print(df.isna().sum().sum())
df.isna().sum()
65
[121]:
agr_fra       0
area_calc     0
area_gov      0
area_ratio    0
arid_1        0
             ..
soil_poros    0
soil_tawc     0
strm_dens     0
typimpact     0
urban_fra     0
Length: 84, dtype: int64

find those columns which have at least one NaN value

[122]:
df.loc[:, (df.isna().sum()>0)]
[122]:
geol_perme hi_prec_ti lo_prec_ti nrs_rivat
ID
826 -12.4 NaN son 20376803.0
819 -11.5 son djf 20464042.0
79 -13.3 jja djf 20454049.0
696 -12.2 jja djf 20424102.0
98 -12.0 jja djf 20440228.0
... ... ... ... ...
261 -12.1 jja djf 20428827.0
587 -12.9 jja djf 20461304.0
827 -12.6 jja son 20379436.0
250 -13.4 jja djf 20441631.0
72 -12.4 jja djf 20451775.0

859 rows × 4 columns

[123]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[123]:
geol_perme     1
hi_prec_ti    42
lo_prec_ti     3
nrs_rivat     19
dtype: int64
[124]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpa, airtemp_C_mean, dptemp_C_mean_2m, fcst_alb, lai_high_veg, lai_low_veg, pcp_mm,
q_cms_obs, solrad_wm2, swe_mm, thermrad_wm2, total_et, volsw_123, volsw_4, windspeedu_mps,
windspeedv_mps

LamaHIce

[125]:
dataset = RainfallRunoff('LamaHIce', timestep='D', data_type='total_upstrm',
                         path=os.path.join(DATA_PATH, 'LamaHIce_daily'), verbosity=0)
print(dataset)
LamaHIce with 111 stations, 36 dynamic and 154 static features
[126]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
ET_ERA5L_all_basin, ET_ERA5L_unfiltered_basin, ET_rav_all_basin, ET_rav_unfiltered_basin,
PET_ERA5L_all_basin, PET_ERA5L_unfiltered_basin, PET_rav_all_basin, PET_rav_unfiltered_basin,
P_ERA5L_all_basin, P_ERA5L_unfiltered_basin, P_rav_all_basin, P_rav_unfiltered_basin, Q5_basin,
Q5_gauge, Q95_basin, Q95_gauge, Q_all_basin, Q_unfiltered_basin, VHM_no_gauge, V_no_gauge,
agr_fra_basin, area_calc_basin, aridity_ERA5L_basin, aridity_basin, asp_mean_basin, bare_fra_basin,
baseflow_index_ladson_basin, baseflow_index_ladson_gauge, bedrk_dep_basin, clay_fra_basin,
degimpact_basin, degimpact_gauge, elev_mean_basin, elev_med_basin, elev_ran_basin, elev_std_basin,
elevation_gauge, elon_ratio_basin, forest_fra_basin, frac_snow_ERA5L_basin, frac_snow_basin,
g621_fra_basin, g701_fra_basin, g743_fra_basin, g746_fra_basin, g_area_basin, g_aspect_basin,
g_dom_NI_basin, g_frac_basin, g_lat_basin, g_lon_basin, g_max_el_basin, g_mean_el_basin,
g_min_el_basin, g_slope_basin, g_slopel20_basin, gaps_hourly_gauge, gbinn_fra_basin,
gbnew_fra_basin, gbold_fra_basin, gc_23_dom_basin, gc_23_pavr_basin, gc_23_pb_basin,
gc_23_vapy_basin, gc_23_vb_basin, gc_23_vbpy_basin, gc_23_vbsr_basin, gc_dom_basin, gc_pa_fra_basin,
gc_pb_fra_basin, gc_va_fra_basin, gc_vb_fra_basin, geometry_gauge, ggnew_fra_basin, ggold_fra_basin,
ghraun_fra_basin, glac_fra_basin, gmob_fra_basin, grav_fra_basin, gsgos_fra_basin, gsinn_fra_basin,
gsn_fra_basin, gsnew_fra_basin, gsold_fra_basin, gvf_diff_basin, gvf_max_basin, hfd_mean_basin,
hfd_mean_gauge, high_prec_du_ERA5L_basin, high_prec_du_basin, high_prec_fr_ERA5L_basin,
high_prec_fr_basin, high_prec_timing_ERA5L_basin, high_prec_timing_basin, high_q_dur_basin,
high_q_dur_gauge, high_q_freq_basin, high_q_freq_gauge, lai_diff_basin, lai_max_basin,
lake_fra_basin, lat_gauge, lc_dom_basin, lo_prec_fr_ERA5L_basin, lo_prec_fr_basin, lon_gauge,
low_prec_du_ERA5L_basin, low_prec_du_basin, low_prec_timing_ERA5L_basin, low_prec_timing_basin,
low_q_dur_basin, low_q_dur_gauge, low_q_freq_basin, low_q_freq_gauge, mvert_ang_basin,
mvert_dist_basin, name_gauge, ndvi_max_basin, ndvi_min_basin, obsbeg_day_gauge, obsbeg_hr_gauge,
obsend_day_gauge, obsend_hr_gauge, oc_fra_basin, p_mean_ERA5L_basin, p_mean_basin,
p_season_ERA5L_basin, p_season_basin, pet_mean_ERA5L_basin, q_mean_basin, q_mean_gauge,
ref_et_mean_basin, river_gauge, root_dep_basin, runoff_ratio_basin, runoff_ratio_gauge,
sand_fra_basin, scrub_fra_basin, silt_fra_basin, slope_fdc_basin, slope_fdc_gauge, slope_mean_basin,
soil_poros_basin, soil_tawc_basin, stream_elas_basin, stream_elas_gauge, strm_dens_basin,
typimpact_basin, typimpact_gauge, urban_fra_basin, water_year_all_basin,
water_year_unfiltered_basin, wetl_fra_basin, zero_q_freq_gauge
[127]:
df = dataset.fetch_static_features()
print(df.shape)
(111, 154)
[128]:
print(df.isna().sum().sum())
df.isna().sum()
2013
[128]:
ET_ERA5L_all_basin             37
ET_ERA5L_unfiltered_basin      14
ET_rav_all_basin               37
ET_rav_unfiltered_basin        14
PET_ERA5L_all_basin            37
                               ..
urban_fra_basin                 0
water_year_all_basin           37
water_year_unfiltered_basin    14
wetl_fra_basin                  0
zero_q_freq_gauge              37
Length: 154, dtype: int64

find those columns which have at least one NaN value

[129]:
df.loc[:, (df.isna().sum()>0)]
[129]:
ET_ERA5L_all_basin ET_ERA5L_unfiltered_basin ET_rav_all_basin ET_rav_unfiltered_basin PET_ERA5L_all_basin PET_ERA5L_unfiltered_basin PET_rav_all_basin PET_rav_unfiltered_basin P_ERA5L_all_basin P_ERA5L_unfiltered_basin ... q_mean_gauge runoff_ratio_basin runoff_ratio_gauge slope_fdc_basin slope_fdc_gauge stream_elas_basin stream_elas_gauge water_year_all_basin water_year_unfiltered_basin zero_q_freq_gauge
id
79 0.640192 0.636492 0.634929 0.633861 1.295533 1.293016 0.547516 0.557275 4.648427 4.631319 ... 9.389204 1.786 1.785626 0.404 0.404174 0.507 0.506978 2000.272524 1999.000666 0.0
98 0.603482 0.593119 0.599214 0.588846 2.000104 1.942428 0.559626 0.536703 4.717848 4.647344 ... 5.951890 1.178 1.178091 0.561 0.560945 0.666 0.666312 2010.601150 2011.000632 0.0
25 1.378187 1.396363 0.668298 0.665824 4.609922 4.601343 0.529151 0.506576 5.546952 5.682186 ... 10.316638 2.346 2.345917 1.441 1.440679 0.579 0.579003 2004.386057 2003.470929 0.0
1 0.710055 0.718156 0.651314 0.655982 1.472727 1.473907 0.616380 0.603120 3.769304 3.859496 ... 4.546374 1.115 1.114759 2.715 2.715291 1.365 1.364999 2010.248973 2010.499609 0.0
34 NaN 0.436237 NaN 0.282329 NaN 0.656889 NaN 0.243403 NaN 3.560044 ... NaN NaN NaN NaN NaN NaN NaN NaN 1999.000666 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
94 NaN 0.565821 NaN 0.646134 NaN 1.009009 NaN 0.569345 NaN 2.851701 ... NaN NaN NaN NaN NaN NaN NaN NaN 2010.014082 NaN
54 0.286762 0.318574 0.428488 0.405101 0.409014 0.466285 0.490258 0.472720 4.375074 4.262709 ... 6.364638 1.014 1.014307 3.767 3.766742 0.929 0.929429 1997.000000 2000.226138 0.0
77 0.383486 0.368716 0.464558 0.457334 0.557757 0.534627 0.620087 0.598473 3.094146 3.139548 ... 2.217995 0.781 0.780813 1.384 1.383765 0.459 0.458724 2000.151013 2002.000000 0.0
80 0.649492 0.646736 0.529909 0.521922 1.126047 1.143551 0.468229 0.485154 4.652074 4.312049 ... 4.225174 0.871 0.870663 1.945 1.945021 0.913 0.913007 2008.890240 2009.224360 0.0
72 NaN 0.476955 NaN 0.385069 NaN 0.928519 NaN 0.511240 NaN 3.048227 ... NaN NaN NaN NaN NaN NaN NaN NaN 2007.389354 NaN

111 rows × 53 columns

[130]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[130]:
ET_ERA5L_all_basin             37
ET_ERA5L_unfiltered_basin      14
ET_rav_all_basin               37
ET_rav_unfiltered_basin        14
PET_ERA5L_all_basin            37
PET_ERA5L_unfiltered_basin     14
PET_rav_all_basin              37
PET_rav_unfiltered_basin       14
P_ERA5L_all_basin              37
P_ERA5L_unfiltered_basin       14
P_rav_all_basin                37
P_rav_unfiltered_basin         14
Q5_basin                       37
Q5_gauge                       37
Q95_basin                      37
Q95_gauge                      37
Q_all_basin                    37
Q_unfiltered_basin             14
baseflow_index_ladson_basin    37
baseflow_index_ladson_gauge    37
g_aspect_basin                 47
g_lat_basin                    47
g_lon_basin                    47
g_max_el_basin                 47
g_mean_el_basin                47
g_min_el_basin                 47
g_slope_basin                  47
g_slopel20_basin               47
gaps_hourly_gauge              35
hfd_mean_basin                 42
hfd_mean_gauge                 42
high_prec_timing_basin          4
high_q_dur_basin               67
high_q_dur_gauge               67
high_q_freq_basin              67
high_q_freq_gauge              67
low_prec_timing_ERA5L_basin     2
low_prec_timing_basin           1
low_q_dur_basin                70
low_q_dur_gauge                70
low_q_freq_basin               70
low_q_freq_gauge               70
q_mean_basin                   37
q_mean_gauge                   37
runoff_ratio_basin             37
runoff_ratio_gauge             37
slope_fdc_basin                37
slope_fdc_gauge                37
stream_elas_basin              37
stream_elas_gauge              37
water_year_all_basin           37
water_year_unfiltered_basin    14
zero_q_freq_gauge              37
dtype: int64
[131]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
10m_wind_u, 10m_wind_u_rav, 10m_wind_v, 10m_wind_v_rav, 2m_dp_temp_max, 2m_dp_temp_mean,
2m_dp_temp_min, 2m_qv_rav, 2m_temp_rav, fcst_alb, grdflx_rav, lai_high_veg, lai_low_veg, max_temp_C,
mean_temp_C, min_temp_C, obs_q_cms, pcp_mm, pet_mm, prec_carra, prec_rav, ref_et_mm,
surf_dwn_solar_rad_rav, surf_dwn_therm_rad_rav, surf_net_solar_rad_max, surf_net_solar_rad_mean,
surf_net_therm_rad_max, surf_net_therm_rad_mean, surf_outg_therm_rad_rav, surf_press,
surf_press_rav, swe, total_et, total_et_rav, volsw_123, volsw_4
[132]:
dataset = RainfallRunoff('LamaHIce', timestep='H', data_type='total_upstrm', path=DATA_PATH, verbosity=0)
print(dataset)
LamaHIce with 76 stations, 28 dynamic and 138 static features
[133]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5_basin, Q5_gauge, Q95_basin, Q95_gauge, VHM_no_gauge, V_no_gauge, agr_fra_basin, area_calc_basin,
aridity_ERA5L_basin, aridity_basin, asp_mean_basin, bare_fra_basin, baseflow_index_ladson_basin,
baseflow_index_ladson_gauge, bedrk_dep_basin, clay_fra_basin, degimpact_basin, degimpact_gauge,
elev_mean_basin, elev_med_basin, elev_ran_basin, elev_std_basin, elevation_gauge, elon_ratio_basin,
forest_fra_basin, frac_snow_ERA5L_basin, frac_snow_basin, g621_fra_basin, g701_fra_basin,
g743_fra_basin, g746_fra_basin, g_area_basin, g_aspect_basin, g_dom_NI_basin, g_frac_basin,
g_lat_basin, g_lon_basin, g_max_el_basin, g_mean_el_basin, g_min_el_basin, g_slope_basin,
g_slopel20_basin, gaps_hourly_gauge, gbinn_fra_basin, gbnew_fra_basin, gbold_fra_basin,
gc_23_dom_basin, gc_23_pavr_basin, gc_23_pb_basin, gc_23_vapy_basin, gc_23_vb_basin,
gc_23_vbpy_basin, gc_23_vbsr_basin, gc_dom_basin, gc_pa_fra_basin, gc_pb_fra_basin, gc_va_fra_basin,
gc_vb_fra_basin, geometry_gauge, ggnew_fra_basin, ggold_fra_basin, ghraun_fra_basin, glac_fra_basin,
gmob_fra_basin, grav_fra_basin, gsgos_fra_basin, gsinn_fra_basin, gsn_fra_basin, gsnew_fra_basin,
gsold_fra_basin, gvf_diff_basin, gvf_max_basin, hfd_mean_basin, hfd_mean_gauge,
high_prec_du_ERA5L_basin, high_prec_du_basin, high_prec_fr_ERA5L_basin, high_prec_fr_basin,
high_prec_timing_ERA5L_basin, high_prec_timing_basin, high_q_dur_basin, high_q_dur_gauge,
high_q_freq_basin, high_q_freq_gauge, lai_diff_basin, lai_max_basin, lake_fra_basin, lat_gauge,
lc_dom_basin, lo_prec_fr_ERA5L_basin, lo_prec_fr_basin, lon_gauge, low_prec_du_ERA5L_basin,
low_prec_du_basin, low_prec_timing_ERA5L_basin, low_prec_timing_basin, low_q_dur_basin,
low_q_dur_gauge, low_q_freq_basin, low_q_freq_gauge, mvert_ang_basin, mvert_dist_basin, name_gauge,
ndvi_max_basin, ndvi_min_basin, obsbeg_day_gauge, obsbeg_hr_gauge, obsend_day_gauge,
obsend_hr_gauge, oc_fra_basin, p_mean_ERA5L_basin, p_mean_basin, p_season_ERA5L_basin,
p_season_basin, pet_mean_ERA5L_basin, q_mean_basin, q_mean_gauge, ref_et_mean_basin, river_gauge,
root_dep_basin, runoff_ratio_basin, runoff_ratio_gauge, sand_fra_basin, scrub_fra_basin,
silt_fra_basin, slope_fdc_basin, slope_fdc_gauge, slope_mean_basin, soil_poros_basin,
soil_tawc_basin, stream_elas_basin, stream_elas_gauge, strm_dens_basin, typimpact_basin,
typimpact_gauge, urban_fra_basin, wetl_fra_basin, zero_q_freq_gauge
[134]:
df = dataset.fetch_static_features()
print(df.shape)
(76, 138)
[135]:
print(df.isna().sum().sum())
df.isna().sum()
953
[135]:
Q5_basin             18
Q5_gauge             18
Q95_basin            18
Q95_gauge            18
VHM_no_gauge          0
                     ..
typimpact_basin       0
typimpact_gauge       0
urban_fra_basin       0
wetl_fra_basin        0
zero_q_freq_gauge    18
Length: 138, dtype: int64

find those columns which have at least one NaN value

[136]:
df.loc[:, (df.isna().sum()>0)]
[136]:
Q5_basin Q5_gauge Q95_basin Q95_gauge baseflow_index_ladson_basin baseflow_index_ladson_gauge g_aspect_basin g_lat_basin g_lon_basin g_max_el_basin ... low_q_freq_gauge q_mean_basin q_mean_gauge runoff_ratio_basin runoff_ratio_gauge slope_fdc_basin slope_fdc_gauge stream_elas_basin stream_elas_gauge zero_q_freq_gauge
id
79 7.498 7.497704 12.688 12.687769 0.901 0.900813 193.420 448938.051 417760.006 1332.644 ... NaN 9.389 9.389204 1.786 1.785626 0.404 0.404174 0.507 0.506978 0.0
98 4.037 4.036522 9.131 9.131344 0.822 0.821769 161.933 462008.994 449955.798 1768.421 ... NaN 5.952 5.951890 1.178 1.178091 0.561 0.560945 0.666 0.666312 0.0
25 2.593 2.592731 24.071 24.071443 0.756 0.755520 NaN NaN NaN NaN ... 10.581360 10.317 10.316638 2.346 2.345917 1.441 1.440679 0.579 0.579003 0.0
1 0.996 0.996230 12.549 12.549089 0.561 0.560762 NaN NaN NaN NaN ... 6.084722 4.546 4.546374 1.115 1.114759 2.715 2.715291 1.365 1.364999 0.0
34 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
61 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
94 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
54 0.419 0.418586 28.214 28.213980 0.388 0.387571 105.826 465630.107 671331.688 1333.372 ... 110.276178 6.365 6.364638 1.014 1.014307 3.767 3.766742 0.929 0.929429 0.0
80 1.272 1.272295 11.077 11.076516 0.583 0.583472 NaN NaN NaN NaN ... 0.339978 4.225 4.225174 0.871 0.870663 1.945 1.945021 0.913 0.913007 0.0
72 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

76 rows × 35 columns

[137]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[137]:
Q5_basin                       18
Q5_gauge                       18
Q95_basin                      18
Q95_gauge                      18
baseflow_index_ladson_basin    18
baseflow_index_ladson_gauge    18
g_aspect_basin                 36
g_lat_basin                    36
g_lon_basin                    36
g_max_el_basin                 36
g_mean_el_basin                36
g_min_el_basin                 36
g_slope_basin                  36
g_slopel20_basin               36
hfd_mean_basin                 21
hfd_mean_gauge                 21
high_prec_timing_basin          3
high_q_dur_basin               41
high_q_dur_gauge               41
high_q_freq_basin              41
high_q_freq_gauge              41
low_prec_timing_ERA5L_basin     2
low_q_dur_basin                46
low_q_dur_gauge                46
low_q_freq_basin               46
low_q_freq_gauge               46
q_mean_basin                   18
q_mean_gauge                   18
runoff_ratio_basin             18
runoff_ratio_gauge             18
slope_fdc_basin                18
slope_fdc_gauge                18
stream_elas_basin              18
stream_elas_gauge              18
zero_q_freq_gauge              18
dtype: int64
[138]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
10m_wind_u, 10m_wind_u_rav, 10m_wind_v, 10m_wind_v_rav, 2m_dp_temp, 2m_qv_rav, 2m_temp_rav,
fcst_alb, grdflx_rav, lai_high_veg, lai_low_veg, mean_temp_C, obs_q_cms, pcp_mm, pet_mm, prec_rav,
surf_dwn_solar_rad_rav, surf_dwn_therm_rad_rav, surf_net_solar_rad, surf_net_therm_rad,
surf_outg_therm_rad_rav, surf_press, surf_press_rav, swe, total_et, total_et_rav, volsw_123, volsw_4

HYSETS

[139]:
dataset = RainfallRunoff('HYSETS', path=os.path.join(DATA_PATH, 'HYSETS'), verbosity=0)
print(dataset)
HYSETS with 14425 stations, 5 dynamic and 28 static features
[140]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Aspect_deg, Centroid_Lat_deg_N, Centroid_Lon_deg_E, Drainage_Area_GSIM_km2, Drainage_Area_km2,
Elevation_m, Flag_Artificial_Boundaries, Flag_GSIM_boundaries, Flag_Land_Use_Extraction,
Flag_Shape_Extraction, Flag_Subsoil_Extraction, Flag_Terrain_Extraction, Gravelius,
Land_Use_Crops_frac, Land_Use_Forest_frac, Land_Use_Grass_frac, Land_Use_Shrubs_frac,
Land_Use_Snow_Ice_frac, Land_Use_Urban_frac, Land_Use_Water_frac, Land_Use_Wetland_frac, Name,
Official_ID, Perimeter, Permeability_logk_m2, Porosity_frac, Slope_deg, Source
[141]:
df = dataset.fetch_static_features()
print(df.shape)
(14425, 28)
[142]:
print(df.isna().sum().sum())
df.isna().sum()
20179
[142]:
Source                            0
Name                              0
Official_ID                       0
Centroid_Lat_deg_N                0
Centroid_Lon_deg_E                0
Drainage_Area_km2                 0
Drainage_Area_GSIM_km2        13561
Flag_GSIM_boundaries              0
Flag_Artificial_Boundaries        0
Elevation_m                       6
Slope_deg                         6
Gravelius                      1633
Perimeter                      1633
Flag_Shape_Extraction             0
Aspect_deg                        6
Flag_Terrain_Extraction           0
Land_Use_Forest_frac             13
Land_Use_Grass_frac              13
Land_Use_Wetland_frac            13
Land_Use_Water_frac              13
Land_Use_Urban_frac              13
Land_Use_Shrubs_frac             13
Land_Use_Crops_frac              13
Land_Use_Snow_Ice_frac           13
Flag_Land_Use_Extraction          0
Permeability_logk_m2           1615
Porosity_frac                  1615
Flag_Subsoil_Extraction           0
dtype: int64

find those columns which have at least one NaN value

[143]:
df.loc[:, (df.isna().sum()>0)]
[143]:
Drainage_Area_GSIM_km2 Elevation_m Slope_deg Gravelius Perimeter Aspect_deg Land_Use_Forest_frac Land_Use_Grass_frac Land_Use_Wetland_frac Land_Use_Water_frac Land_Use_Urban_frac Land_Use_Shrubs_frac Land_Use_Crops_frac Land_Use_Snow_Ice_frac Permeability_logk_m2 Porosity_frac
Watershed_ID
1 NaN 362.3 3.5329 2.7834 1194.505 130.4023 0.7869 0.0147 0.0645 0.0258 0.0089 0.0749 0.0242 0.0 -14.719327 0.180905
2 NaN 353.4 4.6633 2.0656 269.164 91.7329 0.8452 0.0102 0.0228 0.0219 0.0174 0.0410 0.0414 0.0 -14.056491 0.206450
3 2693.814 293.3 4.4690 2.0620 381.994 223.9510 0.8207 0.0093 0.0032 0.0487 0.0230 0.0351 0.0600 0.0 -14.537390 0.165357
4 NaN 276.5 4.1819 2.4682 413.839 120.7400 0.6837 0.0226 0.1024 0.0630 0.0115 0.0641 0.0528 0.0 -14.687869 0.170597
5 NaN 201.8 2.8061 NaN NaN 56.8902 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 0.0 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14421 NaN 1987.9 17.1982 2.0752 208.852 28.9860 0.5356 0.0330 0.0000 0.0000 0.0202 0.0170 0.3941 0.0 -13.160658 0.096755
14422 NaN 769.5 6.5921 1.5715 325.714 110.5607 0.1348 0.3106 0.0025 0.0024 0.0305 0.0300 0.4874 0.0 -12.698509 0.119993
14423 NaN 1883.2 14.7005 2.5953 1621.229 224.3422 0.8674 0.0437 0.0000 0.0026 0.0027 0.0429 0.0408 0.0 -12.976926 0.090284
14424 NaN 1791.2 12.1021 2.4269 1288.932 184.5177 0.7720 0.1524 0.0000 0.0013 0.0029 0.0474 0.0241 0.0 -12.968686 0.094042
14425 NaN 2179.1 5.9444 2.0769 165.762 112.0832 0.1605 0.5639 0.0000 0.0012 0.0091 0.1116 0.1536 0.0 -12.792099 0.168963

14425 rows × 16 columns

[144]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[144]:
Drainage_Area_GSIM_km2    13561
Elevation_m                   6
Slope_deg                     6
Gravelius                  1633
Perimeter                  1633
Aspect_deg                    6
Land_Use_Forest_frac         13
Land_Use_Grass_frac          13
Land_Use_Wetland_frac        13
Land_Use_Water_frac          13
Land_Use_Urban_frac          13
Land_Use_Shrubs_frac         13
Land_Use_Crops_frac          13
Land_Use_Snow_Ice_frac       13
Permeability_logk_m2       1615
Porosity_frac              1615
dtype: int64
[145]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
discharge, pr, swe, tasmax, tasmin

GRDCCaravan

[146]:
dataset = RainfallRunoff('GRDCCaravan', path=DATA_PATH, verbosity=0)
print(dataset)
GRDCCaravan with 5357 stations, 39 dynamic and 211 static features
[147]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area,
area_fraction_used_for_aggregation, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav, clz_cl_smj,
cmi_ix_s01, cmi_ix_s02, cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07, cmi_ix_s08,
cmi_ix_s09, cmi_ix_s10, cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse, dis_m3_pmn,
dis_m3_pmx, dis_m3_pyr, dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav, fec_cl_smj,
fmh_cl_smj, for_pc_sse, frac_snow, gauge_lat, gauge_lon, gauge_name, gdp_ud_sav, gdp_ud_ssu,
gla_pc_sse, glc_cl_smj, glc_pc_s01, glc_pc_s02, glc_pc_s03, glc_pc_s04, glc_pc_s05, glc_pc_s06,
glc_pc_s07, glc_pc_s08, glc_pc_s09, glc_pc_s10, glc_pc_s11, glc_pc_s12, glc_pc_s13, glc_pc_s14,
glc_pc_s15, glc_pc_s16, glc_pc_s17, glc_pc_s18, glc_pc_s19, glc_pc_s20, glc_pc_s21, glc_pc_s22,
gwt_cm_sav, hdi_ix_sav, hft_ix_s09, hft_ix_s93, high_prec_dur, high_prec_freq, inu_pc_slt,
inu_pc_smn, inu_pc_smx, ire_pc_sse, kar_pc_sse, lit_cl_smj, lka_pc_sse, lkv_mc_usu, low_prec_dur,
low_prec_freq, moisture_index, nli_ix_sav, p_mean, pac_pc_sse, pet_mean, pet_mm_s01, pet_mm_s02,
pet_mm_s03, pet_mm_s04, pet_mm_s05, pet_mm_s06, pet_mm_s07, pet_mm_s08, pet_mm_s09, pet_mm_s10,
pet_mm_s11, pet_mm_s12, pet_mm_syr, pnv_cl_smj, pnv_pc_s01, pnv_pc_s02, pnv_pc_s03, pnv_pc_s04,
pnv_pc_s05, pnv_pc_s06, pnv_pc_s07, pnv_pc_s08, pnv_pc_s09, pnv_pc_s10, pnv_pc_s11, pnv_pc_s12,
pnv_pc_s13, pnv_pc_s14, pnv_pc_s15, pop_ct_usu, ppd_pk_sav, pre_mm_s01, pre_mm_s02, pre_mm_s03,
pre_mm_s04, pre_mm_s05, pre_mm_s06, pre_mm_s07, pre_mm_s08, pre_mm_s09, pre_mm_s10, pre_mm_s11,
pre_mm_s12, pre_mm_syr, prm_pc_sse, pst_pc_sse, rdd_mk_sav, rev_mc_usu, ria_ha_usu, riv_tc_usu,
run_mm_syr, seasonality, sgr_dk_sav, slp_dg_sav, slt_pc_sav, snd_pc_sav, snw_pc_s01, snw_pc_s02,
snw_pc_s03, snw_pc_s04, snw_pc_s05, snw_pc_s06, snw_pc_s07, snw_pc_s08, snw_pc_s09, snw_pc_s10,
snw_pc_s11, snw_pc_s12, snw_pc_smx, snw_pc_syr, soc_th_sav, swc_pc_s01, swc_pc_s02, swc_pc_s03,
swc_pc_s04, swc_pc_s05, swc_pc_s06, swc_pc_s07, swc_pc_s08, swc_pc_s09, swc_pc_s10, swc_pc_s11,
swc_pc_s12, swc_pc_syr, tbi_cl_smj, tec_cl_smj, tmp_dc_s01, tmp_dc_s02, tmp_dc_s03, tmp_dc_s04,
tmp_dc_s05, tmp_dc_s06, tmp_dc_s07, tmp_dc_s08, tmp_dc_s09, tmp_dc_s10, tmp_dc_s11, tmp_dc_s12,
tmp_dc_smn, tmp_dc_smx, tmp_dc_syr, urb_pc_sse, wet_cl_smj, wet_pc_s01, wet_pc_s02, wet_pc_s03,
wet_pc_s04, wet_pc_s05, wet_pc_s06, wet_pc_s07, wet_pc_s08, wet_pc_s09, wet_pc_sg1, wet_pc_sg2
[148]:
df = dataset.fetch_static_features()
print(df.shape)
(5357, 211)
[149]:
print(df.isna().sum().sum())
df.isna().sum()
0
[149]:
gauge_lat         0
gauge_lon         0
gauge_name        0
country           0
area              0
                 ..
seasonality       0
high_prec_freq    0
high_prec_dur     0
low_prec_freq     0
low_prec_dur      0
Length: 211, dtype: int64

find those columns which have at least one NaN value

[150]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[151]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[151]:
Series([], dtype: float64)
[152]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_2m_max, airtemp_C_2m_min, airtemp_C_mean_2m, dewpoint_temperature_2m_max,
dewpoint_temperature_2m_mean, dewpoint_temperature_2m_min, pcp_mm, potential_evaporation_sum,
q_cms_obs, snow_depth_water_equivalent_max, snow_depth_water_equivalent_mean,
snow_depth_water_equivalent_min, surface_net_solar_radiation_max, surface_net_solar_radiation_mean,
surface_net_solar_radiation_min, surface_net_thermal_radiation_max,
surface_net_thermal_radiation_mean, surface_net_thermal_radiation_min, surface_pressure_max,
surface_pressure_mean, surface_pressure_min, u_component_of_wind_10m_max,
u_component_of_wind_10m_mean, u_component_of_wind_10m_min, v_component_of_wind_10m_max,
v_component_of_wind_10m_mean, v_component_of_wind_10m_min, volumetric_soil_water_layer_1_max,
volumetric_soil_water_layer_1_mean, volumetric_soil_water_layer_1_min,
volumetric_soil_water_layer_2_max, volumetric_soil_water_layer_2_mean,
volumetric_soil_water_layer_2_min, volumetric_soil_water_layer_3_max,
volumetric_soil_water_layer_3_mean, volumetric_soil_water_layer_3_min,
volumetric_soil_water_layer_4_max, volumetric_soil_water_layer_4_mean,
volumetric_soil_water_layer_4_min

CCAM

[153]:
dataset = RainfallRunoff('CCAM', path=DATA_PATH, verbosity=0)
print(dataset)
CCAM with 102 stations, 16 dynamic and 124 static features
[154]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, barren, bdticm, bldfie_sl1, bldfie_sl2, bldfie_sl3, bldfie_sl4, bldfie_sl5, bldfie_sl6,
bldfie_sl7, cecsol_sl1, cecsol_sl2, cecsol_sl3, cecsol_sl4, cecsol_sl5, cecsol_sl6, cecsol_sl7,
circulatory_ratio, clay, closed_shrubland, compactness_coefficient, cropland,
cropland_natural_vegetaion, deciduous_broadleaf_tree, deciduous_needleleaf_tree, elev,
elongation_ratio, ev, evergreen_broadleaf_tree, evergreen_needleleaf_tree, evp_mean, form_factor,
frac_snow_daily, geol_permeability, geol_porosity, grassland, grav, gst_mean, high_prec_dur,
high_prec_freq, high_prec_timing, ig, lai_dif, lai_max, lat, length, length_continuous_runoff,
log_k_s_l1, log_k_s_l2, log_k_s_l3, log_k_s_l4, log_k_s_l5, log_k_s_l6, lon, low_prec_dur,
low_prec_freq, low_prec_timing, mixed_forest, mt, nd, ndvi_mean, open_shrubland, orcdrc_sl1,
orcdrc_sl2, orcdrc_sl3, orcdrc_sl4, orcdrc_sl5, orcdrc_sl6, orcdrc_sl7, pa, pb, pdep,
permanent_wetland, pet_mean, phihox_sl1, phihox_sl2, phihox_sl3, phihox_sl4, phihox_sl5, phihox_sl6,
phihox_sl7, pi, pop, pop_dnsty, por, pre_mean, prs_mean, py, rhu_mean, root_depth_50, root_depth_99,
sand, savanna, sc, shape_factor, silt, slope, sm, snow_and_ice, som, ss, ssd_mean, su, tem_mean,
theta_s_l1, theta_s_l2, theta_s_l3, theta_s_l4, theta_s_l5, theta_s_l6, tksatu_l1, tksatu_l2,
tksatu_l3, tksatu_l4, tksatu_l5, tksatu_l6, urban_and_built-up_land, va, vb, vi, water_bodies, wb,
win_mean, woody_savanna
[155]:
df = dataset.fetch_static_features()
print(df.shape)
(102, 124)
[156]:
print(df.isna().sum().sum())
df.isna().sum()
0
[156]:
area             0
barren           0
bdticm           0
bldfie_sl1       0
bldfie_sl2       0
                ..
vi               0
water_bodies     0
wb               0
win_mean         0
woody_savanna    0
Length: 124, dtype: int64

find those columns which have at least one NaN value

[157]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[158]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[158]:
Series([], dtype: float64)
[159]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
evp, gst_max, gst_mean, gst_min, pre, prs_max, prs_mean, prs_min, q, rhu, ssd, tem_max, tem_mean,
tem_min, win_max, win_mean

Japan

[160]:
dataset = RainfallRunoff('Japan', path=DATA_PATH, verbosity=0)
print(dataset)
Japan with 751 stations, 27 dynamic and 35 static features

The static features of Japan are same as that of GSHA.

[161]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slp_dg_uav, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[162]:
df = dataset.fetch_static_features()
print(df.shape)
(751, 35)
[163]:
print(df.isna().sum().sum())
df.isna().sum()
265
[163]:
EVP_uncertainty(%)      78
HYRIV_ID                 0
LRAD_uncertainty(%)     66
P_uncertainty(%)         0
SRAD_uncertainty(%)      0
T_uncertainty(%)         0
agency                   0
area                     0
cly_pc_uav               0
ele_mt_uav               0
ero_kh_uav               0
gla_pc_use               0
glc_cl_cmj               0
gwt_cm_cav               0
inu_pc_ult               0
lat                      0
lit_cl_cmj               0
long                     0
pet_uncertainty(%)     121
pnv_cl_cmj               0
prm_pc_use               0
sgr_dk_rav               0
slp_dg_uav               0
slt_pc_uav               0
snd_pc_uav               0
wet_pc_u01               0
wet_pc_u02               0
wet_pc_u03               0
wet_pc_u04               0
wet_pc_u05               0
wet_pc_u06               0
wet_pc_u07               0
wet_pc_u08               0
wet_pc_u09               0
wind_uncertainty(%)      0
dtype: int64

find those columns which have at least one NaN value

[164]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[165]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[165]:
EVP_uncertainty(%)      78
LRAD_uncertainty(%)     66
pet_uncertainty(%)     121
dtype: int64
[166]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

Ireland

[167]:
dataset = RainfallRunoff('Ireland', path=DATA_PATH, verbosity=0)
print(dataset)
Ireland with 464 stations, 10 dynamic and 208 static features

The static features of Ireland are same as that of EStreams.

[168]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[169]:
df = dataset.fetch_static_features()
print(df.shape)
(464, 208)
[170]:
print(df.isna().sum().sum())
df.isna().sum()
9797
[170]:
static_features
area                16
area_calc            0
area_flag            0
area_perc           16
aridity            208
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq        208
Length: 208, dtype: int64

find those columns which have at least one NaN value

[171]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[172]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[172]:
static_features
area                    16
area_perc               16
aridity                208
baseflow_index         208
bedrk_dep                1
                      ...
soil_tawc_p90            1
start_date             137
start_date_climatic    208
start_date_hydro       204
zero_q_freq            208
Length: 111, dtype: int64
[173]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, obs_q_cms, pcp_mm, pet_mm, rh_%,
solrad_wm2, windspeed_mps

Finland

[174]:
dataset = RainfallRunoff('Finland', path=DATA_PATH, verbosity=0)
print(dataset)
Finland with 669 stations, 10 dynamic and 208 static features

The static features of Finland are same as that of EStreams.

[175]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[176]:
df = dataset.fetch_static_features()
print(df.shape)
(669, 208)
[177]:
print(df.isna().sum().sum())
df.isna().sum()
10791
[177]:
static_features
area               126
area_calc            0
area_flag            0
area_perc          126
aridity            176
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq        196
Length: 208, dtype: int64

find those columns which have at least one NaN value

[178]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[179]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[179]:
static_features
area                   126
area_perc              126
aridity                176
baseflow_index         199
dam_yr_first           590
                      ...
soil_tawc_p90            1
start_date               6
start_date_climatic    176
start_date_hydro       196
zero_q_freq            196
Length: 111, dtype: int64
[180]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, obs_q_cms, pcp_mm, pet_mm, rh_%,
solrad_wm2, windspeed_mps

Italy

[181]:
dataset = RainfallRunoff('Italy', path=DATA_PATH, verbosity=0)
print(dataset)
Italy with 294 stations, 10 dynamic and 208 static features

The static features of Italy are same as that of EStreams.

[182]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[183]:
df = dataset.fetch_static_features()
print(df.shape)
(294, 208)
[184]:
print(df.isna().sum().sum())
df.isna().sum()
4122
[184]:
static_features
area               106
area_calc            0
area_flag            0
area_perc          106
aridity             46
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq         86
Length: 208, dtype: int64

find those columns which have at least one NaN value

[185]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[186]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[186]:
static_features
area                         106
area_perc                    106
aridity                       46
baseflow_index                87
dam_yr_first                 265
dam_yr_last                  265
duplicated_suspect           286
elevation                    219
end_date_climatic             46
end_date_hydro                85
frac_snow                     46
hfd_mean                      98
hfd_std                      105
hp_dur                        46
hp_freq                       46
hp_time                       46
hq_dur                       107
hq_freq                      107
lakes_tot_area               209
lakes_tot_vol                209
lp_dur                        46
lp_freq                       46
lp_time                       46
lq_dur                       112
lq_freq                      112
num_years_climatic            45
num_years_hydro               45
p_mean                        46
p_seasonality                 46
pet_mean                      46
q_5                           86
q_95                          86
q_elas_Sankarasubramanian     86
q_mean                        86
q_runoff_ratio                86
res_tot_sto                  265
slope_sawicz                  90
start_date_climatic           46
start_date_hydro              85
zero_q_freq                   86
dtype: int64
[187]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, obs_q_cms, pcp_mm, pet_mm, rh_%,
solrad_wm2, windspeed_mps

Poland

[188]:
dataset = RainfallRunoff('Poland', path=DATA_PATH, verbosity=0)
print(dataset)
Poland with 1287 stations, 10 dynamic and 208 static features

The static features of Poland are same as that of EStreams.

[189]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[190]:
df = dataset.fetch_static_features()
print(df.shape)
(1287, 208)
[191]:
print(df.isna().sum().sum())
df.isna().sum()
16598
[191]:
static_features
area                 6
area_calc            0
area_flag            0
area_perc            6
aridity            270
                  ...
steep_area_fra       0
strm_dens            0
tot_area             0
watershed_group      0
zero_q_freq        270
Length: 208, dtype: int64

find those columns which have at least one NaN value

[192]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[193]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[193]:
static_features
area                            6
area_perc                       6
aridity                       270
baseflow_index                270
dam_yr_first                 1099
dam_yr_last                  1099
duplicated_suspect           1284
elevation                    1287
end_date                      210
end_date_climatic             270
end_date_hydro                270
frac_snow                     270
hfd_mean                      277
hfd_std                       283
hp_dur                        270
hp_freq                       270
hp_time                       270
hq_dur                        485
hq_freq                       485
lakes_tot_area                396
lakes_tot_vol                 396
lp_dur                        270
lp_freq                       270
lp_time                       270
lq_dur                        507
lq_freq                       507
num_days_gaps                 210
num_years_climatic            270
num_years_hydro               270
p_mean                        270
p_seasonality                 270
pet_mean                      270
q_5                           270
q_95                          270
q_elas_Sankarasubramanian     270
q_mean                        270
q_runoff_ratio                270
res_tot_sto                  1101
slope_sawicz                  270
start_date                    210
start_date_climatic           270
start_date_hydro              270
zero_q_freq                   270
dtype: int64
[194]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, obs_q_cms, pcp_mm, pet_mm, rh_%,
solrad_wm2, windspeed_mps

Portugal

[195]:
dataset = RainfallRunoff('Portugal', path=DATA_PATH, verbosity=0)
print(dataset)
Portugal with 280 stations, 10 dynamic and 208 static features

The static features of Portugal are same as that of EStreams.

[196]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[197]:
df = dataset.fetch_static_features()
print(df.shape)
(280, 208)
[198]:
print(df.isna().sum().sum())
df.isna().sum()
2842
[198]:
static_features
area               25
area_calc           0
area_flag           0
area_perc          25
aridity            43
                   ..
steep_area_fra      0
strm_dens           0
tot_area            0
watershed_group     0
zero_q_freq        43
Length: 208, dtype: int64

find those columns which have at least one NaN value

[199]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[200]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[200]:
static_features
area                          25
area_perc                     25
aridity                       43
baseflow_index               103
dam_yr_first                 221
dam_yr_last                  221
duplicated_suspect           280
end_date                      11
end_date_climatic             43
end_date_hydro                43
frac_snow                     43
hfd_mean                      46
hfd_std                       49
hp_dur                        43
hp_freq                       43
hp_time                       43
hq_dur                        45
hq_freq                       45
lakes_tot_area               176
lakes_tot_vol                176
lp_dur                        43
lp_freq                       43
lp_time                       43
lq_dur                        45
lq_freq                       45
num_days_gaps                 11
num_years_climatic            43
num_years_hydro               43
p_mean                        43
p_seasonality                 43
pet_mean                      43
q_5                           43
q_95                          43
q_elas_Sankarasubramanian     43
q_mean                        43
q_runoff_ratio                43
res_tot_sto                  221
slope_sawicz                  97
start_date                    11
start_date_climatic           43
start_date_hydro              43
zero_q_freq                   43
dtype: int64
[201]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, obs_q_cms, pcp_mm, pet_mm, rh_%,
solrad_wm2, windspeed_mps

Simbi

[202]:
dataset = RainfallRunoff('Simbi', path= DATA_PATH, verbosity=0)
print(dataset)
Simbi with 24 stations, 3 dynamic and 232 static features
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
[203]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Alluvial aquifers with free water, Alluvial aquifers with partly confined water, Alluvium & detrital
materials_geol, Andesites & rhyodacites_geol, Area, Aridity_mon_arid, BFI1_d, BFI2_d, BFI3_d, BFI_d,
Basalt_geol, Beaches & dunes_lc_98, Carb_Rocks_Perc, Carbonate aquifers with marl intercalation,
Closed Shrubland_lc_95, Continuous urban_lc_98, Cropland_lc_95, Crystalline formation,
Cumul_Freq_1%, Cumul_Freq_10%, Cumul_Freq_100%, Cumul_Freq_11%, Cumul_Freq_12%, Cumul_Freq_13%,
Cumul_Freq_14%, Cumul_Freq_15%, Cumul_Freq_16%, Cumul_Freq_17%, Cumul_Freq_18%, Cumul_Freq_19%,
Cumul_Freq_2%, Cumul_Freq_20%, Cumul_Freq_21%, Cumul_Freq_22%, Cumul_Freq_23%, Cumul_Freq_24%,
Cumul_Freq_25%, Cumul_Freq_26%, Cumul_Freq_27%, Cumul_Freq_28%, Cumul_Freq_29%, Cumul_Freq_3%,
Cumul_Freq_30%, Cumul_Freq_31%, Cumul_Freq_32%, Cumul_Freq_33%, Cumul_Freq_34%, Cumul_Freq_35%,
Cumul_Freq_36%, Cumul_Freq_37%, Cumul_Freq_38%, Cumul_Freq_39%, Cumul_Freq_4%, Cumul_Freq_40%,
Cumul_Freq_41%, Cumul_Freq_42%, Cumul_Freq_43%, Cumul_Freq_44%, Cumul_Freq_45%, Cumul_Freq_46%,
Cumul_Freq_47%, Cumul_Freq_48%, Cumul_Freq_49%, Cumul_Freq_5%, Cumul_Freq_50%, Cumul_Freq_51%,
Cumul_Freq_52%, Cumul_Freq_53%, Cumul_Freq_54%, Cumul_Freq_55%, Cumul_Freq_56%, Cumul_Freq_57%,
Cumul_Freq_58%, Cumul_Freq_59%, Cumul_Freq_6%, Cumul_Freq_60%, Cumul_Freq_61%, Cumul_Freq_62%,
Cumul_Freq_63%, Cumul_Freq_64%, Cumul_Freq_65%, Cumul_Freq_66%, Cumul_Freq_67%, Cumul_Freq_68%,
Cumul_Freq_69%, Cumul_Freq_7%, Cumul_Freq_70%, Cumul_Freq_71%, Cumul_Freq_72%, Cumul_Freq_73%,
Cumul_Freq_74%, Cumul_Freq_75%, Cumul_Freq_76%, Cumul_Freq_77%, Cumul_Freq_78%, Cumul_Freq_79%,
Cumul_Freq_8%, Cumul_Freq_80%, Cumul_Freq_81%, Cumul_Freq_82%, Cumul_Freq_83%, Cumul_Freq_84%,
Cumul_Freq_85%, Cumul_Freq_86%, Cumul_Freq_87%, Cumul_Freq_88%, Cumul_Freq_89%, Cumul_Freq_9%,
Cumul_Freq_90%, Cumul_Freq_91%, Cumul_Freq_92%, Cumul_Freq_93%, Cumul_Freq_94%, Cumul_Freq_95%,
Cumul_Freq_96%, Cumul_Freq_97%, Cumul_Freq_98%, Cumul_Freq_99%, Deciduous Broadleaf Forest_lc_95,
Deciduous Needleleaf Forest_lc_95, Dense agricultural crops_lc_98, Dense agroforestry systems_lc_98,
Diorite & tonalite_geol, Discontinuous urban_lc_98, Dominant pastures_lc_98, ETP_5_mon_q5,
ETP_95_mon_q95, ETP_mon_avg, Evergreen Broadleaf Forest_lc_95, Evergreen Needleleaf Forest_lc_95,
Fissured & partitioned carbonate aquifers, Flysch & sandstone & limestone_geol, Forest_lc_98,
Grassland_lc_95, Gravelius, Hard limestone_geol, Highly permeable fissured & porous carbonate
aquifers, Industrial areas_lc_98, Karst aquifer, Lat_Cent, Lat_Exu, Lon_Cent, Lon_Exu, Low
permeability sedimentary formation, Magma_Perc, Mangroves_lc_98, Marl & marly limestone_geol, Marl &
sand_geol, Marly limestone_geol, Max_Elv, Medium-density agricultural crops_lc_98, Min_Elv, Mixed
Forest_lc_95, More productive alluvial area, Open Shrubland_lc_95, P_5_mon_q5, P_95_mon_q95,
P_max10_mon_QMXA10, P_min5_mon_QMNA5, P_mon_avg, Pasture with other presence_lc_98, Ports &
airports_lc_98, Q1_5_mon_q5, Q1_95_mon_q95, Q1_max10_mon_QMXA10, Q1_min5_mon_QMNA5, Q1_mm_d_hq_dur,
Q1_mm_d_hq_freq, Q1_mm_d_lq_dur, Q1_mm_d_lq_freq, Q1_mm_d_mean, Q1_mm_d_q5, Q1_mm_d_q95, Q1_mon_avg,
Q2_5_mon_q5, Q2_95_mon_q95, Q2_max10_mon_QMXA10, Q2_min5_mon_QMNA5, Q2_mm_d_hq_dur, Q2_mm_d_hq_freq,
Q2_mm_d_lq_dur, Q2_mm_d_lq_freq, Q2_mm_d_mean, Q2_mm_d_q5, Q2_mm_d_q95, Q2_mon_avg, Q3_5_mon_q5,
Q3_95_mon_q95, Q3_max10_mon_QMXA10, Q3_min5_mon_QMNA5, Q3_mm_d_hq_dur, Q3_mm_d_hq_freq,
Q3_mm_d_lq_dur, Q3_mm_d_lq_freq, Q3_mm_d_mean, Q3_mm_d_q5, Q3_mm_d_q95, Q3_mon_avg, Q_5_mon_q5,
Q_95_mon_q95, Q_max10_mon_QMXA10, Q_min5_mon_QMNA5, Q_mm_d_hq_dur, Q_mm_d_hq_freq, Q_mm_d_lq_dur,
Q_mm_d_lq_freq, Q_mm_d_mean, Q_mm_d_q5, Q_mm_d_q95, Q_mon_avg, Quarry_lc_98, River beds & recent
alluvium_lc_98, Rock outcrops & bare soil_lc_98, Runoff_Ratio_mon_arid, Saline areas_lc_98,
Savannahs with other presence_lc_98, Savannahs_lc_98, Sd_Elv, Sedim_Perc, Slope, Stream_density,
Temp_5_mon_q5, Temp_95_mon_q95, Temp_mon_avg, Ultrabasic rocks_geol, Urban_lc_95, Volcano-
sedimentary rock_geol, Water plan_lc_98, Water_lc_95, Wetlands_lc_98, Wooded Grassland_lc_95,
Woodland_lc_95
[204]:
df = dataset.fetch_static_features()
print(df.shape)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
(24, 232)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df = pd.read_csv(fpath, parse_dates=True, index_col=0)
[205]:
print(df.isna().sum().sum())
df.isna().sum()
96
[205]:
Alluvial aquifers with free water               0
Alluvial aquifers with partly confined water    0
Alluvium & detrital materials_geol              0
Andesites & rhyodacites_geol                    0
Area                                            0
                                               ..
Water plan_lc_98                                0
Water_lc_95                                     0
Wetlands_lc_98                                  0
Wooded Grassland_lc_95                          0
Woodland_lc_95                                  0
Length: 232, dtype: int64

find those columns which have at least one NaN value

[206]:
df.loc[:, (df.isna().sum()>0)]
[206]:
BFI1_d BFI2_d BFI3_d BFI_d Q1_mm_d_hq_dur Q1_mm_d_hq_freq Q1_mm_d_lq_dur Q1_mm_d_lq_freq Q1_mm_d_mean Q1_mm_d_q5 ... Q3_mm_d_mean Q3_mm_d_q5 Q3_mm_d_q95 Q_mm_d_hq_dur Q_mm_d_hq_freq Q_mm_d_lq_dur Q_mm_d_lq_freq Q_mm_d_mean Q_mm_d_q5 Q_mm_d_q95
001 0.46 0.68 0.55 0.49 2.12 2.43 27.62 34.19 1.16 0.2 ... 1.18 0.2 3.10 1.62 0.86 0.00 0.00 1.23 0.4 3.10
004 0.59 0.38 0.42 0.38 1.98 5.38 0.00 0.00 2.26 0.6 ... 2.23 0.3 8.15 2.00 5.00 0.00 0.00 1.97 0.6 4.80
006 0.47 0.66 0.61 0.50 1.90 5.52 0.00 0.00 1.50 0.5 ... 1.36 0.4 3.50 1.89 1.00 4.00 0.80 1.35 0.4 3.60
007 0.49 0.53 0.50 0.47 2.65 5.05 19.03 29.00 1.91 0.3 ... 2.04 0.5 5.60 2.33 2.50 5.51 16.86 2.08 0.5 5.50
008 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
010 0.29 0.21 0.31 0.32 6.08 20.86 23.73 117.52 2.10 0.2 ... 2.19 0.1 7.10 2.85 8.23 9.95 35.23 2.61 0.4 6.70
023 0.16 0.34 0.29 0.20 3.65 20.14 18.33 89.05 1.87 0.2 ... 1.81 0.2 6.70 1.91 7.79 5.34 24.79 1.98 0.3 5.58
024 0.38 0.42 0.39 0.42 3.38 15.14 0.00 0.00 1.37 0.3 ... 1.38 0.4 4.50 1.75 2.75 18.19 109.00 1.13 0.1 3.70
029 0.38 0.28 0.33 0.39 2.43 6.95 34.18 61.86 2.31 0.2 ... 2.13 0.1 6.30 1.56 2.71 0.00 0.00 2.30 0.7 4.50
036 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
037 0.24 0.47 0.34 0.41 9.02 36.10 83.00 158.10 0.61 0.0 ... 0.71 0.0 2.90 1.33 0.38 12.67 62.08 1.03 0.2 2.90
041 0.39 0.61 0.52 0.39 6.14 14.90 130.76 105.86 0.89 0.0 ... 0.93 0.0 3.20 0.00 0.00 14.83 57.07 1.05 0.2 3.00
044 0.32 0.28 0.36 0.25 7.22 40.24 70.33 160.76 1.05 0.0 ... 0.89 0.0 4.10 1.38 9.86 12.38 15.57 1.36 0.2 4.46
045 0.52 0.39 0.44 0.42 2.95 2.95 0.00 0.00 0.44 0.1 ... 0.43 0.1 1.10 1.60 0.17 5.64 13.17 0.36 0.1 1.00
051 0.23 0.21 0.13 0.18 3.62 13.10 25.48 110.43 1.06 0.1 ... 1.61 0.1 6.00 2.27 12.00 7.72 49.25 1.67 0.2 4.50
052 0.49 0.62 0.58 0.29 1.90 2.71 37.16 97.33 2.66 0.2 ... 2.88 0.8 8.70 3.11 33.50 28.00 160.33 2.39 0.0 9.96
053 0.31 0.10 0.11 0.12 2.30 12.62 11.45 44.71 2.66 0.4 ... 2.72 0.0 15.60 1.89 14.33 7.15 79.33 2.41 0.2 6.90
056 0.22 0.49 0.47 0.15 3.49 11.29 18.35 70.76 1.21 0.1 ... 1.28 0.2 4.30 1.78 10.55 8.69 81.64 1.20 0.1 4.30
057 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
058 0.29 0.27 0.06 0.32 3.71 38.29 81.74 163.48 1.83 0.1 ... 1.50 0.0 6.80 2.32 20.00 20.33 86.50 0.96 0.1 2.80
060 0.44 0.53 0.41 0.26 3.03 9.67 29.38 55.95 1.53 0.3 ... 1.48 0.4 4.50 2.17 9.67 7.76 32.00 1.48 0.2 4.80
061 0.33 0.49 0.41 0.30 2.41 12.71 16.44 54.81 2.69 0.4 ... 2.87 0.5 9.20 1.85 6.92 4.80 19.00 2.86 0.6 8.90
065 0.28 0.34 0.31 0.25 2.39 17.52 28.92 50.95 1.74 0.3 ... 1.72 0.3 6.20 2.30 11.75 13.11 73.31 1.74 0.2 6.00
068 0.55 0.55 0.55 0.52 2.67 3.05 13.68 42.33 2.55 0.4 ... 2.31 0.3 6.10 2.12 2.83 8.60 7.17 2.19 0.6 4.60

24 rows × 32 columns

[207]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[207]:
BFI1_d             3
BFI2_d             3
BFI3_d             3
BFI_d              3
Q1_mm_d_hq_dur     3
Q1_mm_d_hq_freq    3
Q1_mm_d_lq_dur     3
Q1_mm_d_lq_freq    3
Q1_mm_d_mean       3
Q1_mm_d_q5         3
Q1_mm_d_q95        3
Q2_mm_d_hq_dur     3
Q2_mm_d_hq_freq    3
Q2_mm_d_lq_dur     3
Q2_mm_d_lq_freq    3
Q2_mm_d_mean       3
Q2_mm_d_q5         3
Q2_mm_d_q95        3
Q3_mm_d_hq_dur     3
Q3_mm_d_hq_freq    3
Q3_mm_d_lq_dur     3
Q3_mm_d_lq_freq    3
Q3_mm_d_mean       3
Q3_mm_d_q5         3
Q3_mm_d_q95        3
Q_mm_d_hq_dur      3
Q_mm_d_hq_freq     3
Q_mm_d_lq_dur      3
Q_mm_d_lq_freq     3
Q_mm_d_mean        3
Q_mm_d_q5          3
Q_mm_d_q95         3
dtype: int64
[208]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
pcp, q, temp

Spain

[209]:
dataset = RainfallRunoff('Spain', path=DATA_PATH, verbosity=0)
print(dataset)
Spain with 889 stations, 27 dynamic and 35 static features

The static features of Spain are same as that of GSHA.

[210]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slp_dg_uav, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[211]:
df = dataset.fetch_static_features()
print(df.shape)
(889, 35)
[212]:
print(df.isna().sum().sum())
df.isna().sum()
30
[212]:
EVP_uncertainty(%)     11
HYRIV_ID                0
LRAD_uncertainty(%)     6
P_uncertainty(%)        0
SRAD_uncertainty(%)     0
T_uncertainty(%)        0
agency                  0
area                    0
cly_pc_uav              0
ele_mt_uav              0
ero_kh_uav              0
gla_pc_use              0
glc_cl_cmj              0
gwt_cm_cav              0
inu_pc_ult              0
lat                     0
lit_cl_cmj              0
long                    0
pet_uncertainty(%)     13
pnv_cl_cmj              0
prm_pc_use              0
sgr_dk_rav              0
slp_dg_uav              0
slt_pc_uav              0
snd_pc_uav              0
wet_pc_u01              0
wet_pc_u02              0
wet_pc_u03              0
wet_pc_u04              0
wet_pc_u05              0
wet_pc_u06              0
wet_pc_u07              0
wet_pc_u08              0
wet_pc_u09              0
wind_uncertainty(%)     0
dtype: int64

find those columns which have at least one NaN value

[213]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[214]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[214]:
EVP_uncertainty(%)     11
LRAD_uncertainty(%)     6
pet_uncertainty(%)     13
dtype: int64
[215]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

Thailand

[216]:
dataset = RainfallRunoff('Thailand', path=DATA_PATH, verbosity=0)
print(dataset)
Thailand with 73 stations, 27 dynamic and 35 static features

The static features of Thailand are same as that of GSHA.

[217]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slp_dg_uav, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[218]:
df = dataset.fetch_static_features()
print(df.shape)
(73, 35)
[219]:
print(df.isna().sum().sum())
df.isna().sum()
0
[219]:
EVP_uncertainty(%)     0
HYRIV_ID               0
LRAD_uncertainty(%)    0
P_uncertainty(%)       0
SRAD_uncertainty(%)    0
T_uncertainty(%)       0
agency                 0
area                   0
cly_pc_uav             0
ele_mt_uav             0
ero_kh_uav             0
gla_pc_use             0
glc_cl_cmj             0
gwt_cm_cav             0
inu_pc_ult             0
lat                    0
lit_cl_cmj             0
long                   0
pet_uncertainty(%)     0
pnv_cl_cmj             0
prm_pc_use             0
sgr_dk_rav             0
slp_dg_uav             0
slt_pc_uav             0
snd_pc_uav             0
wet_pc_u01             0
wet_pc_u02             0
wet_pc_u03             0
wet_pc_u04             0
wet_pc_u05             0
wet_pc_u06             0
wet_pc_u07             0
wet_pc_u08             0
wet_pc_u09             0
wind_uncertainty(%)    0
dtype: int64

find those columns which have at least one NaN value

[220]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[221]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[221]:
Series([], dtype: float64)
[222]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

USGS

[223]:
dataset = RainfallRunoff('USGS', path=DATA_PATH, verbosity=0)
print(dataset)
USGS with 12004 stations, 5 dynamic and 27 static features

The static features of USGS are same as that of HYSETS.

[224]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Aspect_deg, Centroid_Lat_deg_N, Centroid_Lon_deg_E, Drainage_Area_GSIM_km2, Drainage_Area_km2,
Elevation_m, Flag_Artificial_Boundaries, Flag_GSIM_boundaries, Flag_Land_Use_Extraction,
Flag_Shape_Extraction, Flag_Subsoil_Extraction, Flag_Terrain_Extraction, Gravelius,
Land_Use_Crops_frac, Land_Use_Forest_frac, Land_Use_Grass_frac, Land_Use_Shrubs_frac,
Land_Use_Snow_Ice_frac, Land_Use_Urban_frac, Land_Use_Water_frac, Land_Use_Wetland_frac, Name,
Perimeter, Permeability_logk_m2, Porosity_frac, Slope_deg, Source
[225]:
df = dataset.fetch_static_features()
print(df.shape)
(12004, 27)
[226]:
print(df.isna().sum().sum())
df.isna().sum()
16551
[226]:
Source                            0
Name                              0
Centroid_Lat_deg_N                0
Centroid_Lon_deg_E                0
Drainage_Area_km2                 0
Drainage_Area_GSIM_km2        11884
Flag_GSIM_boundaries              0
Flag_Artificial_Boundaries        0
Elevation_m                       1
Slope_deg                         1
Gravelius                      1168
Perimeter                      1168
Flag_Shape_Extraction             0
Aspect_deg                        1
Flag_Terrain_Extraction           0
Land_Use_Forest_frac              3
Land_Use_Grass_frac               3
Land_Use_Wetland_frac             3
Land_Use_Water_frac               3
Land_Use_Urban_frac               3
Land_Use_Shrubs_frac              3
Land_Use_Crops_frac               3
Land_Use_Snow_Ice_frac            3
Flag_Land_Use_Extraction          0
Permeability_logk_m2           1152
Porosity_frac                  1152
Flag_Subsoil_Extraction           0
dtype: int64

find those columns which have at least one NaN value

[227]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[228]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[228]:
Drainage_Area_GSIM_km2    11884
Elevation_m                   1
Slope_deg                     1
Gravelius                  1168
Perimeter                  1168
Aspect_deg                    1
Land_Use_Forest_frac          3
Land_Use_Grass_frac           3
Land_Use_Wetland_frac         3
Land_Use_Water_frac           3
Land_Use_Urban_frac           3
Land_Use_Shrubs_frac          3
Land_Use_Crops_frac           3
Land_Use_Snow_Ice_frac        3
Permeability_logk_m2       1152
Porosity_frac              1152
dtype: int64
[229]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
obs_q_cms, pr, swe, tasmax, tasmin

WaterBenchIowa

[230]:
dataset = RainfallRunoff('WaterBenchIowa', path=DATA_PATH, verbosity=0)
print(dataset)
WaterBenchIowa with 125 stations, 3 dynamic and 7 static features
[231]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, loam, sandy_clay_loam, silt, silty_clay_loam, slope, travel_time
[232]:
df = dataset.fetch_static_features()
print(df.shape)
(125, 7)
[233]:
print(df.isna().sum().sum())
df.isna().sum()
0
[233]:
travel_time        0
area               0
slope              0
loam               0
silt               0
sandy_clay_loam    0
silty_clay_loam    0
dtype: int64

find those columns which have at least one NaN value

[234]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
No NaN values
[235]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[235]:
Series([], dtype: float64)
[236]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
discharge, et, precipitation

Regional Datsets without observed streamflow

The following datasets do not have observed streamflow data. However, they behave similar to the datasets with observed streamflow data.

GSHA

This dataset contains climate (dynamic) variables and static features for catchments around the world. These dynamic and static features are used for other dataset classes like Spain, Thailand and Japan.

[237]:
dataset = RainfallRunoff('GSHA', path=DATA_PATH, verbosity=0)
print(dataset)
GSHA with 21568 stations, 26 dynamic and 35 static features
[238]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slp_dg_uav, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[239]:
df = dataset.fetch_static_features()
print(df.shape)
(21568, 35)
[240]:
print(df.isna().sum().sum())
df.isna().sum()
3442
[240]:
EVP_uncertainty(%)     1224
HYRIV_ID                  0
LRAD_uncertainty(%)     630
P_uncertainty(%)          0
SRAD_uncertainty(%)       0
T_uncertainty(%)          8
agency                    0
area                      0
cly_pc_uav                0
ele_mt_uav                0
ero_kh_uav                0
gla_pc_use                0
glc_cl_cmj                0
gwt_cm_cav                0
inu_pc_ult                0
lat                       0
lit_cl_cmj                0
long                      0
pet_uncertainty(%)     1580
pnv_cl_cmj                0
prm_pc_use                0
sgr_dk_rav                0
slp_dg_uav                0
slt_pc_uav                0
snd_pc_uav                0
wet_pc_u01                0
wet_pc_u02                0
wet_pc_u03                0
wet_pc_u04                0
wet_pc_u05                0
wet_pc_u06                0
wet_pc_u07                0
wet_pc_u08                0
wet_pc_u09                0
wind_uncertainty(%)       0
dtype: int64

find those columns which have at least one NaN value

[241]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[242]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[242]:
EVP_uncertainty(%)     1224
LRAD_uncertainty(%)     630
T_uncertainty(%)          8
pet_uncertainty(%)     1580
dtype: int64
[243]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2, swe_mm_era5,
windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra

EStreams

The EStreams dataset does not contain observed streamflow data. However, it contains other climate (dynamic) variables and static features for european catchments. These dynamic and static features are used for other Euoropean dataset classes like Portugal, Spain, Finland, Italy, Ireland and Poland.

[244]:
dataset = RainfallRunoff('EStreams', path=DATA_PATH, verbosity=0)
print(dataset)
EStreams with 15047 stations, 9 dynamic and 208 static features
[245]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[246]:
df = dataset.fetch_static_features()
print(df.shape)
(15047, 208)
[247]:
print(df.isna().sum().sum())
df.isna().sum()
191618
[247]:
static_features
area               1115
area_calc             0
area_flag             0
area_perc          1116
aridity            3179
                   ...
steep_area_fra        0
strm_dens             0
tot_area              0
watershed_group       0
zero_q_freq        3269
Length: 208, dtype: int64

find those columns which have at least one NaN value

[248]:
if df.isna().sum().sum()>0:
    df.loc[:, (df.isna().sum()>0)]
else:
    print('No NaN values')
[249]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[249]:
static_features
area                   1115
area_perc              1116
aridity                3179
baseflow_index         3380
bedrk_dep                 5
                       ...
stations_dens_tmax        1
stations_dens_tmean       1
stations_dens_tmin        1
stations_dens_ws          1
zero_q_freq            3269
Length: 158, dtype: int64
[250]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, rh_%, solrad_wm2,
windspeed_mps