Summary of Rainfall Runoff datasets
This file shows summary of all rainfall-runoff datasets available in the package and how to access these datasets using a unified interface RainfallRunoff.
At the time of running this script, the datasets have been previosly downloaded. Therefore, if you run this script for the first time, it may take days to run or may even not run successfully till the end due to internet connection issues.
[1]:
import os
import site
wd_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath('__file__')))))
#wd_dir = os.path.dirname(os.path.dirname(os.path.realpath('__file__')))
#wd_dir = os.path.dirname(os.path.realpath('__file__'))
print(wd_dir)
site.addsitedir(wd_dir)
import textwrap
import matplotlib
nice_fonts = {
#"text.usetex": True,
"font.family": "sans-serif", #sans -serif
#"font.serif" : "Times New Roman",
}
matplotlib.rcParams.update(nice_fonts)
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from easy_mpl.utils import despine_axes
from aqua_fetch.utils import print_info
from aqua_fetch import RainfallRunoff
print_info()
# path where the data will be downloaded or has previously been downloaded
DATA_PATH = '/mnt/datawaha/hyex/atr/gscad_database/raw'
/home/abbaa0a/AquaFetch
numpy 1.26.4
pandas 2.2.3
water_quality 0.1.0
python 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0]
os posix
matplotlib 3.8.4
shapefile 2.3.1
xarray 2024.7.0
netCDF4 1.6.2
scipy 1.13.0
Script Executed on: 19 January 2025 12:00:21
tot_cpus 112
avail_cpus 112
mem_gib 251.52817153930664
[2]:
datasets = {
"Arcticnet" : DATA_PATH,
"Bull" : DATA_PATH,
"CABra" : DATA_PATH,
# GRDC Caravan is overshadowing the other datasets
# so better put it at start
"GRDCCaravan": DATA_PATH,
#"CAMELS_AUS" : os.path.join(DATA_PATH, 'CAMELS_AUS_V1'),
"CAMELS_AUS": os.path.join(DATA_PATH, 'CAMELS'),
"CAMELS_GB" : os.path.join(DATA_PATH, 'CAMELS'),
"CAMELS_BR" : os.path.join(DATA_PATH, 'CAMELS'),
"CAMELS_US" : os.path.join(DATA_PATH, 'CAMELS'),
"CAMELS_CL" : os.path.join(DATA_PATH, 'CAMELS'),
"CAMELS_DK" : os.path.join(DATA_PATH, 'CAMELS'),
"CAMELS_CH" : os.path.join(DATA_PATH, 'CAMELS'),
"CAMELS_DE" : os.path.join(DATA_PATH, 'CAMELS'),
"CAMELS_FR" : os.path.join(DATA_PATH, 'CAMELS'),
"CAMELS_SE" : os.path.join(DATA_PATH, 'CAMELS'),
"CAMELS_IND" : os.path.join(DATA_PATH, 'CAMELS'),
"Caravan_DK": DATA_PATH,
"LamaHCE" : DATA_PATH,
"LamaHIce" : os.path.join(DATA_PATH, 'LamaHIce_daily'),
"HYSETS": os.path.join(DATA_PATH, 'HYSETS'),
"CCAM": DATA_PATH,
"Japan": DATA_PATH,
"Ireland": DATA_PATH,
"Finland": DATA_PATH,
"Italy": DATA_PATH,
"Poland": DATA_PATH,
"Portugal": DATA_PATH,
"Simbi": DATA_PATH,
"Spain": DATA_PATH,
"Thailand": DATA_PATH,
"USGS": DATA_PATH,
}
colors = plt.cm.tab20.colors + plt.cm.tab20b.colors
rets = {}
items = {}
block1 = ['HYSETS', 'Italy', 'GRDCCaravan', 'LamaHCE', 'LamaHIce', "CABra", "CAMELS_US",
"CAMELS_CL", 'Ireland', 'Spain', 'Poland', 'CAMELS_SE', 'USGS', "Bull", "CAMELS_BR"]
block2 = ['CAMELS_DK', 'CAMELS_FR', 'CAMELS_DE', 'Portugal',
"CAMELS_GB", "CAMELS_CH", "Caravan_DK"]
block3 = ['Arcticnet', 'Thailand', 'CCAM', 'Japan', 'Finland', 'CAMELS_AUS',
'CAMELS_IND', "Simbi"]
# collect the coords data
coords_data = {}
for idx, (src, path) in enumerate(datasets.items()):
kws = {}
if src == 'LamaHCE':
kws = dict(timestep='D', data_type='total_upstrm')
ds = RainfallRunoff(src, path=path, verbosity=0, **kws)
coords_data[src] = ds.stn_coords()
# draw the figure
_, ax = plt.subplots(figsize=(10, 12))
map = Basemap(ax=ax, resolution='l')
map.drawcoastlines(linewidth=0.3, ax=ax, color="gray", zorder=0)
for idx, src in enumerate(datasets.keys()):
coords = coords_data[src]
ret = map.scatter(coords['long'].values, coords['lat'].values,
marker=".",
s=2,
linewidths=0.0,
color = colors[idx],
alpha=1.0,
label=f"{src} (n={coords.shape[0]})")
rets[src] = ret
items[src] = coords.shape[0]
leg1 = ax.legend(
[rets[src] for src in sorted(block1)],
[f"{src} (n={items[src]})" for src in sorted(block1)],
markerscale=12,
fontsize=8,
borderpad=0.2,
labelspacing=0.5,
title_fontproperties={'weight': 'bold', 'size': 8+2},
bbox_to_anchor=(0.001, 0.001),
loc="lower left",
framealpha=0.6
)
leg2 = ax.legend([rets[src] for src in sorted(block2)],
[f"{src} (n={items[src]})" for src in sorted(block2)],
markerscale=12,
fontsize=8,
borderpad=0.2,
labelspacing=0.5,
title_fontproperties={'weight': 'bold', 'size': 8+2},
bbox_to_anchor=(0.34, 0.001),
loc="lower left",
)
leg3 = ax.legend([rets[src] for src in block3],
[f"{src} (n={items[src]})" for src in block3],
markerscale=12,
fontsize=8,
borderpad=0.2,
labelspacing=0.5,
title_fontproperties={'weight': 'bold', 'size': 8+2},
bbox_to_anchor=(0.60, 0.001),
loc="lower left",
)
ax.add_artist(leg1)
ax.add_artist(leg2)
#ax.add_artist(leg3)
despine_axes(ax)
#plt.savefig("rr_stations.png", dpi=600, bbox_inches="tight")
plt.show()
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:2541: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
df = pd.read_csv(
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:3223: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
df = pd.read_csv(os.path.join(fpath),
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:3234: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
df = pd.read_csv(fpath,
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
df = pd.read_csv(fpath, parse_dates=True, index_col=0)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
df = pd.read_csv(fpath, parse_dates=True, index_col=0)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
df = pd.read_csv(fpath, parse_dates=True, index_col=0)
Arcticnet
[3]:
dataset = RainfallRunoff('Arcticnet', path=DATA_PATH, verbosity=0)
print(dataset)
Arcticnet with 106 stations, 27 dynamic and 35 static features
The static features of Arcticnet are same as that of GSHA.
[4]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slp_dg_uav, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[5]:
df = dataset.fetch_static_features()
print(df.shape)
(106, 35)
[6]:
print(df.isna().sum().sum())
df.isna().sum()
22
[6]:
EVP_uncertainty(%) 9
HYRIV_ID 0
LRAD_uncertainty(%) 2
P_uncertainty(%) 0
SRAD_uncertainty(%) 0
T_uncertainty(%) 0
agency 0
area 0
cly_pc_uav 0
ele_mt_uav 0
ero_kh_uav 0
gla_pc_use 0
glc_cl_cmj 0
gwt_cm_cav 0
inu_pc_ult 0
lat 0
lit_cl_cmj 0
long 0
pet_uncertainty(%) 11
pnv_cl_cmj 0
prm_pc_use 0
sgr_dk_rav 0
slp_dg_uav 0
slt_pc_uav 0
snd_pc_uav 0
wet_pc_u01 0
wet_pc_u02 0
wet_pc_u03 0
wet_pc_u04 0
wet_pc_u05 0
wet_pc_u06 0
wet_pc_u07 0
wet_pc_u08 0
wet_pc_u09 0
wind_uncertainty(%) 0
dtype: int64
find those columns which have at least one NaN value
[7]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[8]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[8]:
EVP_uncertainty(%) 9
LRAD_uncertainty(%) 2
pet_uncertainty(%) 11
dtype: int64
[9]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra
Bull
[10]:
dataset = RainfallRunoff('Bull', path=DATA_PATH, verbosity=0)
print(dataset)
Bull with 484 stations, 55 dynamic and 214 static features
[11]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
NSE, aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area,
area_fraction_used_for_aggregation, area_hydroatlas, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav,
clz_cl_smj, cmi_ix_s01, cmi_ix_s02, cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07,
cmi_ix_s08, cmi_ix_s09, cmi_ix_s10, cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse,
dis_m3_pmn, dis_m3_pmx, dis_m3_pyr, dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav,
fec_cl_smj, fmh_cl_smj, for_pc_sse, frac_snow, gauge_lat, gauge_lon, gauge_name, gdp_ud_sav,
gdp_ud_ssu, gla_pc_sse, glc_cl_smj, glc_pc_s01, glc_pc_s02, glc_pc_s03, glc_pc_s04, glc_pc_s05,
glc_pc_s06, glc_pc_s07, glc_pc_s08, glc_pc_s09, glc_pc_s10, glc_pc_s11, glc_pc_s12, glc_pc_s13,
glc_pc_s14, glc_pc_s15, glc_pc_s16, glc_pc_s17, glc_pc_s18, glc_pc_s19, glc_pc_s20, glc_pc_s21,
glc_pc_s22, gwt_cm_sav, hdi_ix_sav, hft_ix_s09, hft_ix_s93, high_prec_dur, high_prec_freq,
inu_pc_slt, inu_pc_smn, inu_pc_smx, ire_pc_sse, kar_pc_sse, lit_cl_smj, lka_pc_sse, lkv_mc_usu,
low_prec_dur, low_prec_freq, moisture_index, nli_ix_sav, non-altered, p_mean, pac_pc_sse, pet_mean,
pet_mm_s01, pet_mm_s02, pet_mm_s03, pet_mm_s04, pet_mm_s05, pet_mm_s06, pet_mm_s07, pet_mm_s08,
pet_mm_s09, pet_mm_s10, pet_mm_s11, pet_mm_s12, pet_mm_syr, pnv_cl_smj, pnv_pc_s01, pnv_pc_s02,
pnv_pc_s03, pnv_pc_s04, pnv_pc_s05, pnv_pc_s06, pnv_pc_s07, pnv_pc_s08, pnv_pc_s09, pnv_pc_s10,
pnv_pc_s11, pnv_pc_s12, pnv_pc_s13, pnv_pc_s14, pnv_pc_s15, pop_ct_usu, ppd_pk_sav, pre_mm_s01,
pre_mm_s02, pre_mm_s03, pre_mm_s04, pre_mm_s05, pre_mm_s06, pre_mm_s07, pre_mm_s08, pre_mm_s09,
pre_mm_s10, pre_mm_s11, pre_mm_s12, pre_mm_syr, prm_pc_sse, pst_pc_sse, rdd_mk_sav, rev_mc_usu,
ria_ha_usu, riv_tc_usu, run_mm_syr, seasonality, sgr_dk_sav, slp_dg_sav, slt_pc_sav, snd_pc_sav,
snw_pc_s01, snw_pc_s02, snw_pc_s03, snw_pc_s04, snw_pc_s05, snw_pc_s06, snw_pc_s07, snw_pc_s08,
snw_pc_s09, snw_pc_s10, snw_pc_s11, snw_pc_s12, snw_pc_smx, snw_pc_syr, soc_th_sav, swc_pc_s01,
swc_pc_s02, swc_pc_s03, swc_pc_s04, swc_pc_s05, swc_pc_s06, swc_pc_s07, swc_pc_s08, swc_pc_s09,
swc_pc_s10, swc_pc_s11, swc_pc_s12, swc_pc_syr, tbi_cl_smj, tec_cl_smj, tmp_dc_s01, tmp_dc_s02,
tmp_dc_s03, tmp_dc_s04, tmp_dc_s05, tmp_dc_s06, tmp_dc_s07, tmp_dc_s08, tmp_dc_s09, tmp_dc_s10,
tmp_dc_s11, tmp_dc_s12, tmp_dc_smn, tmp_dc_smx, tmp_dc_syr, urb_pc_sse, wet_cl_smj, wet_pc_s01,
wet_pc_s02, wet_pc_s03, wet_pc_s04, wet_pc_s05, wet_pc_s06, wet_pc_s07, wet_pc_s08, wet_pc_s09,
wet_pc_sg1, wet_pc_sg2
[12]:
df = dataset.fetch_static_features()
print(df.shape)
(484, 214)
[13]:
print(df.isna().sum().sum())
df.isna().sum()
0
[13]:
NSE 0
aet_mm_s01 0
aet_mm_s02 0
aet_mm_s03 0
aet_mm_s04 0
..
wet_pc_s07 0
wet_pc_s08 0
wet_pc_s09 0
wet_pc_sg1 0
wet_pc_sg2 0
Length: 214, dtype: int64
find those columns which have at least one NaN value
[14]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
No NaN values
[15]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[15]:
Series([], dtype: float64)
[16]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_2m_max, airtemp_C_2m_min, airtemp_C_AEMET_max, airtemp_C_AEMET_min, airtemp_C_EMO1arc_max,
airtemp_C_EMO1arc_min, airtemp_C_ERA5Land_max, airtemp_C_ERA5Land_min, airtemp_C_mean_2m,
airtemp_C_mean_AEMET, airtemp_C_mean_EMO1arc, airtemp_C_mean_ERA5Land, dptemp_C_max, dptemp_C_mean,
dptemp_C_min, pcp_mm_AEMET, pcp_mm_BULL, pcp_mm_EMO1arc, pcp_mm_ERA5Land, pet_mm_AEMET,
pet_mm_EMO1arc, pet_mm_ERA5Land, pevap_mm, q_cms_obs, solrad_wm2, solrad_wm2_max, solrad_wm2_max,
streamflow_BULL, surface_pressure_max_BULL, surface_pressure_mean_BULL, surface_pressure_min_BULL,
swe_mm, swe_mm_max, swe_mm_min, thermrad_wm2, thermrad_wm2_max, thermrad_wm2_min,
volumetric_soil_water_layer_1_max_BULL, volumetric_soil_water_layer_1_mean_BULL,
volumetric_soil_water_layer_1_min_BULL, volumetric_soil_water_layer_2_max_BULL,
volumetric_soil_water_layer_2_mean_BULL, volumetric_soil_water_layer_2_min_BULL,
volumetric_soil_water_layer_3_max_BULL, volumetric_soil_water_layer_3_mean_BULL,
volumetric_soil_water_layer_3_min_BULL, volumetric_soil_water_layer_4_max_BULL,
volumetric_soil_water_layer_4_mean_BULL, volumetric_soil_water_layer_4_min_BULL,
windspeedu_mps_max_10m, windspeedu_mps_mean_10m, windspeedu_mps_min_10m, windspeedv_mps_max_10m,
windspeedv_mps_mean_10m, windspeedv_mps_min_10m
CABra
[17]:
dataset = RainfallRunoff('CABra', path=DATA_PATH, verbosity=0)
print(dataset)
CABra with 735 stations, 12 dynamic and 97 static features
[18]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
ANA_ID, ANA_ID, ANA_ID, ANA_ID, ANA_ID, ANA_ID, ANA_ID, ANA_ID, ANA_ID, aquif_name, aquif_type,
aridity_index, baseflow_index, catch_area, catch_hand, catch_lith, catch_order, catch_slope,
catch_wtd, clim_et, clim_p, clim_pet, clim_quality, clim_rh, clim_srad, clim_tmax, clim_tmin,
clim_wind, cover_bare, cover_crops, cover_crops, cover_forest, cover_grass, cover_main, cover_moss,
cover_shrub, cover_snow, cover_urban, cover_urban, cover_waterp, cover_waters, dist_urban,
elev_gauge, elev_max, elev_mean, elev_min, fdc_slope, gauge_biome, gauge_hreg, gauge_state,
hand_class, hdisturb_index, latitude, longitude, missing_data, ndvi_djf, ndvi_jja, ndvi_mam,
ndvi_son, p_seasonality, q_1, q_5, q_95, q_99, q_cv, q_elasticity, q_hcv, q_hd, q_hf, q_hfd, q_lcv,
q_ld, q_lf, q_mean, q_zero, quality_index, res_area, res_number, res_regulation, res_volume,
runoff_coef, series_length, soil_bulk, soil_carbon, soil_clay, soil_depth, soil_sand, soil_silt,
soil_textclass, soil_type, sub_hconduc, sub_permeability, sub_porosity, water_demand, well_dynamic,
well_number, well_static
[19]:
df = dataset.fetch_static_features()
print(df.shape)
(735, 97)
[20]:
print(df.isna().sum().sum())
df.isna().sum()
0
[20]:
ANA_ID 0
ANA_ID 0
ANA_ID 0
ANA_ID 0
ANA_ID 0
..
sub_porosity 0
water_demand 0
well_dynamic 0
well_number 0
well_static 0
Length: 97, dtype: int64
find those columns which have at least one NaN value
[21]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
No NaN values
[22]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[22]:
Series([], dtype: float64)
[23]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
Quality, aet_mm_ens, airtemp_C_ens_max, airtemp_C_ens_min, pcp_mm_ens, pet_mm_hg, pet_mm_pm,
pet_mm_pt, q_cms_obs, rh_%_ens, solrad_wm2_ens, windspeed_mps_ens
CAMELS_AUS
[24]:
dataset = RainfallRunoff('CAMELS_AUS', path=os.path.join(DATA_PATH, 'CAMELS_AUS_V1'), version=1, verbosity=0)
print(dataset)
CAMELS_AUS with 222 stations, 26 dynamic and 166 static features
[25]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, anngro_mega, anngro_meso, anngro_micro, aridity, baseflow_index, carbnatesed,
catchment_area, catchment_di, claya, clayb, confinement, daystart, daystart_P, daystart_Q,
distupdamw, drainage_division, elev_max, elev_mean, elev_min, elev_range, elongratio, end_date,
erosivity, extract_ind_fac, flow_div_fac, flow_regime_di, frac_snow, geol_prim, geol_prim_prop,
geol_sec, geol_sec_prop, gromega_seas, gromeso_seas, gromicro_seas, hdf_mean, high_prec_dur,
high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, igneous, impound_fac, infrastruc_fac,
ksat, landuse_fac, lat_centroid, lat_outlet, lc01_extracti, lc03_waterbo, lc04_saltlak,
lc05_irrcrop, lc06_irrpast, lc07_irrsuga, lc08_rfcropp, lc09_rfpastu, lc10_rfsugar, lc11_wetlands,
lc14_tussclo, lc15_alpineg, lc16_openhum, lc18_opentus, lc19_shrbsca, lc24_shrbden, lc25_shrbope,
lc31_forclos, lc32_foropen, lc33_woodope, lc34_woodspa, lc35_urbanar, leveebank_fac, long_centroid,
long_outlet, low_prec_dur, low_prec_freq, low_prec_timing, low_q_dur, low_q_freq, map_zone,
mean_slope_pct, metamorph, mrvbf_prop_0, mrvbf_prop_1, mrvbf_prop_2, mrvbf_prop_3, mrvbf_prop_4,
mrvbf_prop_5, mrvbf_prop_6, mrvbf_prop_7, mrvbf_prop_8, mrvbf_prop_9, nested_status,
next_station_ds, notes, npp_1, npp_10, npp_11, npp_12, npp_2, npp_3, npp_4, npp_5, npp_6, npp_7,
npp_8, npp_9, npp_ann, num_nested_within, nvis_bare_e, nvis_bare_n, nvis_forests_e, nvis_forests_n,
nvis_grasses_e, nvis_grasses_n, nvis_nodata_e, nvis_nodata_n, nvis_shrubs_e, nvis_shrubs_n,
nvis_woodlands_e, nvis_woodlands_n, oldrock, othersed, p_mean, p_seasonality, pet_mean, pop_gt_1,
pop_gt_10, pop_max, pop_mean, prop_forested, prop_missing_data, q_mean, q_uncert_n,
q_uncert_num_curves, q_uncert_q10, q_uncert_q10_lower, q_uncert_q10_upper, q_uncert_q50,
q_uncert_q50_lower, q_uncert_q50_upper, q_uncert_q90, q_uncert_q90_lower, q_uncert_q90_upper,
relief, reliefratio, river_di, river_region, runoff_ratio, sanda, sedvolc, settlement_fac, silicsed,
slope_fdc, solpawhc, solum_thickness, start_date, state_alt, state_outlet, station_name, strahler,
strdensity, stream_elas, unconsoldted, upsdist, zero_q_freq
[26]:
df = dataset.fetch_static_features()
print(df.shape)
(222, 166)
[27]:
print(df.isna().sum().sum())
df.isna().sum()
1175
[27]:
station_name 0
drainage_division 0
river_region 0
notes 0
lat_outlet 0
..
npp_8 0
npp_9 0
npp_10 0
npp_11 0
npp_12 0
Length: 166, dtype: int64
find those columns which have at least one NaN value
[28]:
df.loc[:, (df.isna().sum()>0)]
[28]:
| state_alt | next_station_ds | q_uncert_num_curves | q_uncert_n | q_uncert_q10 | q_uncert_q10_upper | q_uncert_q10_lower | q_uncert_q50 | q_uncert_q50_upper | q_uncert_q50_lower | q_uncert_q90 | q_uncert_q90_upper | q_uncert_q90_lower | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| station_id | |||||||||||||
| 912101A | NT | NaN | 3.0 | 15226.0 | 0.015122 | 25.07% | -21.06% | 0.027200 | 20.06% | -17.82% | 0.121670 | 18.46% | -15.13% |
| 912105A | NT | 912101A | 1.0 | 15232.0 | 0.016572 | 196.84% | -93.24% | 0.031969 | 129.72% | -77.38% | 0.161384 | 49.79% | -40.02% |
| 915011A | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 917107A | NaN | NaN | 2.0 | 15772.0 | 0.001552 | 143.47% | -66.93% | 0.036077 | 51.70% | -37.00% | 0.371124 | 26.85% | -22.30% |
| 919003A | NaN | NaN | 1.0 | 14933.0 | 0.004731 | 21.65% | -18.16% | 0.053229 | 15.45% | -13.59% | 1.273285 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 312061 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 314207 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 314213 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 315450 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 318076 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
222 rows × 13 columns
[29]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[29]:
state_alt 212
next_station_ds 192
q_uncert_num_curves 56
q_uncert_n 56
q_uncert_q10 56
q_uncert_q10_upper 118
q_uncert_q10_lower 118
q_uncert_q50 56
q_uncert_q50_upper 66
q_uncert_q50_lower 67
q_uncert_q90 56
q_uncert_q90_upper 61
q_uncert_q90_lower 61
dtype: int64
[30]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_silo_morton, aet_mm_silo_morton_point, aet_mm_silo_short_crop, aet_mm_silo_tall_crop,
airtemp_C_awap_max, airtemp_C_awap_min, airtemp_C_silo_max, airtemp_C_silo_min, awap_vp_hpa,
et_morton_wet_SILO, evap_morton_lake_SILO, evap_pan_SILO, evap_syn_SILO, mslp_SILO, pcp_mm_awap,
pcp_mm_silo, precipitation_var_AWAP, q_cms_obs, q_mmd_obs, rh_%_silo_tmax, rh_%_silo_tmin,
silo_vp_hpa, solrad_wm2_awap, solrad_wm2_silo, streamflow_MLd_inclInfilled, vp_deficit_SILO
[31]:
dataset = RainfallRunoff('CAMELS_AUS', path=os.path.join(DATA_PATH, 'CAMELS'), version=2, verbosity=0)
print(dataset)
CAMELS_AUS with 561 stations, 26 dynamic and 187 static features
[32]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
anngro_mega, anngro_meso, anngro_micro, aridity, carbnatesed, catchment_area, catchment_di, claya,
clayb, confinement, daystart, daystart_P, daystart_Q, distupdamw, drainage_division, elev_max,
elev_mean, elev_min, elev_range, elongratio, end_date, erosivity, extract_ind_fac, flow_div_fac,
flow_regime_di, frac_snow, geol_prim, geol_prim_prop, geol_sec, geol_sec_prop, gromega_seas,
gromeso_seas, gromicro_seas, high_prec_dur, high_prec_freq, high_prec_timing, igneous, impound_fac,
infrastruc_fac, ksat, landuse_fac, lat_centroid, lat_outlet, lc01_extracti, lc03_waterbo,
lc04_saltlak, lc05_irrcrop, lc06_irrpast, lc07_irrsuga, lc08_rfcropp, lc09_rfpastu, lc10_rfsugar,
lc11_wetlands, lc14_tussclo, lc15_alpineg, lc16_openhum, lc18_opentus, lc19_shrbsca, lc24_shrbden,
lc25_shrbope, lc31_forclos, lc32_foropen, lc33_woodope, lc34_woodspa, lc35_urbanar, leveebank_fac,
long_centroid, long_outlet, low_prec_dur, low_prec_freq, low_prec_timing, map_zone, mean_slope_pct,
metamorph, mrvbf_prop_0, mrvbf_prop_1, mrvbf_prop_2, mrvbf_prop_3, mrvbf_prop_4, mrvbf_prop_5,
mrvbf_prop_6, mrvbf_prop_7, mrvbf_prop_8, mrvbf_prop_9, nested_status, next_station_ds, notes,
npp_1, npp_10, npp_11, npp_12, npp_2, npp_3, npp_4, npp_5, npp_6, npp_7, npp_8, npp_9, npp_ann,
num_nested_within, nvis_bare_e, nvis_bare_n, nvis_forests_e, nvis_forests_n, nvis_grasses_e,
nvis_grasses_n, nvis_nodata_e, nvis_nodata_n, nvis_shrubs_e, nvis_shrubs_n, nvis_woodlands_e,
nvis_woodlands_n, oldrock, othersed, p_mean, p_seasonality, pet_mean, pop_gt_1, pop_gt_10, pop_max,
pop_mean, prop_forested, prop_missing_data, q_uncert_Q_above, q_uncert_days_above,
q_uncert_rmse_all, q_uncert_rmse_lower, q_uncert_rmse_upper, q_uncert_unique_curves, relief,
reliefratio, river_di, river_region, sanda, sedvolc, settlement_fac, sig_dur_RespTime,
sig_dur_high_Q_dur, sig_dur_low_Q_dur, sig_dur_zero_Q_dur, sig_freq_high_Q_freq,
sig_freq_low_Q_freq, sig_freq_zero_Q_freq, sig_mag_BFI, sig_mag_BaseMag, sig_mag_Q5, sig_mag_Q95,
sig_mag_Q_7_day_max, sig_mag_Q_7_day_min, sig_mag_Q_CoV, sig_mag_Q_mean, sig_mag_Q_skew,
sig_mag_Q_var, sig_mag_VarIdx, sig_other_EventRR, sig_other_PeakDistribution,
sig_other_PeakDistribution_low, sig_other_QP_elasticity, sig_other_RR_seasonality,
sig_other_SnowDayRatio, sig_other_SnowStorage, sig_other_Spearmans_rho, sig_other_StorageFromBase,
sig_other_TotalRR, sig_other_ratio_Event_TotalRR, sig_roc_AC1, sig_roc_AC1_low, sig_roc_BaseRecesK,
sig_roc_FDC_slope, sig_roc_FlashIdx, sig_roc_RLD, sig_roc_RecesK_early, sig_roc_RecesVarSeasonality,
sig_timing_HFD_mean, sig_timing_HFI_mean, silicsed, solpawhc, solum_thickness, start_date,
state_alt, state_outlet, station_name, strahler, strdensity, unconsoldted, upsdist
[33]:
df = dataset.fetch_static_features()
print(df.shape)
(561, 187)
[34]:
print(df.isna().sum().sum())
df.isna().sum()
1643
[34]:
station_name 0
drainage_division 0
river_region 0
notes 0
lat_outlet 0
..
npp_8 0
npp_9 0
npp_10 0
npp_11 0
npp_12 0
Length: 187, dtype: int64
find those columns which have at least one NaN value
[35]:
df.loc[:, (df.isna().sum()>0)]
[35]:
| state_alt | next_station_ds | q_uncert_unique_curves | q_uncert_rmse_all | q_uncert_rmse_lower | q_uncert_rmse_upper | q_uncert_days_above | q_uncert_Q_above | sig_mag_VarIdx | sig_roc_FDC_slope | sig_other_PeakDistribution_low | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| station_id | |||||||||||
| 912101A | NT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.292867 | -1.916733 | -2.180623 |
| 912105A | NT | 912101A | NaN | NaN | NaN | NaN | NaN | NaN | 0.304694 | -1.795139 | -1.254491 |
| 915011A | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.083646 | NaN | -6.090788 |
| 915206A | NaN | NaN | 25.0 | 25.172244 | 6.506520 | 20.955888 | 0.078362 | 19.011459 | 1.009843 | NaN | -8.491230 |
| 917107A | NaN | NaN | 16.0 | 53.380009 | 1168.007627 | 21.192680 | 0.132802 | 12.283859 | 0.641856 | -3.957062 | -3.631162 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 318150 | NaN | 318181 | 8.0 | 13.679565 | 13.569136 | 10.168856 | 0.000000 | 0.000000 | 0.459891 | -3.489307 | -5.351701 |
| 318181 | NaN | NaN | 24.0 | 8.209045 | 23.363542 | 5.920785 | 0.004200 | 0.450893 | 0.507649 | -3.661257 | -5.290249 |
| 318191 | NaN | 318150 | 11.0 | 8.226708 | 12.538870 | 6.093167 | 0.000000 | 0.000000 | 0.514683 | -3.525028 | -8.555535 |
| 318311 | NaN | 318150 | 10.0 | 19.588965 | 34.652832 | 14.517310 | 0.121428 | 11.333069 | 0.678704 | -4.723863 | -7.717046 |
| 319204 | NaN | NaN | 5.0 | 6.379150 | 20.465465 | 4.794664 | 0.005493 | 0.536084 | 0.683732 | -5.213989 | -6.477004 |
561 rows × 11 columns
[36]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[36]:
state_alt 544
next_station_ds 391
q_uncert_unique_curves 102
q_uncert_rmse_all 102
q_uncert_rmse_lower 102
q_uncert_rmse_upper 102
q_uncert_days_above 102
q_uncert_Q_above 102
sig_mag_VarIdx 2
sig_roc_FDC_slope 91
sig_other_PeakDistribution_low 3
dtype: int64
[37]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_silo_morton, aet_mm_silo_morton_point, aet_mm_silo_short_crop, aet_mm_silo_tall_crop,
agcd_h09_vp_hpa, agcd_h15_vp_hpa, airtemp_C_agcd_max, airtemp_C_agcd_min, airtemp_C_silo_max,
airtemp_C_silo_min, et_morton_wet_SILO, evap_morton_lake_SILO, evap_pan_SILO, evap_syn_SILO,
mslp_SILO, pcp_mm_agcd, pcp_mm_silo, precipitation_var_AGCD, q_cms_obs, q_mmd_obs, rh_%_silo_tmax,
rh_%_silo_tmin, silo_vp_hpa, solrad_wm2_silo, streamflow_MLd_inclInfilled, vp_deficit_SILO
CAMELS_GB
[38]:
dataset = RainfallRunoff('CAMELS_GB', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_GB with 671 stations, 10 dynamic and 145 static features
[39]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, abs_agriculture_perc, abs_amenities_perc, abs_energy_perc, abs_environmental_perc,
abs_industry_perc, abs_watersupply_perc, area, aridity, bankfull_flow, bares_perc, baseflow_index,
baseflow_index_ceh, benchmark_catch, bulkdens, bulkdens_5, bulkdens_50, bulkdens_95,
bulkdens_missing, clay_perc, clay_perc_missing, conductivity_cosby, conductivity_cosby_5,
conductivity_cosby_50, conductivity_cosby_95, conductivity_cosby_missing, conductivity_hypres,
conductivity_hypres_5, conductivity_hypres_50, conductivity_hypres_95, conductivity_hypres_missing,
crop_perc, discharges, dom_land_cover, dpsbar, dwood_perc, elev_10, elev_50, elev_90, elev_max,
elev_mean, elev_min, ewood_perc, flow_perc_complete, flow_period_end, flow_period_start,
frac_high_perc, frac_low_perc, frac_mod_perc, frac_snow, gauge_easting, gauge_elev, gauge_lat,
gauge_lon, gauge_name, gauge_northing, grass_perc, groundwater_abs, hfd_mean, high_prec_dur,
high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, inter_high_perc, inter_low_perc,
inter_mod_perc, inwater_perc, low_nsig_perc, low_prec_dur, low_prec_freq, low_prec_timing,
low_q_dur, low_q_freq, no_gw_perc, nsig_low_perc, num_reservoir, organic_perc, organic_perc_missing,
p_mean, p_seasonality, pet_mean, porosity_cosby, porosity_cosby_5, porosity_cosby_50,
porosity_cosby_95, porosity_cosby_missing, porosity_hypres, porosity_hypres_5, porosity_hypres_50,
porosity_hypres_95, porosity_hypres_missing, q25_uncert_lower, q25_uncert_upper, q50_uncert_lower,
q50_uncert_upper, q5_uncert_lower, q5_uncert_upper, q75_uncert_lower, q75_uncert_upper,
q95_uncert_lower, q95_uncert_upper, q99_uncert_lower, q99_uncert_upper, q_mean, quncert_meta,
reservoir_cap, reservoir_drain, reservoir_env, reservoir_fs, reservoir_he, reservoir_nav,
reservoir_nousedata, reservoir_wr, reservoir_year_first, reservoir_year_last, root_depth,
root_depth_5, root_depth_50, root_depth_95, root_depth_missing, runoff_ratio, sand_perc,
sand_perc_missing, shrub_perc, silt_perc, silt_perc_missing, slope_fdc, soil_depth_pelletier,
soil_depth_pelletier_5, soil_depth_pelletier_50, soil_depth_pelletier_95,
soil_depth_pelletier_missing, station_type, stream_elas, structurefull_flow, surfacewater_abs, tawc,
tawc_5, tawc_50, tawc_95, tawc_missing, urban_perc, zero_q_freq
[40]:
df = dataset.fetch_static_features()
print(df.shape)
(671, 145)
[41]:
print(df.isna().sum().sum())
df.isna().sum()
10316
[41]:
q_mean 0
runoff_ratio 0
stream_elas 0
slope_fdc 3
baseflow_index 0
..
soil_depth_pelletier 0
soil_depth_pelletier_missing 0
soil_depth_pelletier_5 0
soil_depth_pelletier_50 0
soil_depth_pelletier_95 0
Length: 145, dtype: int64
find those columns which have at least one NaN value
[42]:
df.loc[:, (df.isna().sum()>0)]
[42]:
| slope_fdc | high_prec_timing | low_prec_timing | surfacewater_abs | groundwater_abs | discharges | abs_agriculture_perc | abs_amenities_perc | abs_energy_perc | abs_environmental_perc | ... | q25_uncert_upper | q25_uncert_lower | q50_uncert_upper | q50_uncert_lower | q75_uncert_upper | q75_uncert_lower | q95_uncert_upper | q95_uncert_lower | q99_uncert_upper | q99_uncert_lower | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| gauge_id | |||||||||||||||||||||
| 38017 | 1.50 | son | jja | 0.000 | 0.054 | 0.005 | 28.43 | 0.00 | 0.00 | 0.00 | ... | 21.59 | -22.91 | 17.81 | -17.37 | 13.07 | -11.96 | 12.88 | -12.67 | 10.38 | -10.58 |
| 42001 | 3.80 | son | jja | 0.004 | 0.149 | 0.003 | 2.79 | 0.00 | 0.00 | 0.00 | ... | 26.76 | -25.97 | 15.90 | -17.53 | 25.30 | -22.87 | 24.43 | -23.47 | 14.23 | -12.56 |
| 55014 | 2.78 | djf | jja | 0.001 | 0.000 | 0.000 | 47.16 | 0.00 | 0.00 | 0.00 | ... | 16.99 | -16.77 | 15.09 | -15.04 | 12.25 | -12.34 | 14.04 | -14.03 | 11.15 | -11.15 |
| 27041 | 2.04 | son | mam | 0.047 | 0.053 | 0.014 | 58.32 | 0.12 | 14.58 | 0.00 | ... | 14.67 | -15.21 | 14.57 | -14.52 | 14.56 | -14.43 | 12.83 | -13.23 | 9.28 | -10.18 |
| 39078 | 2.02 | son | jja | 0.000 | 0.090 | 0.049 | 0.65 | 0.00 | 0.00 | 0.00 | ... | 25.60 | -26.23 | 16.68 | -16.65 | 19.92 | -19.76 | 16.59 | -16.18 | 9.20 | -8.61 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66006 | 3.62 | son | jja | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 17.83 | -17.47 | 12.86 | -12.66 | 10.78 | -10.53 | 13.71 | -13.75 | 10.86 | -10.95 |
| 39014 | 1.72 | son | jja | 0.000 | 0.217 | 0.019 | 0.94 | 0.00 | 0.00 | 0.03 | ... | 14.67 | -13.54 | 14.00 | -13.29 | 12.99 | -12.63 | 9.83 | -9.67 | NaN | NaN |
| 42010 | 1.06 | son | jja | 0.714 | 0.560 | 0.046 | 66.65 | 0.00 | 0.46 | 9.92 | ... | 14.07 | -14.02 | 11.30 | -11.45 | 10.76 | -10.73 | 9.78 | -9.59 | 10.56 | -10.19 |
| 42011 | 2.06 | son | jja | 0.000 | 0.068 | 0.070 | 0.69 | 0.00 | 0.00 | 0.00 | ... | 11.80 | -11.47 | 12.42 | -12.60 | 8.67 | -8.51 | 10.04 | -9.41 | 10.72 | -10.64 |
| 43009 | 3.63 | son | jja | 0.050 | 0.008 | 0.026 | 54.02 | 1.73 | 36.56 | 0.00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
671 rows × 38 columns
[43]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[43]:
slope_fdc 3
high_prec_timing 15
low_prec_timing 3
surfacewater_abs 229
groundwater_abs 229
discharges 231
abs_agriculture_perc 313
abs_amenities_perc 313
abs_energy_perc 313
abs_environmental_perc 313
abs_industry_perc 313
abs_watersupply_perc 313
reservoir_he 509
reservoir_nav 509
reservoir_drain 509
reservoir_wr 509
reservoir_fs 509
reservoir_env 509
reservoir_nousedata 509
reservoir_year_first 530
reservoir_year_last 530
dpsbar 2
elev_mean 2
station_type 1
bankfull_flow 310
structurefull_flow 408
q5_uncert_upper 235
q5_uncert_lower 235
q25_uncert_upper 173
q25_uncert_lower 173
q50_uncert_upper 168
q50_uncert_lower 168
q75_uncert_upper 170
q75_uncert_lower 170
q95_uncert_upper 195
q95_uncert_lower 195
q99_uncert_upper 250
q99_uncert_lower 250
dtype: int64
[44]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, lwdownrad_wm2, pcp_mm, pet_mm, pet_mm_intercep, q_cms_obs, q_mmd_obs, rh_%,
solrad_wm2, windspeed_mps
CAMELS_BR
[45]:
dataset = RainfallRunoff('CAMELS_BR', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_BR with 897 stations, 11 dynamic and 67 static features
[46]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, area, area_ana, area_gsim, area_gsim_quality, aridity, asynchronicity, barren_perc,
baseflow_index, bedrock_depth, carb_rocks_perc, clay_perc, consumptive_use, consumptive_use_perc,
crop_mosaic_perc, crop_perc, dom_land_cover, dom_land_cover_perc, elev_gauge, elev_mean, et_mean,
forest_perc, frac_snow, gauge_lat, gauge_lon, gauge_name, gauge_region, geol_class_1st,
geol_class_1st_perc, geol_class_2nd, geol_class_2nd_perc, geol_permeability, geol_porosity,
grass_perc, hfd_mean, high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur, high_q_freq,
imperv_perc, low_prec_dur, low_prec_freq, low_prec_timing, low_q_dur, low_q_freq,
org_carbon_content, p_mean, p_seasonality, pet_mean, q_mean, q_quality_control_perc,
q_stream_stage_perc, regulation_degree, reservoirs_vol, runoff_ratio, sand_perc, shrub_perc,
silt_perc, slope_fdc, slope_mean, snow_perc, stream_elas, water_table_depth, wet_perc, zero_q_freq
[47]:
df = dataset.fetch_static_features()
print(df.shape)
(897, 67)
[48]:
print(df.isna().sum().sum())
df.isna().sum()
133
[48]:
p_mean 0
pet_mean 0
et_mean 0
aridity 0
p_seasonality 0
..
water_table_depth 0
elev_gauge 0
elev_mean 0
slope_mean 0
area 0
Length: 67, dtype: int64
find those columns which have at least one NaN value
[49]:
df.loc[:, (df.isna().sum()>0)]
[49]:
| frac_snow | high_prec_timing | geol_class_2nd | slope_fdc | baseflow_index | area_ana | |
|---|---|---|---|---|---|---|
| gauge_id | ||||||
| 58030000 | 0.0 | djf | acid_plutonic_rocks | 1.08954 | 0.79986 | 796.0 |
| 57170000 | 0.0 | djf | acid_plutonic_rocks | 1.35609 | 0.76520 | 980.0 |
| 39580000 | 0.0 | mam | siliciclastic_sedimentary_rocks | 1.68712 | 0.66356 | 756.0 |
| 41818000 | 0.0 | djf | metamorphics | 1.98782 | 0.64151 | 16600.0 |
| 58870000 | 0.0 | djf | metamorphics | 1.39838 | 0.71359 | 1120.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 26720000 | 0.0 | djf | siliciclastic_sedimentary_rocks | 6.72326 | 0.57567 | 6610.0 |
| 65925000 | 0.0 | son | NaN | 2.24256 | 0.54385 | 1660.0 |
| 39560000 | 0.0 | mam | metamorphics | 2.14107 | 0.65966 | 4910.0 |
| 71550000 | 0.0 | son | siliciclastic_sedimentary_rocks | 2.51894 | 0.58372 | NaN |
| 41539998 | 0.0 | djf | metamorphics | 1.74270 | 0.69456 | NaN |
897 rows × 6 columns
[50]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[50]:
frac_snow 5
high_prec_timing 4
geol_class_2nd 47
slope_fdc 16
baseflow_index 18
area_ana 43
dtype: int64
[51]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_mgb, airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_chirps, pcp_mm_cpc,
pcp_mm_mswep, pet_mm_gleam, q_cms_obs, q_mmd_obs
CAMELS_US
[52]:
dataset = RainfallRunoff('CAMELS_US', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_US with 671 stations, 8 dynamic and 59 static features
[53]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area_gages2, area_geospa_fabric, aridity, baseflow_index, carbonate_rocks_frac, clay_frac,
dom_land_cover, dom_land_cover_frac, elev_mean, frac_forest, frac_snow, gauge_lat, gauge_lon,
gauge_name, geol_1st_class, geol_2nd_class, geol_permeability, geol_porostiy, glim_1st_class_frac,
glim_2nd_class_frac, gvf_diff, gvf_max, hfd_mean, high_prec_dur, high_prec_freq, high_prec_timing,
high_q_dur, high_q_freq, huc_02, lai_diff, lai_max, low_prec_dur, low_prec_freq, low_prec_timing,
low_q_dur, low_q_freq, max_water_content, organic_frac, other_frac, p_mean, p_seasonality, pet_mean,
q5, q95, q_mean, root_depth_50, root_depth_99, runoff_ratio, sand_frac, silt_frac, slope_fdc,
slope_mean, soil_conductivity, soil_depth_pelletier, soil_depth_statsgo, soil_porosity, stream_elas,
water_frac, zero_q_freq
[54]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_min, dayl(s), pcp_mm, q_cms_obs, solrad_wm2, swe_mm, vp_hpa
CAMELS_CL
[55]:
dataset = RainfallRunoff('CAMELS_CL', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_CL with 516 stations, 12 dynamic and 104 static features
[56]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, area, aridity_chirps, aridity_cr2met, aridity_mswep, aridity_tmpa, baseflow_index, big_dam,
carb_rocks_frac, crop_frac, dom_land_cover, dom_land_cover_frac, elev_gauge, elev_max, elev_mean,
elev_med, elev_min, forest_frac, fp_frac, fp_nf_index, frac_snow_chirps, frac_snow_cr2met,
frac_snow_mswep, frac_snow_tmpa, gauge_lat, gauge_lon, gauge_name, geol_class_1st,
geol_class_1st_frac, geol_class_2nd, geol_class_2nd_frac, grass_frac, gw_rights_flow, gw_rights_n,
hfd_mean, high_prec_dur_chirps, high_prec_dur_cr2met, high_prec_dur_mswep, high_prec_dur_tmpa,
high_prec_freq_chirps, high_prec_freq_cr2met, high_prec_freq_mswep, high_prec_freq_tmpa,
high_prec_timing_chirps, high_prec_timing_cr2met, high_prec_timing_mswep, high_prec_timing_tmpa,
high_q_dur, high_q_freq, imp_frac, interv_degree, land_cover_missing, lc_barren, lc_glacier,
location_type, low_prec_dur_chirps, low_prec_dur_cr2met, low_prec_dur_mswep, low_prec_dur_tmpa,
low_prec_freq_chirps, low_prec_freq_cr2met, low_prec_freq_mswep, low_prec_freq_tmpa,
low_prec_timing_chirps, low_prec_timing_cr2met, low_prec_timing_mswep, low_prec_timing_tmpa,
low_q_dur, low_q_freq, n_obs, nested_inner, nested_outer, nf_frac, p_mean_chirps, p_mean_cr2met,
p_mean_mswep, p_mean_spread, p_mean_tmpa, p_seasonality_chirps, p_seasonality_cr2met,
p_seasonality_mswep, p_seasonality_tmpa, pet_mean, q_mean, record_period_end, record_period_start,
runoff_ratio_chirps, runoff_ratio_cr2met, runoff_ratio_mswep, runoff_ratio_tmpa, shrub_frac,
slope_fdc, slope_mean, snow_frac, stream_elas_chirps, stream_elas_cr2met, stream_elas_mswep,
stream_elas_tmpa, sur_rights_flow, sur_rights_n, swe_ratio, wet_frac, zero_q_freq
[57]:
df = dataset.fetch_static_features()
print(df.shape)
(516, 104)
[58]:
print(df.isna().sum().sum())
df.isna().sum()
12185
[58]:
gauge_id
gauge_name 0
gauge_lat 0
gauge_lon 0
record_period_start 0
record_period_end 0
..
sur_rights_flow 0
interv_degree 0
gw_rights_n 0
gw_rights_flow 0
big_dam 0
Length: 104, dtype: int64
find those rows which have at least one NaN value
[59]:
df.loc[:, (df.isna().sum()>0)]
[59]:
| gauge_id | location_type | geol_class_2nd | p_mean_chirps | p_mean_tmpa | aridity_chirps | aridity_tmpa | p_seasonality_chirps | p_seasonality_tmpa | frac_snow_chirps | frac_snow_tmpa | ... | baseflow_index | hfd_mean | Q95 | Q5 | high_q_freq | high_q_dur | low_q_freq | low_q_dur | zero_q_freq | swe_ratio |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8220009 | NaN | Intermediate plutonic rocks | 3.69311266 | NaN | 0.8586003 | NaN | -1.074889369 | NaN | 0.0000000000000 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 10362001 | coastal | Pyroclastics | 4.34012320 | NaN | 0.6072567 | NaN | -0.622200375 | NaN | 0.0002103480252 | NaN | ... | 0.5624186 | 122.0500 | 6.015702219 | 0.10735714728 | 13.44168511 | 2.714286 | 117.13468456 | 29.341772 | 0.0000000000 | NaN |
| 7317005 | NaN | Basic volcanic rocks | 4.72452895 | NaN | 0.5293816 | NaN | -0.960896391 | NaN | 0.1469334578557 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2112005 | NaN | Unconsolidated sediments | 0.21673525 | NaN | 18.4153050 | NaN | 1.336308822 | NaN | 0.0002204325151 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5746001 | NaN | Unconsolidated sediments | 1.16770048 | NaN | 3.1408824 | NaN | -1.403499734 | NaN | 0.0000000000000 | NaN | ... | 0.7183103 | 124.6000 | 1.580334352 | 0.22086357569 | 2.43736966 | 2.086957 | 8.17534408 | 13.416667 | 0.0000000000 | 0.003368185140 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5101001 | NaN | Acid plutonic rocks | 0.72182012 | NaN | 4.7505564 | NaN | -1.440400284 | NaN | 0.0038392564541 | NaN | ... | 0.6971627 | 162.1176 | 2.386918187 | 0.01539947218 | 43.72596796 | 18.340909 | 148.08311081 | 44.803279 | 0.0000000000 | 0.344224603708 |
| 10401001 | NaN | Metamorphics | 4.61487255 | NaN | 0.5567343 | NaN | -0.598618823 | NaN | 0.0007067884319 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2110002 | NaN | Siliciclastic sedimentary rocks | 0.28986612 | NaN | 11.4299221 | NaN | 1.555320147 | NaN | 0.0021539836182 | NaN | ... | 0.9021013 | 190.0625 | 0.014444286 | 0.00737807572 | 0.00000000 | 0.000000 | 0.00000000 | 0.000000 | 0.0000000000 | NaN |
| 8350001 | NaN | Acid volcanic rocks | 8.45346995 | NaN | 0.3105805 | NaN | -0.836864115 | NaN | 0.0829756992572 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 11315001 | NaN | Unconsolidated sediments | 2.79655852 | NaN | 0.7749488 | NaN | -0.306493095 | NaN | 0.1256086732201 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
516 rows × 42 columns
[60]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[60]:
gauge_id
location_type 386
geol_class_2nd 16
p_mean_chirps 43
p_mean_tmpa 516
aridity_chirps 43
aridity_tmpa 516
p_seasonality_chirps 43
p_seasonality_tmpa 516
frac_snow_chirps 43
frac_snow_tmpa 516
high_prec_freq_chirps 43
high_prec_freq_tmpa 516
high_prec_dur_chirps 43
high_prec_dur_tmpa 516
high_prec_timing_chirps 43
high_prec_timing_tmpa 516
low_prec_freq_chirps 43
low_prec_freq_tmpa 516
low_prec_dur_chirps 43
low_prec_dur_tmpa 516
low_prec_timing_chirps 43
low_prec_timing_tmpa 516
q_mean 278
runoff_ratio_cr2met 278
runoff_ratio_chirps 297
runoff_ratio_mswep 278
runoff_ratio_tmpa 516
stream_elas_cr2met 278
stream_elas_chirps 297
stream_elas_mswep 278
stream_elas_tmpa 516
slope_fdc 278
baseflow_index 278
hfd_mean 278
Q95 278
Q5 278
high_q_freq 278
high_q_dur 278
low_q_freq 278
low_q_dur 278
zero_q_freq 278
swe_ratio 397
dtype: int64
[61]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_chirps, pcp_mm_cr2met, pcp_mm_mswep,
pcp_mm_tmpa, pet_mm_hargreaves, pet_mm_modis, q_cms_obs, q_mmd_obs, swe
CAMELS_DK
[62]:
dataset = RainfallRunoff('CAMELS_DK', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_DK with 304 stations, 13 dynamic and 119 static features
[63]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
FC, HCC, KS, MRC, THS, WP, aridity, bulk_density, catch_accum_number, catch_area, catch_flow_dir,
catch_outlet_lat, catch_outlet_lon, chalk_d, dem_max, dem_mean, dem_median, dem_min,
frac_snow_daily, gauge_record_pct, gauged_type, high_prec_dur, high_prec_freq, high_prec_timing,
low_prec_dur, low_prec_freq, low_prec_timing, p_mean, p_seasonality, pct_aeolain_sand,
pct_agriculture_corine_1990, pct_agriculture_corine_2000, pct_agriculture_corine_2006,
pct_agriculture_corine_2012, pct_agriculture_corine_2018, pct_agriculture_levin_2011,
pct_agriculture_levin_2016, pct_agriculture_levin_2018, pct_agriculture_levin_2021, pct_beach,
pct_clay, pct_claynor_100, pct_claynor_200, pct_claynor_30, pct_claynor_60, pct_down_sand,
pct_flat_area, pct_forest_corine_1990, pct_forest_corine_2000, pct_forest_corine_2006,
pct_forest_corine_2012, pct_forest_corine_2018, pct_forest_levin_2011, pct_forest_levin_2016,
pct_forest_levin_2018, pct_forest_levin_2021, pct_fsandno_100, pct_fsandno_200, pct_fsandno_30,
pct_fsandno_60, pct_glaf_sand, pct_glal_clay, pct_glam_clay, pct_gravel, pct_gsandno_100,
pct_gsandno_200, pct_gsandno_30, pct_gsandno_60, pct_marine_sand, pct_marsh,
pct_naturedry_levin_2011, pct_naturedry_levin_2016, pct_naturedry_levin_2018,
pct_naturedry_levin_2021, pct_naturewet_levin_2011, pct_naturewet_levin_2016,
pct_naturewet_levin_2018, pct_naturewet_levin_2021, pct_organic, pct_sand, pct_sandy_till, pct_silt,
pct_till, pct_urban_corine_1990, pct_urban_corine_2000, pct_urban_corine_2006,
pct_urban_corine_2012, pct_urban_corine_2018, pct_urban_levin_2011, pct_urban_levin_2016,
pct_urban_levin_2018, pct_urban_levin_2021, pct_water_corine_1990, pct_water_corine_2000,
pct_water_corine_2006, pct_water_corine_2012, pct_water_corine_2018, pct_water_deposit,
pct_water_levin_2011, pct_water_levin_2016, pct_water_levin_2018, pct_water_levin_2021,
pct_wetlands_corine_1990, pct_wetlands_corine_2000, pct_wetlands_corine_2006,
pct_wetlands_corine_2012, pct_wetlands_corine_2018, pet_mean, root_depth, slope_max, slope_mean,
slope_median, slope_min, t_mean, tawc, uaquifer_d, uaquifer_t, uclay_t, usand_t
[64]:
df = dataset.fetch_static_features()
print(df.shape)
(304, 119)
[65]:
print(df.isna().sum().sum())
df.isna().sum()
23
[65]:
FC 0
HCC 0
KS 0
MRC 0
THS 0
..
tawc 0
uaquifer_d 3
uaquifer_t 3
uclay_t 3
usand_t 3
Length: 119, dtype: int64
find those columns which have at least one NaN value
[66]:
df.loc[:, (df.isna().sum()>0)]
[66]:
| chalk_d | gauge_record_pct | uaquifer_d | uaquifer_t | uclay_t | usand_t | |
|---|---|---|---|---|---|---|
| 16200607 | 348.941440 | 100.000000 | 6.166402 | 16.930742 | 5.468584 | 5.671323 |
| 37470466 | 451.491863 | 100.000000 | 0.630618 | 46.024319 | 0.550030 | 31.309439 |
| 67221267 | 82.682739 | 100.000000 | 30.316439 | 10.650221 | 28.674266 | 0.580311 |
| 35321353 | 425.690145 | 100.000000 | 7.186167 | 39.533252 | 6.985663 | 0.198207 |
| 53411137 | 287.510620 | 100.000000 | 15.445959 | 8.634338 | 15.021151 | 0.210060 |
| ... | ... | ... | ... | ... | ... | ... |
| 32211121 | 68.721796 | 54.840134 | 7.125480 | 8.509604 | 6.253781 | 1.478004 |
| 42320708 | 155.292211 | 100.000000 | 24.663791 | 9.827164 | 21.955691 | 1.744912 |
| 71270476 | 10.931980 | 100.000000 | 9.366122 | 45.329497 | 9.176942 | 0.364699 |
| 32240800 | 19.045147 | 100.000000 | 9.834996 | 28.712421 | 9.284840 | 0.198684 |
| 42600042 | 124.778317 | 90.673026 | 12.625913 | 17.442115 | 12.289986 | 2.165011 |
304 rows × 6 columns
[67]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[67]:
chalk_d 3
gauge_record_pct 8
uaquifer_d 3
uaquifer_t 3
uclay_t 3
usand_t 3
dtype: int64
[68]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
Abstraction, DKM_dtp, DKM_gwh, DKM_irr, DKM_sdr, DKM_sre, DKM_wcr, Qdkm, aet_mm, airtemp_C_mean,
pcp_mm, pet_mm, q_cms_obs
CAMELS_CH
[69]:
dataset = RainfallRunoff('CAMELS_CH', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_CH with 331 stations, 9 dynamic and 209 static features
[70]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5, Q95, aap, acid_plutonic, acid_volcanic, amk, api, area, aridity, baseflow_index_landson,
basic_plutonic, basic_volcanic, bulk_dens, bulk_dens_25, bulk_dens_5, bulk_dens_50, bulk_dens_75,
bulk_dens_90, bulk_dens_missing, bulk_dens_skewness, carbonate_sedimentary, clay_perc, clay_perc_25,
clay_perc_5, clay_perc_50, clay_perc_75, clay_perc_90, clay_perc_missing, clay_perc_skewness,
coarse_fragm_perc, coarse_fragm_perc_25, coarse_fragm_perc_5, coarse_fragm_perc_50,
coarse_fragm_perc_75, coarse_fragm_perc_90, coarse_fragm_perc_missing, coarse_fragm_perc_skewness,
conductivity, conductivity_25, conductivity_5, conductivity_50, conductivity_75, conductivity_90,
conductivity_missing, conductivity_skewness, country, crop_perc, dens_inhabitants, dom_land_cover,
dup, dwood_perc, elev_max, elev_mean, elev_min, elev_percentile10, elev_percentile25,
elev_percentile50, elev_percentile75, elev_percentile90, ewood_perc, ext_area_perc, fju,
flat_area_perc, frac_snow, gauge_easting, gauge_elevation, gauge_lat, gauge_lon, gauge_name,
gauge_northing, geo_log10_permeability, geo_porosity, glac_area, glac_area_neighbours, glac_mass,
glac_vol, grass_perc, hardrock_imperm_perc, hardrock_perc, hes, hfd_mean, high_prec_dur,
high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, hp_count, hp_inst_turb, hp_max_power,
hp_qturb, ice_geo, ice_perc, id6, ind_end_date, ind_number_of_years, ind_start_date,
intermediate_plutonic, inwater_perc, karst_perc, loose_rock_perc, low_prec_dur, low_prec_freq,
low_prec_timing, low_q_dur, low_q_freq, metamorphics, mixed_sedimentary, mixed_wood_perc, mpk, mps,
n_inhabitants, null_perc, num_reservoir, omm, ood, oos, ops, organic_perc, organic_perc_25,
organic_perc_5, organic_perc_50, organic_perc_75, organic_perc_90, organic_perc_missing,
organic_perc_skewness, osm, p_mean, p_seasonality, pet_mean, porosity, porosity_25, porosity_5,
porosity_50, porosity_75, porosity_90, porosity_missing, porosity_skewness, pyroclastic, q_mean,
qua, reservoir_cap, reservoir_fs, reservoir_he, reservoir_irr, reservoir_nousedata,
reservoir_year_first, reservoir_year_last, rock_perc, root_depth, root_depth_25, root_depth_5,
root_depth_50, root_depth_75, root_depth_90, root_depth_missing, root_depth_skewness, runoff_ratio,
sal, sand_perc, sand_perc_25, sand_perc_5, sand_perc_50, sand_perc_75, sand_perc_90,
sand_perc_missing, sand_perc_skewness, scrub_perc, sign_end_date, sign_number_of_years,
sign_start_date, siliciclastic_sedimentary, silt_perc, silt_perc_25, silt_perc_5, silt_perc_50,
silt_perc_75, silt_perc_90, silt_perc_missing, silt_perc_skewness, slope_fdc, slope_mean,
steep_area_perc, stream_elas, sus, tie, tot_avail_water, tot_avail_water_25, tot_avail_water_5,
tot_avail_water_50, tot_avail_water_75, tot_avail_water_90, tot_avail_water_missing,
tot_avail_water_skewness, ukd, unconsol_coarse_perc, unconsol_fine_perc, unconsol_imperm_perc,
unconsol_medium_perc, unconsol_sediments, uod, ups, urban_perc, usm, water_body_name,
water_body_type, water_geo, water_perc, wetlands_perc, zero_q_freq
[71]:
df = dataset.fetch_static_features()
print(df.shape)
(331, 209)
[72]:
print(df.isna().sum().sum())
df.isna().sum()
2097
[72]:
ind_start_date 0
ind_end_date 0
ind_number_of_years 0
p_mean 0
pet_mean 0
..
elev_percentile90 0
elev_max 0
slope_mean 0
flat_area_perc 0
steep_area_perc 0
Length: 209, dtype: int64
find those columns which have at least one NaN value
[73]:
df.loc[:, (df.isna().sum()>0)]
[73]:
| p_seasonality | frac_snow | high_prec_timing | low_prec_timing | reservoir_he | reservoir_fs | reservoir_irr | reservoir_nousedata | reservoir_year_first | reservoir_year_last | ... | baseflow_index_landson | hfd_mean | Q5 | Q95 | high_q_freq | high_q_dur | low_q_freq | low_q_dur | zero_q_freq | silt_perc_skewness | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| gauge_id | |||||||||||||||||||||
| 2004 | 0.159 | 0.039 | jja | son | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -0.252 |
| 2007 | -0.118 | 0.170 | djf | son | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.635 |
| 2009 | 0.078 | 0.436 | jja | son | 0.999 | 0.0 | 0.001 | 0.0 | 1914.0 | 1989.0 | ... | 0.787 | 243.282 | 1.279 | 6.207 | 0.000 | 0.000 | 0.051 | 2.000 | 0.0 | 0.285 |
| 2011 | 0.106 | 0.474 | son | son | 0.998 | 0.0 | 0.002 | 0.0 | 1927.0 | 1989.0 | ... | 0.751 | 263.667 | 0.821 | 6.681 | 0.051 | 1.000 | 0.436 | 1.000 | 0.0 | 0.267 |
| 2014 | 0.279 | 0.223 | jja | son | 1.000 | 0.0 | 0.000 | 0.0 | 1910.0 | 2015.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.421 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6007 | 0.228 | 0.379 | son | djf | 1.000 | 0.0 | 0.000 | 0.0 | 2010.0 | 2010.0 | ... | 0.715 | 211.333 | 1.298 | 8.789 | 2.005 | 1.714 | 2.451 | 7.333 | 0.0 | 0.393 |
| 6008 | NaN | NaN | son | djf | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.602 | 188.875 | 0.632 | 10.751 | 4.385 | 2.593 | 29.315 | 8.069 | 0.0 | -0.603 |
| 6009 | NaN | NaN | son | djf | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.318 | 191.714 | 0.127 | 15.376 | 26.897 | 2.667 | 155.228 | 12.056 | 0.0 | 0.310 |
| 6010 | NaN | NaN | son | djf | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.494 | 198.400 | 1.002 | 12.617 | 12.195 | 2.103 | 4.998 | 3.571 | 0.0 | -0.744 |
| 6011 | 0.272 | 0.110 | NaN | djf | 1.000 | 0.0 | 0.000 | 0.0 | 1918.0 | 2010.0 | ... | 0.697 | 204.250 | 1.165 | 10.371 | 0.000 | 0.000 | 0.000 | 0.000 | 0.0 | 0.272 |
331 rows × 26 columns
[74]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[74]:
p_seasonality 54
frac_snow 54
high_prec_timing 13
low_prec_timing 5
reservoir_he 223
reservoir_fs 223
reservoir_irr 223
reservoir_nousedata 223
reservoir_year_first 223
reservoir_year_last 223
sign_start_date 42
sign_end_date 42
q_mean 42
runoff_ratio 42
stream_elas 44
slope_fdc 42
baseflow_index_landson 42
hfd_mean 42
Q5 42
Q95 42
high_q_freq 42
high_q_dur 42
low_q_freq 42
low_q_dur 42
zero_q_freq 42
silt_perc_skewness 1
dtype: int64
[75]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, q_cms_obs, q_mmd_obs, rel_sun_dur(%), swe_mm,
waterlevel(m)
CAMELS_DE
[76]:
dataset = RainfallRunoff('CAMELS_DE', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_DE with 1555 stations, 21 dynamic and 111 static features
[77]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
NSE_conceptual, NSE_lstm, Q5, Q95, agricultural_areas_perc, aquifer_aquitard_mixed_perc,
aquifer_perc, aquitard_perc, area, area_metadata, artificial_surfaces_perc,
bulk_density_0_30cm_mean, bulk_density_100_200cm_mean, bulk_density_30_100cm_mean,
cavity_fissure_karst_perc, cavity_fissure_perc, cavity_fissure_pores_perc, cavity_pores_perc,
clay_0_30cm_mean, clay_100_200cm_mean, clay_30_100cm_mean, coarse_fragments_0_30cm_mean,
coarse_fragments_100_200cm_mean, coarse_fragments_30_100cm_mean, consolidation_solid_rock_perc,
consolidation_unconsolidated_rock_perc, dams_names, dams_num, dams_purposes, dams_river_names,
dams_total_lake_area, dams_total_lake_volume, dams_year_first, dams_year_last, elev_5, elev_50,
elev_95, elev_max, elev_mean, elev_min, federal_state, flow_perc_complete, flow_period_end,
flow_period_start, forests_and_seminatural_areas_perc, frac_snow, gauge_easting, gauge_elev,
gauge_elev_metadata, gauge_lat, gauge_lon, gauge_name, gauge_northing,
geochemical_rocktype_anthropogenically_modified_through_filling_perc,
geochemical_rocktype_carbonatic_perc, geochemical_rocktype_halitic_perc,
geochemical_rocktype_silicate_carbonatic_perc,
geochemical_rocktype_silicate_organic_components_perc, geochemical_rocktype_silicate_perc,
geochemical_rocktype_sulfatic_halitic_perc, geochemical_rocktype_sulfatic_perc, hfd_mean,
high_prec_dur, high_prec_freq, high_prec_timing, high_q_dur, high_q_freq, kf_extremely_low_perc,
kf_high_perc, kf_highly_variable_perc, kf_low_perc, kf_low_to_extremely_low_perc, kf_medium_perc,
kf_medium_to_moderate_perc, kf_moderate_perc, kf_moderate_to_low_perc, kf_very_high_perc,
kf_very_high_to_high_perc, kf_very_low_perc, low_prec_dur, low_prec_freq, low_prec_timing,
low_q_dur, low_q_freq, no_data_perc, p_mean, p_seasonality, provider_id, q_mean,
rocktype_magmatite_perc, rocktype_metamorphite_perc, rocktype_sediment_perc, runoff_ratio,
sand_0_30cm_mean, sand_100_200cm_mean, sand_30_100cm_mean, silt_0_30cm_mean, silt_100_200cm_mean,
silt_30_100cm_mean, slope_fdc, soil_organic_carbon_0_30cm_mean, soil_organic_carbon_100_200cm_mean,
soil_organic_carbon_30_100cm_mean, testing_perc_complete, training_perc_complete,
validation_perc_complete, water_bodies_perc, water_body_name, waterbody_perc, wetlands_perc,
zero_q_freq
[78]:
df = dataset.fetch_static_features()
print(df.shape)
(1555, 111)
[79]:
print(df.isna().sum().sum())
df.isna().sum()
6862
[79]:
p_mean 0
p_seasonality 0
frac_snow 0
high_prec_freq 0
high_prec_dur 0
..
elev_min 0
elev_5 0
elev_50 0
elev_95 0
elev_max 0
Length: 111, dtype: int64
find those columns which have at least one NaN value
[80]:
df.loc[:, (df.isna().sum()>0)]
[80]:
| high_prec_timing | low_prec_timing | dams_names | dams_river_names | dams_year_first | dams_year_last | dams_total_lake_area | dams_total_lake_volume | dams_purposes | NSE_lstm | NSE_conceptual | gauge_elev_metadata | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| gauge_id | ||||||||||||
| DEA11180 | jja | mam | Aabachtalsperre|Borchen Hochwasserrückhaltebec... | Aabookach (Afte)|Afte bzw. Wiele|Altenau (Alme... | 1930.0 | 1996.0 | 10.28 | 71.90 | Water supply|Recreational use|Flood control | 0.929 | 0.854 | 30.86 |
| DEE10940 | jja | mam | NaN | NaN | NaN | NaN | 0.00 | 0.00 | NaN | 0.094 | 0.140 | 69.77 |
| DE911160 | jja | mam | NaN | NaN | NaN | NaN | 0.00 | 0.00 | NaN | 0.841 | 0.844 | NaN |
| DE212640 | jja | son | NaN | NaN | NaN | NaN | 0.00 | 0.00 | NaN | 0.727 | 0.620 | 326.68 |
| DE112130 | mam | mam | NaN | NaN | NaN | NaN | 0.00 | 0.00 | NaN | 0.688 | 0.605 | 105.48 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| DEF13210 | jja | mam | NaN | NaN | NaN | NaN | 0.00 | 0.00 | NaN | 0.917 | NaN | NaN |
| DEF10460 | jja | mam | NaN | NaN | NaN | NaN | 0.00 | 0.00 | NaN | 0.574 | 0.355 | NaN |
| DE912320 | jja | mam | NaN | NaN | NaN | NaN | 0.00 | 0.00 | NaN | 0.770 | 0.589 | NaN |
| DEA11090 | jja | mam | Aabachtalsperre|Borchen Hochwasserrückhaltebec... | Afte bzw. Wiele|Aabookach (Afte)|Altenau (Alme... | 1974.0 | 1996.0 | 5.37 | 39.41 | Water supply|Flood control | 0.928 | 0.884 | 63.64 |
| DE213310 | jja | son | NaN | NaN | NaN | NaN | 0.00 | 0.00 | NaN | 0.637 | 0.512 | 542.11 |
1555 rows × 12 columns
[81]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[81]:
high_prec_timing 8
low_prec_timing 3
dams_names 1240
dams_river_names 1240
dams_year_first 1251
dams_year_last 1251
dams_total_lake_area 41
dams_total_lake_volume 2
dams_purposes 1241
NSE_lstm 43
NSE_conceptual 157
gauge_elev_metadata 385
dtype: int64
[82]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm_max, pcp_mm_mean, pcp_mm_median, pcp_mm_min,
pcp_mm_std, q_cms_obs, q_mmd_obs, rh_%, rh_%_max, rh_%_med, rh_%_min, rh_%_std, solrad_wm2_max,
solrad_wm2_mean, solrad_wm2_med, solrad_wm2_min, solrad_wm2_std, water_level
CAMELS_FR
[83]:
dataset = RainfallRunoff('CAMELS_FR', path=DATA_PATH, verbosity=0)
print(dataset)
CAMELS_FR with 654 stations, 22 dynamic and 344 static features
[84]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
clc_1990_lvl1_1, clc_1990_lvl1_2, clc_1990_lvl1_3, clc_1990_lvl1_4, clc_1990_lvl1_5,
clc_1990_lvl1_dom_class, clc_1990_lvl1_na, clc_1990_lvl2_11, clc_1990_lvl2_12, clc_1990_lvl2_13,
clc_1990_lvl2_14, clc_1990_lvl2_21, clc_1990_lvl2_22, clc_1990_lvl2_23, clc_1990_lvl2_24,
clc_1990_lvl2_31, clc_1990_lvl2_32, clc_1990_lvl2_33, clc_1990_lvl2_41, clc_1990_lvl2_42,
clc_1990_lvl2_51, clc_1990_lvl2_52, clc_1990_lvl2_dom_class, clc_1990_lvl2_na, clc_1990_lvl3_111,
clc_1990_lvl3_112, clc_1990_lvl3_121, clc_1990_lvl3_122, clc_1990_lvl3_123, clc_1990_lvl3_124,
clc_1990_lvl3_131, clc_1990_lvl3_132, clc_1990_lvl3_133, clc_1990_lvl3_141, clc_1990_lvl3_142,
clc_1990_lvl3_211, clc_1990_lvl3_212, clc_1990_lvl3_213, clc_1990_lvl3_221, clc_1990_lvl3_222,
clc_1990_lvl3_223, clc_1990_lvl3_231, clc_1990_lvl3_241, clc_1990_lvl3_242, clc_1990_lvl3_243,
clc_1990_lvl3_244, clc_1990_lvl3_311, clc_1990_lvl3_312, clc_1990_lvl3_313, clc_1990_lvl3_321,
clc_1990_lvl3_322, clc_1990_lvl3_323, clc_1990_lvl3_324, clc_1990_lvl3_331, clc_1990_lvl3_332,
clc_1990_lvl3_333, clc_1990_lvl3_334, clc_1990_lvl3_335, clc_1990_lvl3_411, clc_1990_lvl3_412,
clc_1990_lvl3_421, clc_1990_lvl3_422, clc_1990_lvl3_423, clc_1990_lvl3_511, clc_1990_lvl3_512,
clc_1990_lvl3_521, clc_1990_lvl3_522, clc_1990_lvl3_523, clc_1990_lvl3_dom_class, clc_1990_lvl3_na,
clc_2018_lvl1_1, clc_2018_lvl1_2, clc_2018_lvl1_3, clc_2018_lvl1_4, clc_2018_lvl1_5,
clc_2018_lvl1_dom_class, clc_2018_lvl1_na, clc_2018_lvl2_11, clc_2018_lvl2_12, clc_2018_lvl2_13,
clc_2018_lvl2_14, clc_2018_lvl2_21, clc_2018_lvl2_22, clc_2018_lvl2_23, clc_2018_lvl2_24,
clc_2018_lvl2_31, clc_2018_lvl2_32, clc_2018_lvl2_33, clc_2018_lvl2_41, clc_2018_lvl2_42,
clc_2018_lvl2_51, clc_2018_lvl2_52, clc_2018_lvl2_dom_class, clc_2018_lvl2_na, clc_2018_lvl3_111,
clc_2018_lvl3_112, clc_2018_lvl3_121, clc_2018_lvl3_122, clc_2018_lvl3_123, clc_2018_lvl3_124,
clc_2018_lvl3_131, clc_2018_lvl3_132, clc_2018_lvl3_133, clc_2018_lvl3_141, clc_2018_lvl3_142,
clc_2018_lvl3_211, clc_2018_lvl3_212, clc_2018_lvl3_213, clc_2018_lvl3_221, clc_2018_lvl3_222,
clc_2018_lvl3_223, clc_2018_lvl3_231, clc_2018_lvl3_241, clc_2018_lvl3_242, clc_2018_lvl3_243,
clc_2018_lvl3_244, clc_2018_lvl3_311, clc_2018_lvl3_312, clc_2018_lvl3_313, clc_2018_lvl3_321,
clc_2018_lvl3_322, clc_2018_lvl3_323, clc_2018_lvl3_324, clc_2018_lvl3_331, clc_2018_lvl3_332,
clc_2018_lvl3_333, clc_2018_lvl3_334, clc_2018_lvl3_335, clc_2018_lvl3_411, clc_2018_lvl3_412,
clc_2018_lvl3_421, clc_2018_lvl3_422, clc_2018_lvl3_423, clc_2018_lvl3_511, clc_2018_lvl3_512,
clc_2018_lvl3_521, clc_2018_lvl3_522, clc_2018_lvl3_523, clc_2018_lvl3_dom_class, clc_2018_lvl3_na,
cli_aridity_ou, cli_aridity_pe, cli_aridity_pm, cli_assync_ou, cli_assync_pe, cli_assync_pm,
cli_pet_ou_mean, cli_pet_ou_yr, cli_pet_pe_mean, cli_pet_pe_yr, cli_pet_pm_mean, cli_pet_pm_yr,
cli_prec_date_max, cli_prec_dur_high, cli_prec_dur_low, cli_prec_freq_high, cli_prec_freq_low,
cli_prec_intensity, cli_prec_max, cli_prec_mean, cli_prec_mean_yr, cli_prec_season_pet_ou,
cli_prec_season_pet_pe, cli_prec_season_pet_pm, cli_prec_season_temp, cli_prec_timing_high,
cli_prec_timing_low, cli_psol_frac_berghuijs, cli_psol_frac_safran, cli_temp_mean, dam_influence,
dam_n, dam_volume, geo_dom_class, geo_ev, geo_ig, geo_mt, geo_nd, geo_pa, geo_pb, geo_pi, geo_py,
geo_sc, geo_sm, geo_ss, geo_su, geo_va, geo_vb, geo_vi, geo_wb, hgl_krs_karstic,
hgl_krs_not_karstic, hgl_krs_unknown, hgl_permeability, hgl_porosity, hgl_thm_alluvial,
hgl_thm_bedrock, hgl_thm_intense_folded, hgl_thm_sedimentary, hgl_thm_unknown, hgl_thm_volcanism,
hyc_jay_pet_ou, hyc_jay_pet_pe, hyc_jay_pet_pm, hyc_jay_prec_mean, hyc_jay_ratio_prec_pet_ou,
hyc_jay_ratio_prec_pet_pe, hyc_jay_ratio_prec_pet_pm, hyc_jay_ratio_q_prec, hyd_bfi_ladson,
hyd_bfi_lfstat, hyd_bfi_pelletier_pet_ou, hyd_hfd_mean, hyd_q_date_max, hyd_q_date_qmna,
hyd_q_dur_high, hyd_q_dur_low, hyd_q_freq_high, hyd_q_freq_low, hyd_q_freq_zero, hyd_q_max,
hyd_q_mean, hyd_q_mean_yr, hyd_q_qmna_min, hyd_slope_fdc, hyd_stream_elas, hym_q_anomaly_inrae,
hym_q_date_end, hym_q_date_start, hym_q_low_uncertainty_inrae, hym_q_n_year, hym_q_na_period,
hym_q_na_total, hym_q_questionable, hym_q_unqualified, sit_altitude, sit_altitude_datum,
sit_area_hydro, sit_area_topo, sit_city, sit_code_h3, sit_comment, sit_comment_impact_gene, sit_crs,
sit_date_start, sit_date_update, sit_entity, sit_flood_duration, sit_impact, sit_kp_down, sit_kp_up,
sit_label, sit_label_add, sit_label_usual, sit_latitude, sit_longitude, sit_mnemonic,
sit_month1_low_water, sit_month1_year, sit_publication_rights, sit_section, sit_section_vigilance,
sit_status, sit_test_site, sit_type, sit_type_add, sit_tz, sit_waterbody, sit_watercourse_acc,
sit_zone_hydro, sta_altitude_snap, sta_altitude_staff_gauge, sta_area_snap, sta_city,
sta_code_child, sta_code_h2, sta_code_parent, sta_comment, sta_comment_impact_local, sta_crs,
sta_date_altitude_ref, sta_date_end, sta_date_start, sta_date_update, sta_display_level,
sta_dual_staff_gauge, sta_epsg, sta_impact_local, sta_kp, sta_label, sta_label_add,
sta_main_prod_code, sta_main_prod_name, sta_main_prod_name_short, sta_monitor,
sta_publication_right, sta_purpose, sta_qual_highflow, sta_qual_lowflow, sta_qual_meanflow,
sta_territory, sta_test_station, sta_time_data_gap, sta_time_discontinuity, sta_type, sta_x_l2e,
sta_x_l2e_snap, sta_x_l93, sta_x_l93_snap, sta_x_w84, sta_x_w84_snap, sta_y_l2e, sta_y_l2e_snap,
sta_y_l93, sta_y_l93_snap, sta_y_w84, sta_y_w84_snap, top_altitude_mean, top_dist_outlet_mean,
top_drainage_density, top_itopo_mean, top_mor_circ_ratio, top_mor_compact_coef,
top_mor_elong_ratio_catchment, top_mor_elong_ratio_circ, top_mor_form_factor_horton,
top_mor_form_factor_square, top_mor_relief_ratio, top_mor_shape_factor, top_slo_flat,
top_slo_gentle, top_slo_mean, top_slo_moderate, top_slo_ori_e, top_slo_ori_n, top_slo_ori_ne,
top_slo_ori_nw, top_slo_ori_s, top_slo_ori_se, top_slo_ori_sw, top_slo_ori_w, top_slo_steep,
top_slo_strong, top_slo_very_steep
[85]:
df = dataset.fetch_static_features()
print(df.shape)
(654, 344)
[86]:
print(df.isna().sum().sum())
df.isna().sum()
12253
[86]:
clc_1990_lvl1_1 0
clc_1990_lvl1_2 0
clc_1990_lvl1_3 0
clc_1990_lvl1_4 0
clc_1990_lvl1_5 0
..
top_slo_ori_sw 0
top_slo_ori_w 0
top_slo_steep 0
top_slo_strong 0
top_slo_very_steep 0
Length: 344, dtype: int64
find those columns which have at least one NaN value
[87]:
df.loc[:, (df.isna().sum()>0)]
[87]:
| clc_1990_lvl1_dom_class | clc_1990_lvl2_dom_class | clc_1990_lvl3_dom_class | clc_2018_lvl1_dom_class | clc_2018_lvl2_dom_class | clc_2018_lvl3_dom_class | cli_prec_timing_high | cli_prec_timing_low | hyd_bfi_ladson | hyd_bfi_lfstat | ... | sta_code_h2 | sta_code_parent | sta_comment | sta_comment_impact_local | sta_date_altitude_ref | sta_date_end | sta_display_level | sta_kp | sta_label_add | sta_purpose | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| A105003001 | 2.0 | 31.0 | 211.0 | 2.0 | 31.0 | 211.0 | jja | son | 0.56723 | 0.51341 | ... | A1050310 | NaN | Mise à l'heure TU le 05/11/2009. - Remplacemen... | NaN | 2022-02-24 08:21:00 | NaN | NaN | NaN | NaN | Low flow monitoring - Flood forecasting |
| A107020001 | 2.0 | 21.0 | 211.0 | 2.0 | 21.0 | 211.0 | son | son | 0.56320 | 0.53006 | ... | A1072010 | NaN | Nivellement de juillet 2002, géomètre Faber-Sc... | NaN | 2020-12-14 11:19:00 | NaN | NaN | NaN | NaN | Low flow monitoring - Flood forecasting |
| A112020001 | 2.0 | 31.0 | 211.0 | 2.0 | 31.0 | 211.0 | jja | son | 0.44951 | 0.37740 | ... | A1122010 | NaN | Arrêt des observations le 10/01/2008. - Nivell... | NaN | NaN | 2008-01-10 11:20:00 | NaN | NaN | NaN | Low flow monitoring - Flood forecasting |
| A116003002 | 2.0 | 21.0 | 211.0 | 2.0 | 21.0 | 211.0 | jja | son | 0.53010 | 0.45926 | ... | A1080320 | NaN | Echelle et pont arrachés en mai 1983. Seuil re... | NaN | 2018-12-05 07:24:00 | NaN | NaN | NaN | NaN | Low flow monitoring - Flood forecasting |
| A140202001 | 3.0 | 31.0 | 311.0 | 3.0 | 31.0 | 311.0 | djf | son | 0.50286 | 0.44606 | ... | A1402020 | NaN | Passage à l'heure TU le 29/10/2009. - Nivellé ... | NaN | 2020-12-14 11:20:00 | NaN | NaN | NaN | NaN | Low flow monitoring - Flood forecasting |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Y781000101 | 3.0 | 32.0 | 333.0 | 3.0 | 32.0 | 333.0 | son | jja | 0.37525 | 0.29060 | ... | Y7804010 | NaN | Station du réseau de base sur seuil naturel, é... | Pompages Manso et Galeria | 2021-04-15 08:18:00 | NaN | NaN | NaN | NaN | Low flow monitoring - Flood forecasting |
| Y862000101 | 3.0 | 31.0 | 311.0 | 3.0 | 31.0 | 311.0 | djf | jja | NaN | NaN | ... | Y8624010 | NaN | Courbes de tarage à partir du 31/12/1979 revue... | NaN | NaN | NaN | NaN | NaN | NaN | Low flow monitoring - Flood forecasting |
| Y881000102 | 3.0 | 31.0 | 311.0 | 3.0 | 31.0 | 311.0 | djf | jja | 0.55820 | 0.54672 | ... | Y8814020 | NaN | NaN | NaN | NaN | 2012-04-30 12:00:00 | NaN | NaN | Zoza ancien | Flood forecasting - Streamflow monitoring |
| Y902000101 | 3.0 | 31.0 | 312.0 | 3.0 | 31.0 | 312.0 | son | jja | 0.51000 | 0.46926 | ... | Y9025010 | NaN | NaN | Influence forte des barrages de baigneurs en é... | NaN | NaN | NaN | NaN | Pont de Noceta | Low flow monitoring - Flood forecasting - Stre... |
| Y960000102 | 3.0 | 32.0 | 323.0 | 3.0 | 31.0 | 313.0 | djf | jja | 0.34639 | 0.28346 | ... | Y9605230 | NaN | STATION EN REMPLACEMENT DE CELLE DE TAFONATO Y... | Pompages amont ? | 2017-09-13 09:38:00 | NaN | NaN | NaN | Canniciu | Streamflow monitoring |
654 rows × 57 columns
[88]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[88]:
clc_1990_lvl1_dom_class 4
clc_1990_lvl2_dom_class 5
clc_1990_lvl3_dom_class 5
clc_2018_lvl1_dom_class 5
clc_2018_lvl2_dom_class 6
clc_2018_lvl3_dom_class 7
cli_prec_timing_high 15
cli_prec_timing_low 2
hyd_bfi_ladson 42
hyd_bfi_lfstat 42
hyd_bfi_pelletier_pet_ou 42
sit_altitude 10
sit_altitude_datum 10
sit_area_hydro 641
sit_area_topo 7
sit_city 2
sit_comment 515
sit_comment_impact_gene 630
sit_crs 2
sit_date_start 654
sit_date_update 2
sit_entity 2
sit_flood_duration 654
sit_impact 6
sit_kp_down 590
sit_kp_up 654
sit_label 2
sit_label_add 464
sit_label_usual 326
sit_latitude 2
sit_longitude 2
sit_mnemonic 619
sit_month1_low_water 2
sit_month1_year 2
sit_publication_rights 2
sit_section 2
sit_section_vigilance 127
sit_status 2
sit_test_site 2
sit_type 2
sit_type_add 2
sit_tz 2
sit_waterbody 654
sit_watercourse_acc 632
sit_zone_hydro 2
sta_altitude_staff_gauge 120
sta_code_child 654
sta_code_h2 13
sta_code_parent 654
sta_comment 305
sta_comment_impact_local 624
sta_date_altitude_ref 120
sta_date_end 580
sta_display_level 654
sta_kp 583
sta_label_add 527
sta_purpose 17
dtype: int64
[89]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
tsd_humid, tsd_pet_ou, tsd_pet_pe, tsd_pet_pm, tsd_prec, tsd_prec_solid_frac, tsd_q_l, tsd_q_mm,
tsd_rad_dli, tsd_rad_ssi, tsd_swe_isba, tsd_swi_gr, tsd_swi_isba, tsd_temp, tsd_temp_max,
tsd_temp_min, tsd_val_c, tsd_val_i, tsd_val_m, tsd_val_q, tsd_val_s, tsd_wind
CAMELS_SE
[90]:
dataset = RainfallRunoff('CAMELS_SE', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
CAMELS_SE with 50 stations, 4 dynamic and 76 static features
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:2541: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
df = pd.read_csv(
[91]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Agriculture_percentage, Area_km2, Bedrock_percentage_sc, Clayey_till_and_clay_till_percentage_sc,
DOR, Elevation_mabsl, Forest_percentage, Glacier_percentage_sc, Glaciers_percentage,
Glaciofluvial_sediment_percentage_sc, Latitude_WGS84, Longitude_WGS84, Name, Open_land_percentage,
Peat_percentage_sc, Pmean_mm_year, Postglacial_sand_and_gravel_percentage_sc, RegVol_m3,
S01_Qmean_CNP_61_90, S01_Qmean_CNP_91_20, S01_Qmean_hs, S02_Qcoeff_CNP_61_90, S02_Qcoeff_CNP_91_20,
S02_Qcoeff_hs, S03_COM_CNP_61_90, S03_COM_CNP_91_20, S03_COM_hs, S04_SPD_CNP_61_90,
S04_SPD_CNP_91_20, S04_SPD_hs, S05_Qmean_spring_CNP_61_90, S05_Qmean_spring_CNP_91_20,
S05_Qmean_spring_hs, S06_Qmean_summer_CNP_61_90, S06_Qmean_summer_CNP_91_20, S06_Qmean_summer_hs,
S07_Qmean_autumn_CNP_61_90, S07_Qmean_autumn_CNP_91_20, S07_Qmean_autumn_hs,
S08_Qmean_winter_CNP_61_90, S08_Qmean_winter_CNP_91_20, S08_Qmean_winter_hs, S09_LFfreq_CNP_61_90,
S09_LFfreq_CNP_91_20, S09_LFfreq_hs, S10_T_minQ_d30_CNP_61_90, S10_T_minQ_d30_CNP_91_20,
S10_T_minQ_d30_hs, S11_minQ_d7_CNP_61_90, S11_minQ_d7_CNP_91_20, S11_minQ_d7_hs,
S12_minQ_d30_CNP_61_90, S12_minQ_d30_CNP_91_20, S12_minQ_d30_hs, S13_HFfreq_CNP_61_90,
S13_HFfreq_CNP_91_20, S13_HFfreq_hs, S14_T_maxQ_d1_CNP_61_90, S14_T_maxQ_d1_CNP_91_20,
S14_T_maxQ_d1_hs, S15_maxQ_d30_CNP_61_90, S15_maxQ_d30_CNP_91_20, S15_maxQ_d30_hs,
S16_maxQ_d1_CNP_61_90, S16_maxQ_d1_CNP_91_20, S16_maxQ_d1_hs, Shrubs_and_grassland_percentage,
Silt_percentage_sc, Slope_mean_degree, Till_and_weathered_deposit_percentage_sc, Till_percentage_sc,
Tmean_C, Urban_percentage, Water_percentage, Water_percentage_sc, Wetlands_percentage
[92]:
df = dataset.fetch_static_features()
print(df.shape)
(50, 76)
[93]:
print(df.isna().sum().sum())
df.isna().sum()
0
[93]:
Agriculture_percentage 0
Area_km2 0
Bedrock_percentage_sc 0
Clayey_till_and_clay_till_percentage_sc 0
DOR 0
..
Tmean_C 0
Urban_percentage 0
Water_percentage 0
Water_percentage_sc 0
Wetlands_percentage 0
Length: 76, dtype: int64
find those columns which have at least one NaN value
[94]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
No NaN values
[95]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[95]:
Series([], dtype: float64)
[96]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_mean, pcp_mm, q_cms_obs, q_mmd_obs
CAMELS_IND
[97]:
dataset = RainfallRunoff('CAMELS_IND', path=os.path.join(DATA_PATH, 'CAMELS'), verbosity=0)
print(dataset)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:3223: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
df = pd.read_csv(os.path.join(fpath),
CAMELS_IND with 472 stations, 20 dynamic and 210 static features
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_camels.py:3234: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.
df = pd.read_csv(fpath,
[98]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_gleam_mean, ai_mean, annual_max_1day, annual_max_30day, annual_max_3day, annual_max_7day,
annual_max_90day, annual_min_7day, annual_q, aridity_p_pet, aridity_pet_aet, asynchronicity,
bare_frac, bfi, built_area_frac, bulkdens_sub_major, bulkdens_sub_mean, bulkdens_top_major,
bulkdense_top_mean, carb_rocks_frac, cen_time, clay_frac_sub, clay_frac_top, crops_frac,
crops_frac_1985, crops_frac_1995, crops_frac_2005, cv_apr_flow, cv_aug_flow, cv_dec_flow,
cv_feb_flow, cv_jan_flow, cv_jul_flow, cv_jun_flow, cv_mar_flow, cv_may_flow, cv_nov_flow,
cv_oct_flow, cv_sep_flow, cwc_area, cwc_lat, cwc_lon, cwc_river, cwc_site_name, dom_land_cover,
dom_land_cover_frac, doy_max_flow, doy_max_flow_7, doy_min_flow, doy_min_flow_7, drinking_frac,
dspbar, elev_max, elev_mean, elev_median, elev_min, evap_canopy_anum, evap_canopy_max,
evap_canopy_mean, evap_canopy_min, evap_surface_anum, evap_surface_max, evap_surface_mean,
evap_surface_min, fall_days, fall_rate_mean, fall_rate_median, first_dam_year, flood_frac,
flooded_veg_frac, flow_availability, freq_q_high, freq_q_low, gauge_elevation, geol_class_1st,
geol_class_1st_frac, geol_class_2nd, geol_class_2nd_frac, geol_permeability, geol_porosity,
ghi_area, ghi_group, ghi_lat, ghi_lon, ghi_stn_id, gini_flow, gravel_frac_sub, gravel_frac_top,
high_prec_dur, high_prec_freq, high_prec_timing, hsg_major, hydroelec_frac, irrigation_frac,
lai_diff, lai_max, lai_mean, lai_min, last_dam_year, low_prec_dur, low_prec_freq, low_prec_timing,
max_high_prec_dur, max_low_prec_dur, mean_anum_flow, mean_apr_flow, mean_atmn_flow, mean_aug_flow,
mean_dec_flow, mean_feb_flow, mean_jan_flow, mean_jul_flow, mean_jun_flow, mean_mar_flow,
mean_may_flow, mean_nov_flow, mean_oct_flow, mean_sep_flow, mean_sumr_flow, mean_swmn_flow,
mean_wint_flow, month_1day_max, month_1day_min, n_dams, navigation_frac, num_dams, num_hyd_alt,
org_carb_sub_major, org_carb_sub_mean, org_carb_top_major, org_carb_top_mean, organic_frac_sub,
organic_frac_top, overflow_frac, p_annual_variability, p_max, p_mean, p_mean_anum,
p_monthly_variability, p_unif, pet_gleam_mean, pet_max, pet_mean, pet_mean_anum, pet_min,
pop_density_2000, pop_density_2005, pop_density_2010, pop_density_2015, pop_density_2020, q_10,
q_25, q_25_swmn, q_50, q_50_swmn, q_5_swmn, q_75, q_75_swmn, q_90, q_95_swmn, q_cv, q_high_days,
q_low_days, q_mean, q_mean_swmn, q_zero, range_frac, rel_hum_mean, res_store_sum, reservoir_index,
rise_days, rise_rate_mean, rise_rate_median, river_basin, runoff_ratio, sand_frac_sub,
sand_frac_top, silt_frac_sub, silt_frac_top, slope_fdc, slope_max, slope_mean, slope_median,
slope_min, sm_lvl1_mean, sm_lvl2_mean, sm_lvl3_mean, sm_lvl4_mean, soil_awc_sub, soil_awc_top,
soil_awsc_major, soil_awsc_max, soil_awsc_min, soil_conductivity_sub, soil_conductivity_top,
soil_depth, srad_lw_mean, srad_sw_mean, streamflow_elas, tailing_frac, tmax_mean, tmin_mean,
total_storage, trees_frac, urban_frac_1985, urban_frac_1995, urban_frac_2005, water_frac, wind_mean,
wtd
[99]:
df = dataset.fetch_static_features()
print(df.shape)
(472, 210)
[100]:
print(df.isna().sum().sum())
df.isna().sum()
20322
[100]:
aet_gleam_mean 0
ai_mean 0
annual_max_1day 300
annual_max_30day 300
annual_max_3day 300
...
urban_frac_1995 0
urban_frac_2005 0
water_frac 0
wind_mean 0
wtd 0
Length: 210, dtype: int64
find those columns which have at least one NaN value
[101]:
df.loc[:, (df.isna().sum()>0)]
[101]:
| annual_max_1day | annual_max_30day | annual_max_3day | annual_max_7day | annual_max_90day | annual_min_7day | annual_q | bfi | bulkdens_sub_major | bulkdens_sub_mean | ... | q_mean_swmn | q_zero | reservoir_index | rise_days | rise_rate_mean | rise_rate_median | runoff_ratio | slope_fdc | streamflow_elas | tailing_frac | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| gauge_id | |||||||||||||||||||||
| 3001 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.33 | 1.291356 | ... | 0.568 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000000 |
| 3002 | 756.807 | 136.155 | 485.599 | 284.301 | 90.494 | 0.000 | 828.584 | 0.372 | 1.45 | 1.450000 | ... | 4.587 | 86.667 | 0.000688 | 62.00 | 38.509 | 3.07 | 0.472 | NaN | 3.744 | 0.000000 |
| 3003 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.21 | 1.210000 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000000 |
| 3004 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.21 | 1.211649 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000000 |
| 3005 | 14640.524 | 3780.572 | 11975.203 | 8184.747 | 2305.069 | 5.293 | 21163.952 | 0.385 | 1.21 | 1.218816 | ... | 3.028 | 0.000 | 0.507788 | 131.75 | 324.324 | 5.46 | 0.331 | 2.859 | 1.925 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17021 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.291081 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000000 |
| 17022 | 370.817 | 98.813 | 269.619 | 191.648 | 60.805 | 0.060 | 322.091 | 0.254 | NaN | 1.318424 | ... | 0.005 | 148.950 | 1.030649 | 78.20 | 12.834 | 0.43 | 0.034 | NaN | 2.049 | 0.117647 |
| 17023 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.211304 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000000 |
| 17024 | 609.535 | 154.046 | 460.329 | 312.861 | 81.274 | 0.000 | 356.873 | 0.169 | NaN | 1.320490 | ... | 0.001 | 306.800 | 0.942509 | 21.65 | 55.641 | 5.30 | 0.031 | NaN | 2.977 | 0.117647 |
| 17025 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.330000 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
472 rows × 86 columns
[102]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[102]:
annual_max_1day 300
annual_max_30day 300
annual_max_3day 300
annual_max_7day 300
annual_max_90day 300
...
rise_rate_median 299
runoff_ratio 244
slope_fdc 331
streamflow_elas 271
tailing_frac 66
Length: 86, dtype: int64
[103]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, airtemp_C_max, airtemp_C_mean, airtemp_C_min, evap_canopy(kg/m2/s),
evap_surface(kg/m2/s), lwdownrad_wm2, pcp_mm, pet_mm, pet_mm_gleam, q_cms_obs, rh_%, sm_lvl1(kg/m2),
sm_lvl2(kg/m2), sm_lvl3(kg/m2), sm_lvl4(kg/m2), solrad_wm2, windspeed_mps, windspeedu_mps,
windspeedv_mps
Caravan_DK
[104]:
dataset = RainfallRunoff('Caravan_DK', path=DATA_PATH, verbosity=0)
print(dataset)
Caravan_DK with 308 stations, 39 dynamic and 211 static features
[105]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area,
area_fraction_used_for_aggregation, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav, clz_cl_smj,
cmi_ix_s01, cmi_ix_s02, cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07, cmi_ix_s08,
cmi_ix_s09, cmi_ix_s10, cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse, dis_m3_pmn,
dis_m3_pmx, dis_m3_pyr, dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav, fec_cl_smj,
fmh_cl_smj, for_pc_sse, frac_snow, gauge_lat, gauge_lon, gauge_name, gdp_ud_sav, gdp_ud_ssu,
gla_pc_sse, glc_cl_smj, glc_pc_s01, glc_pc_s02, glc_pc_s03, glc_pc_s04, glc_pc_s05, glc_pc_s06,
glc_pc_s07, glc_pc_s08, glc_pc_s09, glc_pc_s10, glc_pc_s11, glc_pc_s12, glc_pc_s13, glc_pc_s14,
glc_pc_s15, glc_pc_s16, glc_pc_s17, glc_pc_s18, glc_pc_s19, glc_pc_s20, glc_pc_s21, glc_pc_s22,
gwt_cm_sav, hdi_ix_sav, hft_ix_s09, hft_ix_s93, high_prec_dur, high_prec_freq, inu_pc_slt,
inu_pc_smn, inu_pc_smx, ire_pc_sse, kar_pc_sse, lit_cl_smj, lka_pc_sse, lkv_mc_usu, low_prec_dur,
low_prec_freq, moisture_index, nli_ix_sav, p_mean, pac_pc_sse, pet_mean, pet_mm_s01, pet_mm_s02,
pet_mm_s03, pet_mm_s04, pet_mm_s05, pet_mm_s06, pet_mm_s07, pet_mm_s08, pet_mm_s09, pet_mm_s10,
pet_mm_s11, pet_mm_s12, pet_mm_syr, pnv_cl_smj, pnv_pc_s01, pnv_pc_s02, pnv_pc_s03, pnv_pc_s04,
pnv_pc_s05, pnv_pc_s06, pnv_pc_s07, pnv_pc_s08, pnv_pc_s09, pnv_pc_s10, pnv_pc_s11, pnv_pc_s12,
pnv_pc_s13, pnv_pc_s14, pnv_pc_s15, pop_ct_usu, ppd_pk_sav, pre_mm_s01, pre_mm_s02, pre_mm_s03,
pre_mm_s04, pre_mm_s05, pre_mm_s06, pre_mm_s07, pre_mm_s08, pre_mm_s09, pre_mm_s10, pre_mm_s11,
pre_mm_s12, pre_mm_syr, prm_pc_sse, pst_pc_sse, rdd_mk_sav, rev_mc_usu, ria_ha_usu, riv_tc_usu,
run_mm_syr, seasonality, sgr_dk_sav, slp_dg_sav, slt_pc_sav, snd_pc_sav, snw_pc_s01, snw_pc_s02,
snw_pc_s03, snw_pc_s04, snw_pc_s05, snw_pc_s06, snw_pc_s07, snw_pc_s08, snw_pc_s09, snw_pc_s10,
snw_pc_s11, snw_pc_s12, snw_pc_smx, snw_pc_syr, soc_th_sav, swc_pc_s01, swc_pc_s02, swc_pc_s03,
swc_pc_s04, swc_pc_s05, swc_pc_s06, swc_pc_s07, swc_pc_s08, swc_pc_s09, swc_pc_s10, swc_pc_s11,
swc_pc_s12, swc_pc_syr, tbi_cl_smj, tec_cl_smj, tmp_dc_s01, tmp_dc_s02, tmp_dc_s03, tmp_dc_s04,
tmp_dc_s05, tmp_dc_s06, tmp_dc_s07, tmp_dc_s08, tmp_dc_s09, tmp_dc_s10, tmp_dc_s11, tmp_dc_s12,
tmp_dc_smn, tmp_dc_smx, tmp_dc_syr, urb_pc_sse, wet_cl_smj, wet_pc_s01, wet_pc_s02, wet_pc_s03,
wet_pc_s04, wet_pc_s05, wet_pc_s06, wet_pc_s07, wet_pc_s08, wet_pc_s09, wet_pc_sg1, wet_pc_sg2
[106]:
df = dataset.fetch_static_features()
print(df.shape)
(308, 211)
[107]:
print(df.isna().sum().sum())
df.isna().sum()
0
[107]:
aet_mm_s01 0
aet_mm_s02 0
aet_mm_s03 0
aet_mm_s04 0
aet_mm_s05 0
..
wet_pc_s07 0
wet_pc_s08 0
wet_pc_s09 0
wet_pc_sg1 0
wet_pc_sg2 0
Length: 211, dtype: int64
find those columns which have at least one NaN value
[108]:
df.loc[:, (df.isna().sum()>0)]
[108]:
| 240001 |
|---|
| 590006 |
| 340003 |
| 450043 |
| 100009 |
| ... |
| 610013 |
| 180078 |
| 150046 |
| 490082 |
| 20006 |
308 rows × 0 columns
[109]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[109]:
Series([], dtype: float64)
[110]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
dewpoint_temperature_2m_max, dewpoint_temperature_2m_mean, dewpoint_temperature_2m_min,
potential_evaporation_sum, snow_depth_water_equivalent_max, snow_depth_water_equivalent_mean,
snow_depth_water_equivalent_min, streamflow, surface_net_solar_radiation_max,
surface_net_solar_radiation_mean, surface_net_solar_radiation_min,
surface_net_thermal_radiation_max, surface_net_thermal_radiation_mean,
surface_net_thermal_radiation_min, surface_pressure_max, surface_pressure_mean,
surface_pressure_min, temperature_2m_max, temperature_2m_mean, temperature_2m_min,
total_precipitation_sum, u_component_of_wind_10m_max, u_component_of_wind_10m_mean,
u_component_of_wind_10m_min, v_component_of_wind_10m_max, v_component_of_wind_10m_mean,
v_component_of_wind_10m_min, volumetric_soil_water_layer_1_max, volumetric_soil_water_layer_1_mean,
volumetric_soil_water_layer_1_min, volumetric_soil_water_layer_2_max,
volumetric_soil_water_layer_2_mean, volumetric_soil_water_layer_2_min,
volumetric_soil_water_layer_3_max, volumetric_soil_water_layer_3_mean,
volumetric_soil_water_layer_3_min, volumetric_soil_water_layer_4_max,
volumetric_soil_water_layer_4_mean, volumetric_soil_water_layer_4_min
LamaHCE
[111]:
dataset = RainfallRunoff('LamaHCE', timestep='D', data_type='total_upstrm', path=DATA_PATH, verbosity=0)
print(dataset)
LamaHCE with 859 stations, 22 dynamic and 80 static features
[112]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
agr_fra, area_calc, area_gov, area_ratio, aridity, bare_fra, bedrk_dep, clay_fra, country,
degimpact, diur_art, diur_glac, elev, elev_mean, elev_med, elev_ran, elev_std, elon_ratio, et0_mean,
eta_mean, fedstate, forest_fra, frac_snow, gaps, gc_dom, gc_ig_fra, gc_mt_fra, gc_pa_fra, gc_pb_fra,
gc_pi_fra, gc_py_fra, gc_sc_fra, gc_sm_fra, gc_ss_fra, gc_su_fra, gc_va_fra, gc_vb_fra, gc_wb_fra,
geol_perme, geol_poros, glac_fra, govnr, grav_fra, gvf_diff, gvf_max, hi_prec_du, hi_prec_fr,
hi_prec_ti, lai_diff, lai_max, lake_fra, lat, lc_dom, lo_prec_du, lo_prec_fr, lo_prec_ti, lon,
mvert_ang, mvert_dist, name, ndvi_max, ndvi_min, obsbeg_day, obsbeg_hr, obsend, oc_fra, p_mean,
p_season, region, river, root_dep, sand_fra, silt_fra, slope_mean, soil_condu, soil_poros,
soil_tawc, strm_dens, typimpact, urban_fra
[113]:
df = dataset.fetch_static_features()
print(df.shape)
(859, 80)
[114]:
print(df.isna().sum().sum())
df.isna().sum()
46
[114]:
agr_fra 0
area_calc 0
area_gov 0
area_ratio 0
aridity 0
..
soil_poros 0
soil_tawc 0
strm_dens 0
typimpact 0
urban_fra 0
Length: 80, dtype: int64
find those columns which have at least one NaN value
[115]:
df.loc[:, (df.isna().sum()>0)]
[115]:
| geol_perme | hi_prec_ti | lo_prec_ti | |
|---|---|---|---|
| ID | |||
| 826 | -12.4 | NaN | son |
| 819 | -11.5 | son | djf |
| 79 | -13.3 | jja | djf |
| 696 | -12.2 | jja | djf |
| 98 | -12.0 | jja | djf |
| ... | ... | ... | ... |
| 261 | -12.1 | jja | djf |
| 587 | -12.9 | jja | djf |
| 827 | -12.6 | jja | son |
| 250 | -13.4 | jja | djf |
| 72 | -12.4 | jja | djf |
859 rows × 3 columns
[116]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[116]:
geol_perme 1
hi_prec_ti 42
lo_prec_ti 3
dtype: int64
[117]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpa, airtemp_C_max, airtemp_C_mean, airtemp_C_min, dptemp_C_max_2m, dptemp_C_mean_2m,
dptemp_C_min_2m, fcst_alb, lai_high_veg, lai_low_veg, pcp_mm, q_cms_obs, solrad_wm2, solrad_wm2_max,
swe_mm, thermrad_wm2, thermrad_wm2_max, total_et, volsw_123, volsw_4, windspeedu_mps, windspeedv_mps
[118]:
dataset = RainfallRunoff('LamaHCE', timestep='H', data_type='total_upstrm', path=DATA_PATH, verbosity=0)
print(dataset)
LamaHCE with 859 stations, 16 dynamic and 84 static features
[119]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
agr_fra, area_calc, area_gov, area_ratio, arid_1, arid_2, bare_fra, bedrk_dep, clay_fra, country,
degimpact, diur_art, diur_glac, elev, elev_mean, elev_med, elev_ran, elev_std, elon_ratio, et0_mean,
eta_mean, fedstate, forest_fra, frac_snow, gaps_post, gaps_pre, gc_dom, gc_ig_fra, gc_mt_fra,
gc_pa_fra, gc_pb_fra, gc_pi_fra, gc_py_fra, gc_sc_fra, gc_sm_fra, gc_ss_fra, gc_su_fra, gc_va_fra,
gc_vb_fra, gc_wb_fra, geol_perme, geol_poros, glac_fra, govnr, grav_fra, gvf_diff, gvf_max,
hi_prec_du, hi_prec_fr, hi_prec_ti, lai_diff, lai_max, lake_fra, lat, lc_dom, lo_prec_du,
lo_prec_fr, lo_prec_ti, lon, mvert_ang, mvert_dist, name, ndvi_max, ndvi_min, nrs_euhyd, nrs_rivat,
obsbeg_day, obsbeg_hr, obsend, oc_fra, p_mean, p_season, region, river, root_dep, sand_fra,
silt_fra, slope_mean, soil_condu, soil_poros, soil_tawc, strm_dens, typimpact, urban_fra
[120]:
df = dataset.fetch_static_features()
print(df.shape)
(859, 84)
[121]:
print(df.isna().sum().sum())
df.isna().sum()
65
[121]:
agr_fra 0
area_calc 0
area_gov 0
area_ratio 0
arid_1 0
..
soil_poros 0
soil_tawc 0
strm_dens 0
typimpact 0
urban_fra 0
Length: 84, dtype: int64
find those columns which have at least one NaN value
[122]:
df.loc[:, (df.isna().sum()>0)]
[122]:
| geol_perme | hi_prec_ti | lo_prec_ti | nrs_rivat | |
|---|---|---|---|---|
| ID | ||||
| 826 | -12.4 | NaN | son | 20376803.0 |
| 819 | -11.5 | son | djf | 20464042.0 |
| 79 | -13.3 | jja | djf | 20454049.0 |
| 696 | -12.2 | jja | djf | 20424102.0 |
| 98 | -12.0 | jja | djf | 20440228.0 |
| ... | ... | ... | ... | ... |
| 261 | -12.1 | jja | djf | 20428827.0 |
| 587 | -12.9 | jja | djf | 20461304.0 |
| 827 | -12.6 | jja | son | 20379436.0 |
| 250 | -13.4 | jja | djf | 20441631.0 |
| 72 | -12.4 | jja | djf | 20451775.0 |
859 rows × 4 columns
[123]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[123]:
geol_perme 1
hi_prec_ti 42
lo_prec_ti 3
nrs_rivat 19
dtype: int64
[124]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpa, airtemp_C_mean, dptemp_C_mean_2m, fcst_alb, lai_high_veg, lai_low_veg, pcp_mm,
q_cms_obs, solrad_wm2, swe_mm, thermrad_wm2, total_et, volsw_123, volsw_4, windspeedu_mps,
windspeedv_mps
LamaHIce
[125]:
dataset = RainfallRunoff('LamaHIce', timestep='D', data_type='total_upstrm',
path=os.path.join(DATA_PATH, 'LamaHIce_daily'), verbosity=0)
print(dataset)
LamaHIce with 111 stations, 36 dynamic and 154 static features
[126]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
ET_ERA5L_all_basin, ET_ERA5L_unfiltered_basin, ET_rav_all_basin, ET_rav_unfiltered_basin,
PET_ERA5L_all_basin, PET_ERA5L_unfiltered_basin, PET_rav_all_basin, PET_rav_unfiltered_basin,
P_ERA5L_all_basin, P_ERA5L_unfiltered_basin, P_rav_all_basin, P_rav_unfiltered_basin, Q5_basin,
Q5_gauge, Q95_basin, Q95_gauge, Q_all_basin, Q_unfiltered_basin, VHM_no_gauge, V_no_gauge,
agr_fra_basin, area_calc_basin, aridity_ERA5L_basin, aridity_basin, asp_mean_basin, bare_fra_basin,
baseflow_index_ladson_basin, baseflow_index_ladson_gauge, bedrk_dep_basin, clay_fra_basin,
degimpact_basin, degimpact_gauge, elev_mean_basin, elev_med_basin, elev_ran_basin, elev_std_basin,
elevation_gauge, elon_ratio_basin, forest_fra_basin, frac_snow_ERA5L_basin, frac_snow_basin,
g621_fra_basin, g701_fra_basin, g743_fra_basin, g746_fra_basin, g_area_basin, g_aspect_basin,
g_dom_NI_basin, g_frac_basin, g_lat_basin, g_lon_basin, g_max_el_basin, g_mean_el_basin,
g_min_el_basin, g_slope_basin, g_slopel20_basin, gaps_hourly_gauge, gbinn_fra_basin,
gbnew_fra_basin, gbold_fra_basin, gc_23_dom_basin, gc_23_pavr_basin, gc_23_pb_basin,
gc_23_vapy_basin, gc_23_vb_basin, gc_23_vbpy_basin, gc_23_vbsr_basin, gc_dom_basin, gc_pa_fra_basin,
gc_pb_fra_basin, gc_va_fra_basin, gc_vb_fra_basin, geometry_gauge, ggnew_fra_basin, ggold_fra_basin,
ghraun_fra_basin, glac_fra_basin, gmob_fra_basin, grav_fra_basin, gsgos_fra_basin, gsinn_fra_basin,
gsn_fra_basin, gsnew_fra_basin, gsold_fra_basin, gvf_diff_basin, gvf_max_basin, hfd_mean_basin,
hfd_mean_gauge, high_prec_du_ERA5L_basin, high_prec_du_basin, high_prec_fr_ERA5L_basin,
high_prec_fr_basin, high_prec_timing_ERA5L_basin, high_prec_timing_basin, high_q_dur_basin,
high_q_dur_gauge, high_q_freq_basin, high_q_freq_gauge, lai_diff_basin, lai_max_basin,
lake_fra_basin, lat_gauge, lc_dom_basin, lo_prec_fr_ERA5L_basin, lo_prec_fr_basin, lon_gauge,
low_prec_du_ERA5L_basin, low_prec_du_basin, low_prec_timing_ERA5L_basin, low_prec_timing_basin,
low_q_dur_basin, low_q_dur_gauge, low_q_freq_basin, low_q_freq_gauge, mvert_ang_basin,
mvert_dist_basin, name_gauge, ndvi_max_basin, ndvi_min_basin, obsbeg_day_gauge, obsbeg_hr_gauge,
obsend_day_gauge, obsend_hr_gauge, oc_fra_basin, p_mean_ERA5L_basin, p_mean_basin,
p_season_ERA5L_basin, p_season_basin, pet_mean_ERA5L_basin, q_mean_basin, q_mean_gauge,
ref_et_mean_basin, river_gauge, root_dep_basin, runoff_ratio_basin, runoff_ratio_gauge,
sand_fra_basin, scrub_fra_basin, silt_fra_basin, slope_fdc_basin, slope_fdc_gauge, slope_mean_basin,
soil_poros_basin, soil_tawc_basin, stream_elas_basin, stream_elas_gauge, strm_dens_basin,
typimpact_basin, typimpact_gauge, urban_fra_basin, water_year_all_basin,
water_year_unfiltered_basin, wetl_fra_basin, zero_q_freq_gauge
[127]:
df = dataset.fetch_static_features()
print(df.shape)
(111, 154)
[128]:
print(df.isna().sum().sum())
df.isna().sum()
2013
[128]:
ET_ERA5L_all_basin 37
ET_ERA5L_unfiltered_basin 14
ET_rav_all_basin 37
ET_rav_unfiltered_basin 14
PET_ERA5L_all_basin 37
..
urban_fra_basin 0
water_year_all_basin 37
water_year_unfiltered_basin 14
wetl_fra_basin 0
zero_q_freq_gauge 37
Length: 154, dtype: int64
find those columns which have at least one NaN value
[129]:
df.loc[:, (df.isna().sum()>0)]
[129]:
| ET_ERA5L_all_basin | ET_ERA5L_unfiltered_basin | ET_rav_all_basin | ET_rav_unfiltered_basin | PET_ERA5L_all_basin | PET_ERA5L_unfiltered_basin | PET_rav_all_basin | PET_rav_unfiltered_basin | P_ERA5L_all_basin | P_ERA5L_unfiltered_basin | ... | q_mean_gauge | runoff_ratio_basin | runoff_ratio_gauge | slope_fdc_basin | slope_fdc_gauge | stream_elas_basin | stream_elas_gauge | water_year_all_basin | water_year_unfiltered_basin | zero_q_freq_gauge | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||||||
| 79 | 0.640192 | 0.636492 | 0.634929 | 0.633861 | 1.295533 | 1.293016 | 0.547516 | 0.557275 | 4.648427 | 4.631319 | ... | 9.389204 | 1.786 | 1.785626 | 0.404 | 0.404174 | 0.507 | 0.506978 | 2000.272524 | 1999.000666 | 0.0 |
| 98 | 0.603482 | 0.593119 | 0.599214 | 0.588846 | 2.000104 | 1.942428 | 0.559626 | 0.536703 | 4.717848 | 4.647344 | ... | 5.951890 | 1.178 | 1.178091 | 0.561 | 0.560945 | 0.666 | 0.666312 | 2010.601150 | 2011.000632 | 0.0 |
| 25 | 1.378187 | 1.396363 | 0.668298 | 0.665824 | 4.609922 | 4.601343 | 0.529151 | 0.506576 | 5.546952 | 5.682186 | ... | 10.316638 | 2.346 | 2.345917 | 1.441 | 1.440679 | 0.579 | 0.579003 | 2004.386057 | 2003.470929 | 0.0 |
| 1 | 0.710055 | 0.718156 | 0.651314 | 0.655982 | 1.472727 | 1.473907 | 0.616380 | 0.603120 | 3.769304 | 3.859496 | ... | 4.546374 | 1.115 | 1.114759 | 2.715 | 2.715291 | 1.365 | 1.364999 | 2010.248973 | 2010.499609 | 0.0 |
| 34 | NaN | 0.436237 | NaN | 0.282329 | NaN | 0.656889 | NaN | 0.243403 | NaN | 3.560044 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1999.000666 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 94 | NaN | 0.565821 | NaN | 0.646134 | NaN | 1.009009 | NaN | 0.569345 | NaN | 2.851701 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2010.014082 | NaN |
| 54 | 0.286762 | 0.318574 | 0.428488 | 0.405101 | 0.409014 | 0.466285 | 0.490258 | 0.472720 | 4.375074 | 4.262709 | ... | 6.364638 | 1.014 | 1.014307 | 3.767 | 3.766742 | 0.929 | 0.929429 | 1997.000000 | 2000.226138 | 0.0 |
| 77 | 0.383486 | 0.368716 | 0.464558 | 0.457334 | 0.557757 | 0.534627 | 0.620087 | 0.598473 | 3.094146 | 3.139548 | ... | 2.217995 | 0.781 | 0.780813 | 1.384 | 1.383765 | 0.459 | 0.458724 | 2000.151013 | 2002.000000 | 0.0 |
| 80 | 0.649492 | 0.646736 | 0.529909 | 0.521922 | 1.126047 | 1.143551 | 0.468229 | 0.485154 | 4.652074 | 4.312049 | ... | 4.225174 | 0.871 | 0.870663 | 1.945 | 1.945021 | 0.913 | 0.913007 | 2008.890240 | 2009.224360 | 0.0 |
| 72 | NaN | 0.476955 | NaN | 0.385069 | NaN | 0.928519 | NaN | 0.511240 | NaN | 3.048227 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2007.389354 | NaN |
111 rows × 53 columns
[130]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[130]:
ET_ERA5L_all_basin 37
ET_ERA5L_unfiltered_basin 14
ET_rav_all_basin 37
ET_rav_unfiltered_basin 14
PET_ERA5L_all_basin 37
PET_ERA5L_unfiltered_basin 14
PET_rav_all_basin 37
PET_rav_unfiltered_basin 14
P_ERA5L_all_basin 37
P_ERA5L_unfiltered_basin 14
P_rav_all_basin 37
P_rav_unfiltered_basin 14
Q5_basin 37
Q5_gauge 37
Q95_basin 37
Q95_gauge 37
Q_all_basin 37
Q_unfiltered_basin 14
baseflow_index_ladson_basin 37
baseflow_index_ladson_gauge 37
g_aspect_basin 47
g_lat_basin 47
g_lon_basin 47
g_max_el_basin 47
g_mean_el_basin 47
g_min_el_basin 47
g_slope_basin 47
g_slopel20_basin 47
gaps_hourly_gauge 35
hfd_mean_basin 42
hfd_mean_gauge 42
high_prec_timing_basin 4
high_q_dur_basin 67
high_q_dur_gauge 67
high_q_freq_basin 67
high_q_freq_gauge 67
low_prec_timing_ERA5L_basin 2
low_prec_timing_basin 1
low_q_dur_basin 70
low_q_dur_gauge 70
low_q_freq_basin 70
low_q_freq_gauge 70
q_mean_basin 37
q_mean_gauge 37
runoff_ratio_basin 37
runoff_ratio_gauge 37
slope_fdc_basin 37
slope_fdc_gauge 37
stream_elas_basin 37
stream_elas_gauge 37
water_year_all_basin 37
water_year_unfiltered_basin 14
zero_q_freq_gauge 37
dtype: int64
[131]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
10m_wind_u, 10m_wind_u_rav, 10m_wind_v, 10m_wind_v_rav, 2m_dp_temp_max, 2m_dp_temp_mean,
2m_dp_temp_min, 2m_qv_rav, 2m_temp_rav, fcst_alb, grdflx_rav, lai_high_veg, lai_low_veg, max_temp_C,
mean_temp_C, min_temp_C, obs_q_cms, pcp_mm, pet_mm, prec_carra, prec_rav, ref_et_mm,
surf_dwn_solar_rad_rav, surf_dwn_therm_rad_rav, surf_net_solar_rad_max, surf_net_solar_rad_mean,
surf_net_therm_rad_max, surf_net_therm_rad_mean, surf_outg_therm_rad_rav, surf_press,
surf_press_rav, swe, total_et, total_et_rav, volsw_123, volsw_4
[132]:
dataset = RainfallRunoff('LamaHIce', timestep='H', data_type='total_upstrm', path=DATA_PATH, verbosity=0)
print(dataset)
LamaHIce with 76 stations, 28 dynamic and 138 static features
[133]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Q5_basin, Q5_gauge, Q95_basin, Q95_gauge, VHM_no_gauge, V_no_gauge, agr_fra_basin, area_calc_basin,
aridity_ERA5L_basin, aridity_basin, asp_mean_basin, bare_fra_basin, baseflow_index_ladson_basin,
baseflow_index_ladson_gauge, bedrk_dep_basin, clay_fra_basin, degimpact_basin, degimpact_gauge,
elev_mean_basin, elev_med_basin, elev_ran_basin, elev_std_basin, elevation_gauge, elon_ratio_basin,
forest_fra_basin, frac_snow_ERA5L_basin, frac_snow_basin, g621_fra_basin, g701_fra_basin,
g743_fra_basin, g746_fra_basin, g_area_basin, g_aspect_basin, g_dom_NI_basin, g_frac_basin,
g_lat_basin, g_lon_basin, g_max_el_basin, g_mean_el_basin, g_min_el_basin, g_slope_basin,
g_slopel20_basin, gaps_hourly_gauge, gbinn_fra_basin, gbnew_fra_basin, gbold_fra_basin,
gc_23_dom_basin, gc_23_pavr_basin, gc_23_pb_basin, gc_23_vapy_basin, gc_23_vb_basin,
gc_23_vbpy_basin, gc_23_vbsr_basin, gc_dom_basin, gc_pa_fra_basin, gc_pb_fra_basin, gc_va_fra_basin,
gc_vb_fra_basin, geometry_gauge, ggnew_fra_basin, ggold_fra_basin, ghraun_fra_basin, glac_fra_basin,
gmob_fra_basin, grav_fra_basin, gsgos_fra_basin, gsinn_fra_basin, gsn_fra_basin, gsnew_fra_basin,
gsold_fra_basin, gvf_diff_basin, gvf_max_basin, hfd_mean_basin, hfd_mean_gauge,
high_prec_du_ERA5L_basin, high_prec_du_basin, high_prec_fr_ERA5L_basin, high_prec_fr_basin,
high_prec_timing_ERA5L_basin, high_prec_timing_basin, high_q_dur_basin, high_q_dur_gauge,
high_q_freq_basin, high_q_freq_gauge, lai_diff_basin, lai_max_basin, lake_fra_basin, lat_gauge,
lc_dom_basin, lo_prec_fr_ERA5L_basin, lo_prec_fr_basin, lon_gauge, low_prec_du_ERA5L_basin,
low_prec_du_basin, low_prec_timing_ERA5L_basin, low_prec_timing_basin, low_q_dur_basin,
low_q_dur_gauge, low_q_freq_basin, low_q_freq_gauge, mvert_ang_basin, mvert_dist_basin, name_gauge,
ndvi_max_basin, ndvi_min_basin, obsbeg_day_gauge, obsbeg_hr_gauge, obsend_day_gauge,
obsend_hr_gauge, oc_fra_basin, p_mean_ERA5L_basin, p_mean_basin, p_season_ERA5L_basin,
p_season_basin, pet_mean_ERA5L_basin, q_mean_basin, q_mean_gauge, ref_et_mean_basin, river_gauge,
root_dep_basin, runoff_ratio_basin, runoff_ratio_gauge, sand_fra_basin, scrub_fra_basin,
silt_fra_basin, slope_fdc_basin, slope_fdc_gauge, slope_mean_basin, soil_poros_basin,
soil_tawc_basin, stream_elas_basin, stream_elas_gauge, strm_dens_basin, typimpact_basin,
typimpact_gauge, urban_fra_basin, wetl_fra_basin, zero_q_freq_gauge
[134]:
df = dataset.fetch_static_features()
print(df.shape)
(76, 138)
[135]:
print(df.isna().sum().sum())
df.isna().sum()
953
[135]:
Q5_basin 18
Q5_gauge 18
Q95_basin 18
Q95_gauge 18
VHM_no_gauge 0
..
typimpact_basin 0
typimpact_gauge 0
urban_fra_basin 0
wetl_fra_basin 0
zero_q_freq_gauge 18
Length: 138, dtype: int64
find those columns which have at least one NaN value
[136]:
df.loc[:, (df.isna().sum()>0)]
[136]:
| Q5_basin | Q5_gauge | Q95_basin | Q95_gauge | baseflow_index_ladson_basin | baseflow_index_ladson_gauge | g_aspect_basin | g_lat_basin | g_lon_basin | g_max_el_basin | ... | low_q_freq_gauge | q_mean_basin | q_mean_gauge | runoff_ratio_basin | runoff_ratio_gauge | slope_fdc_basin | slope_fdc_gauge | stream_elas_basin | stream_elas_gauge | zero_q_freq_gauge | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||||||
| 79 | 7.498 | 7.497704 | 12.688 | 12.687769 | 0.901 | 0.900813 | 193.420 | 448938.051 | 417760.006 | 1332.644 | ... | NaN | 9.389 | 9.389204 | 1.786 | 1.785626 | 0.404 | 0.404174 | 0.507 | 0.506978 | 0.0 |
| 98 | 4.037 | 4.036522 | 9.131 | 9.131344 | 0.822 | 0.821769 | 161.933 | 462008.994 | 449955.798 | 1768.421 | ... | NaN | 5.952 | 5.951890 | 1.178 | 1.178091 | 0.561 | 0.560945 | 0.666 | 0.666312 | 0.0 |
| 25 | 2.593 | 2.592731 | 24.071 | 24.071443 | 0.756 | 0.755520 | NaN | NaN | NaN | NaN | ... | 10.581360 | 10.317 | 10.316638 | 2.346 | 2.345917 | 1.441 | 1.440679 | 0.579 | 0.579003 | 0.0 |
| 1 | 0.996 | 0.996230 | 12.549 | 12.549089 | 0.561 | 0.560762 | NaN | NaN | NaN | NaN | ... | 6.084722 | 4.546 | 4.546374 | 1.115 | 1.114759 | 2.715 | 2.715291 | 1.365 | 1.364999 | 0.0 |
| 34 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 61 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 94 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 54 | 0.419 | 0.418586 | 28.214 | 28.213980 | 0.388 | 0.387571 | 105.826 | 465630.107 | 671331.688 | 1333.372 | ... | 110.276178 | 6.365 | 6.364638 | 1.014 | 1.014307 | 3.767 | 3.766742 | 0.929 | 0.929429 | 0.0 |
| 80 | 1.272 | 1.272295 | 11.077 | 11.076516 | 0.583 | 0.583472 | NaN | NaN | NaN | NaN | ... | 0.339978 | 4.225 | 4.225174 | 0.871 | 0.870663 | 1.945 | 1.945021 | 0.913 | 0.913007 | 0.0 |
| 72 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
76 rows × 35 columns
[137]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[137]:
Q5_basin 18
Q5_gauge 18
Q95_basin 18
Q95_gauge 18
baseflow_index_ladson_basin 18
baseflow_index_ladson_gauge 18
g_aspect_basin 36
g_lat_basin 36
g_lon_basin 36
g_max_el_basin 36
g_mean_el_basin 36
g_min_el_basin 36
g_slope_basin 36
g_slopel20_basin 36
hfd_mean_basin 21
hfd_mean_gauge 21
high_prec_timing_basin 3
high_q_dur_basin 41
high_q_dur_gauge 41
high_q_freq_basin 41
high_q_freq_gauge 41
low_prec_timing_ERA5L_basin 2
low_q_dur_basin 46
low_q_dur_gauge 46
low_q_freq_basin 46
low_q_freq_gauge 46
q_mean_basin 18
q_mean_gauge 18
runoff_ratio_basin 18
runoff_ratio_gauge 18
slope_fdc_basin 18
slope_fdc_gauge 18
stream_elas_basin 18
stream_elas_gauge 18
zero_q_freq_gauge 18
dtype: int64
[138]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
10m_wind_u, 10m_wind_u_rav, 10m_wind_v, 10m_wind_v_rav, 2m_dp_temp, 2m_qv_rav, 2m_temp_rav,
fcst_alb, grdflx_rav, lai_high_veg, lai_low_veg, mean_temp_C, obs_q_cms, pcp_mm, pet_mm, prec_rav,
surf_dwn_solar_rad_rav, surf_dwn_therm_rad_rav, surf_net_solar_rad, surf_net_therm_rad,
surf_outg_therm_rad_rav, surf_press, surf_press_rav, swe, total_et, total_et_rav, volsw_123, volsw_4
HYSETS
[139]:
dataset = RainfallRunoff('HYSETS', path=os.path.join(DATA_PATH, 'HYSETS'), verbosity=0)
print(dataset)
HYSETS with 14425 stations, 5 dynamic and 28 static features
[140]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Aspect_deg, Centroid_Lat_deg_N, Centroid_Lon_deg_E, Drainage_Area_GSIM_km2, Drainage_Area_km2,
Elevation_m, Flag_Artificial_Boundaries, Flag_GSIM_boundaries, Flag_Land_Use_Extraction,
Flag_Shape_Extraction, Flag_Subsoil_Extraction, Flag_Terrain_Extraction, Gravelius,
Land_Use_Crops_frac, Land_Use_Forest_frac, Land_Use_Grass_frac, Land_Use_Shrubs_frac,
Land_Use_Snow_Ice_frac, Land_Use_Urban_frac, Land_Use_Water_frac, Land_Use_Wetland_frac, Name,
Official_ID, Perimeter, Permeability_logk_m2, Porosity_frac, Slope_deg, Source
[141]:
df = dataset.fetch_static_features()
print(df.shape)
(14425, 28)
[142]:
print(df.isna().sum().sum())
df.isna().sum()
20179
[142]:
Source 0
Name 0
Official_ID 0
Centroid_Lat_deg_N 0
Centroid_Lon_deg_E 0
Drainage_Area_km2 0
Drainage_Area_GSIM_km2 13561
Flag_GSIM_boundaries 0
Flag_Artificial_Boundaries 0
Elevation_m 6
Slope_deg 6
Gravelius 1633
Perimeter 1633
Flag_Shape_Extraction 0
Aspect_deg 6
Flag_Terrain_Extraction 0
Land_Use_Forest_frac 13
Land_Use_Grass_frac 13
Land_Use_Wetland_frac 13
Land_Use_Water_frac 13
Land_Use_Urban_frac 13
Land_Use_Shrubs_frac 13
Land_Use_Crops_frac 13
Land_Use_Snow_Ice_frac 13
Flag_Land_Use_Extraction 0
Permeability_logk_m2 1615
Porosity_frac 1615
Flag_Subsoil_Extraction 0
dtype: int64
find those columns which have at least one NaN value
[143]:
df.loc[:, (df.isna().sum()>0)]
[143]:
| Drainage_Area_GSIM_km2 | Elevation_m | Slope_deg | Gravelius | Perimeter | Aspect_deg | Land_Use_Forest_frac | Land_Use_Grass_frac | Land_Use_Wetland_frac | Land_Use_Water_frac | Land_Use_Urban_frac | Land_Use_Shrubs_frac | Land_Use_Crops_frac | Land_Use_Snow_Ice_frac | Permeability_logk_m2 | Porosity_frac | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Watershed_ID | ||||||||||||||||
| 1 | NaN | 362.3 | 3.5329 | 2.7834 | 1194.505 | 130.4023 | 0.7869 | 0.0147 | 0.0645 | 0.0258 | 0.0089 | 0.0749 | 0.0242 | 0.0 | -14.719327 | 0.180905 |
| 2 | NaN | 353.4 | 4.6633 | 2.0656 | 269.164 | 91.7329 | 0.8452 | 0.0102 | 0.0228 | 0.0219 | 0.0174 | 0.0410 | 0.0414 | 0.0 | -14.056491 | 0.206450 |
| 3 | 2693.814 | 293.3 | 4.4690 | 2.0620 | 381.994 | 223.9510 | 0.8207 | 0.0093 | 0.0032 | 0.0487 | 0.0230 | 0.0351 | 0.0600 | 0.0 | -14.537390 | 0.165357 |
| 4 | NaN | 276.5 | 4.1819 | 2.4682 | 413.839 | 120.7400 | 0.6837 | 0.0226 | 0.1024 | 0.0630 | 0.0115 | 0.0641 | 0.0528 | 0.0 | -14.687869 | 0.170597 |
| 5 | NaN | 201.8 | 2.8061 | NaN | NaN | 56.8902 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 1.0000 | 0.0 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14421 | NaN | 1987.9 | 17.1982 | 2.0752 | 208.852 | 28.9860 | 0.5356 | 0.0330 | 0.0000 | 0.0000 | 0.0202 | 0.0170 | 0.3941 | 0.0 | -13.160658 | 0.096755 |
| 14422 | NaN | 769.5 | 6.5921 | 1.5715 | 325.714 | 110.5607 | 0.1348 | 0.3106 | 0.0025 | 0.0024 | 0.0305 | 0.0300 | 0.4874 | 0.0 | -12.698509 | 0.119993 |
| 14423 | NaN | 1883.2 | 14.7005 | 2.5953 | 1621.229 | 224.3422 | 0.8674 | 0.0437 | 0.0000 | 0.0026 | 0.0027 | 0.0429 | 0.0408 | 0.0 | -12.976926 | 0.090284 |
| 14424 | NaN | 1791.2 | 12.1021 | 2.4269 | 1288.932 | 184.5177 | 0.7720 | 0.1524 | 0.0000 | 0.0013 | 0.0029 | 0.0474 | 0.0241 | 0.0 | -12.968686 | 0.094042 |
| 14425 | NaN | 2179.1 | 5.9444 | 2.0769 | 165.762 | 112.0832 | 0.1605 | 0.5639 | 0.0000 | 0.0012 | 0.0091 | 0.1116 | 0.1536 | 0.0 | -12.792099 | 0.168963 |
14425 rows × 16 columns
[144]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[144]:
Drainage_Area_GSIM_km2 13561
Elevation_m 6
Slope_deg 6
Gravelius 1633
Perimeter 1633
Aspect_deg 6
Land_Use_Forest_frac 13
Land_Use_Grass_frac 13
Land_Use_Wetland_frac 13
Land_Use_Water_frac 13
Land_Use_Urban_frac 13
Land_Use_Shrubs_frac 13
Land_Use_Crops_frac 13
Land_Use_Snow_Ice_frac 13
Permeability_logk_m2 1615
Porosity_frac 1615
dtype: int64
[145]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
discharge, pr, swe, tasmax, tasmin
GRDCCaravan
[146]:
dataset = RainfallRunoff('GRDCCaravan', path=DATA_PATH, verbosity=0)
print(dataset)
GRDCCaravan with 5357 stations, 39 dynamic and 211 static features
[147]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
aet_mm_s01, aet_mm_s02, aet_mm_s03, aet_mm_s04, aet_mm_s05, aet_mm_s06, aet_mm_s07, aet_mm_s08,
aet_mm_s09, aet_mm_s10, aet_mm_s11, aet_mm_s12, aet_mm_syr, area,
area_fraction_used_for_aggregation, ari_ix_sav, aridity, cls_cl_smj, cly_pc_sav, clz_cl_smj,
cmi_ix_s01, cmi_ix_s02, cmi_ix_s03, cmi_ix_s04, cmi_ix_s05, cmi_ix_s06, cmi_ix_s07, cmi_ix_s08,
cmi_ix_s09, cmi_ix_s10, cmi_ix_s11, cmi_ix_s12, cmi_ix_syr, country, crp_pc_sse, dis_m3_pmn,
dis_m3_pmx, dis_m3_pyr, dor_pc_pva, ele_mt_sav, ele_mt_smn, ele_mt_smx, ero_kh_sav, fec_cl_smj,
fmh_cl_smj, for_pc_sse, frac_snow, gauge_lat, gauge_lon, gauge_name, gdp_ud_sav, gdp_ud_ssu,
gla_pc_sse, glc_cl_smj, glc_pc_s01, glc_pc_s02, glc_pc_s03, glc_pc_s04, glc_pc_s05, glc_pc_s06,
glc_pc_s07, glc_pc_s08, glc_pc_s09, glc_pc_s10, glc_pc_s11, glc_pc_s12, glc_pc_s13, glc_pc_s14,
glc_pc_s15, glc_pc_s16, glc_pc_s17, glc_pc_s18, glc_pc_s19, glc_pc_s20, glc_pc_s21, glc_pc_s22,
gwt_cm_sav, hdi_ix_sav, hft_ix_s09, hft_ix_s93, high_prec_dur, high_prec_freq, inu_pc_slt,
inu_pc_smn, inu_pc_smx, ire_pc_sse, kar_pc_sse, lit_cl_smj, lka_pc_sse, lkv_mc_usu, low_prec_dur,
low_prec_freq, moisture_index, nli_ix_sav, p_mean, pac_pc_sse, pet_mean, pet_mm_s01, pet_mm_s02,
pet_mm_s03, pet_mm_s04, pet_mm_s05, pet_mm_s06, pet_mm_s07, pet_mm_s08, pet_mm_s09, pet_mm_s10,
pet_mm_s11, pet_mm_s12, pet_mm_syr, pnv_cl_smj, pnv_pc_s01, pnv_pc_s02, pnv_pc_s03, pnv_pc_s04,
pnv_pc_s05, pnv_pc_s06, pnv_pc_s07, pnv_pc_s08, pnv_pc_s09, pnv_pc_s10, pnv_pc_s11, pnv_pc_s12,
pnv_pc_s13, pnv_pc_s14, pnv_pc_s15, pop_ct_usu, ppd_pk_sav, pre_mm_s01, pre_mm_s02, pre_mm_s03,
pre_mm_s04, pre_mm_s05, pre_mm_s06, pre_mm_s07, pre_mm_s08, pre_mm_s09, pre_mm_s10, pre_mm_s11,
pre_mm_s12, pre_mm_syr, prm_pc_sse, pst_pc_sse, rdd_mk_sav, rev_mc_usu, ria_ha_usu, riv_tc_usu,
run_mm_syr, seasonality, sgr_dk_sav, slp_dg_sav, slt_pc_sav, snd_pc_sav, snw_pc_s01, snw_pc_s02,
snw_pc_s03, snw_pc_s04, snw_pc_s05, snw_pc_s06, snw_pc_s07, snw_pc_s08, snw_pc_s09, snw_pc_s10,
snw_pc_s11, snw_pc_s12, snw_pc_smx, snw_pc_syr, soc_th_sav, swc_pc_s01, swc_pc_s02, swc_pc_s03,
swc_pc_s04, swc_pc_s05, swc_pc_s06, swc_pc_s07, swc_pc_s08, swc_pc_s09, swc_pc_s10, swc_pc_s11,
swc_pc_s12, swc_pc_syr, tbi_cl_smj, tec_cl_smj, tmp_dc_s01, tmp_dc_s02, tmp_dc_s03, tmp_dc_s04,
tmp_dc_s05, tmp_dc_s06, tmp_dc_s07, tmp_dc_s08, tmp_dc_s09, tmp_dc_s10, tmp_dc_s11, tmp_dc_s12,
tmp_dc_smn, tmp_dc_smx, tmp_dc_syr, urb_pc_sse, wet_cl_smj, wet_pc_s01, wet_pc_s02, wet_pc_s03,
wet_pc_s04, wet_pc_s05, wet_pc_s06, wet_pc_s07, wet_pc_s08, wet_pc_s09, wet_pc_sg1, wet_pc_sg2
[148]:
df = dataset.fetch_static_features()
print(df.shape)
(5357, 211)
[149]:
print(df.isna().sum().sum())
df.isna().sum()
0
[149]:
gauge_lat 0
gauge_lon 0
gauge_name 0
country 0
area 0
..
seasonality 0
high_prec_freq 0
high_prec_dur 0
low_prec_freq 0
low_prec_dur 0
Length: 211, dtype: int64
find those columns which have at least one NaN value
[150]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
No NaN values
[151]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[151]:
Series([], dtype: float64)
[152]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airtemp_C_2m_max, airtemp_C_2m_min, airtemp_C_mean_2m, dewpoint_temperature_2m_max,
dewpoint_temperature_2m_mean, dewpoint_temperature_2m_min, pcp_mm, potential_evaporation_sum,
q_cms_obs, snow_depth_water_equivalent_max, snow_depth_water_equivalent_mean,
snow_depth_water_equivalent_min, surface_net_solar_radiation_max, surface_net_solar_radiation_mean,
surface_net_solar_radiation_min, surface_net_thermal_radiation_max,
surface_net_thermal_radiation_mean, surface_net_thermal_radiation_min, surface_pressure_max,
surface_pressure_mean, surface_pressure_min, u_component_of_wind_10m_max,
u_component_of_wind_10m_mean, u_component_of_wind_10m_min, v_component_of_wind_10m_max,
v_component_of_wind_10m_mean, v_component_of_wind_10m_min, volumetric_soil_water_layer_1_max,
volumetric_soil_water_layer_1_mean, volumetric_soil_water_layer_1_min,
volumetric_soil_water_layer_2_max, volumetric_soil_water_layer_2_mean,
volumetric_soil_water_layer_2_min, volumetric_soil_water_layer_3_max,
volumetric_soil_water_layer_3_mean, volumetric_soil_water_layer_3_min,
volumetric_soil_water_layer_4_max, volumetric_soil_water_layer_4_mean,
volumetric_soil_water_layer_4_min
CCAM
[153]:
dataset = RainfallRunoff('CCAM', path=DATA_PATH, verbosity=0)
print(dataset)
CCAM with 102 stations, 16 dynamic and 124 static features
[154]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, barren, bdticm, bldfie_sl1, bldfie_sl2, bldfie_sl3, bldfie_sl4, bldfie_sl5, bldfie_sl6,
bldfie_sl7, cecsol_sl1, cecsol_sl2, cecsol_sl3, cecsol_sl4, cecsol_sl5, cecsol_sl6, cecsol_sl7,
circulatory_ratio, clay, closed_shrubland, compactness_coefficient, cropland,
cropland_natural_vegetaion, deciduous_broadleaf_tree, deciduous_needleleaf_tree, elev,
elongation_ratio, ev, evergreen_broadleaf_tree, evergreen_needleleaf_tree, evp_mean, form_factor,
frac_snow_daily, geol_permeability, geol_porosity, grassland, grav, gst_mean, high_prec_dur,
high_prec_freq, high_prec_timing, ig, lai_dif, lai_max, lat, length, length_continuous_runoff,
log_k_s_l1, log_k_s_l2, log_k_s_l3, log_k_s_l4, log_k_s_l5, log_k_s_l6, lon, low_prec_dur,
low_prec_freq, low_prec_timing, mixed_forest, mt, nd, ndvi_mean, open_shrubland, orcdrc_sl1,
orcdrc_sl2, orcdrc_sl3, orcdrc_sl4, orcdrc_sl5, orcdrc_sl6, orcdrc_sl7, pa, pb, pdep,
permanent_wetland, pet_mean, phihox_sl1, phihox_sl2, phihox_sl3, phihox_sl4, phihox_sl5, phihox_sl6,
phihox_sl7, pi, pop, pop_dnsty, por, pre_mean, prs_mean, py, rhu_mean, root_depth_50, root_depth_99,
sand, savanna, sc, shape_factor, silt, slope, sm, snow_and_ice, som, ss, ssd_mean, su, tem_mean,
theta_s_l1, theta_s_l2, theta_s_l3, theta_s_l4, theta_s_l5, theta_s_l6, tksatu_l1, tksatu_l2,
tksatu_l3, tksatu_l4, tksatu_l5, tksatu_l6, urban_and_built-up_land, va, vb, vi, water_bodies, wb,
win_mean, woody_savanna
[155]:
df = dataset.fetch_static_features()
print(df.shape)
(102, 124)
[156]:
print(df.isna().sum().sum())
df.isna().sum()
0
[156]:
area 0
barren 0
bdticm 0
bldfie_sl1 0
bldfie_sl2 0
..
vi 0
water_bodies 0
wb 0
win_mean 0
woody_savanna 0
Length: 124, dtype: int64
find those columns which have at least one NaN value
[157]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
No NaN values
[158]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[158]:
Series([], dtype: float64)
[159]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
evp, gst_max, gst_mean, gst_min, pre, prs_max, prs_mean, prs_min, q, rhu, ssd, tem_max, tem_mean,
tem_min, win_max, win_mean
Japan
[160]:
dataset = RainfallRunoff('Japan', path=DATA_PATH, verbosity=0)
print(dataset)
Japan with 751 stations, 27 dynamic and 35 static features
The static features of Japan are same as that of GSHA.
[161]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slp_dg_uav, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[162]:
df = dataset.fetch_static_features()
print(df.shape)
(751, 35)
[163]:
print(df.isna().sum().sum())
df.isna().sum()
265
[163]:
EVP_uncertainty(%) 78
HYRIV_ID 0
LRAD_uncertainty(%) 66
P_uncertainty(%) 0
SRAD_uncertainty(%) 0
T_uncertainty(%) 0
agency 0
area 0
cly_pc_uav 0
ele_mt_uav 0
ero_kh_uav 0
gla_pc_use 0
glc_cl_cmj 0
gwt_cm_cav 0
inu_pc_ult 0
lat 0
lit_cl_cmj 0
long 0
pet_uncertainty(%) 121
pnv_cl_cmj 0
prm_pc_use 0
sgr_dk_rav 0
slp_dg_uav 0
slt_pc_uav 0
snd_pc_uav 0
wet_pc_u01 0
wet_pc_u02 0
wet_pc_u03 0
wet_pc_u04 0
wet_pc_u05 0
wet_pc_u06 0
wet_pc_u07 0
wet_pc_u08 0
wet_pc_u09 0
wind_uncertainty(%) 0
dtype: int64
find those columns which have at least one NaN value
[164]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[165]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[165]:
EVP_uncertainty(%) 78
LRAD_uncertainty(%) 66
pet_uncertainty(%) 121
dtype: int64
[166]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra
Ireland
[167]:
dataset = RainfallRunoff('Ireland', path=DATA_PATH, verbosity=0)
print(dataset)
Ireland with 464 stations, 10 dynamic and 208 static features
The static features of Ireland are same as that of EStreams.
[168]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[169]:
df = dataset.fetch_static_features()
print(df.shape)
(464, 208)
[170]:
print(df.isna().sum().sum())
df.isna().sum()
9797
[170]:
static_features
area 16
area_calc 0
area_flag 0
area_perc 16
aridity 208
...
steep_area_fra 0
strm_dens 0
tot_area 0
watershed_group 0
zero_q_freq 208
Length: 208, dtype: int64
find those columns which have at least one NaN value
[171]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[172]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[172]:
static_features
area 16
area_perc 16
aridity 208
baseflow_index 208
bedrk_dep 1
...
soil_tawc_p90 1
start_date 137
start_date_climatic 208
start_date_hydro 204
zero_q_freq 208
Length: 111, dtype: int64
[173]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, obs_q_cms, pcp_mm, pet_mm, rh_%,
solrad_wm2, windspeed_mps
Finland
[174]:
dataset = RainfallRunoff('Finland', path=DATA_PATH, verbosity=0)
print(dataset)
Finland with 669 stations, 10 dynamic and 208 static features
The static features of Finland are same as that of EStreams.
[175]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[176]:
df = dataset.fetch_static_features()
print(df.shape)
(669, 208)
[177]:
print(df.isna().sum().sum())
df.isna().sum()
10791
[177]:
static_features
area 126
area_calc 0
area_flag 0
area_perc 126
aridity 176
...
steep_area_fra 0
strm_dens 0
tot_area 0
watershed_group 0
zero_q_freq 196
Length: 208, dtype: int64
find those columns which have at least one NaN value
[178]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[179]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[179]:
static_features
area 126
area_perc 126
aridity 176
baseflow_index 199
dam_yr_first 590
...
soil_tawc_p90 1
start_date 6
start_date_climatic 176
start_date_hydro 196
zero_q_freq 196
Length: 111, dtype: int64
[180]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, obs_q_cms, pcp_mm, pet_mm, rh_%,
solrad_wm2, windspeed_mps
Italy
[181]:
dataset = RainfallRunoff('Italy', path=DATA_PATH, verbosity=0)
print(dataset)
Italy with 294 stations, 10 dynamic and 208 static features
The static features of Italy are same as that of EStreams.
[182]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[183]:
df = dataset.fetch_static_features()
print(df.shape)
(294, 208)
[184]:
print(df.isna().sum().sum())
df.isna().sum()
4122
[184]:
static_features
area 106
area_calc 0
area_flag 0
area_perc 106
aridity 46
...
steep_area_fra 0
strm_dens 0
tot_area 0
watershed_group 0
zero_q_freq 86
Length: 208, dtype: int64
find those columns which have at least one NaN value
[185]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[186]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[186]:
static_features
area 106
area_perc 106
aridity 46
baseflow_index 87
dam_yr_first 265
dam_yr_last 265
duplicated_suspect 286
elevation 219
end_date_climatic 46
end_date_hydro 85
frac_snow 46
hfd_mean 98
hfd_std 105
hp_dur 46
hp_freq 46
hp_time 46
hq_dur 107
hq_freq 107
lakes_tot_area 209
lakes_tot_vol 209
lp_dur 46
lp_freq 46
lp_time 46
lq_dur 112
lq_freq 112
num_years_climatic 45
num_years_hydro 45
p_mean 46
p_seasonality 46
pet_mean 46
q_5 86
q_95 86
q_elas_Sankarasubramanian 86
q_mean 86
q_runoff_ratio 86
res_tot_sto 265
slope_sawicz 90
start_date_climatic 46
start_date_hydro 85
zero_q_freq 86
dtype: int64
[187]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, obs_q_cms, pcp_mm, pet_mm, rh_%,
solrad_wm2, windspeed_mps
Poland
[188]:
dataset = RainfallRunoff('Poland', path=DATA_PATH, verbosity=0)
print(dataset)
Poland with 1287 stations, 10 dynamic and 208 static features
The static features of Poland are same as that of EStreams.
[189]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[190]:
df = dataset.fetch_static_features()
print(df.shape)
(1287, 208)
[191]:
print(df.isna().sum().sum())
df.isna().sum()
16598
[191]:
static_features
area 6
area_calc 0
area_flag 0
area_perc 6
aridity 270
...
steep_area_fra 0
strm_dens 0
tot_area 0
watershed_group 0
zero_q_freq 270
Length: 208, dtype: int64
find those columns which have at least one NaN value
[192]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[193]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[193]:
static_features
area 6
area_perc 6
aridity 270
baseflow_index 270
dam_yr_first 1099
dam_yr_last 1099
duplicated_suspect 1284
elevation 1287
end_date 210
end_date_climatic 270
end_date_hydro 270
frac_snow 270
hfd_mean 277
hfd_std 283
hp_dur 270
hp_freq 270
hp_time 270
hq_dur 485
hq_freq 485
lakes_tot_area 396
lakes_tot_vol 396
lp_dur 270
lp_freq 270
lp_time 270
lq_dur 507
lq_freq 507
num_days_gaps 210
num_years_climatic 270
num_years_hydro 270
p_mean 270
p_seasonality 270
pet_mean 270
q_5 270
q_95 270
q_elas_Sankarasubramanian 270
q_mean 270
q_runoff_ratio 270
res_tot_sto 1101
slope_sawicz 270
start_date 210
start_date_climatic 270
start_date_hydro 270
zero_q_freq 270
dtype: int64
[194]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, obs_q_cms, pcp_mm, pet_mm, rh_%,
solrad_wm2, windspeed_mps
Portugal
[195]:
dataset = RainfallRunoff('Portugal', path=DATA_PATH, verbosity=0)
print(dataset)
Portugal with 280 stations, 10 dynamic and 208 static features
The static features of Portugal are same as that of EStreams.
[196]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[197]:
df = dataset.fetch_static_features()
print(df.shape)
(280, 208)
[198]:
print(df.isna().sum().sum())
df.isna().sum()
2842
[198]:
static_features
area 25
area_calc 0
area_flag 0
area_perc 25
aridity 43
..
steep_area_fra 0
strm_dens 0
tot_area 0
watershed_group 0
zero_q_freq 43
Length: 208, dtype: int64
find those columns which have at least one NaN value
[199]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[200]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[200]:
static_features
area 25
area_perc 25
aridity 43
baseflow_index 103
dam_yr_first 221
dam_yr_last 221
duplicated_suspect 280
end_date 11
end_date_climatic 43
end_date_hydro 43
frac_snow 43
hfd_mean 46
hfd_std 49
hp_dur 43
hp_freq 43
hp_time 43
hq_dur 45
hq_freq 45
lakes_tot_area 176
lakes_tot_vol 176
lp_dur 43
lp_freq 43
lp_time 43
lq_dur 45
lq_freq 45
num_days_gaps 11
num_years_climatic 43
num_years_hydro 43
p_mean 43
p_seasonality 43
pet_mean 43
q_5 43
q_95 43
q_elas_Sankarasubramanian 43
q_mean 43
q_runoff_ratio 43
res_tot_sto 221
slope_sawicz 97
start_date 11
start_date_climatic 43
start_date_hydro 43
zero_q_freq 43
dtype: int64
[201]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, obs_q_cms, pcp_mm, pet_mm, rh_%,
solrad_wm2, windspeed_mps
Simbi
[202]:
dataset = RainfallRunoff('Simbi', path= DATA_PATH, verbosity=0)
print(dataset)
Simbi with 24 stations, 3 dynamic and 232 static features
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
df = pd.read_csv(fpath, parse_dates=True, index_col=0)
[203]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Alluvial aquifers with free water, Alluvial aquifers with partly confined water, Alluvium & detrital
materials_geol, Andesites & rhyodacites_geol, Area, Aridity_mon_arid, BFI1_d, BFI2_d, BFI3_d, BFI_d,
Basalt_geol, Beaches & dunes_lc_98, Carb_Rocks_Perc, Carbonate aquifers with marl intercalation,
Closed Shrubland_lc_95, Continuous urban_lc_98, Cropland_lc_95, Crystalline formation,
Cumul_Freq_1%, Cumul_Freq_10%, Cumul_Freq_100%, Cumul_Freq_11%, Cumul_Freq_12%, Cumul_Freq_13%,
Cumul_Freq_14%, Cumul_Freq_15%, Cumul_Freq_16%, Cumul_Freq_17%, Cumul_Freq_18%, Cumul_Freq_19%,
Cumul_Freq_2%, Cumul_Freq_20%, Cumul_Freq_21%, Cumul_Freq_22%, Cumul_Freq_23%, Cumul_Freq_24%,
Cumul_Freq_25%, Cumul_Freq_26%, Cumul_Freq_27%, Cumul_Freq_28%, Cumul_Freq_29%, Cumul_Freq_3%,
Cumul_Freq_30%, Cumul_Freq_31%, Cumul_Freq_32%, Cumul_Freq_33%, Cumul_Freq_34%, Cumul_Freq_35%,
Cumul_Freq_36%, Cumul_Freq_37%, Cumul_Freq_38%, Cumul_Freq_39%, Cumul_Freq_4%, Cumul_Freq_40%,
Cumul_Freq_41%, Cumul_Freq_42%, Cumul_Freq_43%, Cumul_Freq_44%, Cumul_Freq_45%, Cumul_Freq_46%,
Cumul_Freq_47%, Cumul_Freq_48%, Cumul_Freq_49%, Cumul_Freq_5%, Cumul_Freq_50%, Cumul_Freq_51%,
Cumul_Freq_52%, Cumul_Freq_53%, Cumul_Freq_54%, Cumul_Freq_55%, Cumul_Freq_56%, Cumul_Freq_57%,
Cumul_Freq_58%, Cumul_Freq_59%, Cumul_Freq_6%, Cumul_Freq_60%, Cumul_Freq_61%, Cumul_Freq_62%,
Cumul_Freq_63%, Cumul_Freq_64%, Cumul_Freq_65%, Cumul_Freq_66%, Cumul_Freq_67%, Cumul_Freq_68%,
Cumul_Freq_69%, Cumul_Freq_7%, Cumul_Freq_70%, Cumul_Freq_71%, Cumul_Freq_72%, Cumul_Freq_73%,
Cumul_Freq_74%, Cumul_Freq_75%, Cumul_Freq_76%, Cumul_Freq_77%, Cumul_Freq_78%, Cumul_Freq_79%,
Cumul_Freq_8%, Cumul_Freq_80%, Cumul_Freq_81%, Cumul_Freq_82%, Cumul_Freq_83%, Cumul_Freq_84%,
Cumul_Freq_85%, Cumul_Freq_86%, Cumul_Freq_87%, Cumul_Freq_88%, Cumul_Freq_89%, Cumul_Freq_9%,
Cumul_Freq_90%, Cumul_Freq_91%, Cumul_Freq_92%, Cumul_Freq_93%, Cumul_Freq_94%, Cumul_Freq_95%,
Cumul_Freq_96%, Cumul_Freq_97%, Cumul_Freq_98%, Cumul_Freq_99%, Deciduous Broadleaf Forest_lc_95,
Deciduous Needleleaf Forest_lc_95, Dense agricultural crops_lc_98, Dense agroforestry systems_lc_98,
Diorite & tonalite_geol, Discontinuous urban_lc_98, Dominant pastures_lc_98, ETP_5_mon_q5,
ETP_95_mon_q95, ETP_mon_avg, Evergreen Broadleaf Forest_lc_95, Evergreen Needleleaf Forest_lc_95,
Fissured & partitioned carbonate aquifers, Flysch & sandstone & limestone_geol, Forest_lc_98,
Grassland_lc_95, Gravelius, Hard limestone_geol, Highly permeable fissured & porous carbonate
aquifers, Industrial areas_lc_98, Karst aquifer, Lat_Cent, Lat_Exu, Lon_Cent, Lon_Exu, Low
permeability sedimentary formation, Magma_Perc, Mangroves_lc_98, Marl & marly limestone_geol, Marl &
sand_geol, Marly limestone_geol, Max_Elv, Medium-density agricultural crops_lc_98, Min_Elv, Mixed
Forest_lc_95, More productive alluvial area, Open Shrubland_lc_95, P_5_mon_q5, P_95_mon_q95,
P_max10_mon_QMXA10, P_min5_mon_QMNA5, P_mon_avg, Pasture with other presence_lc_98, Ports &
airports_lc_98, Q1_5_mon_q5, Q1_95_mon_q95, Q1_max10_mon_QMXA10, Q1_min5_mon_QMNA5, Q1_mm_d_hq_dur,
Q1_mm_d_hq_freq, Q1_mm_d_lq_dur, Q1_mm_d_lq_freq, Q1_mm_d_mean, Q1_mm_d_q5, Q1_mm_d_q95, Q1_mon_avg,
Q2_5_mon_q5, Q2_95_mon_q95, Q2_max10_mon_QMXA10, Q2_min5_mon_QMNA5, Q2_mm_d_hq_dur, Q2_mm_d_hq_freq,
Q2_mm_d_lq_dur, Q2_mm_d_lq_freq, Q2_mm_d_mean, Q2_mm_d_q5, Q2_mm_d_q95, Q2_mon_avg, Q3_5_mon_q5,
Q3_95_mon_q95, Q3_max10_mon_QMXA10, Q3_min5_mon_QMNA5, Q3_mm_d_hq_dur, Q3_mm_d_hq_freq,
Q3_mm_d_lq_dur, Q3_mm_d_lq_freq, Q3_mm_d_mean, Q3_mm_d_q5, Q3_mm_d_q95, Q3_mon_avg, Q_5_mon_q5,
Q_95_mon_q95, Q_max10_mon_QMXA10, Q_min5_mon_QMNA5, Q_mm_d_hq_dur, Q_mm_d_hq_freq, Q_mm_d_lq_dur,
Q_mm_d_lq_freq, Q_mm_d_mean, Q_mm_d_q5, Q_mm_d_q95, Q_mon_avg, Quarry_lc_98, River beds & recent
alluvium_lc_98, Rock outcrops & bare soil_lc_98, Runoff_Ratio_mon_arid, Saline areas_lc_98,
Savannahs with other presence_lc_98, Savannahs_lc_98, Sd_Elv, Sedim_Perc, Slope, Stream_density,
Temp_5_mon_q5, Temp_95_mon_q95, Temp_mon_avg, Ultrabasic rocks_geol, Urban_lc_95, Volcano-
sedimentary rock_geol, Water plan_lc_98, Water_lc_95, Wetlands_lc_98, Wooded Grassland_lc_95,
Woodland_lc_95
[204]:
df = dataset.fetch_static_features()
print(df.shape)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
df = pd.read_csv(fpath, parse_dates=True, index_col=0)
(24, 232)
/home/abbaa0a/AquaFetch/aqua_fetch/rr/_simbi.py:299: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
df = pd.read_csv(fpath, parse_dates=True, index_col=0)
[205]:
print(df.isna().sum().sum())
df.isna().sum()
96
[205]:
Alluvial aquifers with free water 0
Alluvial aquifers with partly confined water 0
Alluvium & detrital materials_geol 0
Andesites & rhyodacites_geol 0
Area 0
..
Water plan_lc_98 0
Water_lc_95 0
Wetlands_lc_98 0
Wooded Grassland_lc_95 0
Woodland_lc_95 0
Length: 232, dtype: int64
find those columns which have at least one NaN value
[206]:
df.loc[:, (df.isna().sum()>0)]
[206]:
| BFI1_d | BFI2_d | BFI3_d | BFI_d | Q1_mm_d_hq_dur | Q1_mm_d_hq_freq | Q1_mm_d_lq_dur | Q1_mm_d_lq_freq | Q1_mm_d_mean | Q1_mm_d_q5 | ... | Q3_mm_d_mean | Q3_mm_d_q5 | Q3_mm_d_q95 | Q_mm_d_hq_dur | Q_mm_d_hq_freq | Q_mm_d_lq_dur | Q_mm_d_lq_freq | Q_mm_d_mean | Q_mm_d_q5 | Q_mm_d_q95 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 001 | 0.46 | 0.68 | 0.55 | 0.49 | 2.12 | 2.43 | 27.62 | 34.19 | 1.16 | 0.2 | ... | 1.18 | 0.2 | 3.10 | 1.62 | 0.86 | 0.00 | 0.00 | 1.23 | 0.4 | 3.10 |
| 004 | 0.59 | 0.38 | 0.42 | 0.38 | 1.98 | 5.38 | 0.00 | 0.00 | 2.26 | 0.6 | ... | 2.23 | 0.3 | 8.15 | 2.00 | 5.00 | 0.00 | 0.00 | 1.97 | 0.6 | 4.80 |
| 006 | 0.47 | 0.66 | 0.61 | 0.50 | 1.90 | 5.52 | 0.00 | 0.00 | 1.50 | 0.5 | ... | 1.36 | 0.4 | 3.50 | 1.89 | 1.00 | 4.00 | 0.80 | 1.35 | 0.4 | 3.60 |
| 007 | 0.49 | 0.53 | 0.50 | 0.47 | 2.65 | 5.05 | 19.03 | 29.00 | 1.91 | 0.3 | ... | 2.04 | 0.5 | 5.60 | 2.33 | 2.50 | 5.51 | 16.86 | 2.08 | 0.5 | 5.50 |
| 008 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 010 | 0.29 | 0.21 | 0.31 | 0.32 | 6.08 | 20.86 | 23.73 | 117.52 | 2.10 | 0.2 | ... | 2.19 | 0.1 | 7.10 | 2.85 | 8.23 | 9.95 | 35.23 | 2.61 | 0.4 | 6.70 |
| 023 | 0.16 | 0.34 | 0.29 | 0.20 | 3.65 | 20.14 | 18.33 | 89.05 | 1.87 | 0.2 | ... | 1.81 | 0.2 | 6.70 | 1.91 | 7.79 | 5.34 | 24.79 | 1.98 | 0.3 | 5.58 |
| 024 | 0.38 | 0.42 | 0.39 | 0.42 | 3.38 | 15.14 | 0.00 | 0.00 | 1.37 | 0.3 | ... | 1.38 | 0.4 | 4.50 | 1.75 | 2.75 | 18.19 | 109.00 | 1.13 | 0.1 | 3.70 |
| 029 | 0.38 | 0.28 | 0.33 | 0.39 | 2.43 | 6.95 | 34.18 | 61.86 | 2.31 | 0.2 | ... | 2.13 | 0.1 | 6.30 | 1.56 | 2.71 | 0.00 | 0.00 | 2.30 | 0.7 | 4.50 |
| 036 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 037 | 0.24 | 0.47 | 0.34 | 0.41 | 9.02 | 36.10 | 83.00 | 158.10 | 0.61 | 0.0 | ... | 0.71 | 0.0 | 2.90 | 1.33 | 0.38 | 12.67 | 62.08 | 1.03 | 0.2 | 2.90 |
| 041 | 0.39 | 0.61 | 0.52 | 0.39 | 6.14 | 14.90 | 130.76 | 105.86 | 0.89 | 0.0 | ... | 0.93 | 0.0 | 3.20 | 0.00 | 0.00 | 14.83 | 57.07 | 1.05 | 0.2 | 3.00 |
| 044 | 0.32 | 0.28 | 0.36 | 0.25 | 7.22 | 40.24 | 70.33 | 160.76 | 1.05 | 0.0 | ... | 0.89 | 0.0 | 4.10 | 1.38 | 9.86 | 12.38 | 15.57 | 1.36 | 0.2 | 4.46 |
| 045 | 0.52 | 0.39 | 0.44 | 0.42 | 2.95 | 2.95 | 0.00 | 0.00 | 0.44 | 0.1 | ... | 0.43 | 0.1 | 1.10 | 1.60 | 0.17 | 5.64 | 13.17 | 0.36 | 0.1 | 1.00 |
| 051 | 0.23 | 0.21 | 0.13 | 0.18 | 3.62 | 13.10 | 25.48 | 110.43 | 1.06 | 0.1 | ... | 1.61 | 0.1 | 6.00 | 2.27 | 12.00 | 7.72 | 49.25 | 1.67 | 0.2 | 4.50 |
| 052 | 0.49 | 0.62 | 0.58 | 0.29 | 1.90 | 2.71 | 37.16 | 97.33 | 2.66 | 0.2 | ... | 2.88 | 0.8 | 8.70 | 3.11 | 33.50 | 28.00 | 160.33 | 2.39 | 0.0 | 9.96 |
| 053 | 0.31 | 0.10 | 0.11 | 0.12 | 2.30 | 12.62 | 11.45 | 44.71 | 2.66 | 0.4 | ... | 2.72 | 0.0 | 15.60 | 1.89 | 14.33 | 7.15 | 79.33 | 2.41 | 0.2 | 6.90 |
| 056 | 0.22 | 0.49 | 0.47 | 0.15 | 3.49 | 11.29 | 18.35 | 70.76 | 1.21 | 0.1 | ... | 1.28 | 0.2 | 4.30 | 1.78 | 10.55 | 8.69 | 81.64 | 1.20 | 0.1 | 4.30 |
| 057 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 058 | 0.29 | 0.27 | 0.06 | 0.32 | 3.71 | 38.29 | 81.74 | 163.48 | 1.83 | 0.1 | ... | 1.50 | 0.0 | 6.80 | 2.32 | 20.00 | 20.33 | 86.50 | 0.96 | 0.1 | 2.80 |
| 060 | 0.44 | 0.53 | 0.41 | 0.26 | 3.03 | 9.67 | 29.38 | 55.95 | 1.53 | 0.3 | ... | 1.48 | 0.4 | 4.50 | 2.17 | 9.67 | 7.76 | 32.00 | 1.48 | 0.2 | 4.80 |
| 061 | 0.33 | 0.49 | 0.41 | 0.30 | 2.41 | 12.71 | 16.44 | 54.81 | 2.69 | 0.4 | ... | 2.87 | 0.5 | 9.20 | 1.85 | 6.92 | 4.80 | 19.00 | 2.86 | 0.6 | 8.90 |
| 065 | 0.28 | 0.34 | 0.31 | 0.25 | 2.39 | 17.52 | 28.92 | 50.95 | 1.74 | 0.3 | ... | 1.72 | 0.3 | 6.20 | 2.30 | 11.75 | 13.11 | 73.31 | 1.74 | 0.2 | 6.00 |
| 068 | 0.55 | 0.55 | 0.55 | 0.52 | 2.67 | 3.05 | 13.68 | 42.33 | 2.55 | 0.4 | ... | 2.31 | 0.3 | 6.10 | 2.12 | 2.83 | 8.60 | 7.17 | 2.19 | 0.6 | 4.60 |
24 rows × 32 columns
[207]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[207]:
BFI1_d 3
BFI2_d 3
BFI3_d 3
BFI_d 3
Q1_mm_d_hq_dur 3
Q1_mm_d_hq_freq 3
Q1_mm_d_lq_dur 3
Q1_mm_d_lq_freq 3
Q1_mm_d_mean 3
Q1_mm_d_q5 3
Q1_mm_d_q95 3
Q2_mm_d_hq_dur 3
Q2_mm_d_hq_freq 3
Q2_mm_d_lq_dur 3
Q2_mm_d_lq_freq 3
Q2_mm_d_mean 3
Q2_mm_d_q5 3
Q2_mm_d_q95 3
Q3_mm_d_hq_dur 3
Q3_mm_d_hq_freq 3
Q3_mm_d_lq_dur 3
Q3_mm_d_lq_freq 3
Q3_mm_d_mean 3
Q3_mm_d_q5 3
Q3_mm_d_q95 3
Q_mm_d_hq_dur 3
Q_mm_d_hq_freq 3
Q_mm_d_lq_dur 3
Q_mm_d_lq_freq 3
Q_mm_d_mean 3
Q_mm_d_q5 3
Q_mm_d_q95 3
dtype: int64
[208]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
pcp, q, temp
Spain
[209]:
dataset = RainfallRunoff('Spain', path=DATA_PATH, verbosity=0)
print(dataset)
Spain with 889 stations, 27 dynamic and 35 static features
The static features of Spain are same as that of GSHA.
[210]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slp_dg_uav, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[211]:
df = dataset.fetch_static_features()
print(df.shape)
(889, 35)
[212]:
print(df.isna().sum().sum())
df.isna().sum()
30
[212]:
EVP_uncertainty(%) 11
HYRIV_ID 0
LRAD_uncertainty(%) 6
P_uncertainty(%) 0
SRAD_uncertainty(%) 0
T_uncertainty(%) 0
agency 0
area 0
cly_pc_uav 0
ele_mt_uav 0
ero_kh_uav 0
gla_pc_use 0
glc_cl_cmj 0
gwt_cm_cav 0
inu_pc_ult 0
lat 0
lit_cl_cmj 0
long 0
pet_uncertainty(%) 13
pnv_cl_cmj 0
prm_pc_use 0
sgr_dk_rav 0
slp_dg_uav 0
slt_pc_uav 0
snd_pc_uav 0
wet_pc_u01 0
wet_pc_u02 0
wet_pc_u03 0
wet_pc_u04 0
wet_pc_u05 0
wet_pc_u06 0
wet_pc_u07 0
wet_pc_u08 0
wet_pc_u09 0
wind_uncertainty(%) 0
dtype: int64
find those columns which have at least one NaN value
[213]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[214]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[214]:
EVP_uncertainty(%) 11
LRAD_uncertainty(%) 6
pet_uncertainty(%) 13
dtype: int64
[215]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra
Thailand
[216]:
dataset = RainfallRunoff('Thailand', path=DATA_PATH, verbosity=0)
print(dataset)
Thailand with 73 stations, 27 dynamic and 35 static features
The static features of Thailand are same as that of GSHA.
[217]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slp_dg_uav, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[218]:
df = dataset.fetch_static_features()
print(df.shape)
(73, 35)
[219]:
print(df.isna().sum().sum())
df.isna().sum()
0
[219]:
EVP_uncertainty(%) 0
HYRIV_ID 0
LRAD_uncertainty(%) 0
P_uncertainty(%) 0
SRAD_uncertainty(%) 0
T_uncertainty(%) 0
agency 0
area 0
cly_pc_uav 0
ele_mt_uav 0
ero_kh_uav 0
gla_pc_use 0
glc_cl_cmj 0
gwt_cm_cav 0
inu_pc_ult 0
lat 0
lit_cl_cmj 0
long 0
pet_uncertainty(%) 0
pnv_cl_cmj 0
prm_pc_use 0
sgr_dk_rav 0
slp_dg_uav 0
slt_pc_uav 0
snd_pc_uav 0
wet_pc_u01 0
wet_pc_u02 0
wet_pc_u03 0
wet_pc_u04 0
wet_pc_u05 0
wet_pc_u06 0
wet_pc_u07 0
wet_pc_u08 0
wet_pc_u09 0
wind_uncertainty(%) 0
dtype: int64
find those columns which have at least one NaN value
[220]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
No NaN values
[221]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[221]:
Series([], dtype: float64)
[222]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, q_cms_obs, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2,
swe_mm_era5, windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra
USGS
[223]:
dataset = RainfallRunoff('USGS', path=DATA_PATH, verbosity=0)
print(dataset)
USGS with 12004 stations, 5 dynamic and 27 static features
The static features of USGS are same as that of HYSETS.
[224]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
Aspect_deg, Centroid_Lat_deg_N, Centroid_Lon_deg_E, Drainage_Area_GSIM_km2, Drainage_Area_km2,
Elevation_m, Flag_Artificial_Boundaries, Flag_GSIM_boundaries, Flag_Land_Use_Extraction,
Flag_Shape_Extraction, Flag_Subsoil_Extraction, Flag_Terrain_Extraction, Gravelius,
Land_Use_Crops_frac, Land_Use_Forest_frac, Land_Use_Grass_frac, Land_Use_Shrubs_frac,
Land_Use_Snow_Ice_frac, Land_Use_Urban_frac, Land_Use_Water_frac, Land_Use_Wetland_frac, Name,
Perimeter, Permeability_logk_m2, Porosity_frac, Slope_deg, Source
[225]:
df = dataset.fetch_static_features()
print(df.shape)
(12004, 27)
[226]:
print(df.isna().sum().sum())
df.isna().sum()
16551
[226]:
Source 0
Name 0
Centroid_Lat_deg_N 0
Centroid_Lon_deg_E 0
Drainage_Area_km2 0
Drainage_Area_GSIM_km2 11884
Flag_GSIM_boundaries 0
Flag_Artificial_Boundaries 0
Elevation_m 1
Slope_deg 1
Gravelius 1168
Perimeter 1168
Flag_Shape_Extraction 0
Aspect_deg 1
Flag_Terrain_Extraction 0
Land_Use_Forest_frac 3
Land_Use_Grass_frac 3
Land_Use_Wetland_frac 3
Land_Use_Water_frac 3
Land_Use_Urban_frac 3
Land_Use_Shrubs_frac 3
Land_Use_Crops_frac 3
Land_Use_Snow_Ice_frac 3
Flag_Land_Use_Extraction 0
Permeability_logk_m2 1152
Porosity_frac 1152
Flag_Subsoil_Extraction 0
dtype: int64
find those columns which have at least one NaN value
[227]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[228]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[228]:
Drainage_Area_GSIM_km2 11884
Elevation_m 1
Slope_deg 1
Gravelius 1168
Perimeter 1168
Aspect_deg 1
Land_Use_Forest_frac 3
Land_Use_Grass_frac 3
Land_Use_Wetland_frac 3
Land_Use_Water_frac 3
Land_Use_Urban_frac 3
Land_Use_Shrubs_frac 3
Land_Use_Crops_frac 3
Land_Use_Snow_Ice_frac 3
Permeability_logk_m2 1152
Porosity_frac 1152
dtype: int64
[229]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
obs_q_cms, pr, swe, tasmax, tasmin
WaterBenchIowa
[230]:
dataset = RainfallRunoff('WaterBenchIowa', path=DATA_PATH, verbosity=0)
print(dataset)
WaterBenchIowa with 125 stations, 3 dynamic and 7 static features
[231]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, loam, sandy_clay_loam, silt, silty_clay_loam, slope, travel_time
[232]:
df = dataset.fetch_static_features()
print(df.shape)
(125, 7)
[233]:
print(df.isna().sum().sum())
df.isna().sum()
0
[233]:
travel_time 0
area 0
slope 0
loam 0
silt 0
sandy_clay_loam 0
silty_clay_loam 0
dtype: int64
find those columns which have at least one NaN value
[234]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
No NaN values
[235]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[235]:
Series([], dtype: float64)
[236]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
discharge, et, precipitation
Regional Datsets without observed streamflow
The following datasets do not have observed streamflow data. However, they behave similar to the datasets with observed streamflow data.
GSHA
This dataset contains climate (dynamic) variables and static features for catchments around the world. These dynamic and static features are used for other dataset classes like Spain, Thailand and Japan.
[237]:
dataset = RainfallRunoff('GSHA', path=DATA_PATH, verbosity=0)
print(dataset)
GSHA with 21568 stations, 26 dynamic and 35 static features
[238]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
EVP_uncertainty(%), HYRIV_ID, LRAD_uncertainty(%), P_uncertainty(%), SRAD_uncertainty(%),
T_uncertainty(%), agency, area, cly_pc_uav, ele_mt_uav, ero_kh_uav, gla_pc_use, glc_cl_cmj,
gwt_cm_cav, inu_pc_ult, lat, lit_cl_cmj, long, pet_uncertainty(%), pnv_cl_cmj, prm_pc_use,
sgr_dk_rav, slp_dg_uav, slt_pc_uav, snd_pc_uav, wet_pc_u01, wet_pc_u02, wet_pc_u03, wet_pc_u04,
wet_pc_u05, wet_pc_u06, wet_pc_u07, wet_pc_u08, wet_pc_u09, wind_uncertainty(%)
[239]:
df = dataset.fetch_static_features()
print(df.shape)
(21568, 35)
[240]:
print(df.isna().sum().sum())
df.isna().sum()
3442
[240]:
EVP_uncertainty(%) 1224
HYRIV_ID 0
LRAD_uncertainty(%) 630
P_uncertainty(%) 0
SRAD_uncertainty(%) 0
T_uncertainty(%) 8
agency 0
area 0
cly_pc_uav 0
ele_mt_uav 0
ero_kh_uav 0
gla_pc_use 0
glc_cl_cmj 0
gwt_cm_cav 0
inu_pc_ult 0
lat 0
lit_cl_cmj 0
long 0
pet_uncertainty(%) 1580
pnv_cl_cmj 0
prm_pc_use 0
sgr_dk_rav 0
slp_dg_uav 0
slt_pc_uav 0
snd_pc_uav 0
wet_pc_u01 0
wet_pc_u02 0
wet_pc_u03 0
wet_pc_u04 0
wet_pc_u05 0
wet_pc_u06 0
wet_pc_u07 0
wet_pc_u08 0
wet_pc_u09 0
wind_uncertainty(%) 0
dtype: int64
find those columns which have at least one NaN value
[241]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[242]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[242]:
EVP_uncertainty(%) 1224
LRAD_uncertainty(%) 630
T_uncertainty(%) 8
pet_uncertainty(%) 1580
dtype: int64
[243]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
aet_mm_gleam, aet_mm_rea, airtemp_C_mean_era5, airtemp_C_mean_eustace, airtemp_C_mean_merra2,
gw_percent, lai, lwdownrad_wm2_era5, lwdownrad_wm2_merra2, pcp_mm_emearth, pcp_mm_mswep,
pet_mm_gleam, pet_mm_hpet, sml1, sml2, sml3, sml4, solrad_wm2_era5, solrad_wm2_merra2, swe_mm_era5,
windspeed_mps_era5, windspeed_mps_merra, windspeedu_mps_era5, windspeedu_mps_merra,
windspeedv_mps_era5, windspeedv_mps_merra
EStreams
The EStreams dataset does not contain observed streamflow data. However, it contains other climate (dynamic) variables and static features for european catchments. These dynamic and static features are used for other Euoropean dataset classes like Portugal, Spain, Finland, Italy, Ireland and Poland.
[244]:
dataset = RainfallRunoff('EStreams', path=DATA_PATH, verbosity=0)
print(dataset)
EStreams with 15047 stations, 9 dynamic and 208 static features
[245]:
static_features = dataset.static_features
static_features.sort()
print(textwrap.fill(", ".join(static_features), width=100))
area, area_calc, area_flag, area_perc, aridity, baseflow_index, bedrk_dep, dam_num, dam_yr_first,
dam_yr_last, duplicated_suspect, ele_mt_max, ele_mt_mean, ele_mt_min, elevation, elon_ratio,
end_date, end_date_climatic, end_date_hydro, flat_area_fra, frac_snow, gauge_country, gauge_id,
gauge_name, gauge_provider, gauges_upstream, hfd_mean, hfd_std, hp_dur, hp_freq, hp_time, hq_dur,
hq_freq, lai_01, lai_02, lai_03, lai_04, lai_05, lai_06, lai_07, lai_08, lai_09, lai_10, lai_11,
lai_12, lai_mean, lakes_num, lakes_tot_area, lakes_tot_vol, lat, lat_snap, lit_dom, lit_fra_ev,
lit_fra_ig, lit_fra_mt, lit_fra_nd, lit_fra_pa, lit_fra_pb, lit_fra_pi, lit_fra_py, lit_fra_sc,
lit_fra_sm, lit_fra_ss, lit_fra_su, lit_fra_va, lit_fra_vb, lit_fra_vi, lon, lon_snap, lp_dur,
lp_freq, lp_time, lq_dur, lq_freq, ndvi_01, ndvi_02, ndvi_03, ndvi_04, ndvi_05, ndvi_06, ndvi_07,
ndvi_08, ndvi_09, ndvi_10, ndvi_11, ndvi_12, ndvi_mean, num_continuous_days, num_days,
num_days_gaps, num_months, num_years, num_years_climatic, num_years_hydro, p_mean, p_seasonality,
pet_mean, q_5, q_95, q_elas_Sankarasubramanian, q_mean, q_runoff_ratio, res_num, res_tot_sto, river,
root_dep_max, root_dep_mean, root_dep_med, root_dep_min, root_dep_p05, root_dep_p25, root_dep_p75,
root_dep_p90, slope_sawicz, slp_dg_mean, sno_cov_01, sno_cov_02, sno_cov_03, sno_cov_04, sno_cov_05,
sno_cov_06, sno_cov_07, sno_cov_08, sno_cov_09, sno_cov_10, sno_cov_11, sno_cov_12, sno_cov_mean,
soil_bd_max, soil_bd_mean, soil_bd_med, soil_bd_min, soil_bd_p05, soil_bd_p25, soil_bd_p75,
soil_bd_p90, soil_fra_clay_max, soil_fra_clay_mean, soil_fra_clay_med, soil_fra_clay_min,
soil_fra_clay_p05, soil_fra_clay_p25, soil_fra_clay_p75, soil_fra_clay_p90, soil_fra_grav_max,
soil_fra_grav_mean, soil_fra_grav_med, soil_fra_grav_min, soil_fra_grav_p05, soil_fra_grav_p25,
soil_fra_grav_p75, soil_fra_grav_p90, soil_fra_sand_max, soil_fra_sand_mean, soil_fra_sand_med,
soil_fra_sand_min, soil_fra_sand_p05, soil_fra_sand_p25, soil_fra_sand_p75, soil_fra_sand_p90,
soil_fra_silt_max, soil_fra_silt_mean, soil_fra_silt_med, soil_fra_silt_min, soil_fra_silt_p05,
soil_fra_silt_p25, soil_fra_silt_p75, soil_fra_silt_p90, soil_oc_max, soil_oc_mean, soil_oc_med,
soil_oc_min, soil_oc_p05, soil_oc_p25, soil_oc_p75, soil_oc_p90, soil_tawc_max, soil_tawc_mean,
soil_tawc_med, soil_tawc_min, soil_tawc_p05, soil_tawc_p25, soil_tawc_p75, soil_tawc_p90,
start_date, start_date_climatic, start_date_hydro, stations_dens_p, stations_dens_rh,
stations_dens_sp, stations_dens_swr, stations_dens_tmax, stations_dens_tmean, stations_dens_tmin,
stations_dens_ws, stations_num_p, stations_num_rh, stations_num_sp, stations_num_swr,
stations_num_tmax, stations_num_tmean, stations_num_tmin, stations_num_ws, steep_area_fra,
strm_dens, tot_area, watershed_group, zero_q_freq
[246]:
df = dataset.fetch_static_features()
print(df.shape)
(15047, 208)
[247]:
print(df.isna().sum().sum())
df.isna().sum()
191618
[247]:
static_features
area 1115
area_calc 0
area_flag 0
area_perc 1116
aridity 3179
...
steep_area_fra 0
strm_dens 0
tot_area 0
watershed_group 0
zero_q_freq 3269
Length: 208, dtype: int64
find those columns which have at least one NaN value
[248]:
if df.isna().sum().sum()>0:
df.loc[:, (df.isna().sum()>0)]
else:
print('No NaN values')
[249]:
df.loc[:, (df.isna().sum()>0)].isna().sum()
[249]:
static_features
area 1115
area_perc 1116
aridity 3179
baseflow_index 3380
bedrk_dep 5
...
stations_dens_tmax 1
stations_dens_tmean 1
stations_dens_tmin 1
stations_dens_ws 1
zero_q_freq 3269
Length: 158, dtype: int64
[250]:
dynamic_features = dataset.dynamic_features
dynamic_features.sort()
print(textwrap.fill(", ".join(dynamic_features), width=100))
airpres_hpamin_, airtemp_C_max, airtemp_C_mean, airtemp_C_min, pcp_mm, pet_mm, rh_%, solrad_wm2,
windspeed_mps