CAMELS Australia

This example demonstrates how to use the water_datasets package to download and explore the CAMELS Australia dataset using the :py:class:water_datasets.RainfallRunoff class. Although we show it for CAMELS Australia, the same can be done for all other rainfall runoff datasets.

Note: This file runs online on readthedocs everytime the documentation is built. The server to download the CAMELS_AUS data is sometimes down and gives HTTPError: HTTP Error 500: Internal Server Error.

[1]:
import os
import site

if __name__ == '__main__':
    wd_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath('__file__')))))
    #wd_dir = os.path.dirname(os.path.realpath('__file__'))
    #wd_dir = os.path.dirname(os.path.dirname(os.path.realpath('__file__')))
    print(wd_dir)
    site.addsitedir(wd_dir)

from tabulight import EDA
import matplotlib.pyplot as plt
from easy_mpl import scatter, hist
from easy_mpl.utils import process_cbar
from aqua_fetch import RainfallRunoff
from aqua_fetch.utils import print_info
/home/docs/checkouts/readthedocs.org/user_builds/water-datasets/checkouts/latest
[2]:
print_info()
numpy 1.26.4
pandas 2.1.4
water_quality 0.1.0
python 3.12.7 (main, Nov  5 2024, 17:00:24) [GCC 9.4.0]
os posix
matplotlib 3.8.4
xarray 2024.7.0
netCDF4 1.7.2
scipy 1.15.1
Script Executed on:  19 January 2025 12:13:12
tot_cpus 2
avail_cpus 2
mem_gib 7.612831115722656
[3]:
dataset = RainfallRunoff('CAMELS_AUS', version=1,
                         #overwrite=True,
                         #path='/mnt/datawaha/hyex/atr/gscad_database/raw/CAMELS_AUS_V1'
                         )
01_id_name_metadata.zip already exists at /home/docs/checkouts/readthedocs.org/user_builds/water-datasets/checkouts/latest/aqua_fetch/data/CAMELS/CAMELS_AUS
Downloading 02_location_boundary_area.zip from https://download.pangaea.de/dataset/921850/files/02_location_boundary_area.zip at /home/docs/checkouts/readthedocs.org/user_builds/water-datasets/checkouts/latest/aqua_fetch/data/CAMELS/CAMELS_AUS/02_location_boundary_area.zip
HTTP Error for https://download.pangaea.de/dataset/921850/files/02_location_boundary_area.zip to download 02_location_boundary_area.zip
/home/docs/checkouts/readthedocs.org/user_builds/water-datasets/checkouts/latest/aqua_fetch/rr/_camels.py:276: SyntaxWarning: invalid escape sequence '\s'
  sep="\s+|;|:",
---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 dataset = RainfallRunoff('CAMELS_AUS', version=1, 
      2                          #overwrite=True,
      3                          #path='/mnt/datawaha/hyex/atr/gscad_database/raw/CAMELS_AUS_V1'
      4                          )

File ~/checkouts/readthedocs.org/user_builds/water-datasets/checkouts/latest/aqua_fetch/rr/__init__.py:296, in RainfallRunoff.__init__(self, dataset, path, overwrite, to_netcdf, processes, remove_zip, verbosity, **kwargs)
    293 if dataset not in DATASETS:
    294     raise ValueError(f"Dataset {dataset} not available")
--> 296 self.dataset = DATASETS[dataset](
    297     path=path,
    298     overwrite=overwrite,
    299     to_netcdf=to_netcdf,
    300     processes=processes,
    301     remove_zip=remove_zip,
    302     verbosity=verbosity,
    303     **kwargs
    304 )

File ~/checkouts/readthedocs.org/user_builds/water-datasets/checkouts/latest/aqua_fetch/rr/_camels.py:835, in CAMELS_AUS.__init__(self, path, version, to_netcdf, overwrite, verbosity, **kwargs)
    833     if verbosity > 0:
    834         print(f"Downloading {_file} from {url + _file} at {fpath}")
--> 835     download(url + _file, outdir=self.path, fname=_file)
    836 elif verbosity > 0:
    837     print(f"{_file} already exists at {self.path}")

File ~/checkouts/readthedocs.org/user_builds/water-datasets/checkouts/latest/aqua_fetch/utils.py:101, in download(url, outdir, fname, verbosity)
     99 except ulib.HTTPError as e:
    100     print(f"HTTP Error for {url} to download {fname}")
--> 101     raise e
    103 filename = filename_from_url(url)
    105 if fname:

File ~/checkouts/readthedocs.org/user_builds/water-datasets/checkouts/latest/aqua_fetch/utils.py:98, in download(url, outdir, fname, verbosity)
     95 binurl = urlparse.urlunsplit(binurl)
     97 try:
---> 98     (tmpfile, headers) = ulib.urlretrieve(binurl, tmpfile, callback)
     99 except ulib.HTTPError as e:
    100     print(f"HTTP Error for {url} to download {fname}")

File ~/.asdf/installs/python/3.12.7/lib/python3.12/urllib/request.py:240, in urlretrieve(url, filename, reporthook, data)
    223 """
    224 Retrieve a URL into a temporary location on disk.
    225
   (...)
    236 data file as well as the resulting HTTPMessage object.
    237 """
    238 url_type, path = _splittype(url)
--> 240 with contextlib.closing(urlopen(url, data)) as fp:
    241     headers = fp.info()
    243     # Just return the local path and the "headers" for file://
    244     # URLs. No sense in performing a copy unless requested.

File ~/.asdf/installs/python/3.12.7/lib/python3.12/urllib/request.py:215, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    213 else:
    214     opener = _opener
--> 215 return opener.open(url, data, timeout)

File ~/.asdf/installs/python/3.12.7/lib/python3.12/urllib/request.py:521, in OpenerDirector.open(self, fullurl, data, timeout)
    519 for processor in self.process_response.get(protocol, []):
    520     meth = getattr(processor, meth_name)
--> 521     response = meth(req, response)
    523 return response

File ~/.asdf/installs/python/3.12.7/lib/python3.12/urllib/request.py:630, in HTTPErrorProcessor.http_response(self, request, response)
    627 # According to RFC 2616, "2xx" code indicates that the client's
    628 # request was successfully received, understood, and accepted.
    629 if not (200 <= code < 300):
--> 630     response = self.parent.error(
    631         'http', request, response, code, msg, hdrs)
    633 return response

File ~/.asdf/installs/python/3.12.7/lib/python3.12/urllib/request.py:559, in OpenerDirector.error(self, proto, *args)
    557 if http_err:
    558     args = (dict, 'default', 'http_error_default') + orig_args
--> 559     return self._call_chain(*args)

File ~/.asdf/installs/python/3.12.7/lib/python3.12/urllib/request.py:492, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
    490 for handler in handlers:
    491     func = getattr(handler, meth_name)
--> 492     result = func(*args)
    493     if result is not None:
    494         return result

File ~/.asdf/installs/python/3.12.7/lib/python3.12/urllib/request.py:639, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs)
    638 def http_error_default(self, req, fp, code, msg, hdrs):
--> 639     raise HTTPError(req.full_url, code, msg, hdrs, fp)

HTTPError: HTTP Error 503: Service Unavailable
[4]:
dataset.start
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 dataset.start

NameError: name 'dataset' is not defined
[5]:
dataset.end
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 dataset.end

NameError: name 'dataset' is not defined
[6]:
stations = dataset.stations()
len(stations)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 stations = dataset.stations()
      2 len(stations)

NameError: name 'dataset' is not defined
[7]:
stations[0:10]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 1
----> 1 stations[0:10]

NameError: name 'stations' is not defined

Static Features

[8]:
dataset.static_features
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 dataset.static_features

NameError: name 'dataset' is not defined
[9]:
len(dataset.static_features)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 len(dataset.static_features)

NameError: name 'dataset' is not defined
[10]:
mrvbf = 'proportion of catchment occupied by classes of MultiResolution Valley Bottom Flatness'
lc01 = 'land cover codes'
nvis = 'vegetation sub-groups'
anngro = 'Average annual growth index value for some plants'
gromega = 'Seasonality of growth index value'
npp = 'net primary productivity'
[11]:
static = dataset.fetch_static_features(stations=stations)
static.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 static = dataset.fetch_static_features(stations=stations)
      2 static.shape

NameError: name 'dataset' is not defined
[12]:
# EDA(data=static, save=False).heatmap()
[13]:
physical_features = []
soil_features = []
geological_features = []
flow_characteristics = []

static = static.dropna(axis=1)
static.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 6
      3 geological_features = []
      4 flow_characteristics = []
----> 6 static = static.dropna(axis=1)
      7 static.shape

NameError: name 'static' is not defined
[14]:
coords = dataset.stn_coords()
coords
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 1
----> 1 coords = dataset.stn_coords()
      2 coords

NameError: name 'dataset' is not defined
[15]:
dataset.plot_stations()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 1
----> 1 dataset.plot_stations()

NameError: name 'dataset' is not defined
[16]:
lat = coords['lat'].astype(float).values.reshape(-1,)
long = coords['long'].astype(float).values.reshape(-1,)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 1
----> 1 lat = coords['lat'].astype(float).values.reshape(-1,)
      2 long = coords['long'].astype(float).values.reshape(-1,)

NameError: name 'coords' is not defined
[17]:
idx = 0
ax_num = 0

fig, axes = plt.subplots(5, 5, figsize=(15, 12))
axes = axes.flatten()

while ax_num < 25:

    val = static.iloc[:, idx]
    idx += 1

    try:
        c = val.astype(float).values.reshape(-1,)

        en = 222
        ax = axes[ax_num]
        ax, sc = scatter(long[0:en], lat[0:en], c=c[0:en], cmap="hot", show=False, ax=ax)

        process_cbar(ax, sc, border=False, title=val.name, #title_kws ={"fontsize": 14}
                    )
        ax_num += 1
    except ValueError:
        continue

plt.tight_layout()
plt.show()
print(idx)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 9
      5 axes = axes.flatten()
      7 while ax_num < 25:
----> 9     val = static.iloc[:, idx]
     10     idx += 1
     12     try:

NameError: name 'static' is not defined
../_images/auto_examples_camels_australia_18_1.svg
[18]:
idx = 32
ax_num = 0

fig, axes = plt.subplots(5, 5, figsize=(15, 12))
axes = axes.flatten()

while ax_num < 25:

    val = static.iloc[:, idx]
    idx += 1

    try:
        c = val.astype(float).values.reshape(-1,)

        en = 222
        ax = axes[ax_num]
        ax, sc = scatter(long[0:en], lat[0:en], c=c[0:en], cmap="hot", show=False, ax=ax)

        process_cbar(ax, sc, border=False, title=val.name, #title_kws ={"fontsize": 14}
                    )
        ax_num += 1
    except ValueError:
        continue

plt.tight_layout()
plt.show()
print(idx)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 9
      5 axes = axes.flatten()
      7 while ax_num < 25:
----> 9     val = static.iloc[:, idx]
     10     idx += 1
     12     try:

NameError: name 'static' is not defined
../_images/auto_examples_camels_australia_19_1.svg
[19]:
idx = 59
ax_num = 0

fig, axes = plt.subplots(5, 5, figsize=(15, 12))
axes = axes.flatten()

while ax_num < 25:

    val = static.iloc[:, idx]
    idx += 1

    try:
        c = val.astype(float).values.reshape(-1,)

        en = 222
        ax = axes[ax_num]
        ax, sc = scatter(long[0:en], lat[0:en], c=c[0:en], cmap="hot", show=False, ax=ax)

        process_cbar(ax, sc, border=False, title=val.name, #title_kws ={"fontsize": 14}
                    )
        ax_num += 1
    except ValueError:
        continue

plt.tight_layout()
plt.show()
print(idx)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 9
      5 axes = axes.flatten()
      7 while ax_num < 25:
----> 9     val = static.iloc[:, idx]
     10     idx += 1
     12     try:

NameError: name 'static' is not defined
../_images/auto_examples_camels_australia_20_1.svg

Dyanmic Features

[20]:
dataset.dynamic_features
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[20], line 1
----> 1 dataset.dynamic_features

NameError: name 'dataset' is not defined

Streamflow

[21]:
streamflow = dataset.q_mmd()

streamflow.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 1
----> 1 streamflow = dataset.q_mmd()
      3 streamflow.shape

NameError: name 'dataset' is not defined
[22]:
streamflow
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 1
----> 1 streamflow

NameError: name 'streamflow' is not defined
[23]:
# EDA(data=streamflow, save=False).heatmap()
[24]:
fig, axes = plt.subplots(7, 7, figsize=(10, 10), sharey="all")

for idx, ax in enumerate(axes.flat):

    hist(streamflow.iloc[:, idx].values.reshape(-1,),
         bins=20,
         ax=ax,
         show=False
        )

plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 5
      1 fig, axes = plt.subplots(7, 7, figsize=(10, 10), sharey="all")
      3 for idx, ax in enumerate(axes.flat):
----> 5     hist(streamflow.iloc[:, idx].values.reshape(-1,),
      6          bins=20,
      7          ax=ax,
      8          show=False
      9         )
     11 plt.show()

NameError: name 'streamflow' is not defined
../_images/auto_examples_camels_australia_27_1.svg
[25]:
_ = hist(streamflow.skew().values.reshape(-1,), bins=50)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 1
----> 1 _ = hist(streamflow.skew().values.reshape(-1,), bins=50)

NameError: name 'streamflow' is not defined
[26]:
df = dataset.fetch(stations=1, as_dataframe=True)
df = df.unstack() # the returned dataframe is a multi-indexed dataframe so we have to unstack it
df.columns = df.columns.get_level_values('dynamic_features')
df.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 1
----> 1 df = dataset.fetch(stations=1, as_dataframe=True)
      2 df = df.unstack() # the returned dataframe is a multi-indexed dataframe so we have to unstack it
      3 df.columns = df.columns.get_level_values('dynamic_features')

NameError: name 'dataset' is not defined
[27]:
df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 1
----> 1 df

NameError: name 'df' is not defined
[28]:
# get name of all stations as list
stns = dataset.stations()
len(stns)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[28], line 2
      1 # get name of all stations as list
----> 2 stns = dataset.stations()
      3 len(stns)

NameError: name 'dataset' is not defined

get data of 10 % of stations as dataframe

[29]:
df = dataset.fetch(0.1, as_dataframe=True)
df.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 df = dataset.fetch(0.1, as_dataframe=True)
      2 df.shape

NameError: name 'dataset' is not defined
[30]:
df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[30], line 1
----> 1 df

NameError: name 'df' is not defined

The returned dataframe is a multi-indexed data

[31]:
df.index.names == ['time', 'dynamic_features']

df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[31], line 1
----> 1 df.index.names == ['time', 'dynamic_features']
      3 df

NameError: name 'df' is not defined

get data by station id

[32]:
df = dataset.fetch(stations='224214A', as_dataframe=True).unstack()
df.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[32], line 1
----> 1 df = dataset.fetch(stations='224214A', as_dataframe=True).unstack()
      2 df.shape

NameError: name 'dataset' is not defined
[33]:
df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[33], line 1
----> 1 df

NameError: name 'df' is not defined

get names of available dynamic features

[34]:
dataset.dynamic_features
# get only selected dynamic features
data = dataset.fetch(1, as_dataframe=True,
dynamic_features=['airtemp_C_awap_max', 'pcp_mm_awap', 'aet_mm_silo_morton', 'q_cms_obs']).unstack()
data.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[34], line 1
----> 1 dataset.dynamic_features
      2 # get only selected dynamic features
      3 data = dataset.fetch(1, as_dataframe=True,
      4 dynamic_features=['airtemp_C_awap_max', 'pcp_mm_awap', 'aet_mm_silo_morton', 'q_cms_obs']).unstack()

NameError: name 'dataset' is not defined
[35]:
data
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[35], line 1
----> 1 data

NameError: name 'data' is not defined
[36]:
# get names of available static features
dataset.static_features
# get data of 10 random stations
df = dataset.fetch(10, as_dataframe=True)
df.shape  # remember this is a multiindexed dataframe
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[36], line 2
      1 # get names of available static features
----> 2 dataset.static_features
      3 # get data of 10 random stations
      4 df = dataset.fetch(10, as_dataframe=True)

NameError: name 'dataset' is not defined
[37]:
# when we get both static and dynamic data, the returned data is a dictionary
# with ``static`` and ``dyanic`` keys.
data = dataset.fetch(stations='224214A', static_features="all", as_dataframe=True)
data['static'].shape, data['dynamic'].shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[37], line 3
      1 # when we get both static and dynamic data, the returned data is a dictionary
      2 # with ``static`` and ``dyanic`` keys.
----> 3 data = dataset.fetch(stations='224214A', static_features="all", as_dataframe=True)
      4 data['static'].shape, data['dynamic'].shape

NameError: name 'dataset' is not defined
[38]:
data['static']
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[38], line 1
----> 1 data['static']

NameError: name 'data' is not defined
[39]:
data['dynamic']
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[39], line 1
----> 1 data['dynamic']

NameError: name 'data' is not defined

get data data of all stations as xarray dataset

[40]:
data = dataset.fetch()
data
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[40], line 1
----> 1 data = dataset.fetch()
      2 data

NameError: name 'dataset' is not defined