import os
from typing import List, Union, Dict
import pandas as pd
from .camels import Camels
from ..utils import check_attributes
from ._map import (
catchment_area,
gauge_latitude,
gauge_longitude,
slope
)
[docs]
class Simbi(Camels):
"""
monthly rainfall from 1905 - 2005, daily rainfall from 1920-1940, 70 daily
streamflow series, and 23 monthly temperature series for 24 catchments of Haiti
`Bathelemy et al., 2023 <https://doi.org/10.23708/02POK6>`_
`Bathelemy et al., 2024 <doi: 10.5194/essd-16-2073-2024>`_
Examples
---------
>>> from water_datasets import Simbi
>>> simbi = Simbi()
"""
url = {
'00_SIMBI_OBSERVED_DATA.zip': "https://dataverse.ird.fr/api/access/datafile/44141",
'01_SIMBI_CATCHMENT.zip': "https://dataverse.ird.fr/api/access/datafile/43638",
'02_SIMBI_SIMULATED_STREAMFLOW.zip': "https://dataverse.ird.fr/api/access/datafile/43639",
'03_SIMBI_ATTRIBUTE.zip': "https://dataverse.ird.fr/api/access/datafile/43640",
"04_SIMBI_MAP.zip": "https://dataverse.ird.fr/api/access/datafile/43646",
"08_SIMBI_METADATA.zip": "https://dataverse.ird.fr/api/access/datafile/43644",
'SIMBI_README.txt': 'https://dataverse.ird.fr/api/access/datafile/43644'
}
[docs]
def __init__(
self,
path: str = None,
overwrite:bool = False,
verbosity:int = 1,
**kwargs
):
"""
Arguments:
path: path where the Simbi dataset has been downloaded. This path
must contain five zip files and one xlsx file. If None, then the
data will be downloaded.
to_netcdf :
"""
super().__init__(path=path, verbosity=verbosity, **kwargs)
self._download(overwrite=overwrite)
self._static_features = self.static_data().columns.tolist()
self._dynamic_features = ['q', 'pcp', 'temp']
self.boundary_file = os.path.join(self.path, '01_SIMBI_CATCHMENT', 'Haitian_Catchment.shp')
self._create_boundary_id_map(self.boundary_file, 1)
self.dyn_fname = ''
@property
def static_map(self) -> Dict[str, str]:
return {
'Area': catchment_area(),
'Lat_Cent': gauge_latitude(),
'Slope': slope('degrees'),
'Lon_Cent': gauge_longitude(),
}
@property
def static_features(self):
return self._static_features
@property
def dynamic_features(self):
return self._dynamic_features
@property
def _coords_name(self)->List[str]:
return ['Lat_Exu', 'Lon_Exu']
@property
def _area_name(self) ->str:
return 'Area'
@property
def start(self):
return pd.Timestamp("19200101")
@property
def end(self):
return pd.Timestamp("20051231")
@property
def daily_q_path(self):
return os.path.join(self.path, '00_SIMBI_OBSERVED_DATA', '02_DAILY_STREAMFLOW')
@property
def daily_pcp_path(self):
return os.path.join(self.path, '00_SIMBI_OBSERVED_DATA', '01_DAILY_RAINFALL')
@property
def daily_pcp_20_40_path(self):
return os.path.join(self.daily_pcp_path, '1920_1940')
@property
def daily_pcp_48_60_path(self):
return os.path.join(self.daily_pcp_path, '1948_1966')
@property
def attributes_path(self):
return os.path.join(self.path, '03_SIMBI_ATTRIBUTE')
@property
def clim_sig_path(self):
return os.path.join(self.attributes_path, '01_CLIMATIC_SIGNATURE')
@property
def daily_clim_sig_path(self):
return os.path.join(self.clim_sig_path, '02_DAILY')
@property
def monthly_clim_sig_path(self):
return os.path.join(self.clim_sig_path, '01_MONTHLY')
@property
def other_attrs_path(self):
return os.path.join(self.attributes_path, '02_OTHERS')
@property
def temp_path(self):
return os.path.join(self.path, '00_SIMBI_OBSERVED_DATA', '05_DAILY_LONG_TERM_AVERAGE_TEMPERATURE')
[docs]
def stations(self)->List[str]:
"""Returns names/IDs of 24 stations which have all (boundary, streamflow,
static features) data. Although there are 70 stations which have daily
streamflow data, only 24 of them have static + boundary data.
"""
return self.boundary_stations()
[docs]
def all_stations(self)->List[str]:
"""
Not all stations have all data.
"""
return [f"0{str(i).zfill(2)}" for i in range(1, 71)]
[docs]
def q_stations(self)->List[str]:
"""
Returns names/IDs of 70 stations with daily streamflow data.
"""
return [f"0{str(i).zfill(2)}" for i in range(1, 71)]
[docs]
def pcp_stations(self)->List[str]:
"""
Returns IDs of 74 stations with daily rainfall data.
"""
s1 = [stn.split('.')[0].split('_')[1] for stn in os.listdir(self.daily_pcp_20_40_path)]
s2 = [stn.split('.')[0].split('_')[1] for stn in os.listdir(self.daily_pcp_48_60_path)]
return list(set(s1 + s2))
[docs]
def temp_stations(self)->List[str]:
"""
Returns names/IDs of 21 stations with daily temperature data.
"""
return [stn.split('.')[0].split('_')[1] for stn in os.listdir(self.temp_path)]
[docs]
def boundary_stations(self)->List[str]:
"""
Returns names/IDs of 24 stations with boundary data.
"""
return [stn.split('-')[1] for stn in self.bndry_id_map.keys()]
[docs]
def static_data_stations(self)->List[str]:
"""
Returns names/IDs of 24 stations with static data.
"""
return self.static_data().index.tolist()
[docs]
def daily_bsi(self)->pd.DataFrame:
"""
Read the daily BSI values.
"""
fpath = os.path.join(self.daily_clim_sig_path, 'baseflow_index.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_d" for i in df.columns]
return df
[docs]
def daily_high_q_dur(self)->pd.DataFrame:
"""
Read the daily high flow values.
"""
fpath = os.path.join(self.daily_clim_sig_path, 'high_q_dur.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_d_hq_dur" for i in df.columns]
return df
[docs]
def daily_high_q_freq(self)->pd.DataFrame:
"""
Read the daily flow frequency values.
"""
fpath = os.path.join(self.daily_clim_sig_path, 'high_q_freq.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_d_hq_freq" for i in df.columns]
return df
[docs]
def daily_low_q_dur(self)->pd.DataFrame:
"""
Read the daily low flow values.
"""
fpath = os.path.join(self.daily_clim_sig_path, 'low_q_dur.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_d_lq_dur" for i in df.columns]
return df
[docs]
def daily_low_q_freq(self)->pd.DataFrame:
"""
Read the daily low flow frequency values.
"""
fpath = os.path.join(self.daily_clim_sig_path, 'low_q_freq.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_d_lq_freq" for i in df.columns]
return df
[docs]
def daily_q_mean(self)->pd.DataFrame:
"""
Read the daily mean flow values.
"""
fpath = os.path.join(self.daily_clim_sig_path, 'q_mean.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_d_mean" for i in df.columns]
return df
[docs]
def daily_quantile_5(self)->pd.DataFrame:
"""
Read the daily 5th quantile flow values.
"""
fpath = os.path.join(self.daily_clim_sig_path, 'quantile_5.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_d_q5" for i in df.columns]
return df
[docs]
def daily_quantile_95(self)->pd.DataFrame:
"""
Read the daily 95th quantile flow values.
"""
fpath = os.path.join(self.daily_clim_sig_path, 'quantile_95.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_d_q95" for i in df.columns]
return df
[docs]
def daily_clim_sigs(self)->pd.DataFrame:
"""
Read the daily climate signatures.
"""
return pd.concat([
self.daily_bsi(),
self.daily_high_q_dur(),
self.daily_high_q_freq(),
self.daily_low_q_dur(),
self.daily_low_q_freq(),
self.daily_q_mean(),
self.daily_quantile_5(),
self.daily_quantile_95()
], axis=1)
[docs]
def monthly_aridity_runoff(self)->pd.DataFrame:
"""
Read the monthly aridity runoff values.
"""
fpath = os.path.join(self.monthly_clim_sig_path, 'aridity_runoff.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_mon_arid" for i in df.columns]
return df
[docs]
def monthly_average(self)->pd.DataFrame:
"""
Read the monthly average flow values.
"""
fpath = os.path.join(self.monthly_clim_sig_path, 'average.csv')
df = pd.read_csv(fpath, parse_dates=True, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_mon_avg" for i in df.columns]
return df
[docs]
def monthly_QMNA5(self)->pd.DataFrame:
"""
Read the monthly QMNA5 flow values.
"""
fpath = os.path.join(self.monthly_clim_sig_path, 'QMNA5.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_mon_QMNA5" for i in df.columns]
return df
[docs]
def monthly_QMXA10(self)->pd.DataFrame:
"""
Read the monthly QMNA10 flow values.
"""
fpath = os.path.join(self.monthly_clim_sig_path, 'QMXA10.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_mon_QMXA10" for i in df.columns]
return df
[docs]
def monthly_quantile_5(self)->pd.DataFrame:
"""
Read the monthly 5th quantile flow values.
"""
fpath = os.path.join(self.monthly_clim_sig_path, 'quantile_5.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_mon_q5" for i in df.columns]
return df
[docs]
def monthly_quantile_95(self)->pd.DataFrame:
"""
Read the monthly 95th quantile flow values.
"""
fpath = os.path.join(self.monthly_clim_sig_path, 'quantile_95.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_mon_q95" for i in df.columns]
return df
[docs]
def monthly_clim_sigs(self)->pd.DataFrame:
"""
Read the monthly climate signatures.
"""
return pd.concat([
self.monthly_aridity_runoff(),
self.monthly_average(),
self.monthly_QMNA5(),
self.monthly_QMXA10(),
self.monthly_quantile_5(),
self.monthly_quantile_95()
], axis=1)
[docs]
def stream_density(self)->pd.DataFrame:
"""
Read the stream density values.
"""
fpath = os.path.join(self.other_attrs_path, 'stream_density.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
return df
[docs]
def percent_lc_98(self)->pd.DataFrame:
"""
Read the land cover percentage values.
"""
fpath = os.path.join(self.other_attrs_path, 'Percent_land_cover_98.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_lc_98" for i in df.columns]
return df
[docs]
def percent_lc_95(self)->pd.DataFrame:
"""
Read the 95th land cover percentage values.
"""
fpath = os.path.join(self.other_attrs_path, 'Percent_land_cover_95.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_lc_95" for i in df.columns]
return df
[docs]
def percent_geology(self)->pd.DataFrame:
"""
Read the geology percentage values.
"""
fpath = os.path.join(self.other_attrs_path, 'Percent_geologic_class.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
df.columns = [f"{i}_geol" for i in df.columns]
return df
[docs]
def topography(self)->pd.DataFrame:
"""
Read the topography values.
"""
fpath = os.path.join(self.other_attrs_path, 'location_and_topography.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
return df
[docs]
def hypsometric_curve(self)->pd.DataFrame:
"""
Read the hyposometric curve values.
"""
fpath = os.path.join(self.other_attrs_path, 'hypsometric_curve.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
return df
[docs]
def aquifer_class(self)->pd.DataFrame:
"""
Read the aquifer class values.
"""
fpath = os.path.join(self.other_attrs_path, 'Percent_aquifer_class.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
return df
[docs]
def carb_sed_magma(self)->pd.DataFrame:
"""
Read the carbonated sedimentary and magmatic values.
"""
fpath = os.path.join(self.other_attrs_path, 'Percent_carb_sediment_magma.csv')
df = pd.read_csv(fpath, index_col=0)
df.index = [i.split('-')[1] for i in df.index]
return df
[docs]
def other_attributes(self)->pd.DataFrame:
"""
Read the other attributes.
"""
return pd.concat([
self.stream_density(),
self.percent_lc_98(),
self.percent_lc_95(),
self.percent_geology(),
self.topography(),
self.hypsometric_curve(),
self.aquifer_class(),
self.carb_sed_magma()
], axis=1)
[docs]
def clim_sigs(self)->pd.DataFrame:
"""
Read the climate signatures.
"""
return pd.concat([
self.daily_clim_sigs(),
self.monthly_clim_sigs()
], axis=1)
[docs]
def static_data(self)->pd.DataFrame:
"""
Read the static data.
"""
return pd.concat([
self.other_attributes(),
self.clim_sigs()
], axis=1)
[docs]
def fetch_static_features(
self,
stn_id: Union[str, list] = 'all',
static_features: Union[str, list] = 'all'
)->pd.DataFrame:
"""
Returns static features of one or more stations.
Parameters
----------
stn_id : str
name/id of station/stations of which to extract the data
static_features : list/str, optional (default="all")
The name/names of features to fetch. By default, all available
static features are returned.
Returns
-------
pd.DataFrame
a pandas dataframe of shape (stations, features)
Examples
---------
>>> from water_quality import Simbi
>>> dataset = Simbi()
get all static data of all stations
>>> stns = dataset.static_data_stations()
>>> static_data = dataset.fetch_static_features(stns)
>>> static_data.shape
(24, 232)
get static data of one station only
>>> static_data = dataset.fetch_static_features('001')
>>> static_data.shape
(1, 232)
get the names of static features
>>> dataset.static_features
get only selected features of all stations
>>> static_data = dataset.fetch_static_features(stns, ['stream_density', 'pcp', 'Forest_lc_98'])
>>> static_data.shape
(24, 3)
>>> data = dataset.fetch_static_features('001', static_features=['stream_density', 'pcp', 'Forest_lc_98'])
>>> data.shape
(1, 3)
"""
stations = check_attributes(stn_id, self.static_data_stations())
df = self.static_data().copy()
features = check_attributes(static_features, self.static_features,
"static_features")
return df.loc[stations, features]
@property
def dyn_map(self):
return {
'q': 'obs_q_cms',
'temp': 'mean_temp_C',
'pcp': 'pcp_mm',
}
def _read_dynamic_from_csv(
self,
stations,
dynamic_features,
st="1919-01-01",
en="2005-12-31"
) ->dict:
"""
reads dynamic data of one or more catchments
"""
attributes = check_attributes(dynamic_features, self.dynamic_features)
stations = check_attributes(stations, self.stations())
dyn = {}
for stn in stations:
df = self._read_dynamic_for_stn(stn).loc[st:en, attributes]
dyn[stn] = df
return dyn
[docs]
def read_stn_q(self, stn:str)->pd.DataFrame:
"""
Read the daily streamflow data for a station.
"""
fpath = os.path.join(self.daily_q_path, f'Q_{stn}.csv')
df = pd.read_csv(fpath, parse_dates=True, index_col=0)
return df
[docs]
def read_stn_pcp(self, stn:str)->pd.DataFrame:
"""
Read the daily rainfall data for a station.
"""
df1, df2 = pd.DataFrame(columns=['P']), pd.DataFrame(columns=['P'])
fpath = os.path.join(self.daily_pcp_20_40_path, f'P_{stn}.csv')
if os.path.exists(fpath):
df1 = pd.read_csv(fpath, parse_dates=True, index_col=0)
#df1.columns = ['pcp']
fpath = os.path.join(self.daily_pcp_48_60_path, f'P_{stn}.csv')
if os.path.exists(fpath):
df2 = pd.read_csv(fpath, parse_dates=True, index_col=0)
#df2.columns = ['pcp2']
df = pd.concat([df1, df2])
return df
[docs]
def read_stn_temp(self, stn:str)->pd.DataFrame:
"""
Read the daily temperature data for a station.
"""
df = pd.DataFrame(columns=['temp'])
fpath = os.path.join(self.temp_path, f'P_{stn}.csv')
if os.path.exists(fpath):
df = pd.read_csv(fpath, parse_dates=True, index_col=0)
return df
def _read_dynamic_for_stn(self, stn:str)->pd.DataFrame:
"""
Read the daily streamflow, rainfall, and temperature data for a station.
"""
df1 = self.read_stn_q(stn)
df2 = self.read_stn_pcp(stn)
df3 = self.read_stn_temp(stn)
df = pd.concat([df1, df2, df3], axis=1)
df.columns = ['q', 'pcp', 'temp']
df.index = pd.to_datetime(df.index)
df.columns.name = 'dynamic_features'
df.index.name = 'time'
return df.sort_index()