Source code for aqua_fetch.wq._misc


__all__ = [
    'SanFranciscoBay', 
    'WhiteClayCreek', 
    'BuzzardsBay',
    'SeluneRiver'
           ]


import os
from typing import Union, List

import pandas as pd

from .._datasets import Datasets
from ..utils import _unzip, check_attributes


[docs] class SanFranciscoBay(Datasets): """ Time series of water quality parameters from 59 stations in San-Francisco from 1969 - 2015. For details on data see `Cloern et al.., 2017 <https://doi.org/10.1002/lno.10537>`_ and `Schraga et al., 2017 <https://doi.org/10.1038/sdata.2017.98>`_. Following parameters are available: - ``Depth`` - ``Discrete_Chlorophyll`` - ``Ratio_DiscreteChlorophyll_Pheopigment`` - ``Calculated_Chlorophyll`` - ``Discrete_Oxygen`` - ``Calculated_Oxygen`` - ``Oxygen_Percent_Saturation`` - ``Discrete_SPM`` - ``Calculated_SPM`` - ``Extinction_Coefficient`` - ``Salinity`` - ``Temperature`` - ``Sigma_t`` - ``Nitrite`` - ``Nitrate_Nitrite`` - ``Ammonium`` - ``Phosphate`` - ``Silicate`` Examples -------- >>> from water_datasets import SanFranciscoBay >>> ds = SanFranciscoBay() >>> data = ds.data() >>> data.shape (212472, 19) >>> stations = ds.stations() >>> len(stations) 59 >>> parameters = ds.parameters() >>> len(parameters) 18 ... # fetch data for station 18 >>> stn18 = ds.fetch(stations='18') >>> stn18.shape (13944, 18) """ url = { "SanFranciscoBay.zip": "https://www.sciencebase.gov/catalog/file/get/64248ee5d34e370832fe343d" }
[docs] def __init__(self, path=None, **kwargs): super().__init__(path=path, **kwargs) self._download() self._stations = self.data()['Station_Number'].unique().tolist() self._parameters = self.data().columns.tolist()[1:]
def stations(self)->List[str]: return self._stations def parameters(self)->List[str]: return self._parameters def data(self)->pd.DataFrame: fpath = os.path.join(self.path, 'SanFranciscoBay', 'SanFranciscoBayWaterQualityData1969-2015v4.csv') df = pd.read_csv(fpath, dtype={'Station_Number': str}) # join Date and Time columns to create a datetime column # specify the format for Date/Month/YY df.index = pd.to_datetime(df.pop('Date') + ' ' + df.pop('Time'), format='%m/%d/%y %H:%M') df.pop('Julian_Date') return df
[docs] def stn_data( self, stations:Union[str, List[str]]='all', )->pd.DataFrame: """ Get station metadata. """ fpath = os.path.join(self.path, 'SanFranciscoBay', 'SFBstation_locations19692015.csv') df = pd.read_csv(fpath, dtype={'Station_Number': str}) df.index = df.pop('Station_Number') df = df.dropna() stations = check_attributes(stations, self.stations(), 'stations') df = df.loc[stations, :] return df
[docs] def fetch( self, stations:Union[str, List[str]]='all', parameters:Union[str, List[str]]='all', )->pd.DataFrame: """ Parameters ---------- parameters : Union[str, List[str]], optional The parameters to return. The default is 'all'. Returns ------- pd.DataFrame DESCRIPTION. """ parameters = check_attributes(parameters, self.parameters(), 'parameters') stations = check_attributes(stations, self.stations(), 'stations') data = self.data() data = data.loc[ data['Station_Number'].isin(stations), :] return data.loc[:, parameters]
[docs] class WhiteClayCreek(Datasets): """ Time series of water quality parameters from White Clay Creek. - chl-a : 2001 - 2012 - Dissolved Organic Carbon : 1977 - 2017 """ url = { "WCC_CHLA_2001_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2001_1.csv", "WCC_CHLA_2001.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2001.csv", "WCC_CHLA_2002_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2002_1.csv", "WCC_CHLA_2002.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2002.csv", "WCC_CHLA_2003_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2003_1.csv", "WCC_CHLA_2003.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2003.csv", "WCC_CHLA_2004_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2004_1.csv", "WCC_CHLA_2004.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2004.csv", "WCC_CHLA_2005_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2005_1.csv", "WCC_CHLA_2005.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2005.csv", "WCC_CHLA_2006_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2006_1.csv", "WCC_CHLA_2006.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2006.csv", "WCC_CHLA_2007_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2007_1.csv", "WCC_CHLA_2007.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2007.csv", "WCC_CHLA_2008_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2008_1.csv", "WCC_CHLA_2008.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2008.csv", "WCC_CHLA_2009_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2009_1.csv", "WCC_CHLA_2009.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2009.csv", "WCC_CHLA_2010_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2010_1.csv", "WCC_CHLA_2010.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2010.csv", "WCC_CHLA_2011_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2011_1.csv", "WCC_CHLA_2011.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2011.csv", "WCC_CHLA_2012_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2012_1.csv", "WCC_CHLA_2012.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2012.csv", "doc.csv": "https://portal.edirepository.org/nis/dataviewer?packageid=edi.386.1&entityid=3f802081eda955b2b0b405b55b85d11c" }
[docs] def __init__(self, path=None, **kwargs): super().__init__(path=path, **kwargs) self._download()
def fetch( self, stations:Union[str, List[str]]='all', parameters:Union[str, List[str]]='all', ): raise NotImplementedError
[docs] def doc(self)->pd.DataFrame: """ Dissolved Organic Carbon data """ fpath = os.path.join(self.path, 'doc.csv') import pandas as pd df = pd.read_csv(fpath, index_col=0, parse_dates=True, dtype={'site': str}) return df
[docs] def chla(self)->pd.DataFrame: """ Chlorophyll-a data """ files = [f for f in os.listdir(self.path) if f.startswith("WCC_CHLA")] # start reading file when line starts with "\data" dfs = [] for f in files: with open(os.path.join(self.path, f), 'r') as f: for line in f: if line.startswith("\data"): break # read the header df = pd.read_csv(f, sep=',', header=None) df.insert(0, 'date', pd.to_datetime(df.iloc[:, 1])) df.columns = ['date', 'site', 'junk', 'chla_chlaspec', 'chlafluor1', 'chlafluor2', 'chlafluor3', 'pheophytin_pheospec', 'Pheophytinfluor1', 'Pheophytinfluor2', 'Pheophytinfluor3', ] df = df.drop(columns=['junk']) dfs.append(df) df = pd.concat(dfs, axis=0) return df
[docs] class BuzzardsBay(Datasets): """ Water quality measurements in Buzzards Bay from 1992 - 2018. For more details on data see `Jakuba et al., <https://doi.org/10.1038/s41597-021-00856-4>`_ data is downloaded from `MBLWHOI Library <https://darchive.mblwhoilibrary.org/entities/publication/f31123f1-2097-5742-8ce9-69010ea36460>`_ Examples -------- >>> from water_datasets import BuzzardsBay >>> ds = BuzzardsBay() >>> doc = ds.doc() >>> doc.shape (11092, 4) >>> chla = ds.chla() >>> chla.shape (1028, 10) """ url = { "buzzards_bay.xlsx": "https://darchive.mblwhoilibrary.org/bitstreams/87c25cf4-21b5-551c-bb7d-4604806109b4/download"}
[docs] def __init__(self, path=None, **kwargs): super().__init__(path=path, **kwargs) self._download() self._stations = self.read_stations()['STN_ID'].unique().tolist() self._parameters = self.data().columns.tolist()
@property def fpath(self): return os.path.join(self.path, 'buzzards_bay.xlsx') def stations(self)->List[str]: return self._stations @property def parameters(self)->List[str]: return self._parameters
[docs] def fetch( self, parameters:Union[str, List[str]]='all', )->pd.DataFrame: """ Fetch data for the specified parameters. """ parameters = check_attributes(parameters, self.parameters(), 'parameters') data = self.data() return data.loc[:, parameters]
def data(self): data = pd.read_excel( self.fpath, sheet_name='all', dtype={ 'STN_ID': str, 'STN_EQUIV': str, 'SOURCE': str, 'GEN_QC': self.fp, 'PREC': self.fp, 'WHTR': self.fp, #'TIME_QC': self.ip, 'SAMPDEP_QC': self.fp, 'SECCHI_M': self.fp, 'SECC_QC': self.fp, #'TOTDEP_QC': self.ip, 'TEMP_C': self.fp, #'TEMP_QC': self.ip } ) if 'Unnamed: 0' in data.columns: data.pop('Unnamed: 0') return data def metadata(self): meta = pd.read_excel(self.fpath, sheet_name='META') return meta def read_stations(self)->pd.DataFrame: stations = pd.read_excel( self.fpath, sheet_name='Stations', skiprows=1, dtype={ 'STN_ID': str, 'LATITUDE': self.fp, 'LONGITUDE': self.fp, 'Town': str, 'EMBAYMENT': str, 'WQI_Area': str, } ) return stations
[docs] class SeluneRiver(Datasets): """ Dataset of physico-chemical variables measured at different levels, for a 2021 and 2022 for characterization of Hyporheic zone of Selune River, Manche, Normandie, France following `Moustapha Ba et al., 2023 <https://doi.org/10.1016/j.dib.2022.108837>`_ . The data is available at `data.gouv.fr <https://doi.org/10.57745/SBXWUC>`_ . The following variables are available: - water level - temperature - conductivity - oxygen - pressure """ url = { "data_downstream_signy-zh.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150676", "data_baro_upstream-virey.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/151002", "data_conduc_upstream-virey-zh.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150783", "data_mini-lomos_downstream-signy.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150678", "data_mini-lomos_upstream-virey.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150780", "data_oxygen_downstream-signy-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150771", "data_oxygen_upstream-virey-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150782", "data_oxygen_upstream-virey-zh.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150781", "data_station_downstream-signy-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150868", "data_station_oxygen_upstream-virey-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150865", "data_station_upstream-virey-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150866", "data_water-level_upstream-virey-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150779", "readme.txt":"https://entrepot.recherche.data.gouv.fr/api/access/datafile/151001", "readme1.0.txt":"https://entrepot.recherche.data.gouv.fr/api/access/datafile/156508", }
[docs] def __init__(self, path=None, **kwargs): super().__init__(path=path, **kwargs) self._download()
[docs] def data(self)->pd.DataFrame: """ Return a DataFrame of the data """ fpath = os.path.join(self.path, 'data_downstream_signy-zh.tab') downstream_signy_zh = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype={'id': str}) downstream_signy_zh.columns = [col + '_dwnstr_signyzh' for col in downstream_signy_zh.columns] downstream_signy_zh.index = pd.to_datetime(downstream_signy_zh.index) downstream_signy_zh.index.name = 'date' fpath = os.path.join(self.path, 'data_baro_upstream-virey.tab') baro_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype={'barometric': float}) baro_upstream_virey.columns = [col + '_baro_upstr_virey' for col in baro_upstream_virey.columns] #assert baro_upstream_virey.shape == (31927, 1) baro_upstream_virey.index = pd.to_datetime(baro_upstream_virey.index) baro_upstream_virey.index.name = 'date' fpath = os.path.join(self.path, 'data_conduc_upstream-virey-zh.tab') cond_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype={'cond_30cm': float, 't_30cm_sensor_cond': float}) cond_upstream_virey.columns = [col + '_cond_upstream_virey' for col in cond_upstream_virey.columns] #assert cond_upstream_virey.shape == (31927, 2) cond_upstream_virey.index = pd.to_datetime(cond_upstream_virey.index) cond_upstream_virey.index.name = 'date' fpath = os.path.join(self.path, 'data_mini-lomos_downstream-signy.tab') mini_lomos_downstream_signy = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype=float) # diff_press, t_river, t at 10,20,30,40 cm #assert mini_lomos_downstream_signy.shape == (14843, 6) mini_lomos_downstream_signy.columns = [col + '_mini_lomos_dwnstr_signy' for col in mini_lomos_downstream_signy.columns] mini_lomos_downstream_signy.index = pd.to_datetime(mini_lomos_downstream_signy.index) mini_lomos_downstream_signy.index.name = 'date' fpath = os.path.join(self.path, 'data_oxygen_downstream-signy-river.tab') oxy_downstream_signy = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype=float ) # temp, oxy_sat, oxy_conc #assert oxy_downstream_signy.shape == (31947, 3) oxy_downstream_signy.columns = [col + '_oxy_dwnstr_signy' for col in oxy_downstream_signy.columns] oxy_downstream_signy.index = pd.to_datetime(oxy_downstream_signy.index) oxy_downstream_signy.index.name = 'date' fpath = os.path.join(self.path, 'data_station_downstream-signy-river.tab') downstream_signy = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype=float ) # cond, turb, wl #assert downstream_signy.shape == (31947, 3) downstream_signy.columns = [col + '_dwnstr_signy' for col in downstream_signy.columns] downstream_signy.index = pd.to_datetime(downstream_signy.index, format="mixed") downstream_signy.index.name = 'date' fpath = os.path.join(self.path, 'data_station_oxygen_upstream-virey-river.tab') oxy_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype=float ) # con_oxy #assert oxy_upstream_virey.shape == (31927, 1) oxy_upstream_virey.columns = [col + '_oxy_upstr_virey_stn' for col in oxy_upstream_virey.columns] oxy_upstream_virey.index = pd.to_datetime(oxy_upstream_virey.index, format="mixed") oxy_upstream_virey.index.name = 'date' fpath = os.path.join(self.path, 'data_station_upstream-virey-river.tab') upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype=float ) # cond, turb, wl #assert upstream_virey.shape == (31947, 3) upstream_virey.columns = [col + '_upstr_virey_stn' for col in upstream_virey.columns] upstream_virey.index = pd.to_datetime(upstream_virey.index, format="mixed") upstream_virey.index.name = 'date' fpath = os.path.join(self.path, 'data_water-level_upstream-virey-river.tab') wl_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype=float ) # wl, temp #assert wl_upstream_virey.shape == (31927, 2) wl_upstream_virey.columns = [col + '_upstr_virey' for col in wl_upstream_virey.columns] wl_upstream_virey.index = pd.to_datetime(wl_upstream_virey.index) wl_upstream_virey.index.name = 'date' fpath = os.path.join(self.path, 'data_mini-lomos_upstream-virey.tab') mini_lomos_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype=float ) # diff_press, t_river, t at 10,20,30,40 cm #assert mini_lomos_upstream_virey.shape == (8621, 6) mini_lomos_upstream_virey.columns = [col + '_mini_lomos_upstr_virey' for col in mini_lomos_upstream_virey.columns] mini_lomos_upstream_virey.index = pd.to_datetime(mini_lomos_upstream_virey.index) mini_lomos_upstream_virey.index.name = 'date' fpath = os.path.join(self.path, 'data_oxygen_upstream-virey-river.tab') oxy_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype=float ) # temp, oxy_sat, oxy_conc #assert oxy_upstream_virey.shape == (25699, 2) oxy_upstream_virey.columns = [col + '_oxy_upstr_virey' for col in oxy_upstream_virey.columns] oxy_upstream_virey.index = pd.to_datetime(oxy_upstream_virey.index) oxy_upstream_virey.index.name = 'date' fpath = os.path.join(self.path, 'data_oxygen_upstream-virey-zh.tab') oxy_upstream_virey_zh = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True, dtype=float ) # oxy_conc and t_ at 15 and 30 cm #assert oxy_upstream_virey_zh.shape == (31927, 4) oxy_upstream_virey_zh.columns = [col + '_oxy_upstr_virey_zh' for col in oxy_upstream_virey_zh.columns] oxy_upstream_virey_zh.index = pd.to_datetime(oxy_upstream_virey_zh.index) oxy_upstream_virey_zh.index.name = 'date' # concatenate all dataframes df = pd.concat([downstream_signy_zh, baro_upstream_virey, cond_upstream_virey, mini_lomos_downstream_signy, oxy_downstream_signy, downstream_signy, oxy_upstream_virey, upstream_virey, wl_upstream_virey, mini_lomos_upstream_virey, oxy_upstream_virey, oxy_upstream_virey_zh], axis=1) return df