Source code for aqua_fetch.wq._misc


__all__ = [
    'SanFranciscoBay', 
    'WhiteClayCreek', 
    'BuzzardsBay',
    'SeluneRiver'
           ]


import os
from typing import Union, List

import pandas as pd

from .._datasets import Datasets
from ..utils import _unzip, check_attributes



[docs]
class SanFranciscoBay(Datasets):
    """
    Time series of water quality parameters from 59 stations in San-Francisco from 1969 - 2015.
    For details on data see `Cloern et al.., 2017 <https://doi.org/10.1002/lno.10537>`_ 
    and `Schraga et al., 2017 <https://doi.org/10.1038/sdata.2017.98>`_.
    Following parameters are available:
    
        - ``Depth``
        - ``Discrete_Chlorophyll``
        - ``Ratio_DiscreteChlorophyll_Pheopigment``
        - ``Calculated_Chlorophyll``
        - ``Discrete_Oxygen``
        - ``Calculated_Oxygen``
        - ``Oxygen_Percent_Saturation``
        - ``Discrete_SPM``
        - ``Calculated_SPM``
        - ``Extinction_Coefficient``
        - ``Salinity``
        - ``Temperature``
        - ``Sigma_t``
        - ``Nitrite``
        - ``Nitrate_Nitrite``
        - ``Ammonium``
        - ``Phosphate``
        - ``Silicate``
    
    Examples
    --------
    >>> from water_datasets import SanFranciscoBay
    >>> ds = SanFranciscoBay()
    >>> data = ds.data()
    >>> data.shape
    (212472, 19)
    >>> stations = ds.stations()
    >>> len(stations)
    59
    >>> parameters = ds.parameters()
    >>> len(parameters)
    18
    ... # fetch data for station 18
    >>> stn18 = ds.fetch(stations='18')
    >>> stn18.shape
    (13944, 18)

    """
    url = {
"SanFranciscoBay.zip": "https://www.sciencebase.gov/catalog/file/get/64248ee5d34e370832fe343d"
}


[docs]
    def __init__(self, path=None, **kwargs):
        super().__init__(path=path, **kwargs)
        self._download()

        self._stations = self.data()['Station_Number'].unique().tolist()
        self._parameters = self.data().columns.tolist()[1:]


    def stations(self)->List[str]:
        return self._stations
    
    def parameters(self)->List[str]:
        return self._parameters

    def data(self)->pd.DataFrame:

        fpath = os.path.join(self.path, 'SanFranciscoBay', 'SanFranciscoBayWaterQualityData1969-2015v4.csv')

        df = pd.read_csv(fpath,
                         dtype={'Station_Number': str})

        # join Date and Time columns to create a datetime column
        # specify the format for Date/Month/YY
        df.index = pd.to_datetime(df.pop('Date') + ' ' + df.pop('Time'), format='%m/%d/%y %H:%M')
        df.pop('Julian_Date')

        return df


[docs]
    def stn_data(
            self,
            stations:Union[str, List[str]]='all',
            )->pd.DataFrame:
        """
        Get station metadata.
        """
        fpath = os.path.join(self.path, 'SanFranciscoBay', 'SFBstation_locations19692015.csv')
        df = pd.read_csv(fpath, dtype={'Station_Number': str})
        df.index = df.pop('Station_Number')
        df =  df.dropna()

        stations = check_attributes(stations, self.stations(), 'stations')
        df = df.loc[stations, :]
        return df



[docs]
    def fetch(
            self,
            stations:Union[str, List[str]]='all',
            parameters:Union[str, List[str]]='all',
    )->pd.DataFrame:
        """

        Parameters
        ----------
        parameters : Union[str, List[str]], optional
            The parameters to return. The default is 'all'.

        Returns
        -------
        pd.DataFrame
            DESCRIPTION.

        """
        parameters = check_attributes(parameters, self.parameters(), 'parameters')
        stations = check_attributes(stations, self.stations(), 'stations')

        data = self.data()

        data = data.loc[ data['Station_Number'].isin(stations), :]

        return data.loc[:, parameters]





[docs]
class WhiteClayCreek(Datasets):
    """
    Time series of water quality parameters from White Clay Creek.
        
        - chl-a : 2001 - 2012
        - Dissolved Organic Carbon : 1977 - 2017
    """

    url = {
"WCC_CHLA_2001_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2001_1.csv",
"WCC_CHLA_2001.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2001.csv",
"WCC_CHLA_2002_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2002_1.csv",
"WCC_CHLA_2002.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2002.csv",
"WCC_CHLA_2003_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2003_1.csv",
"WCC_CHLA_2003.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2003.csv",
"WCC_CHLA_2004_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2004_1.csv",
"WCC_CHLA_2004.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2004.csv",
"WCC_CHLA_2005_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2005_1.csv",
"WCC_CHLA_2005.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2005.csv",
"WCC_CHLA_2006_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2006_1.csv",
"WCC_CHLA_2006.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2006.csv",
"WCC_CHLA_2007_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2007_1.csv",
"WCC_CHLA_2007.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2007.csv",
"WCC_CHLA_2008_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2008_1.csv",
"WCC_CHLA_2008.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2008.csv",
"WCC_CHLA_2009_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2009_1.csv",
"WCC_CHLA_2009.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2009.csv",
"WCC_CHLA_2010_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2010_1.csv",
"WCC_CHLA_2010.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2010.csv",
"WCC_CHLA_2011_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2011_1.csv",
"WCC_CHLA_2011.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2011.csv",
"WCC_CHLA_2012_1.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2012_1.csv",
"WCC_CHLA_2012.csv": "https://www.hydroshare.org/resource/d841f99381424ebc850842a1dbb5630b/data/contents/WCC_CHLA_2012.csv",
"doc.csv": "https://portal.edirepository.org/nis/dataviewer?packageid=edi.386.1&entityid=3f802081eda955b2b0b405b55b85d11c"
        }



[docs]
    def __init__(self, path=None, **kwargs):
        super().__init__(path=path, **kwargs)
        self._download()


    def fetch(
            self,
            stations:Union[str, List[str]]='all',
            parameters:Union[str, List[str]]='all',
        ):
    
        raise NotImplementedError
    

[docs]
    def doc(self)->pd.DataFrame:
        """
        Dissolved Organic Carbon data
        """
        fpath = os.path.join(self.path, 'doc.csv')
        import pandas as pd
        df = pd.read_csv(fpath, index_col=0, parse_dates=True,
                        dtype={'site': str})
        return df

    

[docs]
    def chla(self)->pd.DataFrame:
        """
        Chlorophyll-a data
        """
        files = [f for f in os.listdir(self.path) if f.startswith("WCC_CHLA")]

        # start reading file when line starts with "\data"

        dfs = []
        for f in files:
            with open(os.path.join(self.path, f), 'r') as f:
                for line in f:
                    if line.startswith("\data"):
                        break
                
                # read the header
                df = pd.read_csv(f, sep=',', header=None)

            df.insert(0, 'date', pd.to_datetime(df.iloc[:, 1]))

            df.columns = ['date', 'site', 'junk',
                          'chla_chlaspec', 'chlafluor1', 'chlafluor2', 'chlafluor3',
                          'pheophytin_pheospec', 'Pheophytinfluor1', 'Pheophytinfluor2', 'Pheophytinfluor3',
                          ]
            
            df = df.drop(columns=['junk'])

            dfs.append(df)
    
        df = pd.concat(dfs, axis=0)
        return df





[docs]
class BuzzardsBay(Datasets):
    """
    Water quality measurements in Buzzards Bay from 1992 - 2018. For more details on data
    see `Jakuba et al., <https://doi.org/10.1038/s41597-021-00856-4>`_
    data is downloaded from `MBLWHOI Library <https://darchive.mblwhoilibrary.org/entities/publication/f31123f1-2097-5742-8ce9-69010ea36460>`_

    Examples
    --------
    >>> from water_datasets import BuzzardsBay
    >>> ds = BuzzardsBay()
    >>> doc = ds.doc()
    >>> doc.shape
    (11092, 4)
    >>> chla = ds.chla()
    >>> chla.shape
    (1028, 10)
    """
    url = {
"buzzards_bay.xlsx": "https://darchive.mblwhoilibrary.org/bitstreams/87c25cf4-21b5-551c-bb7d-4604806109b4/download"}


[docs]
    def __init__(self, path=None, **kwargs):
        super().__init__(path=path, **kwargs)
        self._download()

        self._stations = self.read_stations()['STN_ID'].unique().tolist()

        self._parameters = self.data().columns.tolist()


    @property
    def fpath(self):
        return os.path.join(self.path, 'buzzards_bay.xlsx')

    def stations(self)->List[str]:
        return self._stations
    
    @property
    def parameters(self)->List[str]:
        return self._parameters


[docs]
    def fetch(
            self,
            parameters:Union[str, List[str]]='all',
    )->pd.DataFrame:
        """
        Fetch data for the specified parameters.
        """
        parameters = check_attributes(parameters, self.parameters(), 'parameters')
        data = self.data()
        return data.loc[:, parameters]

   
    def data(self):
        data = pd.read_excel(
            self.fpath, 
            sheet_name='all',
            dtype={
                'STN_ID': str,
                'STN_EQUIV': str,
                'SOURCE': str,
                'GEN_QC': self.fp,
                'PREC': self.fp,
                'WHTR': self.fp,
                #'TIME_QC': self.ip,
                'SAMPDEP_QC': self.fp,
                'SECCHI_M': self.fp,
                'SECC_QC': self.fp,
                #'TOTDEP_QC': self.ip,
                'TEMP_C': self.fp,
                #'TEMP_QC': self.ip
            }
            )
        
        if 'Unnamed: 0' in data.columns: 
            data.pop('Unnamed: 0')
        
        return data

    def metadata(self):

        meta = pd.read_excel(self.fpath, sheet_name='META')

        return meta

    def read_stations(self)->pd.DataFrame:
        stations = pd.read_excel(
            self.fpath, 
            sheet_name='Stations',
            skiprows=1,
            dtype={
                'STN_ID': str,
                'LATITUDE': self.fp,
                'LONGITUDE': self.fp,
                'Town': str,
                'EMBAYMENT': str,
                'WQI_Area': str,
                }
            )

        return stations





[docs]
class SeluneRiver(Datasets):
    """
    Dataset of physico-chemical variables measured at different levels, 
    for a 2021 and 2022 for characterization
    of Hyporheic zone of Selune River, Manche, Normandie, France following
    `Moustapha Ba et al., 2023 <https://doi.org/10.1016/j.dib.2022.108837>`_ .
    The data is available at `data.gouv.fr <https://doi.org/10.57745/SBXWUC>`_ .
    The following variables are available:
       
        - water level
        - temperature 
        - conductivity 
        - oxygen  
        - pressure
    """
    url = {
    "data_downstream_signy-zh.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150676",
    "data_baro_upstream-virey.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/151002",
    "data_conduc_upstream-virey-zh.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150783",
    "data_mini-lomos_downstream-signy.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150678",
    "data_mini-lomos_upstream-virey.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150780",
    "data_oxygen_downstream-signy-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150771",
    "data_oxygen_upstream-virey-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150782",
    "data_oxygen_upstream-virey-zh.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150781",
    "data_station_downstream-signy-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150868",
    "data_station_oxygen_upstream-virey-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150865",
    "data_station_upstream-virey-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150866",
    "data_water-level_upstream-virey-river.tab": "https://entrepot.recherche.data.gouv.fr/api/access/datafile/150779",
    "readme.txt":"https://entrepot.recherche.data.gouv.fr/api/access/datafile/151001",
    "readme1.0.txt":"https://entrepot.recherche.data.gouv.fr/api/access/datafile/156508",
    }


[docs]
    def __init__(self, path=None, **kwargs):
        super().__init__(path=path, **kwargs)
        self._download()



[docs]
    def data(self)->pd.DataFrame:
        """
        Return a DataFrame of the data
        """

        fpath = os.path.join(self.path, 'data_downstream_signy-zh.tab')
        downstream_signy_zh = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype={'id': str})
        downstream_signy_zh.columns = [col + '_dwnstr_signyzh' for col in downstream_signy_zh.columns]
        downstream_signy_zh.index = pd.to_datetime(downstream_signy_zh.index)
        downstream_signy_zh.index.name = 'date'

        fpath = os.path.join(self.path, 'data_baro_upstream-virey.tab')
        baro_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype={'barometric': float})
        baro_upstream_virey.columns = [col + '_baro_upstr_virey' for col in baro_upstream_virey.columns]
        #assert baro_upstream_virey.shape == (31927, 1)
        baro_upstream_virey.index = pd.to_datetime(baro_upstream_virey.index)
        baro_upstream_virey.index.name = 'date'

        fpath = os.path.join(self.path, 'data_conduc_upstream-virey-zh.tab')
        cond_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype={'cond_30cm': float, 't_30cm_sensor_cond': float})
        cond_upstream_virey.columns = [col + '_cond_upstream_virey' for col in cond_upstream_virey.columns]
        #assert cond_upstream_virey.shape == (31927, 2)
        cond_upstream_virey.index = pd.to_datetime(cond_upstream_virey.index)
        cond_upstream_virey.index.name = 'date'

        fpath = os.path.join(self.path, 'data_mini-lomos_downstream-signy.tab')
        mini_lomos_downstream_signy = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype=float)  # diff_press, t_river, t at 10,20,30,40 cm
        #assert mini_lomos_downstream_signy.shape == (14843, 6)
        mini_lomos_downstream_signy.columns = [col + '_mini_lomos_dwnstr_signy' for col in mini_lomos_downstream_signy.columns]
        mini_lomos_downstream_signy.index = pd.to_datetime(mini_lomos_downstream_signy.index)
        mini_lomos_downstream_signy.index.name = 'date'

        fpath = os.path.join(self.path, 'data_oxygen_downstream-signy-river.tab')
        oxy_downstream_signy = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype=float
                        )  # temp, oxy_sat, oxy_conc
        #assert oxy_downstream_signy.shape == (31947, 3)
        oxy_downstream_signy.columns = [col + '_oxy_dwnstr_signy' for col in oxy_downstream_signy.columns]
        oxy_downstream_signy.index = pd.to_datetime(oxy_downstream_signy.index)
        oxy_downstream_signy.index.name = 'date'

        fpath = os.path.join(self.path, 'data_station_downstream-signy-river.tab')
        downstream_signy = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                            dtype=float
                            ) # cond, turb, wl
        #assert downstream_signy.shape == (31947, 3)
        downstream_signy.columns = [col + '_dwnstr_signy' for col in downstream_signy.columns]
        downstream_signy.index = pd.to_datetime(downstream_signy.index, format="mixed")
        downstream_signy.index.name = 'date'

        fpath = os.path.join(self.path, 'data_station_oxygen_upstream-virey-river.tab')
        oxy_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype=float
                        ) # con_oxy
        #assert oxy_upstream_virey.shape == (31927, 1)
        oxy_upstream_virey.columns = [col + '_oxy_upstr_virey_stn' for col in oxy_upstream_virey.columns]
        oxy_upstream_virey.index = pd.to_datetime(oxy_upstream_virey.index, format="mixed")
        oxy_upstream_virey.index.name = 'date'

        fpath = os.path.join(self.path, 'data_station_upstream-virey-river.tab')
        upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype=float
                        )  # cond, turb, wl
        #assert upstream_virey.shape == (31947, 3)
        upstream_virey.columns = [col + '_upstr_virey_stn' for col in upstream_virey.columns]
        upstream_virey.index = pd.to_datetime(upstream_virey.index, format="mixed")
        upstream_virey.index.name = 'date'

        fpath = os.path.join(self.path, 'data_water-level_upstream-virey-river.tab')
        wl_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype=float
                        )  # wl, temp
        #assert wl_upstream_virey.shape == (31927, 2)
        wl_upstream_virey.columns = [col + '_upstr_virey' for col in wl_upstream_virey.columns]
        wl_upstream_virey.index = pd.to_datetime(wl_upstream_virey.index)
        wl_upstream_virey.index.name = 'date'

        fpath = os.path.join(self.path, 'data_mini-lomos_upstream-virey.tab')
        mini_lomos_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype=float
                        )  # diff_press, t_river, t at 10,20,30,40 cm
        #assert mini_lomos_upstream_virey.shape == (8621, 6)
        mini_lomos_upstream_virey.columns = [col + '_mini_lomos_upstr_virey' for col in mini_lomos_upstream_virey.columns]
        mini_lomos_upstream_virey.index = pd.to_datetime(mini_lomos_upstream_virey.index)
        mini_lomos_upstream_virey.index.name = 'date'

        fpath = os.path.join(self.path, 'data_oxygen_upstream-virey-river.tab')
        oxy_upstream_virey = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype=float
                        )  # temp, oxy_sat, oxy_conc
        #assert oxy_upstream_virey.shape == (25699, 2)
        oxy_upstream_virey.columns = [col + '_oxy_upstr_virey' for col in oxy_upstream_virey.columns]
        oxy_upstream_virey.index = pd.to_datetime(oxy_upstream_virey.index)
        oxy_upstream_virey.index.name = 'date'

        fpath = os.path.join(self.path, 'data_oxygen_upstream-virey-zh.tab')
        oxy_upstream_virey_zh = pd.read_csv(fpath, sep='\t', index_col=0, parse_dates=True,
                        dtype=float
                        )  # oxy_conc and t_ at 15 and 30 cm
        #assert oxy_upstream_virey_zh.shape == (31927, 4)
        oxy_upstream_virey_zh.columns = [col + '_oxy_upstr_virey_zh' for col in oxy_upstream_virey_zh.columns]
        oxy_upstream_virey_zh.index = pd.to_datetime(oxy_upstream_virey_zh.index)
        oxy_upstream_virey_zh.index.name = 'date'

        # concatenate all dataframes

        df = pd.concat([downstream_signy_zh, baro_upstream_virey, cond_upstream_virey,
                        mini_lomos_downstream_signy, oxy_downstream_signy, downstream_signy,
                        oxy_upstream_virey, upstream_virey, wl_upstream_virey, mini_lomos_upstream_virey,
                        oxy_upstream_virey, oxy_upstream_virey_zh], axis=1)
        
        return df