Source code for aqua_fetch.wq._sylt_roads

__all__ = ["SyltRoads"]

import os
from typing import Union, List

import pandas as pd

from .._datasets import Datasets
from ..utils import check_attributes, download_and_unzip

# todo: entrance and entrace are same


[docs]
class SyltRoads(Datasets):
    """
    Dataset of physico-hydro-chemical time series data at Sylt Roads from 1973 - 2019
    following `Rick et al., 2023 <https://doi.org/10.5194/essd-15-1037-2023>`_ .
    Following parameters are available

        - ``location``
        - ``Depth water [m]``
        - ``Sal``
        - ``Temp [°C]``
        - ``[PO4]3- [µmol/l]``
        - ``[NH4]+ [µmol/l]``
        - ``[NO2]- [µmol/l]``
        - ``[NO3]- [µmol/l]``
        - ``Si(OH)4 [µmol/l]``
        - ``SPM [mg/l]``
        - ``pH``
        - ``O2 [µmol/l]``
        - ``Chl a [µg/l]``
        - ``DON [µmol/l]``
        - ``DOP [µmol/l]``
        - ``DIN [µmol/l]``

    Examples
    --------
    >>> from water_datasets import SyltRoads
    >>> ds = SyltRoads()

    """
    url = {
        "list_entrance_2014.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.873545?format=textfile",
        "list_reede_2014.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.873549?format=textfile",
        "list_ferry_2014.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.873547?format=textfile",
        "list_reede_2015.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918018?format=textfile",
        "list_entrance_2015.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918032?format=textfile",
        "list_ferry_2015.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918027?format=textfile",
        "list_reede_2016.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918023?format=textfile",
        "list_entrance_2016.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918033?format=textfile",
        "list_ferry_2016.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918028?format=textfile",
        "list_reede_2017.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918024?format=textfile",
        "list_entrace_2017.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918034?format=textfile",
        "list_ferry_2017.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918029?format=textfile",
        "list_reede_2018.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918025?format=textfile",
        "list_entrace_2018.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918035?format=textfile",
        "list_ferry_2018.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918030?format=textfile",
        "list_reede_2019.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918026?format=textfile",
        "list_entrace_2019.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918036?format=textfile",
        "list_ferry_2019.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.918031?format=textfile",
        "1973_2013.txt":
            "https://doi.pangaea.de/10.1594/PANGAEA.150032?format=textfile"
    }


[docs]
    def __init__(self, path=None, **kwargs):
        super().__init__(path=path, **kwargs)
        self.ds_dir = path
        self._download()

        self._parameters = self._get_data().columns.tolist()


    @property
    def parameters(self)->List[str]:
        """returns names of parameters in the dataset"""
        return self._parameters

    @property
    def raw_data_path(self):
        path = os.path.join(self.path, 'raw_data')
        if not os.path.exists(path):
            os.makedirs(path)
        return path


[docs]
    def stn_coords(self)->pd.DataFrame:
        """
        Returns the coordinates of all the stations in the dataset in wgs84
        projection.

        Returns
        -------
        pd.DataFrame
            A dataframe with columns 'lat', 'long'
        """
        entrace = 55.038300 , 8.438300
        ferry = 55.015530 , 8.439900
        reede = 55.030000 , 8.460000

        return pd.DataFrame([entrace, ferry, reede], 
                            columns=['lat', 'long'],
                    index=['entrace', 'ferry', 'reede'])



[docs]
    def fetch(
            self,
            parameters: Union[str, List[str]] = "all",
    )->pd.DataFrame:
        """
        Fetch the data from the dataset

        Parameters
        ----------
        parameters : str or List[str], optional
            Parameters to fetch. Default is None which will fetch all parameters

        Returns
        -------
        pd.DataFrame
            DataFrame containing the data

        Examples
        --------
        >>> from water_datasets import SyltRoads
        >>> ds = SyltRoads()
        >>> df = ds.fetch()
        >>> df.shape
        (5710, 16)
        >>> len(ds.parameters)
        16
        >>> ds.fetch(['Sal', 'Temp [°C]', 'pH']).shape
        (5710, 3)
        """

        parameters = check_attributes(parameters, self.parameters, 'parameters')
        return self._get_data()[parameters]

    
    def _get_data(self)->pd.DataFrame:
        df = self._get_historical_data()
        df1 = self._read_data_2014_2019()

        df = pd.concat([df, df1])

        return df

    def _read_files(self, location:str)->pd.DataFrame:
        entrace_files = [f for f in os.listdir(self.path) if f.startswith(location)]
        # read first file and skip all rows in the file until the row which starts with Date
        # then read the rest of the file and use the rown which starts with Date as header
        entrace = []
        for file in entrace_files:
            with open(os.path.join(self.path, file)) as f:
                for line in f:
                    if line.startswith('*/'):
                        break
                df = pd.read_csv(f, sep='\t', index_col=0)
            
            entrace.append(df)

        entrace = pd.concat(entrace)

        return entrace
    
    def _read_data_2014_2019(self)->pd.DataFrame:

        dfs = []
        for loc in ["list_entrace", "list_reede", "list_ferry", "list_entrance"]:
            df = self._read_files(loc)
            df.insert(0, 'location', loc)

            dfs.append(df)

        return pd.concat(dfs)
    
    def _get_historical_data(self)->pd.DataFrame:
        """gets data from 1973 - 2013"""

        fpath = os.path.join(self.path, "1973_2013.csv")

        if os.path.exists(fpath):
            if self.verbosity:
                print(f"Reading data from pre-existing {fpath}")
            return pd.read_csv(fpath, index_col=0)
        
        if self.verbosity:
            print(f"Downloading data from 1973 - 2013")

        dfs = []
        for stn, col_idx in {"list_reede": 0, "list_entrance": 1,  "list_ferry": 3}.items():
            df = self._get_historical_data_stn(stn, col_idx)
        
            dfs.append(df)
        
        df = pd.concat(dfs)

        df.to_csv(fpath, index_label='Date/Time')

        return df            
    
    def _get_historical_data_stn(self, stn, col_idx):
        # read the file 1973_2013.txt
        with open(os.path.join(self.path, "1973_2013.txt")) as f:
            for line in f:
                if line.startswith('*/'):
                    break
            doi_df = pd.read_csv(f, sep='\t', index_col=0)

        dfs = []
        for year, doi in zip(doi_df.index, doi_df.iloc[:, col_idx]):
            if isinstance(doi, str):
                f = doi.split('.')[-1]
                url =  f"https://doi.pangaea.de/10.1594/PANGAEA.{f}?format=textfile"
                fname = f'{year}_{stn}.txt'
                download_and_unzip(self.raw_data_path, {fname: url}, verbosity=0)

                fpath = os.path.join(self.raw_data_path, fname)

                with open(fpath) as f:
                    for line in f:
                        if line.startswith('*/'):
                            break
                    df_ = pd.read_csv(f, sep='\t', index_col=0)
                    df_.insert(0, 'location', stn)

                dfs.append(df_)

        df = pd.concat(dfs)
        return df