Source code for aqua_fetch.wq._grimedb

__all__ = ["GRiMeDB"]

import os
from typing import Union, List

import pandas as pd
import numpy as np

from .._datasets import Datasets
from ..utils import check_attributes



[docs]
class GRiMeDB(Datasets):
    """
    Global river database of methan concentrations and fluxes 
    from 5029 stations of 305 rivers following
    `Stanley et al., 2023 <https://doi.org/10.5194/essd-15-2879-2023>`_

    Examples
    --------
    >>> from water_datasets import GRiMeDB
    >>> ds = GRiMeDB(path='/path/to/dataset')
    >>> ds.stations()
    >>> ds.streams
    >>> ds.stn_coords()
    >>> ds.shape
    5029, 2
    """
    url = {
        "concentrations.csv": "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-ntl.420.1&entityid=ba3e270bcab8ace5d157c995e4b791e4",
        "fluxes.csv": "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-ntl.420.1&entityid=1a559f00566ed9f9f33ccb0daab0bef5",
        "sites.csv": "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-ntl.420.1&entityid=3faa64303d5f5bcd043bb88f6768e603",
        "sources.csv": "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-ntl.420.1&entityid=3615386d27a2d148be09e70ac22799e4"
    }


[docs]
    def __init__(self, path=None, **kwargs):
        super().__init__(path=path, **kwargs)
        self.ds_dir = path
        self._download()

        self._stations = self.sites()['Site_ID'].unique().tolist()

    
    def stations(self)->List[str]:
        return self._stations
    
    @property
    def streams(self)->List[str]:
        """returns names of streams"""
        return self.sites()['Stream_Name'].unique().tolist()


[docs]
    def stn_coords(self)->pd.DataFrame:
        """
        Returns the coordinates of all the stations in the dataset in wgs84
        projection.

        Returns
        -------
        pd.DataFrame
            A dataframe with columns 'lat', 'long'
        """
        coords = self.sites()[['Latitude', 'Longitude']]
        coords.columns = ['lat', 'long']
        return coords.astype(np.float32)



[docs]
    def concentrations(
            self,
            stations: Union[str, List[str]] = "all",
            streams: Union[str, List[str]] = "all",
            parameters: Union[str, List[str]] = "all"
            ):

        """
        Get concentrations data.

        Parameters
        ----------
        stations : Union[str, List[str]], optional
            station ID or list of station IDs, by default "all".
            If given, then ``streams`` must not be given. Check `.stations()` method
            for available stations.
        streams : Union[str, List[str]], optional
            stream name or list of stream names, by default "all".
            If given, then ``stations`` must not be given. Check `.streams` attribute
            for available streams.
        parameters : Union[str, List[str]], optional
            parameters to return, by default "all". Check `.parameters` attribute 
            for available parameters.
        """

        if stations != "all" and streams != "all":
            raise ValueError("Either stations or streams must be provided, not both.")

        fpath = os.path.join(self.path, 'concentrations.csv')

        df = pd.read_csv(fpath,
                         dtype={
                             'pH': np.float32, 
                             #'Site_ID': int,
                             #'Aggregated_Space': bool,
                             #'Aggregated_Time': bool,
                             #'FluxYesNo': bool
                                },
                                # converters are taking time
                        converters={'Date_start': pd.to_datetime,
                                    'Date_end': pd.to_datetime},
                        na_values={'pH': '.'})

        if stations != "all":
            stations = check_attributes(stations, self._stations, 'stations')
            df = df[df['Site_ID'].isin(stations)]
        elif streams != "all":
            streams = check_attributes(streams, self.streams, 'streams')
            sites = self.sites()
            stations = sites.loc[sites['Stream_Name'].isin(streams), 'Site_ID'].values.tolist()
            df = df[df['Site_ID'].isin(stations)]
        
        if parameters != "all":
            df = df[parameters]
    
        return df


    def sites(self):
        fpath = os.path.join(self.path, 'sites.csv')
        df = pd.read_csv(
            fpath,
            dtype={'Site_Name': str, 'Stream_Name': str, 'Basin_Region': str, 'Site_ID': str}
            )
        return df


[docs]
    def fluxes(
            self,
            stations: Union[str, List[str]] = "all",
            )->pd.DataFrame:
        """returns fluxes data as a pandas dataframe"""
        fpath = os.path.join(self.path, 'fluxes.csv')
        df = pd.read_csv(fpath)

        if stations != "all":
            stations = check_attributes(stations, self._stations, 'stations')
            df = df[df['Site_ID'].isin(stations)]
        return df

    
    def sources(self):
        fpath = os.path.join(self.path, 'sources.csv')
        df = pd.read_csv(fpath)
        return df