Source code for aqua_fetch.wwt.photocatalysis


__all__ = [
    "mg_degradation",
    "dye_removal",
    "dichlorophenoxyacetic_acid_removal",
    "pms_removal",
    "tetracycline_degradation",
    "tio2_degradation"
]

from typing import Union, Tuple, Any, List, Dict

import numpy as np
import pandas as pd

from ..utils import (
    check_attributes,
    LabelEncoder,
    OneHotEncoder,
    maybe_download_and_read_data,
    encode_cols
)


[docs]
def mg_degradation(
        parameters: Union[str, List[str]] = "all",
        encoding: str = None
)->Tuple[pd.DataFrame, Dict[str, Union[OneHotEncoder, LabelEncoder, Any]]]:
    """
    This data is about photocatalytic degradation of melachite green dye using
    nobel metal dobe BiFeO3. For further description of this data see
    `Jafari et al., 2023 <https://doi.org/10.1016/j.jhazmat.2022.130031>`_ and
    for the use of this data for removal efficiency prediction `see <https://github.com/ZeeshanHJ/Photocatalytic_Performance_Prediction>`_ .
    This dataset consists of 1200 points collected during ~135 experiments.

    Parameters
    ----------
        parameters : list, optional
            features to use as input. By default following features are used as input

                - ``Catalyst_type``
                - ``Surface area``
                - ``Pore Volume``
                - ``Catalyst_loading (g/L)``
                - ``Light_intensity (W)``
                - ``time (min)``
                - ``solution_pH``
                - ``HA (mg/L)``
                - ``Anions``
                - ``Ci (mg/L)``
                - ``Cf (mg/L)``
                - ``Efficiency (%)``
                - ``k_first``
                - ``k_2nd``

        encoding : str, default=None
            type of encoding to use for the two categorical features i.e., ``catalyst_type``
            and ``anions``, to convert them into numberical. Available options are ``ohe``,
            ``le`` and None. If ``ohe`` is selected the original input columns are replaced
            with ohe hot encoded columns. This will result in 6 columns for Anions and
            15 columns for catalyst_type.

    Returns
    --------
    tuple
        A tuple of length two. The first element is a DataFrame of shape (1200, len(parameters))
        while the second element is a dictionary consisting of encoders with
        ``catalyst_type`` and ``anions`` as keys.

    Examples
    --------
    >>> from water_datasets import mg_degradation
    >>> mg_data, encoders = mg_degradation()
    >>> mg_data.shape
    (1200, 14)
    ... # the default encoding is None, but if we want to use one hot encoder
    >>> mg_data_ohe, encoders = mg_degradation(encoding="ohe")
    >>> mg_data_ohe.shape
    (1200, 33)
    >>> encoders['catalyst_type'].inverse_transform(mg_data_ohe.loc[:, [col for col in data.columns if col.startswith('catalyst_type')]].values)
    >>> encoders['anions'].inverse_transform(mg_data_ohe.loc[:, [col for col in data.columns if col.startswith('anions')]].values)
    ... # if we want to use label encoder
    >>> mg_data_le, cat_enc, an_enc = mg_degradation(encoding="le")
    >>> mg_data_le.shape
    (1200, 14)
    >>> encoders['catalyst_type'].inverse_transform(mg_data_le.loc[:, 'catalyst_type'].values.astype(int))
    >>> encoders['anions'].inverse_transform(mg_data_le.loc[:, 'anions'].values.astype(int))
    ... # By default the target is efficiency but if we want
    ... # to use first order k as target
    >>> mg_data_k, _ = mg_degradation()
    ... # if we want to use 2nd order k as target
    >>> mg_data_k2, _ = mg_degradation()

    """

    url = "https://raw.githubusercontent.com/ZeeshanHJ/Photocatalytic_Performance_Prediction/main/Raw%20data.csv"
    data = maybe_download_and_read_data(url, "mg_degradation.csv")

    columns = {
        'Catalyst_type': 'catalyst_type',
        'Anions': 'anions',
        'Ci (mg/L)': 'ini_conc_mg/l',
        "Cf (mg/L)": 'final_conc_mg/l',
        "time (min)": 'time_min',
        'Catalyst_loading (g/L)': 'catalyst_loading_g/l',
        'Surface area': 'surface_area',
        'Pore Volume': 'pore_volume',
    }

    data.rename(columns=columns, inplace=True)

    # first order
    data["k_first"] = np.log(data['ini_conc_mg/l'] / data['final_conc_mg/l']) / data["time_min"]

    # k second order
    data["k_2nd"] = ((1 / data['final_conc_mg/l']) - (1 / data['ini_conc_mg/l'])) / data["time_min"]

    def_paras = ['surface_area', 'pore_volume', 'catalyst_loading_g/l',
                      'Light_intensity (W)', 'time_min', 'solution_pH', 'HA (mg/L)',
                      'ini_conc_mg/l', 'final_conc_mg/l', 'catalyst_type', 'anions',
                      ] + ['Efficiency (%)', 'k_first', 'k_2nd']

    parameters = check_attributes(parameters, def_paras, "parameters")

    data = data[parameters]

    # consider encoding of categorical features
    data, encoders = encode_cols(data, ['catalyst_type', 'anions'], encoding)

    return data, encoders




[docs]
def dye_removal(
        parameters: Union[str, List[str]] = "all",
        encoding: str = None
)->Tuple[pd.DataFrame, Dict[str, Union[OneHotEncoder, LabelEncoder, Any]]]:
    """
    Data from experiments conducted to measure dye removal rate from wastewater
    treatment using photocatalysis method. For more information on data see
    `Kim et al., 2024 <https://doi.org/10.1016/j.jhazmat.2023.132995>`_ .

    Parameters
    ----------
    parameters : list, optional
        features to use as input. It must be a subset of the following features

            - ``catalyst``
            - ``hydrothermal_synthesis_time_min)``
            - ``energy_Band_gap_Eg) eV``
            - ``C_%``
            - ``O_%``
            - ``Fe_%``
            - ``Al_%``
            - ``Ni_%``
            - ``Mo_%``
            - ``S_%``
            - ``Bi``
            - ``Ag``
            - ``Pd``
            - ``Pt``
            - ``surface_area_m2/g``
            - ``pore_volume_cm3/g``
            - ``pore_size_nm``
            - ``volume_L``
            - ``loading_g``
            - ``catalyst_loading_mg``
            - ``light_intensity_watt``
            - ``light_source_distance_cm``
            - ``time_m``
            - ``dye``
            - ``log_Kw``
            - ``hydrogen_bonding_acceptor_count``
            - ``hydrogen_bonding_donor_count``
            - ``solubility_g/L``
            - ``molecular_wt_g/mol``
            - ``pka1``
            - ``pka2``
            - ``dye_concentration_mg/L``
            - ``solution_pH``
            - ``HA_mg/L``
            - ``anions``

        encoding : str, default=None
            type of encoding to use for the two categorical features i.e., ``Catalyst_type``
            ``dye`` and ``Anions``, to convert them into numberical. Available options are ``ohe``,
            ``le`` and None.

    Returns
    --------
    tuple
        A tuple of length two. The first element is a DataFrame of shape (1200, len(parameters))
        while the second element is a dictionary consisting of encoders with
        ``catalyst_type`` and ``anions`` as keys.

    Examples
    --------
    >>> from water_datasets import dye_removal

    >>> data, encoders = dye_removal()
    >>> assert data.shape == (1527, 36)
    # using label encoding to encode the categorical variables
    >>> data, encoders = dye_removal(encoding='le')
    >>> assert data.shape == (1527, 36), data.shape
    >>> catalysts = encoders['catalyst'].inverse_transform(data.loc[:, 'catalyst'].values)
    >>> len(set(catalysts.tolist()))
    18
    >>> dye = encoders['dye'].inverse_transform(data.loc[:, "dye"].values)
    >>> set(dye.tolist())
    {'Melachite Green', 'Indigo'}
    >>> anions = encoders['anions'].inverse_transform(data.loc[:,'anions'].values)
    >>> set(anions.tolist())
    {'NaCO3', 'N/A', 'Na2SO4', 'Na2HPO4', 'NaHCO3', 'NaCl'}
    # using one hot encoding for categroicla parameters
    >>> data, encoders = dye_removal(encoding='ohe')
    >>> assert data.shape == (1527, 59), data.shape
    >>> catalysts = encoders['catalyst'].inverse_transform(data.loc[:, [col for col in data.columns if col.startswith('catalyst')]].values)
    >>> len(set(catalysts.tolist()))
    18
    >>> dye = encoders['dye'].inverse_transform(data.loc[:, ["dye_0", "dye_1"]].values)
    >>> set(dye.tolist())
    {'Melachite Green', 'Indigo'}
    >>> anions = encoders['anions'].inverse_transform(data.loc[:, [col for col in data.columns if col.startswith('anions')]].values)
    >>> set(anions.tolist())
    {'NaCO3', 'N/A', 'Na2SO4', 'Na2HPO4', 'NaHCO3', 'NaCl'}

    """

    url = "https://gitlab.com/atrcheema/bajwachor/-/raw/main/scripts/data/230613_Photocatalysis_with_Zeeshan_data_CMKim_Updated.csv"
    df = maybe_download_and_read_data(url, "dye_removal.csv")

    columns = {
        'Catalyst': 'catalyst',
        'Hydrothermal synthesis time (min)': 'hydrothermal_synthesis_time_min',
    'Energy Band gap (Eg) eV': 'energy_band_gap_eV',
        'C (At%)': 'C_%',
        'O (At%)': 'O_%',
        'Fe (At%)': 'Fe_%',
        'Al (At%)': 'Al_%',
    'Ni (At%)': "Ni_%",
    'Mo (At%)': 'Mo_%',
        'S (At%)': 'S_%',
        'Bi': 'Bi', 'Ag': 'Ag', 'Pd': 'Pd', 'Pt': 'Pt',
    'Surface area (m2/g)': "surface_area_m2/g",
        'Pore volume (cm3/g)': 'pore_volume_cm3/g',
        'Pore size (nm)': 'pore_size_nm',
    'volume (L)': 'volume_l',
        # consider one of loading or catalysing loadnig
    'loading (g)': 'loading_g',  # 'Catalyst_loading_mg',
    'Light intensity (watt)': 'light_intensity_watt',
        'Light source distance (cm)': 'light_source_dist_cm',
        'Time (m)': 'time_m',
    'Dye': 'dye',
        # pollutant (dye) properties)
    'log_Kw': 'log_kw',
        'hydrogen_bonding_acceptor_count': 'hydrogen_bonding_accep_count',
        'hydrogen_bonding_donor_count': 'hydrogen_bonding_donor_count',
    'solubility (g/L)': 'solubility_g/l',
        'molecular_wt (g/mol)': 'molecular_wt_g/M',
        'pka1': 'pka1',
        'pka2': 'pka2',
        # instead of Ci we consider Dye Concentration
    'Dye concentration (mg/L)': 'dye_conc_mg/l',
    'Solution pH': 'solution_ph',  # 'Ci',
    'HA (mg/L)': 'ha_mg/l',
    'Anions': 'anions',
    }

    df.rename(columns=columns, inplace=True)

    # first order k following https://doi.org/10.1016/j.seppur.2019.116195
    k = np.log(df["Ci"] / df["Cf"]) / df["time_m"]
    df["k_1st"] = k

    k_2nd = ((1 / df["Cf"]) - (1 / df["Ci"])) / df["time_m"]
    df["k_2nd"] = k_2nd

    # at Time 0, let k==0
    df.loc[df['time_m'] <= 0.0, "k"] = 0.0

    # when final concentration is very low, k is not calculable (will be inf)
    # therefore inserting very small value of k
    df.loc[df['Cf'] == 0.0, "k"] = 0.001

    # mass_ratio = (loading / volume )/dye_conc.

    # when no anions are present, represent them as N/A
    df.loc[df['anions'].isin(['0', 'without Anion']), "anions"] = "N/A"

    default_paras = list(columns.values()) + ['k_1st', 'k_2nd']

    parameters = check_attributes(parameters, default_paras, 'parameters')

    df = df[parameters]

    # consider encoding of categorical features
    df, encoders = encode_cols(df, ["catalyst", "dye", "anions"], encoding)

    return df, encoders




[docs]
def dichlorophenoxyacetic_acid_removal(
    parameters: Union[str, List[str]] = "all",
    encoding: str = None,
)->Tuple[pd.DataFrame, Dict[str, Union[OneHotEncoder, LabelEncoder, Any]]]:

    """
    Data for photodegradation of 2,4-dichlorophenoxyacetic acid using gold-doped bismuth ferrite

    Parameters
    ----------
    parameters : list, optional
        features to use as input. It must be a subset of the following features

            - ``catalyst``
            - ``surface_area``
            - ``pore_volume``
            - ``energy_band_gap_eV``
            - ``Au_%``
            - ``Bi_%``
            - ``Fe_%``
            - ``O_%``
            - ``catalyst_loading_g/l``
            - ``light_intensity_watt``
            - ``time_min
            - ``solution_ph``
            - ``anions``
            - ``ini_conc_mg/l``
            - ``final_conc_mg/l``
            - ``efficiency_%``

        encoding : str, default=None
            type of encoding to use for the two categorical features i.e., ``Catalyst_type``
            ``dye`` and ``Anions``, to convert them into numberical. Available options are ``ohe``,
            ``le`` and None.

    Returns
    --------
    tuple
        A tuple of length two. The first element is a DataFrame of shape (1200, len(parameters))
        while the second element is a dictionary consisting of encoders with
        ``catalyst_type`` and ``anions`` as keys.

    Examples
    --------
    >>> from water_datasets import dichlorophenoxyacetic_acid_removal
    ... # by default all parameters are returned
    >>> data, encoders = dichlorophenoxyacetic_acid_removal()
    >>> assert data.shape == (1044, 16), data.shape
    # using label encoding for categorical parameters
    >>> data, encoders = dichlorophenoxyacetic_acid_removal(encoding='le')
    >>> assert data.shape == (1044, 16), data.shape
    >>> catalysts = encoders['catalyst'].inverse_transform(data.loc[:, 'catalyst'].values)
    >>> assert len(set(catalysts.tolist())) == 7
    >>> anions = encoders['anions'].inverse_transform(data.loc[:,'anions'].values)
    >>> set(anions.tolist())
    {'Na2SO4', 'Without Anions', 'Na2HPO4', 'NaHCO3', 'NaCO3', 'NaCl'}
    # using one hot encoding for categorical parameters
    >>> data, encoders = dichlorophenoxyacetic_acid_removal(encoding='ohe')
    >>> assert data.shape == (1044, 27), data.shape
    >>> catalysts = encoders['catalyst'].inverse_transform(data.loc[:, ['catalyst_0', 'catalyst_1', 'catalyst_2',
       'catalyst_3', 'catalyst_4', 'catalyst_5', 'catalyst_6']].values)
    >>> assert len(set(catalysts.tolist())) == 7
    >>> anions = encoders['anions'].inverse_transform(data.loc[:, [col for col in data.columns if col.startswith('anions')]].values)
    >>> set(anions.tolist())
    {'Na2SO4', 'Without Anions', 'Na2HPO4', 'NaHCO3', 'NaCO3', 'NaCl'}

    """

    url = "https://gitlab.com/atrcheema/envai106/-/raw/main/data/data.xlsx"
    data = maybe_download_and_read_data(url, "dichlorophenoxyacetic_acid_removal.xlsx")

    columns = {
        'Catalyst type': 'catalyst',
        'Surface area': 'surface_area',
        'Pore volume': 'pore_volume',
        'BandGap (eV)': 'energy_band_gap_eV',
        'Au': 'Au_%',
        'Bi': 'Bi_%',
        'Fe': 'Fe_%',
        'O': 'O_%',
        'Catalyst loading (g/L)': 'catalyst_loading_g/l',
        'Light intensity (W)': 'light_intensity_watt',
        'time (min)': 'time_min',
        'solution pH': 'solution_ph',
        'Anions': 'anions',
        'Ci (mg/L)': 'ini_conc_mg/l',
        'Cf (mg/L)': 'final_conc_mg/l',
        'Efficiency (%)': 'efficiency_%',
    }

    data.rename(columns=columns, inplace=True)

    default_parameters = list(columns.values())

    parameters = check_attributes(parameters, default_parameters, 'parameters')

    data = data[parameters]

    data, encoders = encode_cols(data, ['catalyst', 'anions'], encoding)
    return data, encoders




[docs]
def pms_removal(
    parameters: Union[str, List[str]] = "all",
    encoding: str = None,
)->Tuple[pd.DataFrame, Dict[str, Union[OneHotEncoder, LabelEncoder, Any]]]:

    """
    Data for photodegradation of phenol using peroxymonosulfate.

    Parameters
    ----------
        parameters : list, optional
            Names of the parameters to use. By default following parameters are used

                - ``time_min``
                - ``catalyst_type``
                - ``magnetization_Ms_emu/g``
                - ``energy_band_gap_eV``
                - ``calcination_temp_C``
                - ``min_calcination_time``
                - ``surface_area``
                - ``pore_size``
                - ``pollutant``
                - ``poll_mol_formula``
                - ``pms_concentration_g/l``
                - ``light_intensity_watt``
                - ``light_type``
                - ``catalyst_dosage_g/l``
                - ``ini_conc_ppm``
                - ``solution_ph``
                - ``H2O2_Conc_ppm``
                - ``volume_ml``
                - ``stirring_speed_rpm``
                - ``radical_scavenger``
                - ``inorganic anions``
                - ``water_type``
                - ``cycle_num``
                - ``final_conc_ppm``
                - ``removal_efficiency_%``
        encoding : str, default=None
            type of encoding to use for the two categorical features i.e., ``Catalyst_type``
            ``dye`` and ``Anions``, to convert them into numberical. Available options are ``ohe``,
            ``le`` and None.
        
    Returns
    --------
    tuple
        A tuple of length two. The first element is a DataFrame of shape (2078, len(parameters))
        while the second element is a dictionary consisting of encoders with
        ``catalyst_type``, ``pollutant``, ``poll_mol_formula`` and ``water_type`` as keys.
    
    Examples
    --------
    >>> from water_datasets import pms_removal
    >>> data, encoders = pms_removal()
    >>> data.shape
    (2078, 25)
    ... # the default encoding is None, but if we want to use one hot encoder
    >>> data_ohe, encoders = pms_removal(encoding="ohe")
    >>> data_ohe.shape
    (2078, 100)
    >>> catalysts = encoders['catalyst_type'].inverse_transform(data_ohe.loc[:, [col for col in data.columns if col.startswith('catalyst_type')]].values)
    >>> len(set(catalysts))
    42
    >>> pollutants = encoders['pollutant'].inverse_transform(data_ohe.loc[:, [col for col in data.columns if col.startswith('pollutant')]].values)
    >>> len(set(pollutants))
    14
    >>> poll_mol_formula = encoders['poll_mol_formula'].inverse_transform(data_ohe.loc[:, [col for col in data.columns if col.startswith('poll_mol_formula')]].values)
    >>> len(set(poll_mol_formula))
    14
    >>> water_type = encoders['water_type'].inverse_transform(data_ohe.loc[:, [col for col in data.columns if col.startswith('water_type')]].values)
    >>> len(set(water_type))
    9
    ... # if we want to use label encoder
    >>> data_le, encoders = pms_removal(encoding="le")
    >>> data_le.shape
    (2078, 25)
    >>> catalysts = encoders['catalyst_type'].inverse_transform(data_le.loc[:, 'catalyst_type'].values)
    >>> len(set(catalysts))
    42
    >>> pollutants = encoders['pollutant'].inverse_transform(data_le.loc[:, 'pollutant'].values)
    >>> len(set(pollutants))
    14
    >>> poll_mol_formula = encoders['poll_mol_formula'].inverse_transform(data_le.loc[:, 'poll_mol_formula'].values)
    >>> len(set(poll_mol_formula))
    14
    >>> water_type = encoders['water_type'].inverse_transform(data_le.loc[:, 'water_type'].values)
    >>> len(set(water_type))
    9
    """

    url = "https://gitlab.com/atrcheema/envai105/-/raw/main/data/Final_data_sheet_0716.xlsx"
    data = maybe_download_and_read_data(url, "pms_removal.xlsx")

    columns = {
        'time (min)': 'time_min',
        'Photocatalyst': 'catalyst_type',
        'Magnetization (Ms) (emu/g)': 'magnetization_Ms_emu/g',
        'band gap energy Eg (eV)': 'energy_band_gap_eV',
        'Calcination Temp. (oC)': 'calcination_temp_C',
        'Calcination Time (min)': 'min_calcination_time',
    'Surface area': 'surface_area',
        'Pore size': 'pore_size',
    'Pollutant': 'pollutant',
        'Pollutant molecular formula': 'poll_mol_formula',
    'PMS concentration (g/L)': 'pms_concentration_g/l',
        'Light intensity (W)': 'light_intensity_watt',
        'Light type': 'light_type',
        'Catalyst dosage (g/L)': 'catalyst_dosage_g/l',
        'Initial concentration (ppm)': 'ini_conc_ppm',
        'Solution pH': 'solution_ph',
        'H2O2 Concentration (mM)': 'H2O2_Conc_ppm',
        'Volume (mL)': 'volume_ml',
        'stirring speed (rpm)': 'stirring_speed_rpm',
        'Radical Scavenger': 'radical_scavenger',
        'Inorganic Anions': 'inorganic anions',
    'Type of the water': 'water_type',
        'No of Cycle': 'cycle_num',
        'Final Concentration (ppm)': 'final_conc_ppm',
        'Removal efficiency (%)': 'removal_efficiency_%'
    }
    data.rename(columns=columns, inplace=True)

    default_parameters = list(columns.values())

    parameters = check_attributes(parameters, default_parameters, 'parameters')

    data = data[parameters]

    data, encoders = encode_cols(
        data,
        ['catalyst_type', 'pollutant', 'poll_mol_formula', 'water_type'],
        encoding)
    return data, encoders




[docs]
def tetracycline_degradation(
    parameters: Union[str, List[str]] = "all",
    encoding: str = None,
)->Tuple[pd.DataFrame, dict]:
    """
    Data for photodegradation of tetracycline. For details on data see
    `Abdi et al., 2022 <https://doi.org/10.1016/j.chemosphere.2021.132135>`_ .

    Parameters
    ----------
    parameters : list, optional
        Names of the parameters to use. By default, following parameters are used

            - ``surf_area_m2g``
            - ``pore_vol_cm3g``
            - ``catalyst_dosage_gL``
            - ``antibiotic_dosage_mgL``
            - ``illumination_time_min``
            - ``pH``
            - ``metallic_org_framework``
            - ``efficiency_%``

    encoding : str, default=None
            type of encoding to use for the categorical features. It can be either
            'ohe', 'le' or None. If 'ohe' is selected the original categroical column
            (``metallic_org_framework``) is replaced with one hot encoded columns.
            If 'le' is selected the original column is replaced with a label encoded column.
            If None is selected, the original column is not replaced.
        
    Returns
    --------
    tuple
        A tuple of length two. The first element is a DataFrame of shape (474, len(parameters))
        while the second element is a dictionary consisting of encoders with
        ``metallic_org_framework`` as key.
    
    Examples
    --------
    >>> from water_datasets import tetracycline_degradation
    >>> data, encoders = tetracycline_degradation()
    >>> data.shape
    (374, 8)

    >>> data, encoders = tetracycline_degradation(encoding='le')
    >>> data.shape
    (374, 8)
    >>> mofs = encoders['metallic_org_framework'].inverse_transform(data.loc[:, 'metallic_org_framework'].values)
    >>> len(set(mofs))
    10

    >>> data, encoders = tetracycline_degradation(encoding='ohe')
    >>> data.shape
    (374, 17)
    >>> mofs = encoders['metallic_org_framework'].inverse_transform(data.loc[:, [col for col in data.columns if col.startswith('metallic_org_framework')]].values)
    >>> len(set(mofs))
    10

    """

    url = "https://ars.els-cdn.com/content/image/1-s2.0-S0045653521026072-mmc1.zip"

    data = maybe_download_and_read_data(url, "tetracycline_degradation.csv")

    columns = {
        'Surface area (m2/g)': 'surf_area_m2g',
        'Pore Volume (cm3/g)': 'pore_vol_cm3g',
        'Catalyst dosage (g/L)': 'catalyst_dosage_gL',
        'Antibiotic dosage (mg/L)': 'antibiotic_dosage_mgL',
        'Illumination time (min)': 'illumination_time_min',
        'pH': 'pH',
        'Metallic organic framework': 'metallic_org_framework',
        'Degradation efficiency (%)': 'efficiency_%',
    }

    data.rename(columns=columns, inplace=True)

    parameters = check_attributes(parameters, list(columns.values()), 'parameters')

    data = data[parameters]

    data, encoders = encode_cols(data, ['metallic_org_framework'], encoding)

    return data, encoders




[docs]
def tio2_degradation(
    parameters: Union[str, List[str]] = "all",
    encoding: str = None,
)->Tuple[pd.DataFrame, dict]:
    """
    Data for photodegradation of tio2

    For details on data see `Jiang et al., 2020 <https://doi.org/10.1016/j.envres.2020.109697>`_ .

    Parameters
    ----------
    parameters : list, optional
        Names of the parameters to use. By default following parameters are used

            - ``OC``
            - ``i_mWpercm2``
            - ``temp_C``
            - ``D_gl``
            - ``C0_mgl``
            - ``pH``
            - ``neglog_k_permin``

    encoding : str, default=None
        type of encoding to use for the categorical features.

    Returns
    --------
    tuple
        A tuple of length two. The first element is a DataFrame of shape (446, len(parameters))
        while the second element is an empty dictionary.
    
    Examples
    --------
    >>> from water_datasets import tio2_degradation
    >>> data, encoders = tio2_degradation()
    >>> data.shape
    (446, 7)
    """

    url = "https://ars.els-cdn.com/content/image/1-s2.0-S0045653521026072-mmc1.zip"

    data = maybe_download_and_read_data(url, "tio2_degradation.csv")

    columns = {
        'OC': 'OC',
        'I_mW/cm2': 'i_mWpercm2',
        'T_C': 'temp_C',
        'D_gl': 'D_gl',
        'C0_mgl': 'C0_mgl',
        'pH': 'pH',
        'neglog_k_permin': 'neglog_k_permin',
    }

    data.rename(columns=columns, inplace=True)

    parameters = check_attributes(parameters, list(columns.values()), 'parameters')

    data = data[parameters]

    data, encoders = encode_cols(data, [], encoding)

    return data, encoders




[docs]
def photodegradation_Jiang(
        parameters: Union[str, List[str]] = "all",
        encoding: str = None,
)->Tuple[pd.DataFrame, Dict[str, Union[OneHotEncoder, LabelEncoder, Any]]]:
    """
    Data for photodegradation of multiple pollutants using various photocatalysts.
    For details on data see `Jiang et al., 2021 <https://doi.org/10.3390/catal11091107>`_ .

    Parameters
    ----------
        parameters : list, optional
            Names of the parameters to use. By default following parameters are used

                - ``photocatalyst``
                - ``contaminants``
                - ``photocat_dosage_gl``
                - ``photocat_size_nm``
                - ``initial_conc_mgl``
                - ``pH``
                - ``light_type``
                - ``k_min-1``
        
        encoding : str, default=None
            type of encoding to use for the categorical features.
            It can be either ``ohe``, ``le`` or None. If ``ohe`` is selected the original categroical column
            is replaced with one hot encoded columns. If ``le`` is selected the original column is replaced
            with a label encoded column. If None is selected, the original column is not replaced.
    
    Returns
    --------
    tuple
        A tuple of length two. The first element is a DataFrame of shape (446, len(parameters))
        while the second element is a dictionary consisting of encoders with
        ``photocatalyst`` and ``contaminants`` as keys.
    
    Examples
    --------
    >>> from water_datasets import photodegradation_Jiang
    >>> data, encoders = photodegradation_Jiang()
    >>> data.shape
    (449, 8)
    ... # the default encoding is None, but if we want to use one hot encoder
    >>> data_ohe, encoders = photodegradation_Jiang(encoding="ohe")
    >>> data_ohe.shape
    (449, 16)
    >>> photocatalysts = encoders['photocatalyst'].inverse_transform(data_ohe.loc[:, [col for col in data.columns if col.startswith('photocatalyst')]].values)
    >>> len(set(photocatalysts))
    100
    >>> contaminants = encoders['contaminants'].inverse_transform(data_ohe.loc[:, [col for col in data.columns if col.startswith('contaminants')]].values)
    >>> len(set(contaminants))
    47
    ... # if we want to use label encoder
    >>> data_le, encoders = photodegradation_Jiang(encoding="le")
    >>> data_le.shape
    (449, 8)
    >>> photocatalysts = encoders['photocatalyst'].inverse_transform(data_le.loc[:, 'photocatalyst'].values)
    >>> len(set(photocatalysts))
    100
    >>> contaminants = encoders['contaminants'].inverse_transform(data_le.loc[:, 'contaminants'].values)
    >>> len(set(contaminants))
    47    
    """

    url = "https://www.mdpi.com/2073-4344/11/9/1107#app1-catalysts-11-01107"

    data = maybe_download_and_read_data(url, "photodegradation_Jiang.csv")

    # replace "N/A" with np.nan in photocat_size_nm column
    data.loc[data["Photocat. size (nm)"] == "N/A", "Photocat. size (nm)"] = np.nan
    data.loc[data["Photocat. size (nm)"] == "N/A ", "Photocat. size (nm)"] = np.nan
    # convert '100-4000 ' to np.nan
    data.loc[data["Photocat. size (nm)"] == '100-4000 ', "Photocat. size (nm)"] = np.nan
    # convert '~200 ' to 200
    data.loc[data["Photocat. size (nm)"] == '~200 ', "Photocat. size (nm)"] = 200
    # convert '>1000 ' to np.nan
    data.loc[data["Photocat. size (nm)"] == '>1000 ', "Photocat. size (nm)"] = np.nan
    # convert '20-50 ' to np.nan
    data.loc[data["Photocat. size (nm)"] == '20-50 ', "Photocat. size (nm)"] = np.nan
    # convert '<44000 ' to np.nan
    data.loc[data["Photocat. size (nm)"] == '<44000 ', "Photocat. size (nm)"] = np.nan
    data.loc[data["pH"] == "N/A", "pH"] = np.nan
    data.loc[data["pH"] == "N/A ", "pH"] = np.nan
    # convert "Photocat. size (nm)" and 'pH' to float
    #data["Photocat. size (nm)"] = data["Photocat. size (nm)"].astype(np.float32)
    data['pH'] = data['pH'].astype(np.float32)

    # remove trailing empty space in contaiminant column
    data["Contaminants"] = data["Contaminants"].str.strip()

    columns = {
        "Photocatalyst": "photocatalyst",
        "Contaminants": "contaminants",
        "Photocat. dosage (g/L)": "photocat_dosage_gl",
        "Photocat. size (nm)": "photocat_size_nm",
        "Initial conc. (mg/L)": "initial_conc_mgl",
        "pH": "pH",
        "Light type": "light_type",
        "k (min-1)": "k_min-1"
    }

    data.rename(columns=columns, inplace=True)

    parameters = check_attributes(parameters, list(columns.values()), 'parameters')

    data = data[parameters]


    data, encoders = encode_cols(data, ['photocatalyst', 
                                        'contaminants'
                                        ], encoding)

    return data, encoders