Source code for ampworks.datasets

"""
The `datasets` module provides example datasets bundled with `ampworks`. The
available functions allow users to list, download, or load in example datasets.
The datasets are used in tutorials and tests. They provide a convenient intro
to package functions without the overhead of requiring users to perform their
own experiments.

Datasets come from combinations of real-world experiments and model-generated
data (ECM, SPM, P2D). While the `ampworks` algorithms are designed to work with
real experimental data, model-generated data has also been useful in testing and
demonstrating the algorithms in a controlled setting. Note that the included
datasets are not intended to cover all user cases, and users are encouraged to
apply the algorithms to their own data after learning from examples. Datasets
are organized into subfolders by module, e.g., `ici` for ICI datasets. A brief
description of each dataset is given below:

dQdV datasets:
    1. `cell1_rough` - noisy beginning of life full cell pseudo-OCV curve
    2. `cell1_smooth` - smoothed version of `cell1_rough`
    3. `cell2_rough` - noisy aged full cell pseudo-OCV curve
    4. `cell2_smooth` - smoothed version of `cell2_rough`
    5. `gr_smooth` - smoothed graphite electrode pseudo-OCP voltage curve
    6. `nmc_smooth` - smoothed NMC electrode pseudo-OCP voltage curve

GITT datasets:
    1. `gitt_charge` - example GITT data (using charge/rest sequences)
    2. `gitt_discharge` - example GITT data (using discharge/rest sequences)

HPPC datasets:
    1. `hppc_discharge` - example HPPC data (using discharge sequences)

ICI datasets:
    1. `ici_charge` - example ICI data (using charge/rest sequences)
    2. `ici_discharge` - example ICI data (using discharge/rest sequences)

"""

from __future__ import annotations

from typing import TYPE_CHECKING
from warnings import catch_warnings, filterwarnings

import os
import shutil
import pathlib

if TYPE_CHECKING:  # pragma: no cover
    from ampworks import Dataset

__all__ = [
    'download_all',
    'list_datasets',
    'load_datasets',
]

RESOURCES = pathlib.Path(os.path.dirname(__file__), 'resources')
DATAFOLDERS = os.listdir(RESOURCES)



[docs]
def list_datasets(*modules: str) -> list[str]:
    """
    List names of available example datasets.

    Parameters
    ----------
    modules : str, optional
        If given, only list datasets related to the given module(s) ('gitt',
        'ici', etc.). Leaving empty (default) lists all datasets.

    Returns
    -------
    names : list[str]
        A list of example file names from an internal `resources` folder.

    Raises
    ------
    ValueError
        Requested module(s) not found or empty. See the list of modules that
        have available datasets by printing `ampworks.datasets.DATAFOLDERS`.

    Examples
    --------
    The code snippets below show how to use the `list_datasets` function to list
    available datasets. The first example lists all datasets, while the second
    and third examples filter the list by module name.

    >>> from ampworks.datasets import list_datasets
    >>> names = list_datasets()
    >>> print(names)

    >>> names = list_datasets('gitt')
    >>> print(names)

    >>> names = list_datasets('gitt', 'ici')
    >>> print(names)

    """
    if not modules:
        modules = DATAFOLDERS

    missing = set(modules) - set(DATAFOLDERS)
    if missing:
        raise ValueError(f"Requested module(s) not found, or empty: {missing=}."
                         f" Available modules are {DATAFOLDERS=}.")

    names = []
    for m in modules:
        files = [m + '/' + f for f in os.listdir(RESOURCES.joinpath(m))]
        names.extend(files)

    return names




[docs]
def download_all(path: str | os.PathLike | None = None) -> None:
    """
    Copy example datasets into a local directory.

    Parameters
    ----------
    path : str or PathLike or None, optional
        Path to parent directory where a new `ampworks_datasets` folder will
        be created and example datasets will be copied to. If None (default),
        the current working directory is used.

    """
    path = pathlib.Path(path or '.').joinpath('ampworks_datasets')
    path.mkdir(parents=True, exist_ok=True)

    shutil.copytree(RESOURCES, path, dirs_exist_ok=True)




[docs]
def load_datasets(*names: str) -> Dataset:
    """
    Load example datasets by name.

    Parameters
    ----------
    *names : str
        One or more dataset names to load. Check `list_datasets()` for available
        filenames. Note that including the '.csv' extension is optional.

    Returns
    -------
    data : Dataset or tuple[Dataset]
        A single dataset if one name, otherwise a tuple of datasets in the same
        order as the given `names`.

    Raises
    ------
    ValueError
        Requested dataset is not available.

    Examples
    --------
    In the following example, the `load_datasets` function is used to load a
    single HPPC dataset and the optional `.csv` extension is included. The names
    of the available datasets can be found using the `list_datasets` function.

    >>> from ampworks.datasets import load_datasets
    >>> hppc_data = load_datasets('hppc/hppc_discharge.csv')
    >>> print(hppc_data)

    In the next example, two ICI datasets are loaded at once by providing their
    names. Here, the `.csv` extensions is omitted, but the function internally
    appends it as needed. The returned datasets are provided in the same order
    as the given names.

    >>> ici_c, ici_d = load_datasets('ici/ici_charge', 'ici/ici_discharge')
    >>> print(ici_c)
    >>> print(ici_d)

    """
    from ampworks import read_csv

    available = list_datasets()

    if len(names) == 0:
        raise ValueError("At least one dataset name must be given.")

    names = [n + '.csv' if not n.endswith('.csv') else n for n in names]

    not_available = [n for n in names if n not in available]
    if not_available:
        raise ValueError(f"Requested dataset(s) not found: {not_available}.")

    datasets = []
    for name in names:
        with catch_warnings():
            filterwarnings('ignore', message='.*No valid aliases.*')
            data = read_csv(RESOURCES.joinpath(name))

        datasets.append(data)

    if len(datasets) == 1:
        return datasets[0]

    return tuple(datasets)