Source code for ampworks.mathutils

"""
General-purpose mathematical utilities for array and numerical computations.
Provides reusable functions to simplify common tasks in data analysis and math.

"""
from __future__ import annotations

from typing import Sequence, TYPE_CHECKING

import numpy as np
import pandas as pd

__all__ = [
    'combinations',
    'aggregate_over_x',
]

if TYPE_CHECKING:  # pragma: no cover
    from ampworks import Dataset



[docs]
def aggregate_over_x(
    datasets: Sequence[Dataset],
    x: str,
    y: str,
    n: int = 100,
) -> Dataset:
    """
    Aggregate datasets over a shared `x` grid.

    The function finds the overlapping range of `x` across all datasets,
    interpolates each dataset's `y` onto an evenly spaced `x` grid, and then
    computes summary statistics for `y` at each grid point.

    Parameters
    ----------
    datasets : Sequence[Dataset]
        Datasets to aggregate. Each dataset must contain requested x/y columns.
    x : str
        Column name used to build the interpolation grid.
    y : str
        Column name interpolated and aggregated on the shared grid.
    n : int, optional
        Number of evenly spaced points in the shared grid. Default is 100.

    Returns
    -------
    data : Dataset
        Dataset with columns for the `x` grid and aggregated `y` statistics,
        including: mean, standard deviation, minimum, and maximum.

    Raises
    ------
    TypeError
        If any input argument has an invalid type.
    ValueError
        If `datasets` is empty, if `n < 2`, if the requested columns are missing
        from any dataset, or if the datasets have no overlapping range in `x`.

    Examples
    --------
    The code snippet below demonstrates how to use `aggregate_over_x`. Here, we
    load a beginning-of-life and end-of-life cell dataset from the `datasets`
    subpackage. Combining these datasets has no particular physical meaning, but
    serves to illustrate the function.

    .. code-block:: python

        import ampworks as amp
        import matplotlib.pyplot as plt

        data1, data2 = amp.datasets.load_datasets(
            'dqdv/cell1_rough', 'dqdv/cell2_rough',
        )

        avg = amp.mathutils.aggregate_over_x([data1, data2], 'Volts', 'Ah')
        dwn = avg.downsample(n=25)

        errbar = plt.errorbar(
            dwn['Ah_mean'], dwn['Volts'], xerr=dwn['Ah_std'], fmt='.',
        )
        fill_x = plt.fill_betweenx(
            dwn['Volts'], dwn['Ah_min'], dwn['Ah_max'], alpha=0.2,
        )

        plt.legend([errbar, fill_x], ["Mean +/- Std", "Min-Max Range"])
        plt.xlabel("Discharge Capacity [Ah]")
        plt.ylabel("Voltage [V]")

        plt.show()

    """
    from ampworks import Dataset
    from ampworks._checks import _check_columns, _check_type

    _check_type('datasets', datasets, Sequence)
    _check_type('x', x, str)
    _check_type('y', y, str)
    _check_type('n', n, int)

    if len(datasets) == 0:
        raise ValueError("'datasets' must contain at least one dataset.")
    if n < 2:
        raise ValueError("'n' must be at least 2.")

    for i, data in enumerate(datasets):
        _check_type(f"datasets[{i}]", data, (Dataset, pd.DataFrame))
        _check_columns(data, [x, y])

    lo = max(data[x].min() for data in datasets)
    hi = min(data[x].max() for data in datasets)
    if lo >= hi:
        raise ValueError(f"No overlapping range found for x='{x}'.")

    x_grid = np.linspace(lo, hi, n)

    interpolated = np.empty((len(datasets), n))
    for i, data in enumerate(datasets):
        x_vals = data[x].to_numpy()
        y_vals = data[y].to_numpy()

        order = np.argsort(x_vals)
        x_vals = x_vals[order]
        y_vals = y_vals[order]

        # Keep first occurrence of duplicate x values for stable interpolation
        uniq_x, uniq_idx = np.unique(x_vals, return_index=True)
        uniq_y = y_vals[uniq_idx]

        interpolated[i] = np.interp(x_grid, uniq_x, uniq_y)

    zero_std = np.zeros(n)
    use_std = len(datasets) > 1

    return Dataset({
        x: x_grid,
        f"{y}_mean": interpolated.mean(axis=0),
        f"{y}_std": interpolated.std(axis=0, ddof=1) if use_std else zero_std,
        f"{y}_min": interpolated.min(axis=0),
        f"{y}_max": interpolated.max(axis=0),
    })




[docs]
def combinations(
    values: Sequence[np.ndarray],
    names: Sequence[str] = None,
) -> list[dict]:
    """
    Generate all value combinations.

    Parameters
    ----------
    values : Sequence[1D array]
        Variable values. Array `i` corresponds to `names[i]`, if provided.
    names : Sequence[str], optional
        Variable names. Defaults to `range(N)` when not provided, where `N`
        is the length of 'values', i.e., how many arrays are in the sequence.

    Returns
    -------
    combinations : list[dict]
        Dictionaries for each possible combination of values.

    """
    import itertools

    if names is None:
        names = [i for i in range(len(values))]

    combinations = []
    for combination in itertools.product(*values):
        combinations.append({k: v for k, v in zip(names, combination)})

    return combinations