Source code for ampworks._core._dataset

from __future__ import annotations

import numpy as np
import pandas as pd
import plotly.express as px



[docs]
class Dataset(pd.DataFrame):
    """General dataset."""

    @property
    def _constructor(self) -> Dataset:
        return Dataset


[docs]
    def downsample(
        self,
        *,
        n: int = None,
        frac: float = None,
        resolution: tuple[str, float] = None,
        inplace: bool = False,
        ignore_index: bool = False,
        keep_last: bool = False,
    ) -> Dataset | None:
        """
        Downsample the dataset by eliminating rows using one of the following:

        - Keep a given number of rows
        - Keep a given fraction of rows
        - Keep rows based on the resolution of a given column

        Parameters
        ----------
        n : int, optional
            Number of evenly spaced rows to keep, by default None.
        frac : float, optional
            Fraction (in (0, 1]) of evenly spaced rows to keep, by default None.
        resolution : tuple[str, float], optional
            Column (str) and resolution (float) to use for downsampling based on
            adjacent values. By default None.
        inplace : bool, optional
            Modify in place if True. If False (default), return a new Dataset.
        ignore_index : bool, optional
            If True, reset the indices. Default is False.
        keep_last : bool, optional
            If True, always keep the last row. Default is False.

        Returns
        -------
        data : Dataset or None
            The downsampled Dataset if 'inplace' is False. Otherwise, None.

        Raises
        ------
        ValueError
            If more than one of n, frac, resolution is specified, or if they are
            all None. Also, if n is not positive or frac is not in (0, 1].

        Examples
        --------
        Below are examples of how to use the downsample method. In the first two
        examples, the rows are dropped evenly across the dataset. In the third
        example, rows are dropped based on the resolution of the 'Volts' column,
        ensuring that adjacent voltage readings are at least 1 mV apart.

        .. code-block:: python

            import ampworks as amp

            data = amp.datasets.load_datasets('dqdv/cell1_rough')

            # keep 100 evenly spaced rows
            sample1 = data.downsample(n=100)

            # keep 50% of the data, dropping evenly spaced rows
            sample2 = data.downsample(frac=0.5)

            # ensure adjacent voltage readings are at least 1 mV apart
            sample3 = data.downsample(resolution=('Volts', 1e-3))

        """
        from ampworks._checks import (
            _check_type, _check_only_one, _check_columns,
        )

        _check_only_one(
            conditions=[x is not None for x in [n, frac, resolution]],
            message="Specify exactly one of: n, frac, resolution.",
        )

        _check_type('inplace', inplace, bool)
        _check_type('ignore_index', ignore_index, bool)
        _check_type('keep_last', keep_last, bool)

        result = self.copy()

        if len(result) == 0:
            raise ValueError("Cannot downsample an empty dataset.")

        mask = np.zeros(len(result), dtype=bool)

        # keep a specified number of rows
        if n is not None:
            _check_type('n', n, int)

            if n <= 0:
                raise ValueError("'n' must be a positive integer.")

            count = min(n, len(result))
            indices = np.linspace(0, len(result) - 1, count, dtype=int)

        # keep a specified fraction of rows
        elif frac is not None:
            _check_type('frac', frac, (float, int))

            if not (0 < frac <= 1):
                raise ValueError("'frac' must be in the range (0, 1].")

            count = int(len(result) * frac) or 1  # keep at least one row
            indices = np.linspace(0, len(result) - 1, count, dtype=int)

        # keep rows based on a resolution between adjacent values
        elif resolution is not None:
            _check_type('resolution', resolution, (tuple, list))

            if len(resolution) != 2:
                raise ValueError("'resolution' must be length 2.")

            _check_type('resolution[0]', resolution[0], str)
            _check_type('resolution[1]', resolution[1], (float, int))

            column, atol = resolution
            _check_columns(result, [column])

            column_data = result[column].to_numpy()

            indices = [0]  # always keep the first row
            last_val = column_data[0]
            for i, val in enumerate(column_data[1:], start=1):
                if np.abs(val - last_val) >= np.abs(atol):
                    indices.append(i)
                    last_val = val

        mask[indices] = True

        if keep_last:
            mask[-1] = True

        result = result[mask]

        if ignore_index:
            result = result.reset_index(drop=True)

        if inplace:
            self._update_inplace(result)
        else:
            return result



[docs]
    def enforce_monotonic(
        self,
        column: str,
        increasing: bool = True,
        strict: bool = False,
        inplace: bool = False,
        ignore_index: bool = False,
    ) -> Dataset | None:
        """
        Enforce monotonicity in a column by dropping rows that break the trend.

        Parameters
        ----------
        column : str
            Column name to enforce monotonicity on.
        increasing : bool, optional
            If True (default), enforce increasing monotonicity. Otherwise, apply
            decreasing monotonicity.
        strict : bool, optional
            If True, enforce strict monotonicity (no equal adjacent values). The
            default is False, which allows equal adjacent values.
        inplace : bool, optional
            Modify in place if True. If False (default), return a new Dataset.
        ignore_index : bool, optional
            If True, reset the indices. Default is False.

        Returns
        -------
        data : Dataset or None
            The modified Dataset if 'inplace' is False. Otherwise, None.

        """
        from ampworks._checks import _check_type, _check_columns

        _check_type('column', column, str)
        _check_type('increasing', increasing, bool)
        _check_type('strict', strict, bool)
        _check_type('inplace', inplace, bool)
        _check_type('ignore_index', ignore_index, bool)

        result = self.copy()

        if len(result) == 0:
            raise ValueError("Cannot enforce monotonicity on an empty dataset.")

        # loop over indices and store which to keep
        mask = np.zeros(len(result), dtype=bool)

        if increasing:
            compare = np.greater if strict else np.greater_equal
        else:
            compare = np.less if strict else np.less_equal

        _check_columns(result, [column])

        column_data = result[column].to_numpy()

        indices = [0]  # always keep the first row
        last_val = column_data[0]
        for i, val in enumerate(column_data[1:], start=1):
            if compare(val, last_val):
                indices.append(i)
                last_val = val

        # keep the rows where the monotonicity condition is met
        mask[indices] = True

        result = result[mask]

        if ignore_index:
            result = result.reset_index(drop=True)

        if inplace:
            self._update_inplace(result)
        else:
            return result



[docs]
    def interactive_xy_plot(
        self, x: str, y: str, tips: list[str] | None = None,
        figsize: tuple[int, int] = (800, 450), save: str = None,
    ) -> None:
        """
        Create an interactive XY plot using Plotly. Allows hovertips, zooming,
        and more. Optionally, save the plot to an html file, which can be sent
        and opened in a web browser, without needing Python and/or ampworks.

        The hovertips are particularly useful for exploring the data and finding
        specific cycle and steps for slicing and further analysis.

        Parameters
        ----------
        x : str
            Column name for the variable to plot on the x-axis.
        y : str
            Column name for the variable to plot on the y-axis.
        tips : list[str] or None, optional
            List of column names to display as hover tips, by default None.
        figsize : tuple[int, int], optional
            Figure size (width, height) in pixels, by default (800, 450).
        save : str, optional
            File path to save the plot as an HTML file, by default None.

        Notes
        -----
        When run inside a Jupyter notebook, the plot will be rendered inline. If
        instead this function is called from a script, the plot will be saved to
        a temporary directory and automatically opened in a local web browser.

        Examples
        --------
        The following example uses the 'hppc_discharge' dataset and creates an
        interactive XY plot of 'Seconds' vs. 'Volts', with a hover tip showing
        the step number. Even though only one hover tip is requested, it must
        be passed in a list. For more than one hover tip, simply add more column
        names to the list.

        The interactive plots only allow one x and one y variable, and both are
        required to be existing columns in the dataset. In the second example,
        we compute a new column for time in hours so that we can change the
        x-axis to 'Hours' instead of 'Seconds'.

        .. code-block:: python

            import ampworks as amp

            data = amp.datasets.load_datasets('hppc/hppc_discharge')
            data.interactive_xy_plot('Seconds', 'Volts', tips=['Step'])

            # Add new column to plot time in hours instead of seconds
            data['Hours'] = data['Seconds'] / 3600
            data.interactive_xy_plot('Hours', 'Volts', tips=['Step'])

        """
        from ampworks.plotutils._plotly import PLOTLY_TEMPLATE, _render_plotly

        if tips is None:
            tips = []

        fig = px.line(
            self, x=x, y=y, markers=True,
            hover_data={col: True for col in tips},
        )

        fig.update_layout(template=PLOTLY_TEMPLATE)
        _render_plotly(fig=fig, figsize=figsize, save=save)



[docs]
    def zero_below(
        self,
        column: str,
        threshold: float,
        inplace: bool = False,
    ) -> Dataset | None:
        """
        Set values in 'column' below 'threshold' to zero.

        Parameters
        ----------
        column : str
            Column name to apply thresholding.
        threshold : float
            Values with absolute value below this threshold are set to zero.
            Note that values exactly equal to the threshold are not zeroed.
        inplace : bool, optional
            If True, modify the Dataset in place. Otherwise, return a new
            Dataset. Default is False.

        Returns
        -------
        data : Dataset or None
            The modified Dataset if 'inplace' is False. Otherwise, None.

        Examples
        --------
        Occasionally, there may be small non-zero values in the data that can
        be considered as noise and set to zero. When not appropriately zeroed,
        these can cause issues with automatic pulse detection (i.e., where the
        algorithm detects changes from rests to non-rests and vice versa). So,
        in the following example, we load the 'hppc_discharge' dataset and zero
        out current values below a certain threshold. The threshold here is set
        to 1% of the mean current from non-rest data, however, the appropriate
        threshold should be determined based on the specific characteristics of
        the dataset.

        .. code-block:: python

            import ampworks as amp

            # zero out currents below a threshold from non-rest data
            data = amp.datasets.load_datasets('hppc/hppc_discharge')
            threshold = data.loc[data['State'] != 'R', 'Amps'].mean()*1e-2

            data_zeroed = data.zero_below(column='Amps', threshold=threshold)

        """
        result = self.copy()
        mask = result[column].abs() < abs(threshold)
        result.loc[mask, column] = 0.0

        if inplace:
            self._update_inplace(result)
        else:
            return result



[docs]
    def zero_time(self, inplace: bool = False) -> Dataset | None:
        """
        Shift the `Seconds` column by subtracting the value in the first row,
        creating a new zero time reference.

        Parameters
        ----------
        inplace : bool, optional
            If True, modify the Dataset in place. Otherwise, return a new
            Dataset. Default is False.

        Returns
        -------
        data : Dataset | None
            The modified Dataset if 'inplace' is False. Otherwise, None.

        Notes
        -----
        This method does not sort by time, nor does it use the minimum time when
        subtracting. It simply shifts the time values so that the first row has
        a time of zero, regardless of the actual order of time values. Consider
        sorting by time first, if needed, using `data.sort_values('Seconds')`.

        """
        from ampworks._checks import _check_type, _check_columns

        _check_type('inplace', inplace, bool)
        _check_columns(self, ['Seconds'])

        result = self.copy()
        result['Seconds'] -= result['Seconds'].iloc[0]

        if inplace:
            self._update_inplace(result)
        else:
            return result