Source code for ampworks._core._dataset

from __future__ import annotations

from typing import Literal

import numpy as np
import pandas as pd
import plotly.express as px

from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource



[docs]
class Dataset(pd.DataFrame):
    """General dataset."""

    @property
    def _constructor(self) -> Dataset:
        return Dataset


[docs]
    def downsample(
        self,
        *,
        n: int = None,
        frac: float = None,
        resolution: tuple[str, float] = None,
        inplace: bool = False,
        ignore_index: bool = False,
        keep_last: bool = False,
    ) -> Dataset | None:
        """
        Downsample the dataset by eliminating rows given:

        - number of rows
        - fraction of rows
        - resolution of a specified column

        Parameters
        ----------
        n : int, optional
            Number of evenly spaced rows to keep, by default None.
        frac : float, optional
            Fraction (in (0, 1]) of evenly spaced rows to keep, by default None.
        resolution : tuple[str, float], optional
            Column (str) and resolution (float) to use for downsampling based on
            the absolute difference between adjacent values. By default None.
        inplace : bool, optional
            Modify in place if True. If False (default), return a new Dataset.
        ignore_index : bool, optional
            If True, reset the indices. Default is False.
        keep_last : bool, optional
            If True, always keep the last row. Default is False.

        Returns
        -------
        data : Dataset or None
            The downsampled Dataset if 'inplace' is False. Otherwise, None.

        Raises
        ------
        ValueError
            If more than one of n, frac, resolution is specified, or if they are
            all None. Also, if n is not positive or frac is not in (0, 1].

        Examples
        --------
        The following demonstrates three ways to downsample a dataset. Note that
        only the `resolution` option requires a column to operate on.

        .. code-block:: python

            import ampworks as amp

            data = amp.datasets.load_datasets('dqdv/cell1_rough')

            # keep 100 evenly spaced rows
            sample1 = data.downsample(n=100)

            # keep 50% of the data, dropping evenly spaced rows
            sample2 = data.downsample(frac=0.5)

            # ensure adjacent voltage readings are at least 1 mV apart
            sample3 = data.downsample(resolution=('Volts', 1e-3))

        """
        from ampworks._checks import (
            _check_type, _check_only_one, _check_columns,
        )

        _check_only_one(
            conditions=[x is not None for x in [n, frac, resolution]],
            message="Specify exactly one of: n, frac, resolution.",
        )

        _check_type('inplace', inplace, bool)
        _check_type('ignore_index', ignore_index, bool)
        _check_type('keep_last', keep_last, bool)

        result = self.copy()

        if len(result) == 0:
            raise ValueError("Cannot downsample an empty dataset.")

        mask = np.zeros(len(result), dtype=bool)

        # keep a specified number of rows
        if n is not None:
            _check_type('n', n, int)

            if n <= 0:
                raise ValueError("'n' must be a positive integer.")

            count = min(n, len(result))
            indices = np.linspace(0, len(result) - 1, count, dtype=int)

        # keep a specified fraction of rows
        elif frac is not None:
            _check_type('frac', frac, (float, int))

            if not (0 < frac <= 1):
                raise ValueError("'frac' must be in the range (0, 1].")

            count = int(len(result) * frac) or 1  # keep at least one row
            indices = np.linspace(0, len(result) - 1, count, dtype=int)

        # keep rows based on a resolution between adjacent values
        elif resolution is not None:
            _check_type('resolution', resolution, (tuple, list))

            if len(resolution) != 2:
                raise ValueError("'resolution' must be length 2.")

            _check_type('resolution[0]', resolution[0], str)
            _check_type('resolution[1]', resolution[1], (float, int))

            column, atol = resolution
            _check_columns(result, [column])

            column_data = result[column].to_numpy()

            indices = [0]  # always keep the first row
            last_val = column_data[0]
            for i, val in enumerate(column_data[1:], start=1):
                if np.abs(val - last_val) >= np.abs(atol):
                    indices.append(i)
                    last_val = val

        mask[indices] = True

        if keep_last:
            mask[-1] = True

        result = result[mask]

        if ignore_index:
            result = result.reset_index(drop=True)

        if inplace:
            self._update_inplace(result)
        else:
            return result



[docs]
    def enforce_monotonic(
        self,
        column: str,
        increasing: bool = True,
        strict: bool = False,
        inplace: bool = False,
        ignore_index: bool = False,
    ) -> Dataset | None:
        """
        Enforce monotonicity in a column by dropping rows that break the trend.

        Parameters
        ----------
        column : str
            Column name to enforce monotonicity on.
        increasing : bool, optional
            If True (default), enforce increasing monotonicity. Otherwise, apply
            decreasing monotonicity.
        strict : bool, optional
            If True, enforce strict monotonicity (no equal adjacent values). The
            default is False, which allows equal adjacent values.
        inplace : bool, optional
            Modify in place if True. If False (default), return a new Dataset.
        ignore_index : bool, optional
            If True, reset the indices. Default is False.

        Returns
        -------
        data : Dataset or None
            The modified Dataset if 'inplace' is False. Otherwise, None.

        """
        from ampworks._checks import _check_type, _check_columns

        _check_type('column', column, str)
        _check_type('increasing', increasing, bool)
        _check_type('strict', strict, bool)
        _check_type('inplace', inplace, bool)
        _check_type('ignore_index', ignore_index, bool)

        result = self.copy()

        if len(result) == 0:
            raise ValueError("Cannot enforce monotonicity on an empty dataset.")

        # loop over indices and store which to keep
        mask = np.zeros(len(result), dtype=bool)

        if increasing:
            compare = np.greater if strict else np.greater_equal
        else:
            compare = np.less if strict else np.less_equal

        _check_columns(result, [column])

        column_data = result[column].to_numpy()

        indices = [0]  # always keep the first row
        last_val = column_data[0]
        for i, val in enumerate(column_data[1:], start=1):
            if compare(val, last_val):
                indices.append(i)
                last_val = val

        # keep the rows where the monotonicity condition is met
        mask[indices] = True

        result = result[mask]

        if ignore_index:
            result = result.reset_index(drop=True)

        if inplace:
            self._update_inplace(result)
        else:
            return result



[docs]
    def interactive_plotly(
        self,
        x: str,
        y: str,
        *,
        tips: list[str] | None = None,
        figsize: tuple[int | None, int | None] = (800, 450),
        kind: Literal['line', 'scatter', 'both'] = 'line',
        save: str = None,
    ) -> None:
        """
        Create an interactive plotly figure with hover tips. Optionally save as
        a standalone HTML file, viewable without installing Python/ampworks.

        Parameters
        ----------
        x : str
            Column name for the variable to plot on the x-axis.
        y : str
            Column name for the variable to plot on the y-axis.
        tips : list[str] or None, optional
            List of column names to display as hover tips, by default None.
        figsize : tuple[int | None, int | None], optional
            Figure size (width, height) in pixels, by default (800, 450). Set
            either or both dimensions to None to allow them to stretch.
        kind : {'line', 'scatter', 'both'}, optional
            Kind of plot to draw. 'line' (default) for a line plot, 'scatter'
            for a scatter plot, or 'both' to show both a line and markers.
        save : str, optional
            File path to save the plot as an HTML file, by default None.

        See Also
        --------
        interactive_bokeh
            Interactive plots using bokeh. Typically has higher performance for
            large (>250k) datasets and better support for notebook exports.

        Notes
        -----
        The responsive height size option is limited in notebook environments
        since output cells do not have adjustable heights. In these cases, the
        height is set to a default minimum value.

        Examples
        --------
        The following creates an interactive plot of an HPPC discharge dataset.
        Note that the x, y, and tips values must be existing columns; however,
        you can compute or add new columns before plotting, if needed, as shown
        by adding an 'Hours' column in the second figure below. Also, hovertips
        must be passed as a list, even if only one column is requested.

        .. code-block:: python

            import ampworks as amp

            data = amp.datasets.load_datasets('hppc/hppc_discharge')
            data.interactive_plotly('Seconds', 'Volts', tips=['Step'])

            # Add new column to plot time in hours instead of seconds
            data['Hours'] = data['Seconds'] / 3600
            data.interactive_plotly('Hours', 'Volts', tips=['Step', 'Amps'])

        """
        from ampworks.plotutils._plotly import (
            _apply_plotly_style, _render_plotly,
        )

        hover = {} if tips is None else {col: True for col in tips}

        kind = kind.lower()

        if kind in ['line', 'both']:
            markers = True if kind == 'both' else False
            fig = px.line(self, x=x, y=y, markers=markers, hover_data=hover)
        elif kind == 'scatter':
            fig = px.scatter(self, x=x, y=y, hover_data=hover)
        else:
            raise ValueError(
                "Invalid value for 'kind'. Expected one of {'line', 'scatter',"
                " 'both'}, but got " + f"{kind=}."
            )

        _apply_plotly_style(fig)
        _render_plotly(fig=fig, figsize=figsize, save=save)



[docs]
    def interactive_bokeh(
        self,
        x: str,
        y: str,
        *,
        tips: list[str] | None = None,
        figsize: tuple[int | None, int | None] = (800, 450),
        kind: Literal['line', 'scatter', 'both'] = 'line',
        save: str = None,
    ) -> None:
        """
        Create an interactive bokeh figure with hover tips. Optionally save as
        a standalone HTML file, viewable without installing Python/ampworks.

        Parameters
        ----------
        x : str
            Column name for the variable to plot on the x-axis.
        y : str
            Column name for the variable to plot on the y-axis.
        tips : list[str] or None, optional
            List of column names to display as hover tips, by default None.
        figsize : tuple[int | None, int | None], optional
            Figure size (width, height) in pixels, by default (800, 450). Set
            either or both dimensions to None to allow them to stretch.
        kind : {'line', 'scatter', 'both'}, optional
            Type of plot to create. 'line' (default) for a line plot, 'scatter'
            for a scatter plot, or 'both' to show both a line and markers.
        save : str, optional
            File path to save the plot as an HTML file, by default None.

        See Also
        --------
        interactive_plotly
            Interactive plots using plotly. Typically has lower performance for
            large (>250k) datasets, but is compatible with `dash` apps.

        Notes
        -----
        The responsive height size option is limited in notebook environments
        since output cells do not have adjustable heights. In these cases, the
        height is set to a default minimum value.

        Examples
        --------
        The following creates an interactive plot of an HPPC discharge dataset.
        Note that the x, y, and tips values must be existing columns; however,
        you can compute or add new columns before plotting, if needed, as shown
        by adding an 'Hours' column in the second figure below. Also, hovertips
        must be passed as a list, even if only one column is requested.

        .. code-block:: python

            import ampworks as amp

            data = amp.datasets.load_datasets('hppc/hppc_discharge')
            data.interactive_bokeh('Seconds', 'Volts', tips=['Step'])

            # Add new column to plot time in hours instead of seconds
            data['Hours'] = data['Seconds'] / 3600
            data.interactive_bokeh('Hours', 'Volts', tips=['Step', 'Amps'])

        """
        from ampworks.plotutils._bokeh import (
            BOKEH_CONFIG, _apply_bokeh_style, _render_bokeh,
        )

        if tips is None:
            tips = []

        kind = kind.lower()

        color = '#636EFA'  # adopt color from plotly's default

        cols = [x, y] + tips
        source = ColumnDataSource(data=self[cols])

        # Horizontal HTML tooltip to match Plotly's compact single-row layout
        tooltips = [(x, "@{" + x + "}"), (y, "@{" + y + "}")]
        for tip in tips:
            tooltips.append((tip, "@{" + tip + "}"))

        fig = figure(
            x_axis_label=x,
            y_axis_label=y,
            width=figsize[0],
            height=figsize[1],
            **BOKEH_CONFIG,
        )

        if kind not in ['line', 'scatter', 'both']:
            raise ValueError(
                "Invalid value for 'kind'. Expected one of {'line', 'scatter',"
                " 'both'}, but got " + f"{kind=}."
            )

        line = fig.line(x=x, y=y, source=source, color=color, line_width=2)

        # hide line if only scatter is requested, done for hover tool, to reduce
        # too many points showing when dense or overlapping (discussed below)
        if kind == 'scatter':
            line.glyph.line_alpha = 0

        if kind in ['scatter', 'both']:
            fig.scatter(x=x, y=y, source=source, color=color, size=4.5)

        # Attach hover only to the line so a single tooltip fires even when
        # markers are densely overlapping at zoomed-out views
        hover = HoverTool(mode='vline', renderers=[line], tooltips=tooltips)
        fig.add_tools(hover)

        _apply_bokeh_style(fig)
        _render_bokeh(fig=fig, figsize=figsize, save=save)


    def interactive_xy_plot(
        self,
        x: str,
        y: str,
        *,
        tips: list[str] | None = None,
        figsize: tuple[int | None, int | None] = (800, 450),
        save: str = None,
    ) -> None:
        """
        Deprecated. This method will be removed in a future release. Use either
        `interactive_plotly` or `interactive_bokeh` instead.

        """
        import warnings
        warnings.warn(
            "interactive_xy_plot() is deprecated and will be removed in a"
            " future release. Use interactive_plotly() or interactive_bokeh().",
            DeprecationWarning,
            stacklevel=2,
        )
        self.interactive_plotly(
            x=x, y=y, tips=tips, figsize=figsize, kind='both', save=save,
        )


[docs]
    def zero_below(
        self,
        column: str,
        threshold: float,
        inplace: bool = False,
    ) -> Dataset | None:
        """
        Set values in 'column' below 'threshold' to zero.

        Parameters
        ----------
        column : str
            Column name to apply thresholding.
        threshold : float
            Values whose absolute value is below this threshold are set to zero.
            Note that values equal to the threshold are not zeroed.
        inplace : bool, optional
            Modify in place if True. If False (default), return a new Dataset.

        Returns
        -------
        data : Dataset or None
            The modified Dataset if 'inplace' is False. Otherwise, None.

        Examples
        --------
        Small non-zero values that can be attributed to noise can interfere with
        some analysis methods. For example, automatic pulse detection identifies
        pulses based on transitions from zero to non-zero current. This example
        filters out currents below 1% of the mean non-rest current, though the
        thresholds should be tailored to your specific use case and data.

        .. code-block:: python

            import ampworks as amp

            # zero out currents below a threshold from non-rest data
            data = amp.datasets.load_datasets('hppc/hppc_discharge')
            threshold = data.loc[data['State'] != 'R', 'Amps'].mean()*1e-2

            data_zeroed = data.zero_below(column='Amps', threshold=threshold)

        """
        result = self.copy()
        mask = result[column].abs() < abs(threshold)
        result.loc[mask, column] = 0.0

        if inplace:
            self._update_inplace(result)
        else:
            return result



[docs]
    def zero_time(self, inplace: bool = False) -> Dataset | None:
        """
        Shifts the `Seconds` column by subtracting the first row's value to set
        a new zero reference.

        Parameters
        ----------
        inplace : bool, optional
            Modify in place if True. If False (default), return a new Dataset.

        Returns
        -------
        data : Dataset | None
            The modified Dataset if 'inplace' is False. Otherwise, None.

        Notes
        -----
        This method does not sort by time, nor does it use the minimum time when
        subtracting. It simply shifts the time values so that the first row has
        a time of zero, regardless of the actual order of time values. Consider
        sorting by time first, if needed, using `data.sort_values('Seconds')`.

        """
        from ampworks._checks import _check_type, _check_columns

        _check_type('inplace', inplace, bool)
        _check_columns(self, ['Seconds'])

        result = self.copy()
        result['Seconds'] -= result['Seconds'].iloc[0]

        if inplace:
            self._update_inplace(result)
        else:
            return result