Source code for ampworks._core._dataset

from __future__ import annotations

from typing import Literal

import numpy as np
import pandas as pd
import plotly.express as px

from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource


[docs] class Dataset(pd.DataFrame): """General dataset.""" @property def _constructor(self) -> Dataset: return Dataset
[docs] def downsample( self, *, n: int = None, frac: float = None, resolution: tuple[str, float] = None, inplace: bool = False, ignore_index: bool = False, keep_last: bool = False, ) -> Dataset | None: """ Downsample the dataset by eliminating rows given: - number of rows - fraction of rows - resolution of a specified column Parameters ---------- n : int, optional Number of evenly spaced rows to keep, by default None. frac : float, optional Fraction (in (0, 1]) of evenly spaced rows to keep, by default None. resolution : tuple[str, float], optional Column (str) and resolution (float) to use for downsampling based on the absolute difference between adjacent values. By default None. inplace : bool, optional Modify in place if True. If False (default), return a new Dataset. ignore_index : bool, optional If True, reset the indices. Default is False. keep_last : bool, optional If True, always keep the last row. Default is False. Returns ------- data : Dataset or None The downsampled Dataset if 'inplace' is False. Otherwise, None. Raises ------ ValueError If more than one of n, frac, resolution is specified, or if they are all None. Also, if n is not positive or frac is not in (0, 1]. Examples -------- The following demonstrates three ways to downsample a dataset. Note that only the `resolution` option requires a column to operate on. .. code-block:: python import ampworks as amp data = amp.datasets.load_datasets('dqdv/cell1_rough') # keep 100 evenly spaced rows sample1 = data.downsample(n=100) # keep 50% of the data, dropping evenly spaced rows sample2 = data.downsample(frac=0.5) # ensure adjacent voltage readings are at least 1 mV apart sample3 = data.downsample(resolution=('Volts', 1e-3)) """ from ampworks._checks import ( _check_type, _check_only_one, _check_columns, ) _check_only_one( conditions=[x is not None for x in [n, frac, resolution]], message="Specify exactly one of: n, frac, resolution.", ) _check_type('inplace', inplace, bool) _check_type('ignore_index', ignore_index, bool) _check_type('keep_last', keep_last, bool) result = self.copy() if len(result) == 0: raise ValueError("Cannot downsample an empty dataset.") mask = np.zeros(len(result), dtype=bool) # keep a specified number of rows if n is not None: _check_type('n', n, int) if n <= 0: raise ValueError("'n' must be a positive integer.") count = min(n, len(result)) indices = np.linspace(0, len(result) - 1, count, dtype=int) # keep a specified fraction of rows elif frac is not None: _check_type('frac', frac, (float, int)) if not (0 < frac <= 1): raise ValueError("'frac' must be in the range (0, 1].") count = int(len(result) * frac) or 1 # keep at least one row indices = np.linspace(0, len(result) - 1, count, dtype=int) # keep rows based on a resolution between adjacent values elif resolution is not None: _check_type('resolution', resolution, (tuple, list)) if len(resolution) != 2: raise ValueError("'resolution' must be length 2.") _check_type('resolution[0]', resolution[0], str) _check_type('resolution[1]', resolution[1], (float, int)) column, atol = resolution _check_columns(result, [column]) column_data = result[column].to_numpy() indices = [0] # always keep the first row last_val = column_data[0] for i, val in enumerate(column_data[1:], start=1): if np.abs(val - last_val) >= np.abs(atol): indices.append(i) last_val = val mask[indices] = True if keep_last: mask[-1] = True result = result[mask] if ignore_index: result = result.reset_index(drop=True) if inplace: self._update_inplace(result) else: return result
[docs] def enforce_monotonic( self, column: str, increasing: bool = True, strict: bool = False, inplace: bool = False, ignore_index: bool = False, ) -> Dataset | None: """ Enforce monotonicity in a column by dropping rows that break the trend. Parameters ---------- column : str Column name to enforce monotonicity on. increasing : bool, optional If True (default), enforce increasing monotonicity. Otherwise, apply decreasing monotonicity. strict : bool, optional If True, enforce strict monotonicity (no equal adjacent values). The default is False, which allows equal adjacent values. inplace : bool, optional Modify in place if True. If False (default), return a new Dataset. ignore_index : bool, optional If True, reset the indices. Default is False. Returns ------- data : Dataset or None The modified Dataset if 'inplace' is False. Otherwise, None. """ from ampworks._checks import _check_type, _check_columns _check_type('column', column, str) _check_type('increasing', increasing, bool) _check_type('strict', strict, bool) _check_type('inplace', inplace, bool) _check_type('ignore_index', ignore_index, bool) result = self.copy() if len(result) == 0: raise ValueError("Cannot enforce monotonicity on an empty dataset.") # loop over indices and store which to keep mask = np.zeros(len(result), dtype=bool) if increasing: compare = np.greater if strict else np.greater_equal else: compare = np.less if strict else np.less_equal _check_columns(result, [column]) column_data = result[column].to_numpy() indices = [0] # always keep the first row last_val = column_data[0] for i, val in enumerate(column_data[1:], start=1): if compare(val, last_val): indices.append(i) last_val = val # keep the rows where the monotonicity condition is met mask[indices] = True result = result[mask] if ignore_index: result = result.reset_index(drop=True) if inplace: self._update_inplace(result) else: return result
[docs] def interactive_plotly( self, x: str, y: str, *, tips: list[str] | None = None, figsize: tuple[int | None, int | None] = (800, 450), kind: Literal['line', 'scatter', 'both'] = 'line', save: str = None, ) -> None: """ Create an interactive plotly figure with hover tips. Optionally save as a standalone HTML file, viewable without installing Python/ampworks. Parameters ---------- x : str Column name for the variable to plot on the x-axis. y : str Column name for the variable to plot on the y-axis. tips : list[str] or None, optional List of column names to display as hover tips, by default None. figsize : tuple[int | None, int | None], optional Figure size (width, height) in pixels, by default (800, 450). Set either or both dimensions to None to allow them to stretch. kind : {'line', 'scatter', 'both'}, optional Kind of plot to draw. 'line' (default) for a line plot, 'scatter' for a scatter plot, or 'both' to show both a line and markers. save : str, optional File path to save the plot as an HTML file, by default None. See Also -------- interactive_bokeh Interactive plots using bokeh. Typically has higher performance for large (>250k) datasets and better support for notebook exports. Notes ----- The responsive height size option is limited in notebook environments since output cells do not have adjustable heights. In these cases, the height is set to a default minimum value. Examples -------- The following creates an interactive plot of an HPPC discharge dataset. Note that the x, y, and tips values must be existing columns; however, you can compute or add new columns before plotting, if needed, as shown by adding an 'Hours' column in the second figure below. Also, hovertips must be passed as a list, even if only one column is requested. .. code-block:: python import ampworks as amp data = amp.datasets.load_datasets('hppc/hppc_discharge') data.interactive_plotly('Seconds', 'Volts', tips=['Step']) # Add new column to plot time in hours instead of seconds data['Hours'] = data['Seconds'] / 3600 data.interactive_plotly('Hours', 'Volts', tips=['Step', 'Amps']) """ from ampworks.plotutils._plotly import ( _apply_plotly_style, _render_plotly, ) hover = {} if tips is None else {col: True for col in tips} kind = kind.lower() if kind in ['line', 'both']: markers = True if kind == 'both' else False fig = px.line(self, x=x, y=y, markers=markers, hover_data=hover) elif kind == 'scatter': fig = px.scatter(self, x=x, y=y, hover_data=hover) else: raise ValueError( "Invalid value for 'kind'. Expected one of {'line', 'scatter'," " 'both'}, but got " + f"{kind=}." ) _apply_plotly_style(fig) _render_plotly(fig=fig, figsize=figsize, save=save)
[docs] def interactive_bokeh( self, x: str, y: str, *, tips: list[str] | None = None, figsize: tuple[int | None, int | None] = (800, 450), kind: Literal['line', 'scatter', 'both'] = 'line', save: str = None, ) -> None: """ Create an interactive bokeh figure with hover tips. Optionally save as a standalone HTML file, viewable without installing Python/ampworks. Parameters ---------- x : str Column name for the variable to plot on the x-axis. y : str Column name for the variable to plot on the y-axis. tips : list[str] or None, optional List of column names to display as hover tips, by default None. figsize : tuple[int | None, int | None], optional Figure size (width, height) in pixels, by default (800, 450). Set either or both dimensions to None to allow them to stretch. kind : {'line', 'scatter', 'both'}, optional Type of plot to create. 'line' (default) for a line plot, 'scatter' for a scatter plot, or 'both' to show both a line and markers. save : str, optional File path to save the plot as an HTML file, by default None. See Also -------- interactive_plotly Interactive plots using plotly. Typically has lower performance for large (>250k) datasets, but is compatible with `dash` apps. Notes ----- The responsive height size option is limited in notebook environments since output cells do not have adjustable heights. In these cases, the height is set to a default minimum value. Examples -------- The following creates an interactive plot of an HPPC discharge dataset. Note that the x, y, and tips values must be existing columns; however, you can compute or add new columns before plotting, if needed, as shown by adding an 'Hours' column in the second figure below. Also, hovertips must be passed as a list, even if only one column is requested. .. code-block:: python import ampworks as amp data = amp.datasets.load_datasets('hppc/hppc_discharge') data.interactive_bokeh('Seconds', 'Volts', tips=['Step']) # Add new column to plot time in hours instead of seconds data['Hours'] = data['Seconds'] / 3600 data.interactive_bokeh('Hours', 'Volts', tips=['Step', 'Amps']) """ from ampworks.plotutils._bokeh import ( BOKEH_CONFIG, _apply_bokeh_style, _render_bokeh, ) if tips is None: tips = [] kind = kind.lower() color = '#636EFA' # adopt color from plotly's default cols = [x, y] + tips source = ColumnDataSource(data=self[cols]) # Horizontal HTML tooltip to match Plotly's compact single-row layout tooltips = [(x, '$x'), (y, '$y')] for tip in tips: tooltips.append((tip, "@{" + tip + "}")) fig = figure( x_axis_label=x, y_axis_label=y, width=figsize[0], height=figsize[1], **BOKEH_CONFIG, ) if kind not in ['line', 'scatter', 'both']: raise ValueError( "Invalid value for 'kind'. Expected one of {'line', 'scatter'," " 'both'}, but got " + f"{kind=}." ) line = fig.line(x=x, y=y, source=source, color=color, line_width=2) # hide line if only scatter is requested, done for hover tool, to reduce # too many points showing when dense or overlapping (discussed below) if kind == 'scatter': line.glyph.line_alpha = 0 if kind in ['scatter', 'both']: fig.scatter(x=x, y=y, source=source, color=color, size=4.5) # Attach hover only to the line so a single tooltip fires even when # markers are densely overlapping at zoomed-out views hover = HoverTool(mode='vline', renderers=[line], tooltips=tooltips) fig.add_tools(hover) _apply_bokeh_style(fig) _render_bokeh(fig=fig, figsize=figsize, save=save)
def interactive_xy_plot( self, x: str, y: str, *, tips: list[str] | None = None, figsize: tuple[int | None, int | None] = (800, 450), save: str = None, ) -> None: """ Deprecated. This method will be removed in a future release. Use either `interactive_plotly` or `interactive_bokeh` instead. """ import warnings warnings.warn( "interactive_xy_plot() is deprecated and will be removed in a" " future release. Use interactive_plotly() or interactive_bokeh().", DeprecationWarning, stacklevel=2, ) self.interactive_plotly( x=x, y=y, tips=tips, figsize=figsize, kind='both', save=save, )
[docs] def zero_below( self, column: str, threshold: float, inplace: bool = False, ) -> Dataset | None: """ Set values in 'column' below 'threshold' to zero. Parameters ---------- column : str Column name to apply thresholding. threshold : float Values whose absolute value is below this threshold are set to zero. Note that values equal to the threshold are not zeroed. inplace : bool, optional Modify in place if True. If False (default), return a new Dataset. Returns ------- data : Dataset or None The modified Dataset if 'inplace' is False. Otherwise, None. Examples -------- Small non-zero values that can be attributed to noise can interfere with some analysis methods. For example, automatic pulse detection identifies pulses based on transitions from zero to non-zero current. This example filters out currents below 1% of the mean non-rest current, though the thresholds should be tailored to your specific use case and data. .. code-block:: python import ampworks as amp # zero out currents below a threshold from non-rest data data = amp.datasets.load_datasets('hppc/hppc_discharge') threshold = data.loc[data['State'] != 'R', 'Amps'].mean()*1e-2 data_zeroed = data.zero_below(column='Amps', threshold=threshold) """ result = self.copy() mask = result[column].abs() < abs(threshold) result.loc[mask, column] = 0.0 if inplace: self._update_inplace(result) else: return result
[docs] def zero_time(self, inplace: bool = False) -> Dataset | None: """ Shifts the `Seconds` column by subtracting the first row's value to set a new zero reference. Parameters ---------- inplace : bool, optional Modify in place if True. If False (default), return a new Dataset. Returns ------- data : Dataset | None The modified Dataset if 'inplace' is False. Otherwise, None. Notes ----- This method does not sort by time, nor does it use the minimum time when subtracting. It simply shifts the time values so that the first row has a time of zero, regardless of the actual order of time values. Consider sorting by time first, if needed, using `data.sort_values('Seconds')`. """ from ampworks._checks import _check_type, _check_columns _check_type('inplace', inplace, bool) _check_columns(self, ['Seconds']) result = self.copy() result['Seconds'] -= result['Seconds'].iloc[0] if inplace: self._update_inplace(result) else: return result