Source code for ampworks._core._dataset

from __future__ import annotations

import numpy as np
import pandas as pd
import plotly.express as px


[docs] class Dataset(pd.DataFrame): """General dataset.""" @property def _constructor(self) -> Dataset: return Dataset
[docs] def downsample( self, *, n: int = None, frac: float = None, resolution: tuple[str, float] = None, inplace: bool = False, ignore_index: bool = False, keep_last: bool = False, ) -> Dataset | None: """ Downsample the dataset by eliminating rows using one of the following: - Keep a given number of rows - Keep a given fraction of rows - Keep rows based on the resolution of a given column Parameters ---------- n : int, optional Number of evenly spaced rows to keep, by default None. frac : float, optional Fraction (in (0, 1]) of evenly spaced rows to keep, by default None. resolution : tuple[str, float], optional Column (str) and resolution (float) to use for downsampling based on adjacent values. By default None. inplace : bool, optional Modify in place if True. If False (default), return a new Dataset. ignore_index : bool, optional If True, reset the indices. Default is False. keep_last : bool, optional If True, always keep the last row. Default is False. Returns ------- data : Dataset or None The downsampled Dataset if 'inplace' is False. Otherwise, None. Raises ------ ValueError If more than one of n, frac, resolution is specified, or if they are all None. Also, if n is not positive or frac is not in (0, 1]. Examples -------- Below are examples of how to use the downsample method. In the first two examples, the rows are dropped evenly across the dataset. In the third example, rows are dropped based on the resolution of the 'Volts' column, ensuring that adjacent voltage readings are at least 1 mV apart. .. code-block:: python import ampworks as amp data = amp.datasets.load_datasets('dqdv/cell1_rough') # keep 100 evenly spaced rows sample1 = data.downsample(n=100) # keep 50% of the data, dropping evenly spaced rows sample2 = data.downsample(frac=0.5) # ensure adjacent voltage readings are at least 1 mV apart sample3 = data.downsample(resolution=('Volts', 1e-3)) """ from ampworks._checks import ( _check_type, _check_only_one, _check_columns, ) _check_only_one( conditions=[x is not None for x in [n, frac, resolution]], message="Specify exactly one of: n, frac, resolution.", ) _check_type('inplace', inplace, bool) _check_type('ignore_index', ignore_index, bool) _check_type('keep_last', keep_last, bool) result = self.copy() if len(result) == 0: raise ValueError("Cannot downsample an empty dataset.") mask = np.zeros(len(result), dtype=bool) # keep a specified number of rows if n is not None: _check_type('n', n, int) if n <= 0: raise ValueError("'n' must be a positive integer.") count = min(n, len(result)) indices = np.linspace(0, len(result) - 1, count, dtype=int) # keep a specified fraction of rows elif frac is not None: _check_type('frac', frac, (float, int)) if not (0 < frac <= 1): raise ValueError("'frac' must be in the range (0, 1].") count = int(len(result) * frac) or 1 # keep at least one row indices = np.linspace(0, len(result) - 1, count, dtype=int) # keep rows based on a resolution between adjacent values elif resolution is not None: _check_type('resolution', resolution, (tuple, list)) if len(resolution) != 2: raise ValueError("'resolution' must be length 2.") _check_type('resolution[0]', resolution[0], str) _check_type('resolution[1]', resolution[1], (float, int)) column, atol = resolution _check_columns(result, [column]) column_data = result[column].to_numpy() indices = [0] # always keep the first row last_val = column_data[0] for i, val in enumerate(column_data[1:], start=1): if np.abs(val - last_val) >= np.abs(atol): indices.append(i) last_val = val mask[indices] = True if keep_last: mask[-1] = True result = result[mask] if ignore_index: result = result.reset_index(drop=True) if inplace: self._update_inplace(result) else: return result
[docs] def enforce_monotonic( self, column: str, increasing: bool = True, strict: bool = False, inplace: bool = False, ignore_index: bool = False, ) -> Dataset | None: """ Enforce monotonicity in a column by dropping rows that break the trend. Parameters ---------- column : str Column name to enforce monotonicity on. increasing : bool, optional If True (default), enforce increasing monotonicity. Otherwise, apply decreasing monotonicity. strict : bool, optional If True, enforce strict monotonicity (no equal adjacent values). The default is False, which allows equal adjacent values. inplace : bool, optional Modify in place if True. If False (default), return a new Dataset. ignore_index : bool, optional If True, reset the indices. Default is False. Returns ------- data : Dataset or None The modified Dataset if 'inplace' is False. Otherwise, None. """ from ampworks._checks import _check_type, _check_columns _check_type('column', column, str) _check_type('increasing', increasing, bool) _check_type('strict', strict, bool) _check_type('inplace', inplace, bool) _check_type('ignore_index', ignore_index, bool) result = self.copy() if len(result) == 0: raise ValueError("Cannot enforce monotonicity on an empty dataset.") # loop over indices and store which to keep mask = np.zeros(len(result), dtype=bool) if increasing: compare = np.greater if strict else np.greater_equal else: compare = np.less if strict else np.less_equal _check_columns(result, [column]) column_data = result[column].to_numpy() indices = [0] # always keep the first row last_val = column_data[0] for i, val in enumerate(column_data[1:], start=1): if compare(val, last_val): indices.append(i) last_val = val # keep the rows where the monotonicity condition is met mask[indices] = True result = result[mask] if ignore_index: result = result.reset_index(drop=True) if inplace: self._update_inplace(result) else: return result
[docs] def interactive_xy_plot( self, x: str, y: str, tips: list[str] | None = None, figsize: tuple[int, int] = (800, 450), save: str = None, ) -> None: """ Create an interactive XY plot using Plotly. Allows hovertips, zooming, and more. Optionally, save the plot to an html file, which can be sent and opened in a web browser, without needing Python and/or ampworks. The hovertips are particularly useful for exploring the data and finding specific cycle and steps for slicing and further analysis. Parameters ---------- x : str Column name for the variable to plot on the x-axis. y : str Column name for the variable to plot on the y-axis. tips : list[str] or None, optional List of column names to display as hover tips, by default None. figsize : tuple[int, int], optional Figure size (width, height) in pixels, by default (800, 450). save : str, optional File path to save the plot as an HTML file, by default None. Notes ----- When run inside a Jupyter notebook, the plot will be rendered inline. If instead this function is called from a script, the plot will be saved to a temporary directory and automatically opened in a local web browser. Examples -------- The following example uses the 'hppc_discharge' dataset and creates an interactive XY plot of 'Seconds' vs. 'Volts', with a hover tip showing the step number. Even though only one hover tip is requested, it must be passed in a list. For more than one hover tip, simply add more column names to the list. The interactive plots only allow one x and one y variable, and both are required to be existing columns in the dataset. In the second example, we compute a new column for time in hours so that we can change the x-axis to 'Hours' instead of 'Seconds'. .. code-block:: python import ampworks as amp data = amp.datasets.load_datasets('hppc/hppc_discharge') data.interactive_xy_plot('Seconds', 'Volts', tips=['Step']) # Add new column to plot time in hours instead of seconds data['Hours'] = data['Seconds'] / 3600 data.interactive_xy_plot('Hours', 'Volts', tips=['Step']) """ from ampworks.plotutils._plotly import PLOTLY_TEMPLATE, _render_plotly if tips is None: tips = [] fig = px.line( self, x=x, y=y, markers=True, hover_data={col: True for col in tips}, ) fig.update_layout(template=PLOTLY_TEMPLATE) _render_plotly(fig=fig, figsize=figsize, save=save)
[docs] def zero_below( self, column: str, threshold: float, inplace: bool = False, ) -> Dataset | None: """ Set values in 'column' below 'threshold' to zero. Parameters ---------- column : str Column name to apply thresholding. threshold : float Values with absolute value below this threshold are set to zero. Note that values exactly equal to the threshold are not zeroed. inplace : bool, optional If True, modify the Dataset in place. Otherwise, return a new Dataset. Default is False. Returns ------- data : Dataset or None The modified Dataset if 'inplace' is False. Otherwise, None. Examples -------- Occasionally, there may be small non-zero values in the data that can be considered as noise and set to zero. When not appropriately zeroed, these can cause issues with automatic pulse detection (i.e., where the algorithm detects changes from rests to non-rests and vice versa). So, in the following example, we load the 'hppc_discharge' dataset and zero out current values below a certain threshold. The threshold here is set to 1% of the mean current from non-rest data, however, the appropriate threshold should be determined based on the specific characteristics of the dataset. .. code-block:: python import ampworks as amp # zero out currents below a threshold from non-rest data data = amp.datasets.load_datasets('hppc/hppc_discharge') threshold = data.loc[data['State'] != 'R', 'Amps'].mean()*1e-2 data_zeroed = data.zero_below(column='Amps', threshold=threshold) """ result = self.copy() mask = result[column].abs() < abs(threshold) result.loc[mask, column] = 0.0 if inplace: self._update_inplace(result) else: return result
[docs] def zero_time(self, inplace: bool = False) -> Dataset | None: """ Shift the `Seconds` column by subtracting the value in the first row, creating a new zero time reference. Parameters ---------- inplace : bool, optional If True, modify the Dataset in place. Otherwise, return a new Dataset. Default is False. Returns ------- data : Dataset | None The modified Dataset if 'inplace' is False. Otherwise, None. Notes ----- This method does not sort by time, nor does it use the minimum time when subtracting. It simply shifts the time values so that the first row has a time of zero, regardless of the actual order of time values. Consider sorting by time first, if needed, using `data.sort_values('Seconds')`. """ from ampworks._checks import _check_type, _check_columns _check_type('inplace', inplace, bool) _check_columns(self, ['Seconds']) result = self.copy() result['Seconds'] -= result['Seconds'].iloc[0] if inplace: self._update_inplace(result) else: return result