Source code for ampworks._core._headers

from __future__ import annotations

import textwrap

from warnings import warn
from typing import TYPE_CHECKING, Generator, Sequence

import pandas as pd

if TYPE_CHECKING:  # pragma: no cover
    from ampworks import Dataset


def format_alias(names: Sequence[str], units: Sequence[str]) -> list[str]:
    """
    Build alias strings from names and units.

    Parameters
    ----------
    names : Sequence[str]
        Base signal names.
    units : Sequence[str]
        Unit labels used in the source files.

    Returns
    -------
    aliases : list[str]
        Alias strings containing unit-only, name-only, and name.unit forms.

    """
    aliases = list(units)

    for name in names:
        aliases.append(name)
        for unit in units:
            aliases.append(f"{name}.{unit}")

    return aliases


def strip_chars(string: str | list[str] | None) -> str | list[str] | None:
    """
    Normalize header text for matching.

    Parameters
    ----------
    string : str or list[str] or None
        Header text to normalize.

    Returns
    -------
    stripped : str or list[str] or None
        Lowercased text with common separators removed.

    """
    if string is None:
        return None
    if isinstance(string, list):
        return [strip_chars(s) for s in string]

    transmap = str.maketrans('(/,', '...', ' _-#<>)')
    return string.lower().translate(transmap)


t_names = ['t', 'time', 'testtime', 'totaltime']
t_units = ['s', 'sec', 'seconds', 'min', 'minutes', 'h', 'hrs', 'hours']

i_names = ['i', 'amperage', 'current']
i_units = ['a', 'amps', 'ma', 'milliamps']

v_names = ['voltage', 'potential', 'ecell']
v_units = ['v', 'volts']

q_names = ['capacity', 'amphours']
q_units = ['ah', 'ahr', 'amphr', 'mah', 'mahr', 'mamphr']

e_names = ['energy', 'watthours']
e_units = ['wh', 'whr', 'watthr']

HEADER_ALIASES = {
    'Seconds': format_alias(t_names, t_units),
    'Amps': format_alias(i_names, i_units),
    'Volts': format_alias(v_names, v_units),
    'Cycle': ['cycle', 'cyc', 'cycleindex', 'cyclenumber', 'cyclec', 'cyclep'],
    'Step': ['step', 'ns', 'stepindex'],
    'State': ['state', 'md', 'mode'],
    'Ah': format_alias(q_names, q_units),
    'Wh': format_alias(e_names, e_units),
    'DateTime': ['datetime', 'dpttime', 'realtime'],
}

REQUIRED_HEADERS = ['Seconds', 'Amps', 'Volts']



[docs]
class HeaderAliases:
    """Header alias definitions."""

    __slots__ = ('Seconds', 'Amps', 'Volts', 'Cycle', 'Step', 'State', 'Ah',
                 'Wh', 'DateTime')

    def __init__(
        self,
        *,
        Seconds: str | list[str] | None = None,
        Amps: str | list[str] | None = None,
        Volts: str | list[str] | None = None,
        Cycle: str | list[str] | None = None,
        Step: str | list[str] | None = None,
        State: str | list[str] | None = None,
        Ah: str | list[str] | None = None,
        Wh: str | list[str] | None = None,
        DateTime: str | list[str] | None = None,
    ) -> None:
        """
        A container that allows users to specify custom header aliases for their
        data. These are used to automatically find and standardize columns when
        loading data. `ampworks` uses default aliases for any headers that are
        not provided here.

        Parameters
        ----------
        Seconds : str or list[str] or None, optional
            Aliases for the standardized Seconds column.
        Amps : str or list[str] or None, optional
            Aliases for the standardized Amps column.
        Volts : str or list[str] or None, optional
            Aliases for the standardized Volts column.
        Cycle : str or list[str] or None, optional
            Aliases for the standardized Cycle column.
        Step : str or list[str] or None, optional
            Aliases for the standardized Step column.
        State : str or list[str] or None, optional
            Aliases for the standardized State column.
        Ah : str or list[str] or None, optional
            Aliases for the standardized Ah column.
        Wh : str or list[str] or None, optional
            Aliases for the standardized Wh column.
        DateTime : str or list[str] or None, optional
            Aliases for the standardized DateTime column.

        Examples
        --------
        The following example shows how to use `HeaderAliases` to specify custom
        aliases. Any inputs that are skipped will use a list of defaults. Note
        that you can provide a single alias or many for each standard header. Be
        aware that all parameters must be provided as keywords to avoid improper
        ordering.

        >>> import ampworks as amp
        >>> aliases = amp.HeaderAliases(
        ...     Seconds='elapsed_s',
        ...     Amps=['current_amps', 'current_a'],
        ... )

        """
        from ampworks._checks import _check_inner_type, _check_type

        params = {
            'Seconds': Seconds,
            'Amps': Amps,
            'Volts': Volts,
            'Cycle': Cycle,
            'Step': Step,
            'State': State,
            'Ah': Ah,
            'Wh': Wh,
            'DateTime': DateTime,
        }

        # convert inputs to list[str] or use defaults if None
        def make_list_or_default(key, value):
            if value is None:
                return HEADER_ALIASES[key]
            if isinstance(value, str):
                return strip_chars([value])
            return strip_chars(value)

        # loop over fields and add to class instance
        for name, value in params.items():

            _check_type(name, value, (str, list, None))
            value = make_list_or_default(name, value)
            _check_inner_type(name, value, str)

            setattr(self, name, value)

    def __getitem__(self, key: str) -> list[str]:
        """Return aliases for a standardized header name."""
        if key in self.__slots__:
            return getattr(self, key)
        raise KeyError(f"{key} not found in {self.__class__.__name__}")

    def __repr__(self) -> str:  # pragma: no cover
        data = {k: v for k, v in self.items()}
        summary = "\n".join([f"{k}={v!r}," for k, v in data.items()])
        summary = textwrap.indent(summary, " " * 4)
        return f"{self.__class__.__name__}(\n{summary}\n)"


[docs]
    def keys(self) -> list[str]:
        """Return standardized header names supported by the alias set."""
        return list(self.__slots__)



[docs]
    def items(self) -> Generator[tuple[str, list[str]], None, None]:
        """Iterate over `(std_header, aliases)` pairs."""
        for slot in self.__slots__:
            yield (slot, getattr(self, slot))




def header_matches(
    headers: list[str],
    targets: list[str],
    aliases: HeaderAliases,
) -> bool:
    """
    Check headers for required targets.

    Parameters
    ----------
    headers : list[str]
        Source headers to evaluate.
    targets : list[str]
        Standardized target names that must be present.
    aliases : HeaderAliases
        Alias definitions used for matching.

    Returns
    -------
    checks : bool
        True when all target headers are matched.

    """
    normalized = strip_chars(headers)

    checks = {}
    for key in targets:
        checks[key] = any(alias in normalized for alias in aliases[key])

    return all(checks.values())



[docs]
def standardize_headers(
    data: pd.DataFrame,
    aliases: HeaderAliases | None = None,
    extra_columns: dict[str, type | None] | None = None,
) -> Dataset:
    """
    Map source columns to `ampworks` standards.

    Parameters
    ----------
    data : pandas.DataFrame
        Source data frame with raw cycler headers.
    aliases : HeaderAliases or None, optional
        Alias mapping used to identify standardized columns. If None, defaults
        are used.
    extra_columns : dict[str, type or None] or None, optional
        Extra source columns to keep in output using exact source names as
        keys. Values define cast type. Use None to keep inferred dtype.

    Returns
    -------
    data : Dataset
        Standardized dataset.

    Warnings
    --------
    UserWarning
        Raised when standardized aliases are missing, requested extra columns
        are not found, or requested extra columns conflict with standardized
        output columns.

    """
    from ampworks import Dataset

    if aliases is None:
        aliases = HeaderAliases()

    df = Dataset()

    unit_factors = {
        'Amps': {
            ('ma', 'mamps', 'milliamps'): 0.001,
        },
        'Ah': {
            ('mah', 'mahr', 'mamphr'): 0.001,
        },
        'Seconds': {
            ('min', 'mins', 'minute', 'minutes'): 60.0,
            ('h', 'hr', 'hrs', 'hour', 'hours'): 3600.0,
        },
    }

    # Match as-imported headers with standardized headers
    for std_header in aliases.keys():
        for raw_header in data.columns:
            normalized = strip_chars(raw_header)
            if normalized not in aliases[std_header]:
                continue

            if std_header not in df.columns:
                df[std_header] = data[raw_header]

            # Standardize units
            if std_header in unit_factors:
                for units, factor in unit_factors[std_header].items():
                    if any(unit in normalized for unit in units):
                        df[std_header] = df[std_header].astype(float) * factor
                        break

    # Create 'State' data if not present
    if ('State' not in df.columns) and ('Amps' in df.columns):
        df['Amps'] = df['Amps'].astype(float)

        df['State'] = 'R'
        df.loc[df['Amps'] > 0, 'State'] = 'C'
        df.loc[df['Amps'] < 0, 'State'] = 'D'

    # Guarantee sign 'Amps' sign convention (+ charge, - discharge)
    if 'State' in df.columns:
        rename_bitrode = {'REST': 'R', 'DCHG': 'D', 'CHRG': 'C'}
        df['State'] = df['State'].replace(rename_bitrode)

        df['Amps'] = df['Amps'].astype(float)
        df['State'] = df['State'].astype(str)

        sign = df['State'].map({'R': 0.0, 'C': 1.0, 'D': -1.0}).fillna(1.0)
        df['Amps'] = sign * df['Amps'].abs()

    # Create 'Ah' and 'Wh' from separate charge and discharge columns
    if any(header not in df.columns for header in ['Ah', 'Wh']):
        ah_headers = ['charge' + header for header in aliases['Ah']]
        wh_headers = ['charge' + header for header in aliases['Wh']]
        for raw_header in data.columns:
            normalized = strip_chars(raw_header)
            if normalized in ah_headers:
                df['Ah'] = data[raw_header]
                discharge_ah = data[raw_header.replace('Charge', 'Discharge')]
                df.loc[df['State'] == 'D', 'Ah'] = discharge_ah
            if normalized in wh_headers:
                df['Wh'] = data[raw_header]
                discharge_wh = data[raw_header.replace('Charge', 'Discharge')]
                df.loc[df['State'] == 'D', 'Wh'] = discharge_wh

    # Final data typing, unit normalization, and checks for missing headers
    missing = []
    for std_header in aliases.keys():

        # Convert types
        if std_header in df.columns:
            if std_header in ['State', 'DateTime']:
                df[std_header] = df[std_header].astype('string')
            elif std_header in ['Cycle', 'Step']:
                df[std_header] = df[std_header].astype('Int64')
            else:
                df[std_header] = df[std_header].replace('#', '', regex=True)
                df[std_header] = df[std_header].replace(',', '', regex=True)
                df[std_header] = pd.to_numeric(df[std_header], errors='coerce')
        else:
            missing.append(std_header)

    if missing:
        warn(f"No valid aliases found for {missing}.")

    # Keep user-requested non-standardized columns from source data
    if extra_columns is not None:
        missing_extra = []
        skipped_extra = []

        for col_name, col_type in extra_columns.items():
            if col_name not in data.columns:
                missing_extra.append(col_name)
                continue

            if col_name in df.columns:
                skipped_extra.append(col_name)
                continue

            df[col_name] = data[col_name]
            if col_type is not None:
                df[col_name] = df[col_name].astype(col_type)

        if missing_extra:
            warn(f"'extra_columns' not found: {missing_extra=}. Only found"
                 f"{set(data.columns)}.")

        if skipped_extra:
            warn(f"Skipped some conflicting 'extra_columns': {skipped_extra=}."
                 f" Existing are {set(df.columns)}.")

    return df