Source code for ampworks._core._read

from __future__ import annotations

import csv

from warnings import warn
from typing import TYPE_CHECKING, Sequence

import pandas as pd
import polars as pl

if TYPE_CHECKING:  # pragma: no cover
    from os import PathLike
    from ampworks import Dataset, HeaderAliases


def _read_delimited(
    filepath: PathLike,
    delimiter: str,
    aliases: HeaderAliases,
    extra_columns: dict[str, type | None] | None,
) -> Dataset:
    r"""
    Generic internal reader for delimited files. Used for shared logic between
    the csv and txt readers.

    Parameters
    ----------
    filepath : PathLike
        Path to the file, including extension.
    delimiter : str
        Delimiter to use for parsing the file. For example, `','` for csv files
        and `'\t'` for tab-delimited files.
    aliases : HeaderAliases or None, optional
        Column alias mapping for the header standardization. If None (default),
        a set of internal default aliases is used.
    extra_columns : dict[str, type or None] or None, optional
        Additional columns to include in the standardized dataset. Include both
        the exact source column names and their corresponding data types in a
        dictionary. Use value None to keep pandas-inferred dtype. The `type` is
        also compatible with pandas dtypes, e.g., `'string'`, `'Int64'`, etc.

    Returns
    -------
    data : Dataset
        Standardized battery dataset.

    """
    from ampworks import Dataset, HeaderAliases
    from ampworks._checks import _check_type
    from ampworks._core._headers import (
        standardize_headers, header_matches, REQUIRED_HEADERS,
    )

    if aliases is None:
        aliases = HeaderAliases()

    _check_type('aliases', aliases, HeaderAliases)

    options = {
        'ignore_errors': True,
        'separator': delimiter,
        'truncate_ragged_lines': True,
    }

    skip_rows = None
    with open(filepath, encoding='latin1') as datafile:
        reader = csv.reader(datafile, delimiter=delimiter)

        for idx, line in enumerate(reader):
            if header_matches(line, REQUIRED_HEADERS, aliases):
                skip_rows = idx
                break

    if skip_rows is not None:
        options['skip_rows'] = skip_rows
        df = pl.read_csv(filepath, **options).to_pandas()
        return standardize_headers(df, aliases, extra_columns)

    warn(f"No valid aliases found for {REQUIRED_HEADERS} in {filepath}."
         " Returning empty dataset.")

    return Dataset()



[docs]
def read_csv(
    filepath: PathLike,
    aliases: HeaderAliases | None = None,
    extra_columns: dict[str, type | None] | None = None,
) -> Dataset:
    """
    Read a csv file.

    Custom reading function for comma-separated values (CSV) files. Scans the
    file to identify expected headers (see Notes for specifics). This routine is
    not specific to any particular cycler. Instead, it uses default internal or
    user-defined aliases to find and standardize the headers, columns, and data
    types.

    Parameters
    ----------
    filepath : PathLike
        Path to the file, including extension.
    aliases : HeaderAliases or None, optional
        Column alias mapping for the header standardization. If None (default),
        a set of internal default aliases is used.
    extra_columns : dict[str, type or None] or None, optional
        Additional columns to include in the standardized dataset. Include both
        the exact source column names and their corresponding data types in a
        dictionary. Use value None to keep pandas-inferred dtype. The `type` is
        also compatible with pandas dtypes, e.g., `'string'`, `'Int64'`, etc.

    Returns
    -------
    data : Dataset
        Standardized battery dataset.

    Warnings
    --------
    UserWarning
        If `extra_columns` are not found in the source data or conflict with any
        of the standardized headers. Also, if no valid headers are found and an
        empty dataset is returned.

    See Also
    --------
    ~ampworks.HeaderAliases : Custom column mapping for standardization.

    Notes
    -----
    By default, only aliases of Seconds, Amps, Volts, Cycle, Step, State, Ah,
    Wh, and DateTime are included. If you'd like to ensure that additional data
    columns are included, use the `extra_columns` parameter.

    Examples
    --------
    The following example shows how to read in data from a `.csv` file using a
    few of the available options.

    .. code-block:: python

        import ampworks as amp

        # read in the file using all default options
        data = amp.read_csv('data.csv')

        # specify custom aliases for a couple column headers
        aliases = amp.HeaderAliases(Seconds='Time_s', Amps='Current_A')
        data = amp.read_csv('data.csv', aliases=aliases)

        # include extra columns for temperature and notes
        extra_cols = {'Temperature': float, 'Notes': None}
        data = amp.read_csv('data.csv', extra_columns=extra_cols)

    """
    return _read_delimited(filepath, ',', aliases, extra_columns)




[docs]
def read_table(
    filepath: PathLike,
    aliases: HeaderAliases | None = None,
    extra_columns: dict[str, type | None] | None = None,
) -> Dataset:
    """
    Read a tab-delimited file.

    Custom reading function for tab-delimited files. Scans the file to identify
    expected headers (see Notes for specifics). This routine is not specific to
    any particular cycler. Instead, it uses internal or user-defined aliases to
    find and standardize the headers, columns, and data types.

    Parameters
    ----------
    filepath : PathLike
        Path to the file, including extension.
    aliases : HeaderAliases or None, optional
        Column alias mapping for the header standardization. If None (default),
        a set of internal default aliases is used.
    extra_columns : dict[str, type or None] or None, optional
        Extra source columns to preserve using exact source names as keys. The
        values define cast type. Use None to keep inferred dtype. Both Python
        types and pandas dtypes are accepted, e.g., `'string'`, `'Int64'`, etc.

    Returns
    -------
    data : Dataset
        Standardized battery dataset.

    Warnings
    --------
    UserWarning
        If `extra_columns` are not found in the source data or conflict with any
        of the standardized headers. Also, if no valid headers are found and an
        empty dataset is returned.

    See Also
    --------
    ~ampworks.HeaderAliases : Custom column mapping for standardization.

    Notes
    -----
    By default, only aliases of Seconds, Amps, Volts, Cycle, Step, State, Ah,
    Wh, and DateTime are included. If you'd like to ensure that additional data
    columns are included, use the `extra_columns` parameter.

    Examples
    --------
    The following example shows how to read in data from a `.txt` file using a
    few of the available options.

    .. code-block:: python

        import ampworks as amp

        # read in the file using all default options
        data = amp.read_table('data.txt')

        # specify custom aliases for a couple column headers
        aliases = amp.HeaderAliases(Seconds='Time_s', Amps='Current_A')
        data = amp.read_table('data.txt', aliases=aliases)

        # include extra columns for temperature and notes
        extra_cols = {'Temperature': float, 'Notes': None}
        data = amp.read_table('data.txt', extra_columns=extra_cols)

    """
    return _read_delimited(filepath, '\t', aliases, extra_columns)




[docs]
def read_excel(
    filepath: PathLike,
    sheet_name: str | int | Sequence[str | int] | None = None,
    stack_sheets: bool = False,
    aliases: HeaderAliases | None = None,
    extra_columns: dict[str, type | None] | None = None,
) -> Dataset:
    """
    Read an Excel file.

    Custom reading function for Excel files. Scans all (or some) of the sheets
    to identify expected headers (see Notes for specifics). This routine is not
    specific to any particular cycler. Instead, it uses internal or user-defined
    aliases to find and standardize the headers, columns, and data types.

    Parameters
    ----------
    filepath : PathLike
        Path to the file, including extension.
    sheet_name : str or int or Sequence[str or int] or None, optional
        Name or index of the sheet(s) to read. For integers, use natural indices
        from 1 to the number of sheets. None (default) will scan for the first
        sheet with valid headers. Use `'all'` to read all sheets.
    stack_sheets : bool, optional
        If True, concatenate all parsed sheets into one dataset.
    aliases : HeaderAliases or None, optional
        Column alias mapping for the header standardization. If None (default),
        a set of internal default aliases is used.
    extra_columns : dict[str, type or None] or None, optional
        Extra source columns to preserve using exact source names as keys. The
        values define cast type. Use None to keep inferred dtype. Both Python
        types and pandas dtypes are accepted, e.g., `'string'`, `'Int64'`, etc.

    Returns
    -------
    data : Dataset or dict[str or int, Dataset]
        Standardized dataset output. A dictionary is returned if multiple sheets
        are read and `stack_sheets` is False.

    Raises
    ------
    ValueError
        If the parameter (or any of the elements in) `sheet_name` are invalid
        names or indices.

    Warnings
    --------
    UserWarning
        If `extra_columns` are not found in the source data or conflict with any
        of the standardized headers. Also, if no valid headers are found and an
        empty dataset is returned.

    See Also
    --------
    ~ampworks.HeaderAliases : Custom column mapping for standardization.

    Notes
    -----
    By default, only aliases of Seconds, Amps, Volts, Cycle, Step, State, Ah,
    Wh, and DateTime are included. If you'd like to ensure that additional data
    columns are included, use the `extra_columns` parameter.

    Examples
    --------
    The following example shows how to read in data from an Excel file using a
    few of the available options. Note that the examples demonstrate different
    extensions that are both types of Excel files.

    .. code-block:: python

        import ampworks as amp

        # read in the file using all default options
        data = amp.read_excel('data.xls')

        # specify custom aliases for a couple column headers
        aliases = amp.HeaderAliases(Seconds='Time_s', Amps='Current_A')
        data = amp.read_excel('data.xls', aliases=aliases)

        # include extra columns for temperature and notes
        extra_cols = {'Temperature': float, 'Notes': None}
        data = amp.read_excel('data.xls', extra_columns=extra_cols)

        # specify the second sheet and a sheet named 'last'
        data = amp.read_excel('data.xlsx', sheet_name=[2, 'last'])

        # read in all sheets and concatenate the results
        data = amp.read_excel('data.xlsx', sheet_name='all', stack_sheets=True)

    """
    from ampworks import Dataset, HeaderAliases
    from ampworks._checks import _check_type, _check_inner_type
    from ampworks._core._headers import (
        standardize_headers, header_matches, REQUIRED_HEADERS,
    )

    workbook = pd.ExcelFile(filepath)
    all_sheets = workbook.sheet_names
    num_sheets = len(all_sheets)

    # warn if 'all' matches a sheet name
    if sheet_name == 'all' and 'all' in all_sheets:
        warn("sheet_name='all' is interpreted as ALL sheets, but a sheet named"
             " 'all' exists. To read only that sheet, pass ['all'] explicitly.")

    # Set which sheets to iterate through
    _check_type('sheet_name', sheet_name, (str, int, Sequence, None))

    if sheet_name is None or sheet_name == 'all':
        iter_sheets = all_sheets
    elif isinstance(sheet_name, (str, int)):
        iter_sheets = [sheet_name]
    elif isinstance(sheet_name, Sequence):
        iter_sheets = list(sheet_name)

    # Raise errors if invalid indices/names
    _check_inner_type('sheet_name', iter_sheets, (str, int))

    strings = [value for value in iter_sheets if isinstance(value, str)]
    indices = [value for value in iter_sheets if isinstance(value, int)]

    bad_str = [value for value in strings if value not in all_sheets]
    bad_ind = [value for value in indices if not 1 <= value <= num_sheets]

    if bad_str:
        raise ValueError(f"Invalid worksheet names {bad_str}.")
    if bad_ind:
        raise ValueError(f"Invalid sheet indices {bad_ind}, must be between 1"
                         f" and {num_sheets}.")

    # Set up aliases or use defaults
    if aliases is None:
        aliases = HeaderAliases()

    _check_type('aliases', aliases, HeaderAliases)

    # Iterate through select sheets
    failed = []
    datasets = {}
    for sheet in iter_sheets:
        preview = workbook.parse(sheet, header=None, nrows=20, dtype=str)

        # Find header row
        header_row = None
        for idx, row in preview.iterrows():
            tmp_headers = row.fillna('NaN').astype(str).to_list()
            if header_matches(tmp_headers, REQUIRED_HEADERS, aliases):
                header_row = idx
                break

        if header_row is not None:
            sheet_int = sheet if isinstance(sheet, int) else None
            sheet_str = sheet if isinstance(sheet, str) else None
            read_options = {'header_row': header_row} if header_row > 0 else {}

            df = pl.read_excel(
                filepath,
                sheet_id=sheet_int,
                sheet_name=sheet_str,
                read_options=read_options,
            )

            datasets[sheet] = standardize_headers(
                df.to_pandas(), aliases, extra_columns,
            )

            if sheet_name is None:
                break
        else:
            failed.append(sheet)

    # Prepare outputs (only warn about failed when auto-detecting)
    if sheet_name is None and failed:
        warn(f"No valid aliases found in requested sheets: {failed}.")

    if not datasets:
        warn(f"No valid aliases found in requested sheets of {filepath}.")
        return Dataset()

    if stack_sheets:
        stack = pd.concat([ds for ds in datasets.values()], ignore_index=True)
        return Dataset(stack)

    if len(datasets) == 1:
        (single,) = datasets.values()
        return single

    return datasets