Source code for src.utils.pandas

"""helper methods for manipulation pandas dataframes. i.e.
- merging tables
- drop list entries
- index by a certain feature
"""
from loguru import logger
import pandas as pd
import numpy as np


[docs]
def mergeTables(tables: list[pd.DataFrame]) -> pd.DataFrame:
    """merges tables along the rows, but assumes unique colnames!"""
    logger.trace("Starting merging dataframes with each other.")
    if not tables:
        return pd.DataFrame()
    dataset = tables.copy()
    for i in range(len(dataset) - 1):
        dataset[0] = dataset[0].join(dataset[i+1], rsuffix="_other")
    
    return dataset[0]



[docs]
def concatOnPrefixes(table: pd.DataFrame, prefixes: list[str], ident: str) -> pd.DataFrame:
    """Partition columns by prefix and stack matches row-wise.

    For each prefix in prefixes:
      - take all columns starting with the prefix
      - append all non-matching columns
      - add a column 'ident' with the prefix label
    Then concatenate these partitions along rows.
    """
    if table is None or table.empty:
        return pd.DataFrame()
    if not prefixes:
        result = table.copy()
        return result

    cols = list(table.columns)
    nonmatch_cols = [c for c in cols if not any(c.startswith(p) for p in prefixes)]

    parts: list[pd.DataFrame] = []
    for p in prefixes:
        match_cols = [c for c in cols if c.startswith(p)]
        if not match_cols:
            continue
        part = pd.concat([table[match_cols], table[nonmatch_cols]], axis=1)
        part = part.copy()

        part.columns = [s.removeprefix(p) for s in part.columns]
        
        parts.append(part)

    if not parts:
        result = table[nonmatch_cols].copy()
        return result

    res = pd.concat(parts, axis=0, ignore_index=True)
    #res.columns = 

    return res


[docs]
def dropListEntries(dataframe: pd.DataFrame) -> pd.DataFrame:
    """this method drops all columns of a dataframe, that contain a list.
    
    Parameters
    ----------
    dataframe : pd.DataFrame
        a dataframe to format
        
    Returns
    -------
    dataframe : pd.DataFrame
        the resulting dataframe with dropped lists
        
    """
    logger.trace("dropped List entries of a dataframe.")
    list_cols = [c for c in dataframe.columns if dataframe[c].apply(lambda x: isinstance(x, list)).any()]
    return dataframe.drop(columns=list_cols)




[docs]
def indexByOneVariable(df: pd.DataFrame, var: str) -> pd.DataFrame:
    """this method uses one variable/feature and sets it as the new index variable.

    Parameters
    ----------
    df : pd.DataFrame
        the dataframe, to change
    var : str
        the name of the feature/variable

    Returns
    -------
    dataframe : pd.DataFrame
        the new dataframe indexed by var
    
    """
    logger.trace(f"Starting index a dataframe via the feature {var}.")
    dataframe = df.copy()
    colsWithoutMetaKey = list(dataframe.columns)
    colsWithoutMetaKey.remove(var)
    dataframe["cc"] = dataframe.groupby(var).cumcount()

    indexed_df = dataframe.set_index([var, "cc"])[colsWithoutMetaKey].unstack("cc")
    indexed_df.columns = [f"{a}_{b}" for a, b in indexed_df.columns]
    indexed_df.index = list(range(len(indexed_df.index)))

    return indexed_df