Source code for src.utils.pandas

"""helper methods for manipulation pandas dataframes. i.e.
- merging tables
- drop list entries
- index by a certain feature
"""
from loguru import logger
import pandas as pd
import numpy as np

[docs] def mergeTables(tables: list[pd.DataFrame]) -> pd.DataFrame: """merges tables along the rows, but assumes unique colnames!""" logger.trace("Starting merging dataframes with each other.") if not tables: return pd.DataFrame() dataset = tables.copy() for i in range(len(dataset) - 1): dataset[0] = dataset[0].join(dataset[i+1], rsuffix="_other") return dataset[0]
[docs] def concatOnPrefixes(table: pd.DataFrame, prefixes: list[str], ident: str) -> pd.DataFrame: """Partition columns by prefix and stack matches row-wise. For each prefix in prefixes: - take all columns starting with the prefix - append all non-matching columns - add a column 'ident' with the prefix label Then concatenate these partitions along rows. """ if table is None or table.empty: return pd.DataFrame() if not prefixes: result = table.copy() return result cols = list(table.columns) nonmatch_cols = [c for c in cols if not any(c.startswith(p) for p in prefixes)] parts: list[pd.DataFrame] = [] for p in prefixes: match_cols = [c for c in cols if c.startswith(p)] if not match_cols: continue part = pd.concat([table[match_cols], table[nonmatch_cols]], axis=1) part = part.copy() part.columns = [s.removeprefix(p) for s in part.columns] parts.append(part) if not parts: result = table[nonmatch_cols].copy() return result res = pd.concat(parts, axis=0, ignore_index=True) #res.columns = return res
[docs] def dropListEntries(dataframe: pd.DataFrame) -> pd.DataFrame: """this method drops all columns of a dataframe, that contain a list. Parameters ---------- dataframe : pd.DataFrame a dataframe to format Returns ------- dataframe : pd.DataFrame the resulting dataframe with dropped lists """ logger.trace("dropped List entries of a dataframe.") list_cols = [c for c in dataframe.columns if dataframe[c].apply(lambda x: isinstance(x, list)).any()] return dataframe.drop(columns=list_cols)
[docs] def indexByOneVariable(df: pd.DataFrame, var: str) -> pd.DataFrame: """this method uses one variable/feature and sets it as the new index variable. Parameters ---------- df : pd.DataFrame the dataframe, to change var : str the name of the feature/variable Returns ------- dataframe : pd.DataFrame the new dataframe indexed by var """ logger.trace(f"Starting index a dataframe via the feature {var}.") dataframe = df.copy() colsWithoutMetaKey = list(dataframe.columns) colsWithoutMetaKey.remove(var) dataframe["cc"] = dataframe.groupby(var).cumcount() indexed_df = dataframe.set_index([var, "cc"])[colsWithoutMetaKey].unstack("cc") indexed_df.columns = [f"{a}_{b}" for a, b in indexed_df.columns] indexed_df.index = list(range(len(indexed_df.index))) return indexed_df