Source code for src.core.process.extract

"""the file, which mainly manages the data extraction process"""

import pandas as pd
from enum import Enum
from loguru import logger
from src.core.meta import GameTable, TimeTable, ImportType
from src.core.process.pipelines.client import ClientKeys, tableTypeForClient, needsAggClient, needsMetaDataClient
from src.core.process.pipelines.matchv5 import MatchV5Keys, tableTypeForMatchV5, needsAggMatchV5, needsMetaDataMatchV5
from src.utils.pandas import dropListEntries, mergeTables, indexByOneVariable, concatOnPrefixes


[docs]
class ImportPipeline(Enum):
    """possible pipelines, we use currently
    
    Attributes
    ----------
    CLIENT : ClientKeys
        the pipeline for the client dumped data
    MATCHV5 : MatchV5Keys
        the pipeline for the matchv5 gathered data
    
    """
    CLIENT  = ClientKeys
    MATCHV5 = MatchV5Keys


pipeToAgg: dict[ImportPipeline, dict] = {
    ImportPipeline.CLIENT: needsAggClient,
    ImportPipeline.MATCHV5: needsAggMatchV5
}
"""dict, which maps the pipeline to the aggregation helper"""

pipeToMeta: dict[ImportPipeline, dict] = {
    ImportPipeline.CLIENT: needsMetaDataClient,
    ImportPipeline.MATCHV5: needsMetaDataMatchV5
}
"""dict, which maps the pipeline to the metapath helper"""


[docs]
def classify(member: ClientKeys | MatchV5Keys) -> GameTable | TimeTable:
    """classifys the GameTabletype for a Key
    
    Parameters
    ----------
    member : ClientKeys | matchV5Keys
        the member of one of the enums
        
    Returns
    -------
        the table, which it belongs to
        
    """
    logger.trace(f"classified the member {member.value}.")
    # the client case
    if isinstance(member, ClientKeys):
        return tableTypeForClient[member]
    # the matchv5 case
    return tableTypeForMatchV5[member]



[docs]
def getData(
    data:    dict, 
    pathKey: MatchV5Keys | ClientKeys, 
    typ:     ImportType,
    metaKey: list[list[str]] | None = None,
    agg:      bool = False,
) -> pd.DataFrame:
    """parses the data according the passed pathKey
    
    Parameters
    ----------
    data : dict
        the raw data, extracted from a .json gamefile
    pathKey : MatchV5Keys | ClientKeys
        the path to the data in the json tree of the gamefile
    metaKey : list[list[str]] | None, optional
        the path to a meta variable. this is only used, if the jsontree contains deeper lists,
        that we have to parse in an extra step.
    agg : bool, optional
        if passed, the tables gets aggregated with indexbyoneVariable
    
    Returns
    -------
    dataframe : pd.DataFrame
        the resulting table
    
    """
    logger.trace("Start normalizing the raw data dict via pandas.")
    normalizedDf = pd.json_normalize(
        data,
        record_path=pathKey.value,
        meta=metaKey,
        sep="_"
    )

    listlessDf = dropListEntries(normalizedDf)
    
    if agg:
        logger.debug("Start aggregating the resulting datatable.")
        match typ:
            case ImportType.GENERAL:
                return indexByOneVariable(listlessDf, "_".join(metaKey[0]))
            case ImportType.TIMELINE:
                return concatOnPrefixes(listlessDf, [
                    "participantFrames_1_",
                    "participantFrames_2_",
                    "participantFrames_3_",
                    "participantFrames_4_",
                    "participantFrames_5_",
                    "participantFrames_6_",
                    "participantFrames_7_",
                    "participantFrames_8_",
                    "participantFrames_9_",
                    "participantFrames_10_",
                ], "participantid")
        
    return listlessDf



[docs]
def extractRawTables(data: dict, pipe: ImportPipeline, typ: ImportType) -> dict[GameTable, pd.DataFrame]:
    """Collects all member of a enum class (see head of this file) and 
    executes these onto a data dict, to extract the data and format it into
    three outcomes: PLAYER-, META- and TEAMDATA
    
    Parameters
    ----------
    data : dict
        the raw data, extracted from a .json gamefile
    className : ImportPipeline
        one of the two classnames above
        
    Returns
    -------
    dataframes : list[pd.DataFrame]
        contains the three major tables: metadata, playerdata, teamdata
        
    """
    logger.debug("Start the extraction process for the raw json dict.")
    METADATA: list[pd.DataFrame]   = []
    PLAYERDATA: list[pd.DataFrame] = []
    TEAMDATA: list[pd.DataFrame]   = []
    FRAMEDATA: list[pd.DataFrame]  = []
    EVENTDATA: list[pd.DataFrame]  = []

    for path in pipe.value:
        if not isinstance(classify(path),typ.value):
            continue

        logger.debug(f"extracting the part of the json, according to {path.value}.")
        metaKey = None
        aggregation: bool = (path in pipeToAgg[pipe])
        if aggregation:
            metaKey: list[list[str]] = [pipeToAgg[pipe][path]]
        if path in pipeToMeta[pipe]:
            metaKey: list[list[str]] = [pipeToMeta[pipe][path]]
        table: pd.DataFrame = getData(
                data=data,
                pathKey=path,
                typ=typ,
                metaKey=metaKey,
                agg=aggregation
        )

        match classify(path):
            case GameTable.TEAM:
                TEAMDATA.append(table)
            case GameTable.PLAYER:
                PLAYERDATA.append(table)
            case GameTable.META:
                METADATA.append(table)
            case TimeTable.FRAME:
                FRAMEDATA.append(table)
            case TimeTable.EVENT:
                EVENTDATA.append(table)

    return {
        GameTable.META:   mergeTables(METADATA), 
        GameTable.TEAM:   mergeTables(TEAMDATA), 
        GameTable.PLAYER: mergeTables(PLAYERDATA),
        TimeTable.FRAME:  mergeTables(FRAMEDATA), 
        TimeTable.EVENT:  mergeTables(EVENTDATA)
    }