Source code for data_pipeline.rplan_content_extraction.rplan_keyword_search

import json
import os.path

import pandas as pd
from loguru import logger

from features.textual_features.keyword_search.contextual_fuzzy_search import \
    search_df_for_best_matches_keyword_dict
from features.textual_features.keyword_search.exact_keyword_search import search_df_for_keywords
from utility.config_utils import get_data_path, get_source_path
from visualizations.rplan_visualization import plot_keyword_search_results

rplan_content_path = os.path.join(get_data_path(), "nrw", "extracted_rplan_content.json")

keyword_path = get_source_path() + "/data_pipeline/rplan_content_extraction/keywords"
rplan_exact_keyword_dict_path = os.path.join(keyword_path, "exact_keyword_dict_rplans.json")
rplan_fuzzy_keyword_dict_path = os.path.join(keyword_path, "fuzzy_keyword_dict_rplans.json")
rplan_negate_keyword_dict_path = os.path.join(keyword_path, "negate_keyword_dict_rplans.json")










def _prepare_rplan_df(input_df: 'pd.DataFrame',
                      keyword_dict_path):
    """ Function to prepare the input df for the keyword search.

    Args:
        input_df: Input df to be searched for keywords
        keyword_dict_path: Path to the keyword dict

    Returns:
        index_column_name: Name of the index column
        input_df: Input df to be searched for keywords
        rplan_keywords: Dict of relevant keywords
        text_column_name: Name of the column in the input df holding the relevant text

    """
    input_df = input_df.reset_index()  # add index column
    index_column_name = "index"
    text_column_name = "section"
    with open(keyword_dict_path) as f:
        rplan_keywords = json.load(f)
    return index_column_name, input_df, rplan_keywords, text_column_name


def save_rplan_keyword_search(input_df,
                              result_df,
                              drop_false_rows=False,
                              saving_filename: str = None):
    """ Function to save the result of the keyword search to a json file.


    Args:
        input_df: Input df to be searched for keywords
        result_df: Result df of the keyword search
        drop_false_rows: defaults to False; if True, rows with all False values are dropped
        saving_filename: defaults to "rplan_exact_keyword_search_result.json"; filename of the saved result

    Returns:
        pd.DataFrame: Result df of the keyword search with additional columns from the input df

    """
    if drop_false_rows:
        result_df = result_df[result_df.drop(columns=["index"]).any(axis=1)].copy()
    result_df["index"] = result_df["index"].astype('int64')
    # for every entry get the section and chapter from the input df and append to result df
    result_df = input_df.merge(result_df, on="index")
    if saving_filename and saving_filename.endswith(".json"):
        result_df.to_json(saving_filename)

    return result_df






if __name__ == '__main__':
    input_df = pd.read_json(rplan_content_path)
    input_df = negate_keyword_search(input_df, negate_keyword_dict_path=rplan_negate_keyword_dict_path)

    exact_result, exact_keywords = rplan_exact_keyword_search(input_df)
    plot_keyword_search_results(exact_result, keyword_columns=exact_keywords, title="Exact Keyword Search Results")

    fuzzy_result, fuzzy_keywords = rplan_fuzzy_keyword_search(input_df)
    plot_keyword_search_results(fuzzy_result, keyword_columns=fuzzy_keywords, title="Fuzzy Keyword Search Results")