Source code for data_pipeline.rplan_content_extraction.rplan_keyword_search

import json
import os.path

import pandas as pd
from loguru import logger

from features.textual_features.keyword_search.contextual_fuzzy_search import \
    search_df_for_best_matches_keyword_dict
from features.textual_features.keyword_search.exact_keyword_search import search_df_for_keywords
from utility.config_utils import get_data_path, get_source_path
from visualizations.rplan_visualization import plot_keyword_search_results

rplan_content_path = os.path.join(get_data_path(), "nrw", "extracted_rplan_content.json")

keyword_path = get_source_path() + "/data_pipeline/rplan_content_extraction/keywords"
rplan_exact_keyword_dict_path = os.path.join(keyword_path, "exact_keyword_dict_rplans.json")
rplan_fuzzy_keyword_dict_path = os.path.join(keyword_path, "fuzzy_keyword_dict_rplans.json")
rplan_negate_keyword_dict_path = os.path.join(keyword_path, "negate_keyword_dict_rplans.json")



[docs]
def rplan_exact_keyword_search(input_df: 'pd.DataFrame', save_path: str = None, drop_false_rows=False):
    """ Function to search for keywords in a df.

    This function uses excat matching to find the best matches for the keywords. It uses the extracted content from
    the rplan pdfs as input. The keywords are stored in a json file. It basically uses the search_df_for_keywords
    function from the contextual_exact_search module.


    Args:
        input_df: Input df to be searched for keywords
        save_path: defaults to None; if None, the result is not saved
        drop_false_rows: defaults to False; if True, rows with all False values are dropped

    Returns:
        pd.DataFrame: Result df of the keyword search
    """
    index_column_name, input_df, rplan_keywords, text_column_name = _prepare_rplan_df(input_df,
                                                                                      rplan_exact_keyword_dict_path)

    result_df = search_df_for_keywords(input_df,
                                       id_column_name=index_column_name,
                                       text_column_name=text_column_name,
                                       keyword_dict=rplan_keywords,
                                       boolean=True
                                       )

    result_df = save_rplan_keyword_search(input_df, result_df, drop_false_rows=drop_false_rows,
                                          saving_filename=save_path)
    return result_df, rplan_keywords.keys()




[docs]
def rplan_fuzzy_keyword_search(input_df: 'pd.DataFrame', save_path: str = None, drop_false_rows=False):
    """ Function to search for keywords in a df.

    This function uses fuzzy matching to find the best matches for the keywords. It uses the extracted content from
    the rplan pdfs as input. The keywords are stored in a json file. It basically uses the search_df_for_best_matches_keyword_dict
    function from the contextual_fuzzy_search module.

    Args:
        input_df: Input df to be searched for keywords
        save_path: defaults to None; if None, the result is not saved
        drop_false_rows: defaults to False; if True, rows with all False values are dropped

    Returns:
        pd.DataFrame: Result df of the keyword search
        rplan_keywords.keys(): List of keywords used for the search
    """
    index_column_name, input_df, rplan_keywords, text_column_name = _prepare_rplan_df(input_df,
                                                                                      rplan_fuzzy_keyword_dict_path)

    result_df = search_df_for_best_matches_keyword_dict(input_df=input_df,
                                                        id_column_name=index_column_name,
                                                        text_column_name=text_column_name,
                                                        keyword_dict=rplan_keywords,
                                                        default_threshold=70,
                                                        context_words=3)

    result_df = save_rplan_keyword_search(input_df, result_df,
                                          drop_false_rows=drop_false_rows,
                                          saving_filename=save_path)
    return result_df, rplan_keywords.keys()



def _prepare_rplan_df(input_df: 'pd.DataFrame',
                      keyword_dict_path):
    """ Function to prepare the input df for the keyword search.

    Args:
        input_df: Input df to be searched for keywords
        keyword_dict_path: Path to the keyword dict

    Returns:
        index_column_name: Name of the index column
        input_df: Input df to be searched for keywords
        rplan_keywords: Dict of relevant keywords
        text_column_name: Name of the column in the input df holding the relevant text

    """
    input_df = input_df.reset_index()  # add index column
    index_column_name = "index"
    text_column_name = "section"
    with open(keyword_dict_path) as f:
        rplan_keywords = json.load(f)
    return index_column_name, input_df, rplan_keywords, text_column_name


def save_rplan_keyword_search(input_df,
                              result_df,
                              drop_false_rows=False,
                              saving_filename: str = None):
    """ Function to save the result of the keyword search to a json file.


    Args:
        input_df: Input df to be searched for keywords
        result_df: Result df of the keyword search
        drop_false_rows: defaults to False; if True, rows with all False values are dropped
        saving_filename: defaults to "rplan_exact_keyword_search_result.json"; filename of the saved result

    Returns:
        pd.DataFrame: Result df of the keyword search with additional columns from the input df

    """
    if drop_false_rows:
        result_df = result_df[result_df.drop(columns=["index"]).any(axis=1)].copy()
    result_df["index"] = result_df["index"].astype('int64')
    # for every entry get the section and chapter from the input df and append to result df
    result_df = input_df.merge(result_df, on="index")
    if saving_filename and saving_filename.endswith(".json"):
        result_df.to_json(saving_filename)

    return result_df



[docs]
def negate_keyword_search(input_df: 'pd.DataFrame',
                          negate_keyword_dict_path: str,
                          keyword_column: str = 'section'):
    """ Function to negate the result of the keyword search.

    This function removes rows from the input df if the negate keywords are found in the text. It is a simple
    exact matching search.

    Args:
        input_df: Input df to be searched for keywords
        keyword_column: Name of the column in the input df holding the relevant text
        negate_keyword_dict_path: Path to the negate keyword dict

    Returns:
        pd.DataFrame: Result df of the keyword search with additional columns from the input df

    """
    with open(negate_keyword_dict_path) as f:
        negate_keywords = json.load(f)
    logger.info(f"Negate keywords: {negate_keywords}")
    tmp_len = len(input_df)
    for negate_keyword in negate_keywords:
        for index, row in input_df[[keyword_column]].iterrows():
            text = row["section"]
            if negate_keyword in text:
                # remove row
                input_df = input_df.drop(index)
    logger.info(f"Removed {tmp_len - len(input_df)} rows with negate keywords")
    return input_df



if __name__ == '__main__':
    input_df = pd.read_json(rplan_content_path)
    input_df = negate_keyword_search(input_df, negate_keyword_dict_path=rplan_negate_keyword_dict_path)

    exact_result, exact_keywords = rplan_exact_keyword_search(input_df)
    plot_keyword_search_results(exact_result, keyword_columns=exact_keywords, title="Exact Keyword Search Results")

    fuzzy_result, fuzzy_keywords = rplan_fuzzy_keyword_search(input_df)
    plot_keyword_search_results(fuzzy_result, keyword_columns=fuzzy_keywords, title="Fuzzy Keyword Search Results")