dssg_banner
# Import the data generation functions
from data_pipeline.rplan_content_extraction.rplan_utils import extract_text_and_save_to_txt_files
from data_pipeline.rplan_content_extraction.rplan_content_extractor import parse_rplan_directory
from data_pipeline.rplan_content_extraction.rplan_utils import parse_result_df

# Import the keyword search functions
from data_pipeline.rplan_content_extraction.rplan_keyword_search import rplan_exact_keyword_search
# Import the visualization function
from visualizations.rplan_visualization import plot_keyword_search_results

Regional plans

This notebook shows how to extract content from regional plans, i.e. parse the text from the pdfs and divide them into chapters / sections.

# Set the paths to the PDF and TXT directories
RPLAN_PDF_DIR = "../data/nrw/rplan/raw/pdfs"
RPLAN_TXT_DIR = "../data/nrw/rplan/raw/text"
RPLAN_OUTPUT_PATH = "../data/nrw/rplan/features/rplan_content.json"

Step 1: Generate content

extract_text_and_save_to_txt_files(pdf_dir_path=RPLAN_PDF_DIR,
                                   txt_dir_path=RPLAN_TXT_DIR)

input_df = parse_rplan_directory(txt_dir_path=RPLAN_TXT_DIR,
                                 json_output_path=RPLAN_OUTPUT_PATH)

input_df = parse_result_df(df=input_df)

# save df as JSON
input_df.to_json(RPLAN_OUTPUT_PATH)