Source code for src.features.textual_features.document_texts_creation.create_document_texts

import pandas as pd


[docs] def enrich_extracts_with_metadata(info_df: pd.DataFrame, text_df: pd.DataFrame): """Function that joins BP-metadata and BP-text to produce the document_texts table. Args: info_df: df containing metadata text_df: df containing extracted text Returns: final_df: merged df """ # create id columns based on 'filename' text_df['document_id'] = text_df['filename'].str.replace(r'\.pdf$', '', regex=True) text_df['land_parcel_id'] = text_df['document_id'].str.extract(r'(\d+)') # merge BP-metadata into BP-text based on objectid final_df = pd.merge(text_df, info_df[['objectid', 'name', 'scanurl','plantyp']], left_on='document_id', right_on='objectid', how='left') # drop and rename columns final_df = final_df.drop(columns=['objectid']).rename(columns={'name': 'land_parcel_name', 'scanurl': 'land_parcel_scanurl', 'plantyp':'Document Type Code' }) return final_df