Source code for src.features.textual_features.document_texts_creation.create_document_texts

import pandas as pd



[docs]
def enrich_extracts_with_metadata(info_df: pd.DataFrame,
                                  text_df: pd.DataFrame):
    """Function that joins BP-metadata and BP-text to produce the document_texts table.

    Args:
        info_df: df containing metadata
        text_df: df containing extracted text

    Returns:
        final_df: merged df
    """
    # create id columns based on 'filename'
    text_df['document_id'] = text_df['filename'].str.replace(r'\.pdf$', '', regex=True)
    text_df['land_parcel_id'] = text_df['document_id'].str.extract(r'(\d+)')
    
    # merge BP-metadata into BP-text based on objectid
    final_df = pd.merge(text_df, info_df[['objectid', 'name', 'scanurl','plantyp']],
                        left_on='document_id',
                        right_on='objectid',
                        how='left')
    
    # drop and rename columns
    final_df = final_df.drop(columns=['objectid']).rename(columns={'name': 'land_parcel_name',
                                                                   'scanurl': 'land_parcel_scanurl',
                                                                   'plantyp':'Document Type Code'
                                                                   })
    return final_df