Source code for src.data_pipeline.pdf_scraper.tika_pdf_scraper

import os
import pandas as pd
import random

from loguru import logger
from tika import parser


[docs] def pdf_parser_from_path(pdf_path: str) -> dict: """Parse pdf and extract content and metadata Args: pdf_path: path as string Returns: parsed: dictionary containing content and metadata """ # try parsing with extended timeout, extract and store relevant info try: parsed = parser.from_file(pdf_path, requestOptions={'timeout': 200}) return { "content": parsed["content"], "metadata": parsed["metadata"] } # store error message if unsuccessful except Exception as e: logger.info(f"An error occurred: {e}") return { "content": f"Error parsing PDF: {e}", "metadata": None }
[docs] def pdf_parser_from_folder(folder_path: str, sample_size: int =None) -> pd.DataFrame: """Apply pdf_parser_from_path function to full folder Args: folder_path: full input folder path as string Returns: df: df containing filename, content and metadata per pdf """ # get all filenames pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf') or file.lower().endswith('.png')] # if a sample_size is specified, get random sample of specified size if sample_size is not None: # only if enough samples are available if sample_size <= len(pdf_files): pdf_files = random.sample(pdf_files, sample_size) # otherwise raise value error else: raise ValueError(f"Sample of {sample_size} larger than folder contents of {len(pdf_files)}") # define empty df to store parsed result parsed_data = [] # iterate over all files in folder for pdf_file in pdf_files: logger.info(f"Parsing file: {pdf_file}") pdf_path = os.path.join(folder_path, pdf_file) # apply parser function parsed_info = pdf_parser_from_path(pdf_path) parsed_data.append({ "filename": pdf_file, "content": parsed_info["content"], "metadata": parsed_info["metadata"] }) logger.info("Parsing done.") # save as df and convert 'content' to string df = pd.DataFrame(parsed_data) return df