# necessary imports
import json
import os
import pandas as pd
from features.textual_features.keyword_search.exact_keyword_search import search_df_for_keywords
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
Exact keyword search for paragraphs from BauNVO & BauGB
Prepare data
Change the folder path in the code block below to read in the data.
Specify the relevant column names. The function that is used in the following expects the input data frame to have (at least) two columns, i.e., one id and one content column. Here, the columns are called
filenameandcontent. If named differently, change the column names in the code below.
# specify file path
INPUT_FILE_PATH = os.path.join("..", "data", "nrw", "bplan", "raw", "text", "bp_text.csv")
OUTPUT_FILE_PATH = os.path.join("..", "data", "nrw", "bplan", "features", "keywords", "exact_search", "exact_search.csv")
# specify relevant column names
ID_COLUMN='filename'
TEXT_COLUMN='content'
# read in data
input_df = pd.read_csv(INPUT_FILE_PATH, names=[ID_COLUMN, TEXT_COLUMN])
Define keyword dictionary
Keywords are specified in a separate json file to apply the exact keyword search more easily to different sets of keywords, simply by reading in the relevant dictionary. The dictionary is structured so that each keyword category (e.g. baunvo-1) can contain one or more keywords to consider the category covered (e.g., “§1 baunvo”, “1 baunvo”, or “allgemeine vorschriften für bauflächen und baugebiete”).
with open('features/textual_features/keyword_search/keyword_dict_exact.json') as f:
BAUNVO_KEYWORDS = json.load(f)
Apply function
Exact keyword matching based on input dictionary, returns df showing which keyword appeared in each pdf per category.
result_df = search_df_for_keywords(input_df=input_df,
text_column_name=TEXT_COLUMN,
id_column_name=ID_COLUMN,
keyword_dict=BAUNVO_KEYWORDS)
result_df.head(30)
| filename | baunvo-1 | baunvo-2 | baunvo-3 | baunvo-4 | baunvo-4a | baunvo-5 | baunvo-5a | baunvo-6 | baunvo-6a | ... | baunvo-17 | baunvo-18 | baunvo-19 | baunvo-20 | baunvo-21 | baunvo-21a | 13b | hq100 | hqhäufig | hqextrem | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 116995_0.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 1 | 116995_10.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 2 | 116995_2.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 3 | 116995_4.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | [grz] | None | None | None | None | None | None | None |
| 4 | 116995_6.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 5 | 116995_8.pdf | [1 baunvo] | [2 baunvo] | [3 baunvo, reine wohngebiete] | [4 baunvo, allgemeine wohngebiete] | [besondere wohngebiete] | [5 baunvo] | None | [6 baunvo] | None | ... | [17 baunvo] | [18 baunvo] | [19 baunvo, grundflächenzahl, grz] | [20 baunvo, vollgeschosse, gfz] | [21 baunvo] | None | None | None | None | None |
| 6 | 1423897.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 7 | 1427478.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 8 | 1427479.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 9 | 1427480.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 10 | 1427481.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 11 | 1427482.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 12 | 1427483.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 13 | 1434116.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 14 | 1691730_0.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 15 | 1691731_1.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 16 | 1691739_0.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 17 | 1691739_1.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 18 | 1691739_2.pdf | None | None | None | None | [besondere wohngebiete] | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 19 | 1691740_0.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 20 | 1691740_1.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 21 | 1691740_2.pdf | [1 baunvo] | [2 baunvo] | [3 baunvo] | [4 baunvo] | None | [5 baunvo] | None | [6 baunvo] | None | ... | [17 baunvo] | [18 baunvo] | [19 baunvo] | [20 baunvo] | [21 baunvo] | None | None | None | None | None |
| 22 | 1691744_0.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 23 | 1691744_1.pdf | [1 baunvo] | [2 baunvo] | [3 baunvo] | [4 baunvo] | None | [5 baunvo] | None | [6 baunvo] | None | ... | [17 baunvo] | [18 baunvo] | [19 baunvo] | [20 baunvo] | [21 baunvo] | None | None | None | None | None |
| 24 | 1691744_2.pdf | [1 baunvo] | [2 baunvo] | [3 baunvo] | [4 baunvo] | None | [5 baunvo] | None | [6 baunvo] | None | ... | None | None | None | None | None | None | None | None | None | None |
| 25 | 1691766_0.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 26 | 1691766_1.pdf | [1 baunvo] | [2 baunvo] | [3 baunvo] | [4 baunvo] | None | [5 baunvo] | None | [6 baunvo] | None | ... | [17 baunvo] | [18 baunvo] | [grundflächenzahl, grz] | [20 baunvo, vollgeschosse] | None | [stellplätze] | None | None | None | None |
| 27 | 1691782_0.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 28 | 1691782_1.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
| 29 | 1691782_2.pdf | None | None | None | None | None | None | None | None | None | ... | None | None | None | None | None | None | None | None | None | None |
30 rows × 31 columns
Check results
To inspect the keyword coverage across all files:
result_df.count()
filename 21573
baunvo-1 7086
baunvo-2 7217
baunvo-3 7259
baunvo-4 7542
baunvo-4a 2758
baunvo-5 7152
baunvo-5a 499
baunvo-6 7370
baunvo-6a 619
baunvo-7 6819
baunvo-8 7186
baunvo-9 7105
baunvo-10 6658
baunvo-11 6415
baunvo-12 6864
baunvo-13 5701
baunvo-13a 1985
baunvo-14 6182
baunvo-15 6167
baunvo-16 6239
baunvo-17 4991
baunvo-18 6409
baunvo-19 6920
baunvo-20 6897
baunvo-21 5249
baunvo-21a 6129
13b 224
hq100 379
hqhäufig 151
hqextrem 242
dtype: int64
Also, for a given pdf, one can extract all usage options listed in the bplan:
print(result_df.loc[[5]].values.tolist())
[['116995_8.pdf', ['1 baunvo'], ['2 baunvo'], ['3 baunvo', 'reine wohngebiete'], ['4 baunvo', 'allgemeine wohngebiete'], ['besondere wohngebiete'], ['5 baunvo'], None, ['6 baunvo'], None, ['7 baunvo'], ['8 baunvo'], ['9 baunvo'], ['10 baunvo'], ['11 baunvo'], ['12 baunvo'], ['13 baunvo'], None, ['14 baunvo'], ['15 baunvo'], ['16 baunvo'], ['17 baunvo'], ['18 baunvo'], ['19 baunvo', 'grundflächenzahl', 'grz'], ['20 baunvo', 'vollgeschosse', 'gfz'], ['21 baunvo'], None, None, None, None, None]]
Transform to Boolean
For better consecutive analysis, Boolean values may be preferred. The
optional argument boolean=True can be set. Instead of an overview of
all keyword hits per category, a dataframe will be returned that shows
whether a category was covered or not.
boolean_result_df = search_df_for_keywords(input_df=input_df,
text_column_name=TEXT_COLUMN,
id_column_name=ID_COLUMN,
keyword_dict=BAUNVO_KEYWORDS,
boolean=True)
Write results to csv
result_df.to_csv(OUTPUT_FILE_PATH, header=True, index=False)