Sean Pedrick-Case commited on
Commit
5bdafb4
·
unverified ·
2 Parent(s): 8fa83d7 9394293

Merge pull request #1 from seanpedrick-case/dev

Browse files

Deals with multi-line addresses. Better progress tracking. Returns original address in results. Updated packages

.dockerignore CHANGED
@@ -21,4 +21,7 @@ usage/
21
  logs/
22
  feedback/
23
  input/
24
- output/
 
 
 
 
21
  logs/
22
  feedback/
23
  input/
24
+ output/
25
+ cat_to_idx.txt
26
+ vocab.txt
27
+ word_to_index.txt
.gitignore CHANGED
@@ -21,4 +21,7 @@ usage/*
21
  logs/*
22
  feedback/*
23
  input/*
24
- output/*
 
 
 
 
21
  logs/*
22
  feedback/*
23
  input/*
24
+ output/*
25
+ cat_to_idx.txt
26
+ vocab.txt
27
+ word_to_index.txt
app.py CHANGED
@@ -55,45 +55,43 @@ with block:
55
  access_logs_state = gr.State(access_logs_folder + 'log.csv')
56
  access_s3_logs_loc_state = gr.State(access_logs_folder)
57
  usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
58
- usage_s3_logs_loc_state = gr.State(usage_logs_folder)
59
 
60
  gr.Markdown(
61
  """
62
  # Address matcher
63
- Match single or multiple addresses to the reference address file of your choice. Fuzzy matching should work on any address columns as long as you specify the postcode column at the end. The neural network component only activates with the in-house neural network model - contact me for details if you have access to AddressBase already.The neural network component works with LLPG files in the LPI format.
64
 
65
- The tool can accept csv, xlsx (with one sheet), and parquet files. You
66
- need to specify the address columns of the file to match specifically in the address column area with postcode at the end.
67
 
68
- Use the 'New Column' button to create a new cell for each column name. After you have chosen a reference file, an address match file, and specified its address columns (plus postcode), you can press 'Match addresses' to run the tool.
69
- """)
70
 
71
  with gr.Tab("Match addresses"):
72
-
73
- with gr.Accordion("Quick check - single address", open = True):
74
- in_text = gr.Textbox(label="Input a single address as text")
75
 
76
- with gr.Accordion("I have multiple addresses", open = False):
77
  in_file = gr.File(label="Input addresses from file", file_count= "multiple")
78
  in_colnames = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the address. Make sure postcode is at the end")
79
  in_existing = gr.Dropdown(value=[], choices=[], multiselect=False, label="Select columns that indicate existing matches.")
 
 
 
80
 
81
 
82
  gr.Markdown(
83
  """
84
  ## Choose reference file / call API
85
- Upload a reference file to match against, or alternatively call the Addressbase API (requires API key). Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you,
86
- open 'Custom reference file format or join columns' below.
87
  """)
88
 
89
- with gr.Accordion("Use Addressbase API (instead of reference file)", open = True):
90
  in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode"])#["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
91
  in_api_key = gr.Textbox(label="Addressbase API key", type='password', value = ADDRESSBASE_API_KEY)
92
 
93
- with gr.Accordion("Match against reference file of addresses", open = False):
 
94
  in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
95
 
96
- with gr.Accordion("Custom reference file format or join columns (i.e. not LLPG LPI format)", open = False):
97
  in_refcol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
98
  in_joincol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns you want to join on to the search dataset")
99
 
 
55
  access_logs_state = gr.State(access_logs_folder + 'log.csv')
56
  access_s3_logs_loc_state = gr.State(access_logs_folder)
57
  usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
58
+ usage_s3_logs_loc_state = gr.State(usage_logs_folder)
59
 
60
  gr.Markdown(
61
  """
62
  # Address matcher
63
+ Match single or multiple addresses to the reference address file of your choice. *Please note that a postcode column is required for matching*. Fuzzy matching should work on any address columns as long as you specify the postcode column at the end. The neural network component only activates with the in-house neural network model - contact me for details if you have access to AddressBase already. The neural network component works with LLPG files in the LPI format.
64
 
65
+ The tool can accept csv, xlsx (with one sheet), and parquet files. You need to specify the address columns of the file to match specifically in the address column area with postcode at the end.
 
66
 
67
+ Use the 'New Column' button to create a new cell for each column name. After you have chosen a reference file, an address match file, and specified its address columns (plus postcode), you can press 'Match addresses' to run the tool.""")
 
68
 
69
  with gr.Tab("Match addresses"):
 
 
 
70
 
71
+ with gr.Accordion("I have multiple addresses in a CSV/XLSX/Parquet file", open = True):
72
  in_file = gr.File(label="Input addresses from file", file_count= "multiple")
73
  in_colnames = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the address. Make sure postcode is at the end")
74
  in_existing = gr.Dropdown(value=[], choices=[], multiselect=False, label="Select columns that indicate existing matches.")
75
+
76
+ with gr.Accordion("Quick check - single address", open = False):
77
+ in_text = gr.Textbox(label="Input a single address as text")
78
 
79
 
80
  gr.Markdown(
81
  """
82
  ## Choose reference file / call API
83
+ Upload a reference file to match against, or alternatively call the Addressbase API (requires API key). Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you, open 'Custom reference file format or join columns' below.
 
84
  """)
85
 
86
+ with gr.Accordion("Use Addressbase API (instead of reference file)", open = False):
87
  in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode"])#["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
88
  in_api_key = gr.Textbox(label="Addressbase API key", type='password', value = ADDRESSBASE_API_KEY)
89
 
90
+
91
+ with gr.Accordion("Match against reference list of addresses in a CSV/XLSX/Parquet file", open = True):
92
  in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
93
 
94
+ with gr.Accordion("Custom reference file format or join columns (if not LLPG/Addressbase format with columns SaoText, SaoStartNumber etc.)", open = False):
95
  in_refcol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
96
  in_joincol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns you want to join on to the search dataset")
97
 
requirements.txt CHANGED
@@ -1,12 +1,12 @@
1
  torch==2.7.1
2
  pandas==2.2.3
3
- rapidfuzz==3.8.1
4
  recordlinkage==0.16
5
  pyap==0.3.1
6
  pytest==7.4.3
7
- pyarrow==19.0.1
8
  openpyxl==3.1.2
9
- gradio==5.34.0
10
- boto3==1.38.37
11
  polars==0.20.19
12
  numpy==1.26.4
 
1
  torch==2.7.1
2
  pandas==2.2.3
3
+ rapidfuzz==3.13.0
4
  recordlinkage==0.16
5
  pyap==0.3.1
6
  pytest==7.4.3
7
+ pyarrow==21.0.0
8
  openpyxl==3.1.2
9
+ gradio==5.34.2
10
+ boto3==1.40.5
11
  polars==0.20.19
12
  numpy==1.26.4
requirements_aws.txt CHANGED
@@ -1,11 +1,11 @@
1
- pandas==2.2.3
2
- rapidfuzz==3.8.1
3
  recordlinkage==0.16
4
  pyap==0.3.1
5
  pytest==7.4.3
6
- pyarrow==19.0.0
7
  openpyxl==3.1.2
8
- gradio==5.34.0
9
- boto3==1.38.37
10
  polars==0.20.19
11
  numpy==1.26.4
 
1
+ pandas==2.3.0
2
+ rapidfuzz==3.13.0
3
  recordlinkage==0.16
4
  pyap==0.3.1
5
  pytest==7.4.3
6
+ pyarrow==21.0.0
7
  openpyxl==3.1.2
8
+ gradio==5.34.2
9
+ boto3==1.40.5
10
  polars==0.20.19
11
  numpy==1.26.4
tools/fuzzy_match.py CHANGED
@@ -4,6 +4,7 @@ from typing import Dict, List, Tuple, Type
4
  from datetime import datetime
5
  from rapidfuzz import fuzz, process
6
  import gradio as gr
 
7
 
8
  PandasDataFrame = Type[pd.DataFrame]
9
  PandasSeries = Type[pd.Series]
@@ -51,7 +52,7 @@ def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:Panda
51
  return df
52
 
53
  def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
54
- search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults:
55
  '''
56
  Matches by Series values; for example idx is post code and
57
  values address. Search field is reduced by comparing same post codes address reference_address_series.
@@ -140,7 +141,7 @@ def string_match_by_post_code_multiple(match_address_series:PandasSeries, refere
140
 
141
  unique_postcodes = pd.unique(match_address_df['postcode_search'])
142
 
143
- for postcode_match in progress.tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):
144
 
145
  postcode_match_list = [postcode_match]
146
  search_indexes = pd.Series()
@@ -177,7 +178,7 @@ def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_
177
 
178
  ## Diagnostics
179
 
180
- diag_shortlist, diag_best_match = refine_export_results(results_df=results,\
181
  matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
182
  fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
183
 
@@ -308,7 +309,7 @@ def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_mat
308
  diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")
309
 
310
  if 'wratio_score' not in diag_shortlist.columns:
311
- diag_shortlist['wratio_score'] = ''
312
 
313
  # Order by best score
314
  diag_shortlist = diag_shortlist.sort_values([
@@ -317,7 +318,7 @@ def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_mat
317
 
318
  return diag_shortlist
319
 
320
- def refine_export_results(results_df:PandasDataFrame,
321
  matched_df:PandasDataFrame,
322
  ref_list_df:PandasDataFrame,
323
  matched_col="fuzzy_match_search_address",
@@ -340,7 +341,10 @@ def refine_export_results(results_df:PandasDataFrame,
340
  results_df = results_df[results_df[matched_col] !=0 ]
341
 
342
  ### Join property number and flat/room number etc. onto results_df
343
- ref_list_df["ref_index"] = ref_list_df.index
 
 
 
344
  ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
345
  ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})
346
 
@@ -351,7 +355,7 @@ def refine_export_results(results_df:PandasDataFrame,
351
  matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
352
  matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
353
 
354
- results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
355
 
356
  # Choose your best matches from the list of options
357
  diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)
@@ -381,12 +385,15 @@ def refine_export_results(results_df:PandasDataFrame,
381
 
382
  diag_shortlist = diag_shortlist[match_results_cols]
383
 
 
 
 
384
  # Choose best match from the shortlist that has been ordered according to score descending
385
  diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
386
 
387
  return diag_shortlist, diag_best_match
388
 
389
- def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
390
  '''
391
  Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
392
  '''
@@ -402,7 +409,6 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
402
 
403
  ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
404
  ref_df_after_stand_cols.extend(new_join_col)
405
-
406
 
407
  if (search_df_key_field == "index"):
408
  # Check index is int
@@ -420,8 +426,6 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
420
  if "Matched with reference address_y" in results_for_orig_df_join.columns:
421
  results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))
422
 
423
- #results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False)
424
-
425
  if "Reference file_y" in results_for_orig_df_join.columns:
426
  results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)
427
 
@@ -429,8 +433,13 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
429
  results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)
430
 
431
  # Drop columns that aren't useful
432
- results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2", "full_address",
433
  "address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")
 
 
 
 
 
434
 
435
  # Replace blanks with NA, fix UPRNs
436
  results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)
@@ -439,6 +448,7 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
439
 
440
  # Replace cells with only 'nan' with blank
441
  results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
442
-
 
443
 
444
  return results_for_orig_df_join
 
4
  from datetime import datetime
5
  from rapidfuzz import fuzz, process
6
  import gradio as gr
7
+ from tqdm import tqdm
8
 
9
  PandasDataFrame = Type[pd.DataFrame]
10
  PandasSeries = Type[pd.Series]
 
52
  return df
53
 
54
  def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
55
+ search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress(track_tqdm=True))-> MatchedResults:
56
  '''
57
  Matches by Series values; for example idx is post code and
58
  values address. Search field is reduced by comparing same post codes address reference_address_series.
 
141
 
142
  unique_postcodes = pd.unique(match_address_df['postcode_search'])
143
 
144
+ for postcode_match in tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):
145
 
146
  postcode_match_list = [postcode_match]
147
  search_indexes = pd.Series()
 
178
 
179
  ## Diagnostics
180
 
181
+ diag_shortlist, diag_best_match = create_diagnostic_results(results_df=results,\
182
  matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
183
  fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
184
 
 
309
  diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")
310
 
311
  if 'wratio_score' not in diag_shortlist.columns:
312
+ diag_shortlist['wratio_score'] = None
313
 
314
  # Order by best score
315
  diag_shortlist = diag_shortlist.sort_values([
 
318
 
319
  return diag_shortlist
320
 
321
+ def create_diagnostic_results(results_df:PandasDataFrame,
322
  matched_df:PandasDataFrame,
323
  ref_list_df:PandasDataFrame,
324
  matched_col="fuzzy_match_search_address",
 
341
  results_df = results_df[results_df[matched_col] !=0 ]
342
 
343
  ### Join property number and flat/room number etc. onto results_df
344
+ if 'ref_index' not in ref_list_df.columns:
345
+ print("Existing ref_index column not found")
346
+ ref_list_df["ref_index"] = ref_list_df.index
347
+
348
  ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
349
  ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})
350
 
 
355
  matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
356
  matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
357
 
358
+ results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
359
 
360
  # Choose your best matches from the list of options
361
  diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)
 
385
 
386
  diag_shortlist = diag_shortlist[match_results_cols]
387
 
388
+ diag_shortlist["ref_index"] = diag_shortlist["ref_index"].astype(int, errors="ignore")
389
+ diag_shortlist["wratio_score"] = diag_shortlist["wratio_score"].astype(float, errors="ignore")
390
+
391
  # Choose best match from the shortlist that has been ordered according to score descending
392
  diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
393
 
394
  return diag_shortlist, diag_best_match
395
 
396
+ def create_results_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
397
  '''
398
  Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
399
  '''
 
409
 
410
  ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
411
  ref_df_after_stand_cols.extend(new_join_col)
 
412
 
413
  if (search_df_key_field == "index"):
414
  # Check index is int
 
426
  if "Matched with reference address_y" in results_for_orig_df_join.columns:
427
  results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))
428
 
 
 
429
  if "Reference file_y" in results_for_orig_df_join.columns:
430
  results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)
431
 
 
433
  results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)
434
 
435
  # Drop columns that aren't useful
436
+ results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2",
437
  "address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")
438
+
439
+ results_for_orig_df_join.rename(columns={"full_address":"Search data address"}, inplace = True)
440
+
441
+ results_for_orig_df_join["index"] = results_for_orig_df_join["index"].astype(int, errors="ignore")
442
+ results_for_orig_df_join["ref_index"] = results_for_orig_df_join["ref_index"].astype(int, errors="ignore")
443
 
444
  # Replace blanks with NA, fix UPRNs
445
  results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)
 
448
 
449
  # Replace cells with only 'nan' with blank
450
  results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
451
+
452
+ results_for_orig_df_join.to_csv("output/results_for_orig_df_join.csv")
453
 
454
  return results_for_orig_df_join
tools/matcher_funcs.py CHANGED
@@ -9,6 +9,7 @@ import math
9
  from datetime import datetime
10
  import copy
11
  import gradio as gr
 
12
 
13
  PandasDataFrame = Type[pd.DataFrame]
14
  PandasSeries = Type[pd.Series]
@@ -26,7 +27,7 @@ run_standardise = True
26
 
27
  from tools.constants import *
28
  from tools.preparation import prepare_search_address_string, prepare_search_address, extract_street_name, prepare_ref_address, remove_non_postal, check_no_number_addresses
29
- from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
30
  from tools.standardise import standardise_wrapper_func
31
 
32
  # Neural network functions
@@ -69,14 +70,11 @@ def read_file(filename:str) -> PandasDataFrame:
69
  def get_file_name(in_name: str) -> str:
70
  """Get the name of a file from a string, handling both Windows and Unix paths."""
71
 
72
- print("in_name: ", in_name)
73
  match = re.search(rf'{re.escape(os.sep)}(?!.*{re.escape(os.sep)})(.*)', in_name)
74
  if match:
75
  matched_result = match.group(1)
76
  else:
77
  matched_result = None
78
-
79
- print("Matched result: ", matched_result)
80
 
81
  return matched_result
82
 
@@ -108,7 +106,7 @@ def filter_not_matched(
108
 
109
  return search_df.iloc[np.where(~matched)[0]]
110
 
111
- def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
112
 
113
  final_api_output_file_name = ""
114
 
@@ -204,7 +202,7 @@ def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str,
204
  loop_df = Matcher.ref_df
205
  loop_list = [Matcher.ref_df]
206
 
207
- for address in progress.tqdm(api_search_df['full_address_postcode'], desc= "Making API calls", unit="addresses", total=len(api_search_df['full_address_postcode'])):
208
  print("Query number: " + str(i+1), "with address: ", address)
209
 
210
  api_search_index = api_search_df.index
@@ -368,7 +366,7 @@ def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str,
368
 
369
  return Matcher, final_api_output_file_name
370
 
371
- def load_ref_data(Matcher:MatcherClass, ref_data_state:PandasDataFrame, in_ref:List[str], in_refcol:List[str], in_api:List[str], in_api_key:str, query_type:str, progress=gr.Progress()):
372
  '''
373
  Check for reference address data, do some preprocessing, and load in from the Addressbase API if required.
374
  '''
@@ -577,7 +575,7 @@ def load_match_data_and_filter(Matcher:MatcherClass, data_state:PandasDataFrame,
577
  Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
578
  Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
579
  Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
580
- Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
581
 
582
  Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
583
  Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
@@ -657,43 +655,91 @@ def load_match_data_and_filter(Matcher:MatcherClass, data_state:PandasDataFrame,
657
 
658
  return Matcher
659
 
660
- def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, Matcher, in_api, in_api_key):
661
- '''
662
- Load in user inputs from the Gradio interface. Convert all input types (single address, or csv input) into standardised data format that can be used downstream for the fuzzy matching.
663
- '''
664
- final_api_output_file_name = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
 
666
- today_rev = datetime.now().strftime("%Y%m%d")
667
 
668
- # Abort flag for if it's not even possible to attempt the first stage of the match for some reason
669
- Matcher.abort_flag = False
670
 
671
- ### ref_df FILES ###
672
- # If not an API call, run this first
673
- if not in_api:
674
- Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
675
 
676
- ### MATCH/SEARCH FILES ###
677
- # If doing API calls, we need to know the search data before querying for specific addresses/postcodes
678
- Matcher = load_match_data_and_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
 
679
 
680
- # If an API call, ref_df data is loaded after
681
- if in_api:
682
- Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
683
-
684
- print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
685
- print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
686
-
687
- Matcher.match_outputs_name = output_folder + "diagnostics_initial_" + today_rev + ".csv"
688
- Matcher.results_orig_df_name = output_folder + "results_initial_" + today_rev + ".csv"
689
-
690
- Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
691
- Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
692
 
693
- return Matcher, final_api_output_file_name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694
 
695
  # Run whole matcher process
696
- def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
697
  '''
698
  Split search and reference data into batches. Loop and run through the match script for each batch of data.
699
  '''
@@ -722,7 +768,6 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
722
  # Polars implementation not yet finalised
723
  #InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
724
  #InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
725
-
726
 
727
  # Prepare all search addresses
728
  if type(InitMatch.search_df) == str:
@@ -739,7 +784,6 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
739
  # Initial preparation of reference addresses
740
  InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
741
 
742
-
743
  # Polars implementation - not finalised
744
  #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
745
  #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
@@ -747,8 +791,10 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
747
  # Standardise addresses
748
  # Standardise - minimal
749
 
750
-
751
  tic = time.perf_counter()
 
 
 
752
  InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
753
  InitMatch.search_df_cleaned.copy(),
754
  InitMatch.ref_df_cleaned.copy(),
@@ -759,6 +805,8 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
759
  toc = time.perf_counter()
760
  print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
761
 
 
 
762
  # Standardise - full
763
  tic = time.perf_counter()
764
  InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
@@ -784,8 +832,8 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
784
  n = 0
785
  number_of_batches = range_df.shape[0]
786
 
787
- for row in progress.tqdm(range(0,len(range_df)), desc= "Running through batches", unit="batches", total=number_of_batches):
788
- print("Running batch ", str(n+1))
789
 
790
  search_range = range_df.iloc[row]['search_range']
791
  ref_range = range_df.iloc[row]['ref_range']
@@ -830,6 +878,8 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
830
  # Remove any duplicates from reference df, prioritise successful matches
831
  OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
832
 
 
 
833
  overall_toc = time.perf_counter()
834
  time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
835
 
@@ -851,14 +901,13 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
851
  nnet_std_output = OutputMatch.match_results_output.copy()
852
  nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
853
 
854
- final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
855
-
856
-
857
 
858
  estimate_total_processing_time = sum_numbers_before_seconds(time_out)
859
  print("Estimated total processing time:", str(estimate_total_processing_time))
860
 
861
  output_files.extend([OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name])
 
862
  return final_summary, output_files, estimate_total_processing_time
863
 
864
  # Run a match run for a single batch
@@ -985,7 +1034,7 @@ def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:i
985
 
986
  return lengths_df
987
 
988
- def run_single_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
989
  '''
990
  Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
991
  '''
@@ -1074,7 +1123,7 @@ def run_single_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches
1074
  return summary_of_summaries, FuzzyNNetStdMatch
1075
 
1076
  # Overarching functions
1077
- def orchestrate_single_match_batch(Matcher, standardise = False, nnet = False, file_stub= "not_std_", df_name = "Fuzzy not standardised"):
1078
 
1079
  today_rev = datetime.now().strftime("%Y%m%d")
1080
 
@@ -1152,20 +1201,24 @@ def orchestrate_single_match_batch(Matcher, standardise = False, nnet = False, f
1152
  return Matcher
1153
  else:
1154
  Matcher.match_results_output = match_results_output
1155
- Matcher.predict_df_nnet = predict_df_nnet
1156
-
1157
  # Save to file
1158
  Matcher.results_on_orig_df = results_on_orig_df
1159
-
1160
- print("Results output in orchestrate match run shape: ", Matcher.results_on_orig_df.shape)
1161
-
1162
- Matcher.summary = summary
1163
-
1164
  Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
1165
 
1166
  Matcher.match_outputs_name = output_folder + "diagnostics_" + file_stub + today_rev + ".csv"
1167
  Matcher.results_orig_df_name = output_folder + "results_" + file_stub + today_rev + ".csv"
1168
-
 
 
 
 
 
 
 
 
1169
  Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
1170
  Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
1171
 
@@ -1248,7 +1301,7 @@ def full_fuzzy_match(search_df:PandasDataFrame,
1248
  summary = create_match_summary(match_results_output, df_name)
1249
 
1250
  if type(search_df) != str:
1251
- results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
1252
  else: results_on_orig_df = match_results_output
1253
 
1254
  print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
@@ -1283,11 +1336,10 @@ def full_fuzzy_match(search_df:PandasDataFrame,
1283
  summary = create_match_summary(match_results_output, df_name)
1284
 
1285
  if type(search_df) != str:
1286
- results_on_orig_df = join_to_orig_df(match_results_output, search_df_after_stand, search_df_key_field, new_join_col)
1287
  else: results_on_orig_df = match_results_output
1288
 
1289
- return diag_shortlist, diag_best_match,\
1290
- match_results_output, results_on_orig_df, summary, search_address_cols
1291
 
1292
  print("Starting the fuzzy match with street as blocker")
1293
 
@@ -1314,7 +1366,7 @@ def full_fuzzy_match(search_df:PandasDataFrame,
1314
  ### Join URPN back onto orig df
1315
 
1316
  if type(search_df) != str:
1317
- results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
1318
  else: results_on_orig_df = match_results_output
1319
 
1320
  print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
@@ -1480,7 +1532,7 @@ def full_nn_match(ref_address_cols:List[str],
1480
  ### Join URPN back onto orig df
1481
 
1482
  if type(search_df) != str:
1483
- results_on_orig_df = join_to_orig_df(match_results_output_final_three, search_df_after_stand, search_df_key_field, new_join_col)
1484
  else: results_on_orig_df = match_results_output_final_three
1485
 
1486
  return match_results_output_final_three, results_on_orig_df, summary_three, predict_df
@@ -1495,18 +1547,28 @@ def combine_dfs_and_remove_dups(orig_df:PandasDataFrame, new_df:PandasDataFrame,
1495
  # If one of the dataframes is empty, break
1496
  if (orig_df.empty) & (new_df.empty):
1497
  return orig_df
1498
-
1499
 
 
 
 
 
 
 
1500
 
1501
- combined_std_not_matches = pd.concat([orig_df, new_df])#, ignore_index=True)
 
 
 
 
1502
 
 
1503
 
1504
  # If no results were combined
1505
  if combined_std_not_matches.empty:
1506
  combined_std_not_matches[match_address_series] = False
1507
 
1508
- if "full_address" in combined_std_not_matches.columns:
1509
- combined_std_not_matches[index_col] = combined_std_not_matches["full_address"]
1510
  combined_std_not_matches["fuzzy_score"] = 0
1511
  return combined_std_not_matches
1512
 
@@ -1540,6 +1602,7 @@ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass,
1540
  found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
1541
 
1542
  key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int) # Assuming list conversion is suitable
 
1543
  rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
1544
  NewMatchClass.search_df_not_matched = NewMatchClass.search_df_not_matched.loc[~NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].isin(rows_to_drop),:]#.drop(rows_to_drop, axis = 0)
1545
 
@@ -1565,11 +1628,13 @@ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass,
1565
  NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.drop("fuzzy_score", axis = 1)
1566
 
1567
  # Drop any duplicates, prioritise any matches
 
 
 
1568
  NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
1569
 
1570
  NewMatchClass.output_summary = create_match_summary(NewMatchClass.match_results_output, df_name = df_name)
1571
- print(NewMatchClass.output_summary)
1572
-
1573
 
1574
  NewMatchClass.search_df_not_matched = filter_not_matched(NewMatchClass.match_results_output, NewMatchClass.search_df, NewMatchClass.search_df_key_field)
1575
 
@@ -1580,8 +1645,17 @@ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass,
1580
  NewMatchClass.results_orig_df_name = output_folder + "results_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1581
 
1582
  # Only keep essential columns
1583
- essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
1584
- essential_results_cols.extend(NewMatchClass.new_join_col)
 
 
 
 
 
 
 
 
 
1585
 
1586
  NewMatchClass.match_results_output.to_csv(NewMatchClass.match_outputs_name, index = None)
1587
  NewMatchClass.results_on_orig_df[essential_results_cols].to_csv(NewMatchClass.results_orig_df_name, index = None)
 
9
  from datetime import datetime
10
  import copy
11
  import gradio as gr
12
+ from tqdm import tqdm
13
 
14
  PandasDataFrame = Type[pd.DataFrame]
15
  PandasSeries = Type[pd.Series]
 
27
 
28
  from tools.constants import *
29
  from tools.preparation import prepare_search_address_string, prepare_search_address, extract_street_name, prepare_ref_address, remove_non_postal, check_no_number_addresses
30
+ from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, create_results_df
31
  from tools.standardise import standardise_wrapper_func
32
 
33
  # Neural network functions
 
70
  def get_file_name(in_name: str) -> str:
71
  """Get the name of a file from a string, handling both Windows and Unix paths."""
72
 
 
73
  match = re.search(rf'{re.escape(os.sep)}(?!.*{re.escape(os.sep)})(.*)', in_name)
74
  if match:
75
  matched_result = match.group(1)
76
  else:
77
  matched_result = None
 
 
78
 
79
  return matched_result
80
 
 
106
 
107
  return search_df.iloc[np.where(~matched)[0]]
108
 
109
+ def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress(track_tqdm=True)):
110
 
111
  final_api_output_file_name = ""
112
 
 
202
  loop_df = Matcher.ref_df
203
  loop_list = [Matcher.ref_df]
204
 
205
+ for address in tqdm(api_search_df['full_address_postcode'], desc= "Making API calls", unit="addresses", total=len(api_search_df['full_address_postcode'])):
206
  print("Query number: " + str(i+1), "with address: ", address)
207
 
208
  api_search_index = api_search_df.index
 
366
 
367
  return Matcher, final_api_output_file_name
368
 
369
+ def load_ref_data(Matcher:MatcherClass, ref_data_state:PandasDataFrame, in_ref:List[str], in_refcol:List[str], in_api:List[str], in_api_key:str, query_type:str, progress=gr.Progress(track_tqdm=True)):
370
  '''
371
  Check for reference address data, do some preprocessing, and load in from the Addressbase API if required.
372
  '''
 
575
  Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
576
  Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
577
  Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
578
+ #Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
579
 
580
  Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
581
  Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
 
655
 
656
  return Matcher
657
 
658
+ def load_matcher_data(
659
+ in_text: str,
660
+ in_file: str,
661
+ in_ref: str,
662
+ data_state: PandasDataFrame,
663
+ results_data_state: PandasDataFrame,
664
+ ref_data_state: PandasDataFrame,
665
+ in_colnames: list,
666
+ in_refcol: list,
667
+ in_joincol: list,
668
+ in_existing: list,
669
+ Matcher: MatcherClass,
670
+ in_api:str,
671
+ in_api_key: str
672
+ ) -> tuple:
673
+ """
674
+ Load and preprocess user inputs from the Gradio interface for address matching.
675
+
676
+ This function standardises all input types (single address string, file uploads, etc.) into a consistent data format
677
+ suitable for downstream fuzzy matching. It handles both search and reference data, including API-based reference data retrieval
678
+ if requested.
679
+
680
+ Args:
681
+ in_text (str): Single address input as text, if provided.
682
+ in_file: Uploaded file(s) containing addresses to match.
683
+ in_ref: Uploaded reference file(s) or None if using API.
684
+ data_state (PandasDataFrame): Current state of the search data.
685
+ results_data_state (PandasDataFrame): Current state of the results data.
686
+ ref_data_state (PandasDataFrame): Current state of the reference data.
687
+ in_colnames (list): List of column names that make up the address in the search data.
688
+ in_refcol (list): List of column names that make up the address in the reference data.
689
+ in_joincol (list): List of columns to join on between search and reference data.
690
+ in_existing (list): List of columns indicating existing matches.
691
+ Matcher (MatcherClass): Matcher object to store and process data.
692
+ in_api: Flag or value indicating whether to use the API for reference data.
693
+ in_api_key (str): API key for reference data retrieval, if applicable.
694
+
695
+ Returns:
696
+ tuple: (Matcher, final_api_output_file_name)
697
+ Matcher: The updated Matcher object with loaded and preprocessed data.
698
+ final_api_output_file_name (str): The filename of the reference data if loaded from API, else empty string.
699
+ """
700
 
701
+ final_api_output_file_name = ""
702
 
703
+ today_rev = datetime.now().strftime("%Y%m%d")
 
704
 
705
+ # Abort flag for if it's not even possible to attempt the first stage of the match for some reason
706
+ Matcher.abort_flag = False
 
 
707
 
708
+ ### ref_df FILES ###
709
+ # If not an API call, run this first
710
+ if not in_api:
711
+ Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
712
 
713
+ ### MATCH/SEARCH FILES ###
714
+ # If doing API calls, we need to know the search data before querying for specific addresses/postcodes
715
+ Matcher = load_match_data_and_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
716
+
717
+ # If an API call, ref_df data is loaded after
718
+ if in_api:
719
+ Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
 
 
 
 
 
720
 
721
+ print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
722
+ print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
723
+
724
+ Matcher.match_outputs_name = output_folder + "diagnostics_initial_" + today_rev + ".csv"
725
+ Matcher.results_orig_df_name = output_folder + "results_initial_" + today_rev + ".csv"
726
+
727
+ if "fuzzy_score" in Matcher.match_results_output.columns:
728
+ Matcher.match_results_output["fuzzy_score"] = (
729
+ pd.to_numeric(Matcher.match_results_output["fuzzy_score"], errors="coerce").round(2)
730
+ )
731
+ if "wratio_score" in Matcher.match_results_output.columns:
732
+ Matcher.match_results_output["wratio_score"] = (
733
+ pd.to_numeric(Matcher.match_results_output["wratio_score"], errors="coerce").round(2)
734
+ )
735
+
736
+ Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
737
+ Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
738
+
739
+ return Matcher, final_api_output_file_name
740
 
741
  # Run whole matcher process
742
+ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress(track_tqdm=True)):
743
  '''
744
  Split search and reference data into batches. Loop and run through the match script for each batch of data.
745
  '''
 
768
  # Polars implementation not yet finalised
769
  #InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
770
  #InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
 
771
 
772
  # Prepare all search addresses
773
  if type(InitMatch.search_df) == str:
 
784
  # Initial preparation of reference addresses
785
  InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
786
 
 
787
  # Polars implementation - not finalised
788
  #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
789
  #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
 
791
  # Standardise addresses
792
  # Standardise - minimal
793
 
 
794
  tic = time.perf_counter()
795
+
796
+ progress(0.1, desc="Performing minimal standardisation")
797
+
798
  InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
799
  InitMatch.search_df_cleaned.copy(),
800
  InitMatch.ref_df_cleaned.copy(),
 
805
  toc = time.perf_counter()
806
  print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
807
 
808
+ progress(0.1, desc="Performing full standardisation")
809
+
810
  # Standardise - full
811
  tic = time.perf_counter()
812
  InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
 
832
  n = 0
833
  number_of_batches = range_df.shape[0]
834
 
835
+ for row in progress.tqdm(range(0,number_of_batches), desc= "Matching addresses in batches", unit="batches", total=number_of_batches):
836
+ print("Running batch", str(n+1))
837
 
838
  search_range = range_df.iloc[row]['search_range']
839
  ref_range = range_df.iloc[row]['ref_range']
 
878
  # Remove any duplicates from reference df, prioritise successful matches
879
  OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
880
 
881
+
882
+
883
  overall_toc = time.perf_counter()
884
  time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
885
 
 
901
  nnet_std_output = OutputMatch.match_results_output.copy()
902
  nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
903
 
904
+ final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
 
 
905
 
906
  estimate_total_processing_time = sum_numbers_before_seconds(time_out)
907
  print("Estimated total processing time:", str(estimate_total_processing_time))
908
 
909
  output_files.extend([OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name])
910
+
911
  return final_summary, output_files, estimate_total_processing_time
912
 
913
  # Run a match run for a single batch
 
1034
 
1035
  return lengths_df
1036
 
1037
+ def run_single_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress(track_tqdm=True)):
1038
  '''
1039
  Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
1040
  '''
 
1123
  return summary_of_summaries, FuzzyNNetStdMatch
1124
 
1125
  # Overarching functions
1126
+ def orchestrate_single_match_batch(Matcher:MatcherClass, standardise = False, nnet = False, file_stub= "not_std_", df_name = "Fuzzy not standardised"):
1127
 
1128
  today_rev = datetime.now().strftime("%Y%m%d")
1129
 
 
1201
  return Matcher
1202
  else:
1203
  Matcher.match_results_output = match_results_output
1204
+ Matcher.predict_df_nnet = predict_df_nnet
1205
+
1206
  # Save to file
1207
  Matcher.results_on_orig_df = results_on_orig_df
1208
+ Matcher.summary = summary
 
 
 
 
1209
  Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
1210
 
1211
  Matcher.match_outputs_name = output_folder + "diagnostics_" + file_stub + today_rev + ".csv"
1212
  Matcher.results_orig_df_name = output_folder + "results_" + file_stub + today_rev + ".csv"
1213
+
1214
+ if "fuzzy_score" in Matcher.match_results_output.columns:
1215
+ Matcher.match_results_output["fuzzy_score"] = (
1216
+ pd.to_numeric(Matcher.match_results_output["fuzzy_score"], errors="coerce").round(2)
1217
+ )
1218
+ if "wratio_score" in Matcher.match_results_output.columns:
1219
+ Matcher.match_results_output["wratio_score"] = (
1220
+ pd.to_numeric(Matcher.match_results_output["wratio_score"], errors="coerce").round(2)
1221
+ )
1222
  Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
1223
  Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
1224
 
 
1301
  summary = create_match_summary(match_results_output, df_name)
1302
 
1303
  if type(search_df) != str:
1304
+ results_on_orig_df = create_results_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
1305
  else: results_on_orig_df = match_results_output
1306
 
1307
  print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
 
1336
  summary = create_match_summary(match_results_output, df_name)
1337
 
1338
  if type(search_df) != str:
1339
+ results_on_orig_df = create_results_df(match_results_output, search_df_after_stand, search_df_key_field, new_join_col)
1340
  else: results_on_orig_df = match_results_output
1341
 
1342
+ return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
 
1343
 
1344
  print("Starting the fuzzy match with street as blocker")
1345
 
 
1366
  ### Join URPN back onto orig df
1367
 
1368
  if type(search_df) != str:
1369
+ results_on_orig_df = create_results_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
1370
  else: results_on_orig_df = match_results_output
1371
 
1372
  print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
 
1532
  ### Join URPN back onto orig df
1533
 
1534
  if type(search_df) != str:
1535
+ results_on_orig_df = create_results_df(match_results_output_final_three, search_df_after_stand, search_df_key_field, new_join_col)
1536
  else: results_on_orig_df = match_results_output_final_three
1537
 
1538
  return match_results_output_final_three, results_on_orig_df, summary_three, predict_df
 
1547
  # If one of the dataframes is empty, break
1548
  if (orig_df.empty) & (new_df.empty):
1549
  return orig_df
 
1550
 
1551
+ # Ensure that the original search result is returned
1552
+ if "Search data address" not in orig_df.columns:
1553
+ if "search_orig_address" in orig_df.columns:
1554
+ orig_df["Search data address"] = orig_df["search_orig_address"]
1555
+ elif "address_cols_joined" in orig_df.columns:
1556
+ orig_df["Search data address"] = orig_df["address_cols_joined"]
1557
 
1558
+ if "Search data address" not in new_df.columns:
1559
+ if "search_orig_address" in new_df.columns:
1560
+ new_df["Search data address"] = new_df["search_orig_address"]
1561
+ elif "address_cols_joined" in new_df.columns:
1562
+ new_df["Search data address"] = new_df["address_cols_joined"]
1563
 
1564
+ combined_std_not_matches = pd.concat([orig_df, new_df])#, ignore_index=True)
1565
 
1566
  # If no results were combined
1567
  if combined_std_not_matches.empty:
1568
  combined_std_not_matches[match_address_series] = False
1569
 
1570
+ #if "full_address" in combined_std_not_matches.columns:
1571
+ # combined_std_not_matches[index_col] = combined_std_not_matches["full_address"]
1572
  combined_std_not_matches["fuzzy_score"] = 0
1573
  return combined_std_not_matches
1574
 
 
1602
  found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
1603
 
1604
  key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int) # Assuming list conversion is suitable
1605
+
1606
  rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
1607
  NewMatchClass.search_df_not_matched = NewMatchClass.search_df_not_matched.loc[~NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].isin(rows_to_drop),:]#.drop(rows_to_drop, axis = 0)
1608
 
 
1628
  NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.drop("fuzzy_score", axis = 1)
1629
 
1630
  # Drop any duplicates, prioritise any matches
1631
+ NewMatchClass.results_on_orig_df["index"] = NewMatchClass.results_on_orig_df["index"].astype(int, errors="ignore")
1632
+ NewMatchClass.results_on_orig_df["ref_index"] = NewMatchClass.results_on_orig_df["ref_index"].astype(int, errors="ignore")
1633
+
1634
  NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
1635
 
1636
  NewMatchClass.output_summary = create_match_summary(NewMatchClass.match_results_output, df_name = df_name)
1637
+ print(NewMatchClass.output_summary)
 
1638
 
1639
  NewMatchClass.search_df_not_matched = filter_not_matched(NewMatchClass.match_results_output, NewMatchClass.search_df, NewMatchClass.search_df_key_field)
1640
 
 
1645
  NewMatchClass.results_orig_df_name = output_folder + "results_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1646
 
1647
  # Only keep essential columns
1648
+ essential_results_cols = [NewMatchClass.search_df_key_field, "Search data address", "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
1649
+ essential_results_cols.extend(NewMatchClass.new_join_col)
1650
+
1651
+ if "fuzzy_score" in NewMatchClass.match_results_output.columns:
1652
+ NewMatchClass.match_results_output["fuzzy_score"] = (
1653
+ pd.to_numeric(NewMatchClass.match_results_output["fuzzy_score"], errors="coerce").round(2)
1654
+ )
1655
+ if "wratio_score" in NewMatchClass.match_results_output.columns:
1656
+ NewMatchClass.match_results_output["wratio_score"] = (
1657
+ pd.to_numeric(NewMatchClass.match_results_output["wratio_score"], errors="coerce").round(2)
1658
+ )
1659
 
1660
  NewMatchClass.match_results_output.to_csv(NewMatchClass.match_outputs_name, index = None)
1661
  NewMatchClass.results_on_orig_df[essential_results_cols].to_csv(NewMatchClass.results_orig_df_name, index = None)
tools/preparation.py CHANGED
@@ -3,6 +3,10 @@ from typing import Type, Dict, List, Tuple
3
  from datetime import datetime
4
  #import polars as pl
5
  import re
 
 
 
 
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
  PandasSeries = Type[pd.Series]
@@ -54,9 +58,12 @@ def prepare_search_address(
54
  search_df: pd.DataFrame,
55
  address_cols: list,
56
  postcode_col: list,
57
- key_col: str
 
58
  ) -> Tuple[pd.DataFrame, str]:
59
 
 
 
60
  # Validate inputs
61
  if not isinstance(search_df, pd.DataFrame):
62
  raise TypeError("search_df must be a Pandas DataFrame")
@@ -68,56 +75,64 @@ def prepare_search_address(
68
  raise TypeError("postcode_col must be a list")
69
 
70
  if not isinstance(key_col, str):
71
- raise TypeError("key_col must be a string")
72
-
73
- # Clean address columns
74
- #search_df_polars = pl.from_dataframe(search_df)
75
- clean_addresses = _clean_columns(search_df, address_cols)
76
 
77
  # If there is a full address and postcode column in the addresses, clean any postcodes from the first column
78
  if len(address_cols) == 2:
79
  # Remove postcode from address
80
- address_series = remove_postcode(clean_addresses, address_cols[0])
81
- clean_addresses[address_cols[0]] = address_series
82
 
83
  # Join address columns into one
84
- full_addresses = _join_address(clean_addresses, address_cols)
 
 
 
 
85
 
86
  # Add postcode column
87
- full_df = _add_postcode_column(full_addresses, postcode_col)
88
 
89
  # Remove postcode from main address if there was only one column in the input
90
  if postcode_col == "full_address_postcode":
91
  # Remove postcode from address
92
- address_series = remove_postcode(search_df, "full_address")
93
- search_df["full_address"] == address_series
94
-
95
  # Ensure index column
96
  final_df = _ensure_index(full_df, key_col)
97
-
98
- #print(final_df)
99
-
100
 
101
  return final_df
102
 
103
  # Helper functions
104
- def _clean_columns(df, cols):
105
- # Cleaning logic
106
- def clean_col(col):
107
- return col.astype(str).fillna("").infer_objects(copy=False).str.replace("nan","").str.replace("\s{2,}", " ", regex=True).str.replace(","," ").str.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- df[cols] = df[cols].apply(clean_col)
110
-
111
- return df
112
 
113
- def _join_address(df, cols):
114
  # Joining logic
115
  full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
116
  df["full_address"] = full_address.str.replace("\s{2,}", " ", regex=True).str.strip()
117
 
118
  return df
119
 
120
- def _add_postcode_column(df, postcodes):
121
  # Add postcode column
122
  if isinstance(postcodes, list):
123
  postcodes = postcodes[0]
@@ -133,7 +148,7 @@ def _add_postcode_column(df, postcodes):
133
 
134
  return df
135
 
136
- def _ensure_index(df, index_col):
137
  # Ensure index column exists
138
  if ((index_col == "index") & ~("index" in df.columns)):
139
  print("Resetting index in _ensure_index function")
@@ -143,7 +158,7 @@ def _ensure_index(df, index_col):
143
 
144
  return df
145
 
146
- def create_full_address(df):
147
 
148
  df = df.fillna("").infer_objects(copy=False)
149
 
@@ -169,8 +184,10 @@ def create_full_address(df):
169
 
170
  return df["full_address"]
171
 
172
- def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_cols = True):
173
 
 
 
174
  if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns): standard_cols = True
175
  else: standard_cols = False
176
 
@@ -182,6 +199,8 @@ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_co
182
  ref_address_cols_uprn_w_ref.extend(["Reference file"])
183
 
184
  ref_df_cleaned = ref_df.copy()
 
 
185
 
186
  # In on-prem LPI db street has been excluded, so put this back in
187
  if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
@@ -218,13 +237,7 @@ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_co
218
  full_address = ref_df_cleaned[ref_address_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
219
  ref_df_cleaned["fulladdress"] = full_address
220
 
221
- ref_df_cleaned["fulladdress"] = ref_df_cleaned["fulladdress"]\
222
- .str.replace("-999","")\
223
- .str.replace(" -"," ")\
224
- .str.replace("- "," ")\
225
- .str.replace(".0","", regex=False)\
226
- .str.replace("\s{2,}", " ", regex=True)\
227
- .str.strip()
228
 
229
  # Create a street column if it doesn't exist by extracting street from the full address
230
 
@@ -232,6 +245,7 @@ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_co
232
  ref_df_cleaned['Street'] = ref_df_cleaned["fulladdress"].apply(extract_street_name)
233
 
234
  # Add index column
 
235
  ref_df_cleaned['ref_index'] = ref_df_cleaned.index
236
 
237
  return ref_df_cleaned
@@ -246,7 +260,7 @@ def extract_postcode(df, col:str) -> PandasSeries:
246
  return postcode_series
247
 
248
  # Remove addresses with no numbers in at all - too high a risk of badly assigning an address
249
- def check_no_number_addresses(df, in_address_series) -> PandasSeries:
250
  '''
251
  Highlight addresses from a pandas df where there are no numbers in the address.
252
  '''
@@ -262,15 +276,6 @@ def check_no_number_addresses(df, in_address_series) -> PandasSeries:
262
 
263
  return df
264
 
265
- # def remove_postcode(df, col:str) -> PandasSeries:
266
- # '''
267
- # Remove a postcode from a string column in a dataframe
268
- # '''
269
- # address_series_no_pcode = df[col].str.upper().str.replace(\
270
- # "\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","", regex=True).str.lower()
271
-
272
- # return address_series_no_pcode
273
-
274
  def extract_street_name(address:str) -> str:
275
  """
276
  Extracts the street name from the given address.
@@ -342,7 +347,7 @@ def extract_street_name(address:str) -> str:
342
 
343
  # Exclude non-postal addresses
344
 
345
- def remove_non_postal(df, in_address_series):
346
  '''
347
  Highlight non-postal addresses from a polars df where a string series that contain specific substrings
348
  indicating non-postal addresses like 'garage', 'parking', 'shed', etc.
 
3
  from datetime import datetime
4
  #import polars as pl
5
  import re
6
+ from tqdm import tqdm
7
+ from gradio import Progress
8
+
9
+ tqdm.pandas() # Registers the progress_apply method
10
 
11
  PandasDataFrame = Type[pd.DataFrame]
12
  PandasSeries = Type[pd.Series]
 
58
  search_df: pd.DataFrame,
59
  address_cols: list,
60
  postcode_col: list,
61
+ key_col: str,
62
+ progress = Progress(track_tqdm=True)
63
  ) -> Tuple[pd.DataFrame, str]:
64
 
65
+ progress(0, "Preparing search address column")
66
+
67
  # Validate inputs
68
  if not isinstance(search_df, pd.DataFrame):
69
  raise TypeError("search_df must be a Pandas DataFrame")
 
75
  raise TypeError("postcode_col must be a list")
76
 
77
  if not isinstance(key_col, str):
78
+ raise TypeError("key_col must be a string")
 
 
 
 
79
 
80
  # If there is a full address and postcode column in the addresses, clean any postcodes from the first column
81
  if len(address_cols) == 2:
82
  # Remove postcode from address
83
+ search_df[address_cols[0]] = remove_postcode(search_df, address_cols[0])
 
84
 
85
  # Join address columns into one
86
+ full_addresses = _join_address(search_df, address_cols)
87
+
88
+ # Clean address columns
89
+ #search_df_polars = pl.from_dataframe(search_df)
90
+ clean_addresses = _clean_columns(full_addresses, ["full_address"])
91
 
92
  # Add postcode column
93
+ full_df = _add_postcode_column(clean_addresses, postcode_col)
94
 
95
  # Remove postcode from main address if there was only one column in the input
96
  if postcode_col == "full_address_postcode":
97
  # Remove postcode from address
98
+ full_df["full_address"] = remove_postcode(full_df, "full_address")
99
+
 
100
  # Ensure index column
101
  final_df = _ensure_index(full_df, key_col)
 
 
 
102
 
103
  return final_df
104
 
105
  # Helper functions
106
+ def _clean_columns(df:PandasDataFrame, cols:List[str]):
107
+ # Cleaning logic
108
+ def clean_col(col):
109
+ return (
110
+ col.astype(str)
111
+ .fillna("")
112
+ .infer_objects(copy=False)
113
+ .str.replace("nan", "")
114
+ .str.replace(r"\bNone\b", "", case=False, regex=True)
115
+ .str.replace(r"\s{2,}", " ", regex=True)
116
+ .str.replace(",", " ")
117
+ .str.replace(r"[\r\n]+", " ", regex=True) # Replace line breaks with spaces
118
+ .str.strip()
119
+ # Remove duplicate two words at the end if present
120
+ .str.replace(r'(\b\w+\b\s+\b\w+\b)\s+\1$', r'\1', regex=True)
121
+ )
122
+
123
+ for col in tqdm(cols, desc="Cleaning columns"):
124
+ df[col] = clean_col(df[col])
125
 
126
+ return df
 
 
127
 
128
+ def _join_address(df:PandasDataFrame, cols:List[str]):
129
  # Joining logic
130
  full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
131
  df["full_address"] = full_address.str.replace("\s{2,}", " ", regex=True).str.strip()
132
 
133
  return df
134
 
135
+ def _add_postcode_column(df:PandasDataFrame, postcodes:str):
136
  # Add postcode column
137
  if isinstance(postcodes, list):
138
  postcodes = postcodes[0]
 
148
 
149
  return df
150
 
151
+ def _ensure_index(df:PandasDataFrame, index_col:str):
152
  # Ensure index column exists
153
  if ((index_col == "index") & ~("index" in df.columns)):
154
  print("Resetting index in _ensure_index function")
 
158
 
159
  return df
160
 
161
+ def create_full_address(df:PandasDataFrame):
162
 
163
  df = df.fillna("").infer_objects(copy=False)
164
 
 
184
 
185
  return df["full_address"]
186
 
187
+ def prepare_ref_address(ref_df:PandasDataFrame, ref_address_cols:List[str], new_join_col = [], standard_cols = True, progress=Progress(track_tqdm=True)):
188
 
189
+ progress(0, "Preparing reference address")
190
+
191
  if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns): standard_cols = True
192
  else: standard_cols = False
193
 
 
199
  ref_address_cols_uprn_w_ref.extend(["Reference file"])
200
 
201
  ref_df_cleaned = ref_df.copy()
202
+
203
+ ref_df_cleaned["ref_index"] = ref_df_cleaned.index
204
 
205
  # In on-prem LPI db street has been excluded, so put this back in
206
  if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
 
237
  full_address = ref_df_cleaned[ref_address_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
238
  ref_df_cleaned["fulladdress"] = full_address
239
 
240
+ ref_df_cleaned = _clean_columns(ref_df_cleaned, ["fulladdress"])
 
 
 
 
 
 
241
 
242
  # Create a street column if it doesn't exist by extracting street from the full address
243
 
 
245
  ref_df_cleaned['Street'] = ref_df_cleaned["fulladdress"].apply(extract_street_name)
246
 
247
  # Add index column
248
+ if 'ref_index' not in ref_df_cleaned.columns:
249
  ref_df_cleaned['ref_index'] = ref_df_cleaned.index
250
 
251
  return ref_df_cleaned
 
260
  return postcode_series
261
 
262
  # Remove addresses with no numbers in at all - too high a risk of badly assigning an address
263
+ def check_no_number_addresses(df:PandasDataFrame, in_address_series:str) -> PandasSeries:
264
  '''
265
  Highlight addresses from a pandas df where there are no numbers in the address.
266
  '''
 
276
 
277
  return df
278
 
 
 
 
 
 
 
 
 
 
279
  def extract_street_name(address:str) -> str:
280
  """
281
  Extracts the street name from the given address.
 
347
 
348
  # Exclude non-postal addresses
349
 
350
+ def remove_non_postal(df:PandasDataFrame, in_address_series:str):
351
  '''
352
  Highlight non-postal addresses from a polars df where a string series that contain specific substrings
353
  indicating non-postal addresses like 'garage', 'parking', 'shed', etc.
tools/standardise.py CHANGED
@@ -136,7 +136,9 @@ def standardise_address(df:PandasDataFrame, col:str, out_col:str, standardise:bo
136
  str.replace(r"\bmaisonette\b", "flat", regex=True).\
137
  str.replace(r"\bpt\b", "penthouse", regex=True).\
138
  str.replace(r"\bbst\b","basement", regex=True).\
139
- str.replace(r"\bbsmt\b","basement", regex=True)
 
 
140
 
141
  df_copy["add_no_pcode_house"] = move_flat_house_court(df_copy)
142
 
 
136
  str.replace(r"\bmaisonette\b", "flat", regex=True).\
137
  str.replace(r"\bpt\b", "penthouse", regex=True).\
138
  str.replace(r"\bbst\b","basement", regex=True).\
139
+ str.replace(r"\bbsmt\b","basement", regex=True).\
140
+ str.replace(r"\s{2,}", " ", regex=True).\
141
+ str.strip()
142
 
143
  df_copy["add_no_pcode_house"] = move_flat_house_court(df_copy)
144