Spaces:
Sleeping
Sleeping
Merge pull request #1 from seanpedrick-case/dev
Browse filesDeals with multi-line addresses. Better progress tracking. Returns original address in results. Updated packages
- .dockerignore +4 -1
- .gitignore +4 -1
- app.py +13 -15
- requirements.txt +4 -4
- requirements_aws.txt +5 -5
- tools/fuzzy_match.py +23 -13
- tools/matcher_funcs.py +143 -69
- tools/preparation.py +51 -46
- tools/standardise.py +3 -1
.dockerignore
CHANGED
@@ -21,4 +21,7 @@ usage/
|
|
21 |
logs/
|
22 |
feedback/
|
23 |
input/
|
24 |
-
output/
|
|
|
|
|
|
|
|
21 |
logs/
|
22 |
feedback/
|
23 |
input/
|
24 |
+
output/
|
25 |
+
cat_to_idx.txt
|
26 |
+
vocab.txt
|
27 |
+
word_to_index.txt
|
.gitignore
CHANGED
@@ -21,4 +21,7 @@ usage/*
|
|
21 |
logs/*
|
22 |
feedback/*
|
23 |
input/*
|
24 |
-
output/*
|
|
|
|
|
|
|
|
21 |
logs/*
|
22 |
feedback/*
|
23 |
input/*
|
24 |
+
output/*
|
25 |
+
cat_to_idx.txt
|
26 |
+
vocab.txt
|
27 |
+
word_to_index.txt
|
app.py
CHANGED
@@ -55,45 +55,43 @@ with block:
|
|
55 |
access_logs_state = gr.State(access_logs_folder + 'log.csv')
|
56 |
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
57 |
usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
|
58 |
-
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
59 |
|
60 |
gr.Markdown(
|
61 |
"""
|
62 |
# Address matcher
|
63 |
-
Match single or multiple addresses to the reference address file of your choice. Fuzzy matching should work on any address columns as long as you specify the postcode column at the end. The neural network component only activates with the in-house neural network model - contact me for details if you have access to AddressBase already.The neural network component works with LLPG files in the LPI format.
|
64 |
|
65 |
-
The tool can accept csv, xlsx (with one sheet), and parquet files. You
|
66 |
-
need to specify the address columns of the file to match specifically in the address column area with postcode at the end.
|
67 |
|
68 |
-
Use the 'New Column' button to create a new cell for each column name. After you have chosen a reference file, an address match file, and specified its address columns (plus postcode), you can press 'Match addresses' to run the tool.
|
69 |
-
""")
|
70 |
|
71 |
with gr.Tab("Match addresses"):
|
72 |
-
|
73 |
-
with gr.Accordion("Quick check - single address", open = True):
|
74 |
-
in_text = gr.Textbox(label="Input a single address as text")
|
75 |
|
76 |
-
with gr.Accordion("I have multiple addresses", open =
|
77 |
in_file = gr.File(label="Input addresses from file", file_count= "multiple")
|
78 |
in_colnames = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the address. Make sure postcode is at the end")
|
79 |
in_existing = gr.Dropdown(value=[], choices=[], multiselect=False, label="Select columns that indicate existing matches.")
|
|
|
|
|
|
|
80 |
|
81 |
|
82 |
gr.Markdown(
|
83 |
"""
|
84 |
## Choose reference file / call API
|
85 |
-
Upload a reference file to match against, or alternatively call the Addressbase API (requires API key). Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you,
|
86 |
-
open 'Custom reference file format or join columns' below.
|
87 |
""")
|
88 |
|
89 |
-
with gr.Accordion("Use Addressbase API (instead of reference file)", open =
|
90 |
in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode"])#["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
|
91 |
in_api_key = gr.Textbox(label="Addressbase API key", type='password', value = ADDRESSBASE_API_KEY)
|
92 |
|
93 |
-
|
|
|
94 |
in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
|
95 |
|
96 |
-
with gr.Accordion("Custom reference file format or join columns (
|
97 |
in_refcol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
|
98 |
in_joincol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns you want to join on to the search dataset")
|
99 |
|
|
|
55 |
access_logs_state = gr.State(access_logs_folder + 'log.csv')
|
56 |
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
57 |
usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
|
58 |
+
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
59 |
|
60 |
gr.Markdown(
|
61 |
"""
|
62 |
# Address matcher
|
63 |
+
Match single or multiple addresses to the reference address file of your choice. *Please note that a postcode column is required for matching*. Fuzzy matching should work on any address columns as long as you specify the postcode column at the end. The neural network component only activates with the in-house neural network model - contact me for details if you have access to AddressBase already. The neural network component works with LLPG files in the LPI format.
|
64 |
|
65 |
+
The tool can accept csv, xlsx (with one sheet), and parquet files. You need to specify the address columns of the file to match specifically in the address column area with postcode at the end.
|
|
|
66 |
|
67 |
+
Use the 'New Column' button to create a new cell for each column name. After you have chosen a reference file, an address match file, and specified its address columns (plus postcode), you can press 'Match addresses' to run the tool.""")
|
|
|
68 |
|
69 |
with gr.Tab("Match addresses"):
|
|
|
|
|
|
|
70 |
|
71 |
+
with gr.Accordion("I have multiple addresses in a CSV/XLSX/Parquet file", open = True):
|
72 |
in_file = gr.File(label="Input addresses from file", file_count= "multiple")
|
73 |
in_colnames = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the address. Make sure postcode is at the end")
|
74 |
in_existing = gr.Dropdown(value=[], choices=[], multiselect=False, label="Select columns that indicate existing matches.")
|
75 |
+
|
76 |
+
with gr.Accordion("Quick check - single address", open = False):
|
77 |
+
in_text = gr.Textbox(label="Input a single address as text")
|
78 |
|
79 |
|
80 |
gr.Markdown(
|
81 |
"""
|
82 |
## Choose reference file / call API
|
83 |
+
Upload a reference file to match against, or alternatively call the Addressbase API (requires API key). Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you, open 'Custom reference file format or join columns' below.
|
|
|
84 |
""")
|
85 |
|
86 |
+
with gr.Accordion("Use Addressbase API (instead of reference file)", open = False):
|
87 |
in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode"])#["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
|
88 |
in_api_key = gr.Textbox(label="Addressbase API key", type='password', value = ADDRESSBASE_API_KEY)
|
89 |
|
90 |
+
|
91 |
+
with gr.Accordion("Match against reference list of addresses in a CSV/XLSX/Parquet file", open = True):
|
92 |
in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
|
93 |
|
94 |
+
with gr.Accordion("Custom reference file format or join columns (if not LLPG/Addressbase format with columns SaoText, SaoStartNumber etc.)", open = False):
|
95 |
in_refcol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
|
96 |
in_joincol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns you want to join on to the search dataset")
|
97 |
|
requirements.txt
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
torch==2.7.1
|
2 |
pandas==2.2.3
|
3 |
-
rapidfuzz==3.
|
4 |
recordlinkage==0.16
|
5 |
pyap==0.3.1
|
6 |
pytest==7.4.3
|
7 |
-
pyarrow==
|
8 |
openpyxl==3.1.2
|
9 |
-
gradio==5.34.
|
10 |
-
boto3==1.
|
11 |
polars==0.20.19
|
12 |
numpy==1.26.4
|
|
|
1 |
torch==2.7.1
|
2 |
pandas==2.2.3
|
3 |
+
rapidfuzz==3.13.0
|
4 |
recordlinkage==0.16
|
5 |
pyap==0.3.1
|
6 |
pytest==7.4.3
|
7 |
+
pyarrow==21.0.0
|
8 |
openpyxl==3.1.2
|
9 |
+
gradio==5.34.2
|
10 |
+
boto3==1.40.5
|
11 |
polars==0.20.19
|
12 |
numpy==1.26.4
|
requirements_aws.txt
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
pandas==2.
|
2 |
-
rapidfuzz==3.
|
3 |
recordlinkage==0.16
|
4 |
pyap==0.3.1
|
5 |
pytest==7.4.3
|
6 |
-
pyarrow==
|
7 |
openpyxl==3.1.2
|
8 |
-
gradio==5.34.
|
9 |
-
boto3==1.
|
10 |
polars==0.20.19
|
11 |
numpy==1.26.4
|
|
|
1 |
+
pandas==2.3.0
|
2 |
+
rapidfuzz==3.13.0
|
3 |
recordlinkage==0.16
|
4 |
pyap==0.3.1
|
5 |
pytest==7.4.3
|
6 |
+
pyarrow==21.0.0
|
7 |
openpyxl==3.1.2
|
8 |
+
gradio==5.34.2
|
9 |
+
boto3==1.40.5
|
10 |
polars==0.20.19
|
11 |
numpy==1.26.4
|
tools/fuzzy_match.py
CHANGED
@@ -4,6 +4,7 @@ from typing import Dict, List, Tuple, Type
|
|
4 |
from datetime import datetime
|
5 |
from rapidfuzz import fuzz, process
|
6 |
import gradio as gr
|
|
|
7 |
|
8 |
PandasDataFrame = Type[pd.DataFrame]
|
9 |
PandasSeries = Type[pd.Series]
|
@@ -51,7 +52,7 @@ def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:Panda
|
|
51 |
return df
|
52 |
|
53 |
def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
|
54 |
-
search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults:
|
55 |
'''
|
56 |
Matches by Series values; for example idx is post code and
|
57 |
values address. Search field is reduced by comparing same post codes address reference_address_series.
|
@@ -140,7 +141,7 @@ def string_match_by_post_code_multiple(match_address_series:PandasSeries, refere
|
|
140 |
|
141 |
unique_postcodes = pd.unique(match_address_df['postcode_search'])
|
142 |
|
143 |
-
for postcode_match in
|
144 |
|
145 |
postcode_match_list = [postcode_match]
|
146 |
search_indexes = pd.Series()
|
@@ -177,7 +178,7 @@ def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_
|
|
177 |
|
178 |
## Diagnostics
|
179 |
|
180 |
-
diag_shortlist, diag_best_match =
|
181 |
matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
|
182 |
fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
|
183 |
|
@@ -308,7 +309,7 @@ def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_mat
|
|
308 |
diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")
|
309 |
|
310 |
if 'wratio_score' not in diag_shortlist.columns:
|
311 |
-
diag_shortlist['wratio_score'] =
|
312 |
|
313 |
# Order by best score
|
314 |
diag_shortlist = diag_shortlist.sort_values([
|
@@ -317,7 +318,7 @@ def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_mat
|
|
317 |
|
318 |
return diag_shortlist
|
319 |
|
320 |
-
def
|
321 |
matched_df:PandasDataFrame,
|
322 |
ref_list_df:PandasDataFrame,
|
323 |
matched_col="fuzzy_match_search_address",
|
@@ -340,7 +341,10 @@ def refine_export_results(results_df:PandasDataFrame,
|
|
340 |
results_df = results_df[results_df[matched_col] !=0 ]
|
341 |
|
342 |
### Join property number and flat/room number etc. onto results_df
|
343 |
-
|
|
|
|
|
|
|
344 |
ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
|
345 |
ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})
|
346 |
|
@@ -351,7 +355,7 @@ def refine_export_results(results_df:PandasDataFrame,
|
|
351 |
matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
|
352 |
matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
|
353 |
|
354 |
-
results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
|
355 |
|
356 |
# Choose your best matches from the list of options
|
357 |
diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)
|
@@ -381,12 +385,15 @@ def refine_export_results(results_df:PandasDataFrame,
|
|
381 |
|
382 |
diag_shortlist = diag_shortlist[match_results_cols]
|
383 |
|
|
|
|
|
|
|
384 |
# Choose best match from the shortlist that has been ordered according to score descending
|
385 |
diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
|
386 |
|
387 |
return diag_shortlist, diag_best_match
|
388 |
|
389 |
-
def
|
390 |
'''
|
391 |
Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
|
392 |
'''
|
@@ -402,7 +409,6 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
|
|
402 |
|
403 |
ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
|
404 |
ref_df_after_stand_cols.extend(new_join_col)
|
405 |
-
|
406 |
|
407 |
if (search_df_key_field == "index"):
|
408 |
# Check index is int
|
@@ -420,8 +426,6 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
|
|
420 |
if "Matched with reference address_y" in results_for_orig_df_join.columns:
|
421 |
results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))
|
422 |
|
423 |
-
#results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False)
|
424 |
-
|
425 |
if "Reference file_y" in results_for_orig_df_join.columns:
|
426 |
results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)
|
427 |
|
@@ -429,8 +433,13 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
|
|
429 |
results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)
|
430 |
|
431 |
# Drop columns that aren't useful
|
432 |
-
results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2",
|
433 |
"address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")
|
|
|
|
|
|
|
|
|
|
|
434 |
|
435 |
# Replace blanks with NA, fix UPRNs
|
436 |
results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)
|
@@ -439,6 +448,7 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
|
|
439 |
|
440 |
# Replace cells with only 'nan' with blank
|
441 |
results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
|
442 |
-
|
|
|
443 |
|
444 |
return results_for_orig_df_join
|
|
|
4 |
from datetime import datetime
|
5 |
from rapidfuzz import fuzz, process
|
6 |
import gradio as gr
|
7 |
+
from tqdm import tqdm
|
8 |
|
9 |
PandasDataFrame = Type[pd.DataFrame]
|
10 |
PandasSeries = Type[pd.Series]
|
|
|
52 |
return df
|
53 |
|
54 |
def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
|
55 |
+
search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress(track_tqdm=True))-> MatchedResults:
|
56 |
'''
|
57 |
Matches by Series values; for example idx is post code and
|
58 |
values address. Search field is reduced by comparing same post codes address reference_address_series.
|
|
|
141 |
|
142 |
unique_postcodes = pd.unique(match_address_df['postcode_search'])
|
143 |
|
144 |
+
for postcode_match in tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):
|
145 |
|
146 |
postcode_match_list = [postcode_match]
|
147 |
search_indexes = pd.Series()
|
|
|
178 |
|
179 |
## Diagnostics
|
180 |
|
181 |
+
diag_shortlist, diag_best_match = create_diagnostic_results(results_df=results,\
|
182 |
matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
|
183 |
fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
|
184 |
|
|
|
309 |
diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")
|
310 |
|
311 |
if 'wratio_score' not in diag_shortlist.columns:
|
312 |
+
diag_shortlist['wratio_score'] = None
|
313 |
|
314 |
# Order by best score
|
315 |
diag_shortlist = diag_shortlist.sort_values([
|
|
|
318 |
|
319 |
return diag_shortlist
|
320 |
|
321 |
+
def create_diagnostic_results(results_df:PandasDataFrame,
|
322 |
matched_df:PandasDataFrame,
|
323 |
ref_list_df:PandasDataFrame,
|
324 |
matched_col="fuzzy_match_search_address",
|
|
|
341 |
results_df = results_df[results_df[matched_col] !=0 ]
|
342 |
|
343 |
### Join property number and flat/room number etc. onto results_df
|
344 |
+
if 'ref_index' not in ref_list_df.columns:
|
345 |
+
print("Existing ref_index column not found")
|
346 |
+
ref_list_df["ref_index"] = ref_list_df.index
|
347 |
+
|
348 |
ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
|
349 |
ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})
|
350 |
|
|
|
355 |
matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
|
356 |
matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
|
357 |
|
358 |
+
results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
|
359 |
|
360 |
# Choose your best matches from the list of options
|
361 |
diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)
|
|
|
385 |
|
386 |
diag_shortlist = diag_shortlist[match_results_cols]
|
387 |
|
388 |
+
diag_shortlist["ref_index"] = diag_shortlist["ref_index"].astype(int, errors="ignore")
|
389 |
+
diag_shortlist["wratio_score"] = diag_shortlist["wratio_score"].astype(float, errors="ignore")
|
390 |
+
|
391 |
# Choose best match from the shortlist that has been ordered according to score descending
|
392 |
diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
|
393 |
|
394 |
return diag_shortlist, diag_best_match
|
395 |
|
396 |
+
def create_results_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
|
397 |
'''
|
398 |
Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
|
399 |
'''
|
|
|
409 |
|
410 |
ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
|
411 |
ref_df_after_stand_cols.extend(new_join_col)
|
|
|
412 |
|
413 |
if (search_df_key_field == "index"):
|
414 |
# Check index is int
|
|
|
426 |
if "Matched with reference address_y" in results_for_orig_df_join.columns:
|
427 |
results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))
|
428 |
|
|
|
|
|
429 |
if "Reference file_y" in results_for_orig_df_join.columns:
|
430 |
results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)
|
431 |
|
|
|
433 |
results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)
|
434 |
|
435 |
# Drop columns that aren't useful
|
436 |
+
results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2",
|
437 |
"address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")
|
438 |
+
|
439 |
+
results_for_orig_df_join.rename(columns={"full_address":"Search data address"}, inplace = True)
|
440 |
+
|
441 |
+
results_for_orig_df_join["index"] = results_for_orig_df_join["index"].astype(int, errors="ignore")
|
442 |
+
results_for_orig_df_join["ref_index"] = results_for_orig_df_join["ref_index"].astype(int, errors="ignore")
|
443 |
|
444 |
# Replace blanks with NA, fix UPRNs
|
445 |
results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)
|
|
|
448 |
|
449 |
# Replace cells with only 'nan' with blank
|
450 |
results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
|
451 |
+
|
452 |
+
results_for_orig_df_join.to_csv("output/results_for_orig_df_join.csv")
|
453 |
|
454 |
return results_for_orig_df_join
|
tools/matcher_funcs.py
CHANGED
@@ -9,6 +9,7 @@ import math
|
|
9 |
from datetime import datetime
|
10 |
import copy
|
11 |
import gradio as gr
|
|
|
12 |
|
13 |
PandasDataFrame = Type[pd.DataFrame]
|
14 |
PandasSeries = Type[pd.Series]
|
@@ -26,7 +27,7 @@ run_standardise = True
|
|
26 |
|
27 |
from tools.constants import *
|
28 |
from tools.preparation import prepare_search_address_string, prepare_search_address, extract_street_name, prepare_ref_address, remove_non_postal, check_no_number_addresses
|
29 |
-
from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output,
|
30 |
from tools.standardise import standardise_wrapper_func
|
31 |
|
32 |
# Neural network functions
|
@@ -69,14 +70,11 @@ def read_file(filename:str) -> PandasDataFrame:
|
|
69 |
def get_file_name(in_name: str) -> str:
|
70 |
"""Get the name of a file from a string, handling both Windows and Unix paths."""
|
71 |
|
72 |
-
print("in_name: ", in_name)
|
73 |
match = re.search(rf'{re.escape(os.sep)}(?!.*{re.escape(os.sep)})(.*)', in_name)
|
74 |
if match:
|
75 |
matched_result = match.group(1)
|
76 |
else:
|
77 |
matched_result = None
|
78 |
-
|
79 |
-
print("Matched result: ", matched_result)
|
80 |
|
81 |
return matched_result
|
82 |
|
@@ -108,7 +106,7 @@ def filter_not_matched(
|
|
108 |
|
109 |
return search_df.iloc[np.where(~matched)[0]]
|
110 |
|
111 |
-
def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
|
112 |
|
113 |
final_api_output_file_name = ""
|
114 |
|
@@ -204,7 +202,7 @@ def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str,
|
|
204 |
loop_df = Matcher.ref_df
|
205 |
loop_list = [Matcher.ref_df]
|
206 |
|
207 |
-
for address in
|
208 |
print("Query number: " + str(i+1), "with address: ", address)
|
209 |
|
210 |
api_search_index = api_search_df.index
|
@@ -368,7 +366,7 @@ def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str,
|
|
368 |
|
369 |
return Matcher, final_api_output_file_name
|
370 |
|
371 |
-
def load_ref_data(Matcher:MatcherClass, ref_data_state:PandasDataFrame, in_ref:List[str], in_refcol:List[str], in_api:List[str], in_api_key:str, query_type:str, progress=gr.Progress()):
|
372 |
'''
|
373 |
Check for reference address data, do some preprocessing, and load in from the Addressbase API if required.
|
374 |
'''
|
@@ -577,7 +575,7 @@ def load_match_data_and_filter(Matcher:MatcherClass, data_state:PandasDataFrame,
|
|
577 |
Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
|
578 |
Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
|
579 |
Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
|
580 |
-
Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
|
581 |
|
582 |
Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
|
583 |
Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
|
@@ -657,43 +655,91 @@ def load_match_data_and_filter(Matcher:MatcherClass, data_state:PandasDataFrame,
|
|
657 |
|
658 |
return Matcher
|
659 |
|
660 |
-
def load_matcher_data(
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
665 |
|
666 |
-
|
667 |
|
668 |
-
|
669 |
-
Matcher.abort_flag = False
|
670 |
|
671 |
-
|
672 |
-
|
673 |
-
if not in_api:
|
674 |
-
Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
|
675 |
|
676 |
-
|
677 |
-
|
678 |
-
|
|
|
679 |
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
Matcher.match_outputs_name = output_folder + "diagnostics_initial_" + today_rev + ".csv"
|
688 |
-
Matcher.results_orig_df_name = output_folder + "results_initial_" + today_rev + ".csv"
|
689 |
-
|
690 |
-
Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
691 |
-
Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
692 |
|
693 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
694 |
|
695 |
# Run whole matcher process
|
696 |
-
def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
|
697 |
'''
|
698 |
Split search and reference data into batches. Loop and run through the match script for each batch of data.
|
699 |
'''
|
@@ -722,7 +768,6 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
|
|
722 |
# Polars implementation not yet finalised
|
723 |
#InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
|
724 |
#InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
|
725 |
-
|
726 |
|
727 |
# Prepare all search addresses
|
728 |
if type(InitMatch.search_df) == str:
|
@@ -739,7 +784,6 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
|
|
739 |
# Initial preparation of reference addresses
|
740 |
InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
|
741 |
|
742 |
-
|
743 |
# Polars implementation - not finalised
|
744 |
#InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
|
745 |
#InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
|
@@ -747,8 +791,10 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
|
|
747 |
# Standardise addresses
|
748 |
# Standardise - minimal
|
749 |
|
750 |
-
|
751 |
tic = time.perf_counter()
|
|
|
|
|
|
|
752 |
InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
|
753 |
InitMatch.search_df_cleaned.copy(),
|
754 |
InitMatch.ref_df_cleaned.copy(),
|
@@ -759,6 +805,8 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
|
|
759 |
toc = time.perf_counter()
|
760 |
print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
|
761 |
|
|
|
|
|
762 |
# Standardise - full
|
763 |
tic = time.perf_counter()
|
764 |
InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
|
@@ -784,8 +832,8 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
|
|
784 |
n = 0
|
785 |
number_of_batches = range_df.shape[0]
|
786 |
|
787 |
-
for row in progress.tqdm(range(0,
|
788 |
-
print("Running batch
|
789 |
|
790 |
search_range = range_df.iloc[row]['search_range']
|
791 |
ref_range = range_df.iloc[row]['ref_range']
|
@@ -830,6 +878,8 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
|
|
830 |
# Remove any duplicates from reference df, prioritise successful matches
|
831 |
OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
|
832 |
|
|
|
|
|
833 |
overall_toc = time.perf_counter()
|
834 |
time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
|
835 |
|
@@ -851,14 +901,13 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
|
|
851 |
nnet_std_output = OutputMatch.match_results_output.copy()
|
852 |
nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
|
853 |
|
854 |
-
final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
|
855 |
-
|
856 |
-
|
857 |
|
858 |
estimate_total_processing_time = sum_numbers_before_seconds(time_out)
|
859 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
860 |
|
861 |
output_files.extend([OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name])
|
|
|
862 |
return final_summary, output_files, estimate_total_processing_time
|
863 |
|
864 |
# Run a match run for a single batch
|
@@ -985,7 +1034,7 @@ def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:i
|
|
985 |
|
986 |
return lengths_df
|
987 |
|
988 |
-
def run_single_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
|
989 |
'''
|
990 |
Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
|
991 |
'''
|
@@ -1074,7 +1123,7 @@ def run_single_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches
|
|
1074 |
return summary_of_summaries, FuzzyNNetStdMatch
|
1075 |
|
1076 |
# Overarching functions
|
1077 |
-
def orchestrate_single_match_batch(Matcher, standardise = False, nnet = False, file_stub= "not_std_", df_name = "Fuzzy not standardised"):
|
1078 |
|
1079 |
today_rev = datetime.now().strftime("%Y%m%d")
|
1080 |
|
@@ -1152,20 +1201,24 @@ def orchestrate_single_match_batch(Matcher, standardise = False, nnet = False, f
|
|
1152 |
return Matcher
|
1153 |
else:
|
1154 |
Matcher.match_results_output = match_results_output
|
1155 |
-
Matcher.predict_df_nnet = predict_df_nnet
|
1156 |
-
|
1157 |
# Save to file
|
1158 |
Matcher.results_on_orig_df = results_on_orig_df
|
1159 |
-
|
1160 |
-
print("Results output in orchestrate match run shape: ", Matcher.results_on_orig_df.shape)
|
1161 |
-
|
1162 |
-
Matcher.summary = summary
|
1163 |
-
|
1164 |
Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
|
1165 |
|
1166 |
Matcher.match_outputs_name = output_folder + "diagnostics_" + file_stub + today_rev + ".csv"
|
1167 |
Matcher.results_orig_df_name = output_folder + "results_" + file_stub + today_rev + ".csv"
|
1168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1169 |
Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
1170 |
Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
1171 |
|
@@ -1248,7 +1301,7 @@ def full_fuzzy_match(search_df:PandasDataFrame,
|
|
1248 |
summary = create_match_summary(match_results_output, df_name)
|
1249 |
|
1250 |
if type(search_df) != str:
|
1251 |
-
results_on_orig_df =
|
1252 |
else: results_on_orig_df = match_results_output
|
1253 |
|
1254 |
print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
|
@@ -1283,11 +1336,10 @@ def full_fuzzy_match(search_df:PandasDataFrame,
|
|
1283 |
summary = create_match_summary(match_results_output, df_name)
|
1284 |
|
1285 |
if type(search_df) != str:
|
1286 |
-
results_on_orig_df =
|
1287 |
else: results_on_orig_df = match_results_output
|
1288 |
|
1289 |
-
return diag_shortlist, diag_best_match
|
1290 |
-
match_results_output, results_on_orig_df, summary, search_address_cols
|
1291 |
|
1292 |
print("Starting the fuzzy match with street as blocker")
|
1293 |
|
@@ -1314,7 +1366,7 @@ def full_fuzzy_match(search_df:PandasDataFrame,
|
|
1314 |
### Join URPN back onto orig df
|
1315 |
|
1316 |
if type(search_df) != str:
|
1317 |
-
results_on_orig_df =
|
1318 |
else: results_on_orig_df = match_results_output
|
1319 |
|
1320 |
print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
|
@@ -1480,7 +1532,7 @@ def full_nn_match(ref_address_cols:List[str],
|
|
1480 |
### Join URPN back onto orig df
|
1481 |
|
1482 |
if type(search_df) != str:
|
1483 |
-
results_on_orig_df =
|
1484 |
else: results_on_orig_df = match_results_output_final_three
|
1485 |
|
1486 |
return match_results_output_final_three, results_on_orig_df, summary_three, predict_df
|
@@ -1495,18 +1547,28 @@ def combine_dfs_and_remove_dups(orig_df:PandasDataFrame, new_df:PandasDataFrame,
|
|
1495 |
# If one of the dataframes is empty, break
|
1496 |
if (orig_df.empty) & (new_df.empty):
|
1497 |
return orig_df
|
1498 |
-
|
1499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1500 |
|
1501 |
-
|
|
|
|
|
|
|
|
|
1502 |
|
|
|
1503 |
|
1504 |
# If no results were combined
|
1505 |
if combined_std_not_matches.empty:
|
1506 |
combined_std_not_matches[match_address_series] = False
|
1507 |
|
1508 |
-
if "full_address" in combined_std_not_matches.columns:
|
1509 |
-
|
1510 |
combined_std_not_matches["fuzzy_score"] = 0
|
1511 |
return combined_std_not_matches
|
1512 |
|
@@ -1540,6 +1602,7 @@ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass,
|
|
1540 |
found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
|
1541 |
|
1542 |
key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int) # Assuming list conversion is suitable
|
|
|
1543 |
rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
|
1544 |
NewMatchClass.search_df_not_matched = NewMatchClass.search_df_not_matched.loc[~NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].isin(rows_to_drop),:]#.drop(rows_to_drop, axis = 0)
|
1545 |
|
@@ -1565,11 +1628,13 @@ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass,
|
|
1565 |
NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.drop("fuzzy_score", axis = 1)
|
1566 |
|
1567 |
# Drop any duplicates, prioritise any matches
|
|
|
|
|
|
|
1568 |
NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
|
1569 |
|
1570 |
NewMatchClass.output_summary = create_match_summary(NewMatchClass.match_results_output, df_name = df_name)
|
1571 |
-
print(NewMatchClass.output_summary)
|
1572 |
-
|
1573 |
|
1574 |
NewMatchClass.search_df_not_matched = filter_not_matched(NewMatchClass.match_results_output, NewMatchClass.search_df, NewMatchClass.search_df_key_field)
|
1575 |
|
@@ -1580,8 +1645,17 @@ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass,
|
|
1580 |
NewMatchClass.results_orig_df_name = output_folder + "results_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
|
1581 |
|
1582 |
# Only keep essential columns
|
1583 |
-
essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
|
1584 |
-
essential_results_cols.extend(NewMatchClass.new_join_col)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1585 |
|
1586 |
NewMatchClass.match_results_output.to_csv(NewMatchClass.match_outputs_name, index = None)
|
1587 |
NewMatchClass.results_on_orig_df[essential_results_cols].to_csv(NewMatchClass.results_orig_df_name, index = None)
|
|
|
9 |
from datetime import datetime
|
10 |
import copy
|
11 |
import gradio as gr
|
12 |
+
from tqdm import tqdm
|
13 |
|
14 |
PandasDataFrame = Type[pd.DataFrame]
|
15 |
PandasSeries = Type[pd.Series]
|
|
|
27 |
|
28 |
from tools.constants import *
|
29 |
from tools.preparation import prepare_search_address_string, prepare_search_address, extract_street_name, prepare_ref_address, remove_non_postal, check_no_number_addresses
|
30 |
+
from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, create_results_df
|
31 |
from tools.standardise import standardise_wrapper_func
|
32 |
|
33 |
# Neural network functions
|
|
|
70 |
def get_file_name(in_name: str) -> str:
|
71 |
"""Get the name of a file from a string, handling both Windows and Unix paths."""
|
72 |
|
|
|
73 |
match = re.search(rf'{re.escape(os.sep)}(?!.*{re.escape(os.sep)})(.*)', in_name)
|
74 |
if match:
|
75 |
matched_result = match.group(1)
|
76 |
else:
|
77 |
matched_result = None
|
|
|
|
|
78 |
|
79 |
return matched_result
|
80 |
|
|
|
106 |
|
107 |
return search_df.iloc[np.where(~matched)[0]]
|
108 |
|
109 |
+
def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress(track_tqdm=True)):
|
110 |
|
111 |
final_api_output_file_name = ""
|
112 |
|
|
|
202 |
loop_df = Matcher.ref_df
|
203 |
loop_list = [Matcher.ref_df]
|
204 |
|
205 |
+
for address in tqdm(api_search_df['full_address_postcode'], desc= "Making API calls", unit="addresses", total=len(api_search_df['full_address_postcode'])):
|
206 |
print("Query number: " + str(i+1), "with address: ", address)
|
207 |
|
208 |
api_search_index = api_search_df.index
|
|
|
366 |
|
367 |
return Matcher, final_api_output_file_name
|
368 |
|
369 |
+
def load_ref_data(Matcher:MatcherClass, ref_data_state:PandasDataFrame, in_ref:List[str], in_refcol:List[str], in_api:List[str], in_api_key:str, query_type:str, progress=gr.Progress(track_tqdm=True)):
|
370 |
'''
|
371 |
Check for reference address data, do some preprocessing, and load in from the Addressbase API if required.
|
372 |
'''
|
|
|
575 |
Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
|
576 |
Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
|
577 |
Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
|
578 |
+
#Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
|
579 |
|
580 |
Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
|
581 |
Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
|
|
|
655 |
|
656 |
return Matcher
|
657 |
|
658 |
+
def load_matcher_data(
|
659 |
+
in_text: str,
|
660 |
+
in_file: str,
|
661 |
+
in_ref: str,
|
662 |
+
data_state: PandasDataFrame,
|
663 |
+
results_data_state: PandasDataFrame,
|
664 |
+
ref_data_state: PandasDataFrame,
|
665 |
+
in_colnames: list,
|
666 |
+
in_refcol: list,
|
667 |
+
in_joincol: list,
|
668 |
+
in_existing: list,
|
669 |
+
Matcher: MatcherClass,
|
670 |
+
in_api:str,
|
671 |
+
in_api_key: str
|
672 |
+
) -> tuple:
|
673 |
+
"""
|
674 |
+
Load and preprocess user inputs from the Gradio interface for address matching.
|
675 |
+
|
676 |
+
This function standardises all input types (single address string, file uploads, etc.) into a consistent data format
|
677 |
+
suitable for downstream fuzzy matching. It handles both search and reference data, including API-based reference data retrieval
|
678 |
+
if requested.
|
679 |
+
|
680 |
+
Args:
|
681 |
+
in_text (str): Single address input as text, if provided.
|
682 |
+
in_file: Uploaded file(s) containing addresses to match.
|
683 |
+
in_ref: Uploaded reference file(s) or None if using API.
|
684 |
+
data_state (PandasDataFrame): Current state of the search data.
|
685 |
+
results_data_state (PandasDataFrame): Current state of the results data.
|
686 |
+
ref_data_state (PandasDataFrame): Current state of the reference data.
|
687 |
+
in_colnames (list): List of column names that make up the address in the search data.
|
688 |
+
in_refcol (list): List of column names that make up the address in the reference data.
|
689 |
+
in_joincol (list): List of columns to join on between search and reference data.
|
690 |
+
in_existing (list): List of columns indicating existing matches.
|
691 |
+
Matcher (MatcherClass): Matcher object to store and process data.
|
692 |
+
in_api: Flag or value indicating whether to use the API for reference data.
|
693 |
+
in_api_key (str): API key for reference data retrieval, if applicable.
|
694 |
+
|
695 |
+
Returns:
|
696 |
+
tuple: (Matcher, final_api_output_file_name)
|
697 |
+
Matcher: The updated Matcher object with loaded and preprocessed data.
|
698 |
+
final_api_output_file_name (str): The filename of the reference data if loaded from API, else empty string.
|
699 |
+
"""
|
700 |
|
701 |
+
final_api_output_file_name = ""
|
702 |
|
703 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
|
|
704 |
|
705 |
+
# Abort flag for if it's not even possible to attempt the first stage of the match for some reason
|
706 |
+
Matcher.abort_flag = False
|
|
|
|
|
707 |
|
708 |
+
### ref_df FILES ###
|
709 |
+
# If not an API call, run this first
|
710 |
+
if not in_api:
|
711 |
+
Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
|
712 |
|
713 |
+
### MATCH/SEARCH FILES ###
|
714 |
+
# If doing API calls, we need to know the search data before querying for specific addresses/postcodes
|
715 |
+
Matcher = load_match_data_and_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
|
716 |
+
|
717 |
+
# If an API call, ref_df data is loaded after
|
718 |
+
if in_api:
|
719 |
+
Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
|
|
|
|
|
|
|
|
|
|
|
720 |
|
721 |
+
print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
|
722 |
+
print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
|
723 |
+
|
724 |
+
Matcher.match_outputs_name = output_folder + "diagnostics_initial_" + today_rev + ".csv"
|
725 |
+
Matcher.results_orig_df_name = output_folder + "results_initial_" + today_rev + ".csv"
|
726 |
+
|
727 |
+
if "fuzzy_score" in Matcher.match_results_output.columns:
|
728 |
+
Matcher.match_results_output["fuzzy_score"] = (
|
729 |
+
pd.to_numeric(Matcher.match_results_output["fuzzy_score"], errors="coerce").round(2)
|
730 |
+
)
|
731 |
+
if "wratio_score" in Matcher.match_results_output.columns:
|
732 |
+
Matcher.match_results_output["wratio_score"] = (
|
733 |
+
pd.to_numeric(Matcher.match_results_output["wratio_score"], errors="coerce").round(2)
|
734 |
+
)
|
735 |
+
|
736 |
+
Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
737 |
+
Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
738 |
+
|
739 |
+
return Matcher, final_api_output_file_name
|
740 |
|
741 |
# Run whole matcher process
|
742 |
+
def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress(track_tqdm=True)):
|
743 |
'''
|
744 |
Split search and reference data into batches. Loop and run through the match script for each batch of data.
|
745 |
'''
|
|
|
768 |
# Polars implementation not yet finalised
|
769 |
#InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
|
770 |
#InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
|
|
|
771 |
|
772 |
# Prepare all search addresses
|
773 |
if type(InitMatch.search_df) == str:
|
|
|
784 |
# Initial preparation of reference addresses
|
785 |
InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
|
786 |
|
|
|
787 |
# Polars implementation - not finalised
|
788 |
#InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
|
789 |
#InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
|
|
|
791 |
# Standardise addresses
|
792 |
# Standardise - minimal
|
793 |
|
|
|
794 |
tic = time.perf_counter()
|
795 |
+
|
796 |
+
progress(0.1, desc="Performing minimal standardisation")
|
797 |
+
|
798 |
InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
|
799 |
InitMatch.search_df_cleaned.copy(),
|
800 |
InitMatch.ref_df_cleaned.copy(),
|
|
|
805 |
toc = time.perf_counter()
|
806 |
print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
|
807 |
|
808 |
+
progress(0.1, desc="Performing full standardisation")
|
809 |
+
|
810 |
# Standardise - full
|
811 |
tic = time.perf_counter()
|
812 |
InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
|
|
|
832 |
n = 0
|
833 |
number_of_batches = range_df.shape[0]
|
834 |
|
835 |
+
for row in progress.tqdm(range(0,number_of_batches), desc= "Matching addresses in batches", unit="batches", total=number_of_batches):
|
836 |
+
print("Running batch", str(n+1))
|
837 |
|
838 |
search_range = range_df.iloc[row]['search_range']
|
839 |
ref_range = range_df.iloc[row]['ref_range']
|
|
|
878 |
# Remove any duplicates from reference df, prioritise successful matches
|
879 |
OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
|
880 |
|
881 |
+
|
882 |
+
|
883 |
overall_toc = time.perf_counter()
|
884 |
time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
|
885 |
|
|
|
901 |
nnet_std_output = OutputMatch.match_results_output.copy()
|
902 |
nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
|
903 |
|
904 |
+
final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
|
|
|
|
|
905 |
|
906 |
estimate_total_processing_time = sum_numbers_before_seconds(time_out)
|
907 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
908 |
|
909 |
output_files.extend([OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name])
|
910 |
+
|
911 |
return final_summary, output_files, estimate_total_processing_time
|
912 |
|
913 |
# Run a match run for a single batch
|
|
|
1034 |
|
1035 |
return lengths_df
|
1036 |
|
1037 |
+
def run_single_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress(track_tqdm=True)):
|
1038 |
'''
|
1039 |
Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
|
1040 |
'''
|
|
|
1123 |
return summary_of_summaries, FuzzyNNetStdMatch
|
1124 |
|
1125 |
# Overarching functions
|
1126 |
+
def orchestrate_single_match_batch(Matcher:MatcherClass, standardise = False, nnet = False, file_stub= "not_std_", df_name = "Fuzzy not standardised"):
|
1127 |
|
1128 |
today_rev = datetime.now().strftime("%Y%m%d")
|
1129 |
|
|
|
1201 |
return Matcher
|
1202 |
else:
|
1203 |
Matcher.match_results_output = match_results_output
|
1204 |
+
Matcher.predict_df_nnet = predict_df_nnet
|
1205 |
+
|
1206 |
# Save to file
|
1207 |
Matcher.results_on_orig_df = results_on_orig_df
|
1208 |
+
Matcher.summary = summary
|
|
|
|
|
|
|
|
|
1209 |
Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
|
1210 |
|
1211 |
Matcher.match_outputs_name = output_folder + "diagnostics_" + file_stub + today_rev + ".csv"
|
1212 |
Matcher.results_orig_df_name = output_folder + "results_" + file_stub + today_rev + ".csv"
|
1213 |
+
|
1214 |
+
if "fuzzy_score" in Matcher.match_results_output.columns:
|
1215 |
+
Matcher.match_results_output["fuzzy_score"] = (
|
1216 |
+
pd.to_numeric(Matcher.match_results_output["fuzzy_score"], errors="coerce").round(2)
|
1217 |
+
)
|
1218 |
+
if "wratio_score" in Matcher.match_results_output.columns:
|
1219 |
+
Matcher.match_results_output["wratio_score"] = (
|
1220 |
+
pd.to_numeric(Matcher.match_results_output["wratio_score"], errors="coerce").round(2)
|
1221 |
+
)
|
1222 |
Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
1223 |
Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
1224 |
|
|
|
1301 |
summary = create_match_summary(match_results_output, df_name)
|
1302 |
|
1303 |
if type(search_df) != str:
|
1304 |
+
results_on_orig_df = create_results_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
|
1305 |
else: results_on_orig_df = match_results_output
|
1306 |
|
1307 |
print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
|
|
|
1336 |
summary = create_match_summary(match_results_output, df_name)
|
1337 |
|
1338 |
if type(search_df) != str:
|
1339 |
+
results_on_orig_df = create_results_df(match_results_output, search_df_after_stand, search_df_key_field, new_join_col)
|
1340 |
else: results_on_orig_df = match_results_output
|
1341 |
|
1342 |
+
return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
|
|
|
1343 |
|
1344 |
print("Starting the fuzzy match with street as blocker")
|
1345 |
|
|
|
1366 |
### Join URPN back onto orig df
|
1367 |
|
1368 |
if type(search_df) != str:
|
1369 |
+
results_on_orig_df = create_results_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
|
1370 |
else: results_on_orig_df = match_results_output
|
1371 |
|
1372 |
print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
|
|
|
1532 |
### Join URPN back onto orig df
|
1533 |
|
1534 |
if type(search_df) != str:
|
1535 |
+
results_on_orig_df = create_results_df(match_results_output_final_three, search_df_after_stand, search_df_key_field, new_join_col)
|
1536 |
else: results_on_orig_df = match_results_output_final_three
|
1537 |
|
1538 |
return match_results_output_final_three, results_on_orig_df, summary_three, predict_df
|
|
|
1547 |
# If one of the dataframes is empty, break
|
1548 |
if (orig_df.empty) & (new_df.empty):
|
1549 |
return orig_df
|
|
|
1550 |
|
1551 |
+
# Ensure that the original search result is returned
|
1552 |
+
if "Search data address" not in orig_df.columns:
|
1553 |
+
if "search_orig_address" in orig_df.columns:
|
1554 |
+
orig_df["Search data address"] = orig_df["search_orig_address"]
|
1555 |
+
elif "address_cols_joined" in orig_df.columns:
|
1556 |
+
orig_df["Search data address"] = orig_df["address_cols_joined"]
|
1557 |
|
1558 |
+
if "Search data address" not in new_df.columns:
|
1559 |
+
if "search_orig_address" in new_df.columns:
|
1560 |
+
new_df["Search data address"] = new_df["search_orig_address"]
|
1561 |
+
elif "address_cols_joined" in new_df.columns:
|
1562 |
+
new_df["Search data address"] = new_df["address_cols_joined"]
|
1563 |
|
1564 |
+
combined_std_not_matches = pd.concat([orig_df, new_df])#, ignore_index=True)
|
1565 |
|
1566 |
# If no results were combined
|
1567 |
if combined_std_not_matches.empty:
|
1568 |
combined_std_not_matches[match_address_series] = False
|
1569 |
|
1570 |
+
#if "full_address" in combined_std_not_matches.columns:
|
1571 |
+
# combined_std_not_matches[index_col] = combined_std_not_matches["full_address"]
|
1572 |
combined_std_not_matches["fuzzy_score"] = 0
|
1573 |
return combined_std_not_matches
|
1574 |
|
|
|
1602 |
found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
|
1603 |
|
1604 |
key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int) # Assuming list conversion is suitable
|
1605 |
+
|
1606 |
rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
|
1607 |
NewMatchClass.search_df_not_matched = NewMatchClass.search_df_not_matched.loc[~NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].isin(rows_to_drop),:]#.drop(rows_to_drop, axis = 0)
|
1608 |
|
|
|
1628 |
NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.drop("fuzzy_score", axis = 1)
|
1629 |
|
1630 |
# Drop any duplicates, prioritise any matches
|
1631 |
+
NewMatchClass.results_on_orig_df["index"] = NewMatchClass.results_on_orig_df["index"].astype(int, errors="ignore")
|
1632 |
+
NewMatchClass.results_on_orig_df["ref_index"] = NewMatchClass.results_on_orig_df["ref_index"].astype(int, errors="ignore")
|
1633 |
+
|
1634 |
NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
|
1635 |
|
1636 |
NewMatchClass.output_summary = create_match_summary(NewMatchClass.match_results_output, df_name = df_name)
|
1637 |
+
print(NewMatchClass.output_summary)
|
|
|
1638 |
|
1639 |
NewMatchClass.search_df_not_matched = filter_not_matched(NewMatchClass.match_results_output, NewMatchClass.search_df, NewMatchClass.search_df_key_field)
|
1640 |
|
|
|
1645 |
NewMatchClass.results_orig_df_name = output_folder + "results_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
|
1646 |
|
1647 |
# Only keep essential columns
|
1648 |
+
essential_results_cols = [NewMatchClass.search_df_key_field, "Search data address", "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
|
1649 |
+
essential_results_cols.extend(NewMatchClass.new_join_col)
|
1650 |
+
|
1651 |
+
if "fuzzy_score" in NewMatchClass.match_results_output.columns:
|
1652 |
+
NewMatchClass.match_results_output["fuzzy_score"] = (
|
1653 |
+
pd.to_numeric(NewMatchClass.match_results_output["fuzzy_score"], errors="coerce").round(2)
|
1654 |
+
)
|
1655 |
+
if "wratio_score" in NewMatchClass.match_results_output.columns:
|
1656 |
+
NewMatchClass.match_results_output["wratio_score"] = (
|
1657 |
+
pd.to_numeric(NewMatchClass.match_results_output["wratio_score"], errors="coerce").round(2)
|
1658 |
+
)
|
1659 |
|
1660 |
NewMatchClass.match_results_output.to_csv(NewMatchClass.match_outputs_name, index = None)
|
1661 |
NewMatchClass.results_on_orig_df[essential_results_cols].to_csv(NewMatchClass.results_orig_df_name, index = None)
|
tools/preparation.py
CHANGED
@@ -3,6 +3,10 @@ from typing import Type, Dict, List, Tuple
|
|
3 |
from datetime import datetime
|
4 |
#import polars as pl
|
5 |
import re
|
|
|
|
|
|
|
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
PandasSeries = Type[pd.Series]
|
@@ -54,9 +58,12 @@ def prepare_search_address(
|
|
54 |
search_df: pd.DataFrame,
|
55 |
address_cols: list,
|
56 |
postcode_col: list,
|
57 |
-
key_col: str
|
|
|
58 |
) -> Tuple[pd.DataFrame, str]:
|
59 |
|
|
|
|
|
60 |
# Validate inputs
|
61 |
if not isinstance(search_df, pd.DataFrame):
|
62 |
raise TypeError("search_df must be a Pandas DataFrame")
|
@@ -68,56 +75,64 @@ def prepare_search_address(
|
|
68 |
raise TypeError("postcode_col must be a list")
|
69 |
|
70 |
if not isinstance(key_col, str):
|
71 |
-
raise TypeError("key_col must be a string")
|
72 |
-
|
73 |
-
# Clean address columns
|
74 |
-
#search_df_polars = pl.from_dataframe(search_df)
|
75 |
-
clean_addresses = _clean_columns(search_df, address_cols)
|
76 |
|
77 |
# If there is a full address and postcode column in the addresses, clean any postcodes from the first column
|
78 |
if len(address_cols) == 2:
|
79 |
# Remove postcode from address
|
80 |
-
|
81 |
-
clean_addresses[address_cols[0]] = address_series
|
82 |
|
83 |
# Join address columns into one
|
84 |
-
full_addresses = _join_address(
|
|
|
|
|
|
|
|
|
85 |
|
86 |
# Add postcode column
|
87 |
-
full_df = _add_postcode_column(
|
88 |
|
89 |
# Remove postcode from main address if there was only one column in the input
|
90 |
if postcode_col == "full_address_postcode":
|
91 |
# Remove postcode from address
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
# Ensure index column
|
96 |
final_df = _ensure_index(full_df, key_col)
|
97 |
-
|
98 |
-
#print(final_df)
|
99 |
-
|
100 |
|
101 |
return final_df
|
102 |
|
103 |
# Helper functions
|
104 |
-
def _clean_columns(df, cols):
|
105 |
-
|
106 |
-
|
107 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
return df
|
112 |
|
113 |
-
def _join_address(df, cols):
|
114 |
# Joining logic
|
115 |
full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
|
116 |
df["full_address"] = full_address.str.replace("\s{2,}", " ", regex=True).str.strip()
|
117 |
|
118 |
return df
|
119 |
|
120 |
-
def _add_postcode_column(df, postcodes):
|
121 |
# Add postcode column
|
122 |
if isinstance(postcodes, list):
|
123 |
postcodes = postcodes[0]
|
@@ -133,7 +148,7 @@ def _add_postcode_column(df, postcodes):
|
|
133 |
|
134 |
return df
|
135 |
|
136 |
-
def _ensure_index(df, index_col):
|
137 |
# Ensure index column exists
|
138 |
if ((index_col == "index") & ~("index" in df.columns)):
|
139 |
print("Resetting index in _ensure_index function")
|
@@ -143,7 +158,7 @@ def _ensure_index(df, index_col):
|
|
143 |
|
144 |
return df
|
145 |
|
146 |
-
def create_full_address(df):
|
147 |
|
148 |
df = df.fillna("").infer_objects(copy=False)
|
149 |
|
@@ -169,8 +184,10 @@ def create_full_address(df):
|
|
169 |
|
170 |
return df["full_address"]
|
171 |
|
172 |
-
def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_cols = True):
|
173 |
|
|
|
|
|
174 |
if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns): standard_cols = True
|
175 |
else: standard_cols = False
|
176 |
|
@@ -182,6 +199,8 @@ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_co
|
|
182 |
ref_address_cols_uprn_w_ref.extend(["Reference file"])
|
183 |
|
184 |
ref_df_cleaned = ref_df.copy()
|
|
|
|
|
185 |
|
186 |
# In on-prem LPI db street has been excluded, so put this back in
|
187 |
if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
|
@@ -218,13 +237,7 @@ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_co
|
|
218 |
full_address = ref_df_cleaned[ref_address_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
|
219 |
ref_df_cleaned["fulladdress"] = full_address
|
220 |
|
221 |
-
ref_df_cleaned
|
222 |
-
.str.replace("-999","")\
|
223 |
-
.str.replace(" -"," ")\
|
224 |
-
.str.replace("- "," ")\
|
225 |
-
.str.replace(".0","", regex=False)\
|
226 |
-
.str.replace("\s{2,}", " ", regex=True)\
|
227 |
-
.str.strip()
|
228 |
|
229 |
# Create a street column if it doesn't exist by extracting street from the full address
|
230 |
|
@@ -232,6 +245,7 @@ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_co
|
|
232 |
ref_df_cleaned['Street'] = ref_df_cleaned["fulladdress"].apply(extract_street_name)
|
233 |
|
234 |
# Add index column
|
|
|
235 |
ref_df_cleaned['ref_index'] = ref_df_cleaned.index
|
236 |
|
237 |
return ref_df_cleaned
|
@@ -246,7 +260,7 @@ def extract_postcode(df, col:str) -> PandasSeries:
|
|
246 |
return postcode_series
|
247 |
|
248 |
# Remove addresses with no numbers in at all - too high a risk of badly assigning an address
|
249 |
-
def check_no_number_addresses(df, in_address_series) -> PandasSeries:
|
250 |
'''
|
251 |
Highlight addresses from a pandas df where there are no numbers in the address.
|
252 |
'''
|
@@ -262,15 +276,6 @@ def check_no_number_addresses(df, in_address_series) -> PandasSeries:
|
|
262 |
|
263 |
return df
|
264 |
|
265 |
-
# def remove_postcode(df, col:str) -> PandasSeries:
|
266 |
-
# '''
|
267 |
-
# Remove a postcode from a string column in a dataframe
|
268 |
-
# '''
|
269 |
-
# address_series_no_pcode = df[col].str.upper().str.replace(\
|
270 |
-
# "\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","", regex=True).str.lower()
|
271 |
-
|
272 |
-
# return address_series_no_pcode
|
273 |
-
|
274 |
def extract_street_name(address:str) -> str:
|
275 |
"""
|
276 |
Extracts the street name from the given address.
|
@@ -342,7 +347,7 @@ def extract_street_name(address:str) -> str:
|
|
342 |
|
343 |
# Exclude non-postal addresses
|
344 |
|
345 |
-
def remove_non_postal(df, in_address_series):
|
346 |
'''
|
347 |
Highlight non-postal addresses from a polars df where a string series that contain specific substrings
|
348 |
indicating non-postal addresses like 'garage', 'parking', 'shed', etc.
|
|
|
3 |
from datetime import datetime
|
4 |
#import polars as pl
|
5 |
import re
|
6 |
+
from tqdm import tqdm
|
7 |
+
from gradio import Progress
|
8 |
+
|
9 |
+
tqdm.pandas() # Registers the progress_apply method
|
10 |
|
11 |
PandasDataFrame = Type[pd.DataFrame]
|
12 |
PandasSeries = Type[pd.Series]
|
|
|
58 |
search_df: pd.DataFrame,
|
59 |
address_cols: list,
|
60 |
postcode_col: list,
|
61 |
+
key_col: str,
|
62 |
+
progress = Progress(track_tqdm=True)
|
63 |
) -> Tuple[pd.DataFrame, str]:
|
64 |
|
65 |
+
progress(0, "Preparing search address column")
|
66 |
+
|
67 |
# Validate inputs
|
68 |
if not isinstance(search_df, pd.DataFrame):
|
69 |
raise TypeError("search_df must be a Pandas DataFrame")
|
|
|
75 |
raise TypeError("postcode_col must be a list")
|
76 |
|
77 |
if not isinstance(key_col, str):
|
78 |
+
raise TypeError("key_col must be a string")
|
|
|
|
|
|
|
|
|
79 |
|
80 |
# If there is a full address and postcode column in the addresses, clean any postcodes from the first column
|
81 |
if len(address_cols) == 2:
|
82 |
# Remove postcode from address
|
83 |
+
search_df[address_cols[0]] = remove_postcode(search_df, address_cols[0])
|
|
|
84 |
|
85 |
# Join address columns into one
|
86 |
+
full_addresses = _join_address(search_df, address_cols)
|
87 |
+
|
88 |
+
# Clean address columns
|
89 |
+
#search_df_polars = pl.from_dataframe(search_df)
|
90 |
+
clean_addresses = _clean_columns(full_addresses, ["full_address"])
|
91 |
|
92 |
# Add postcode column
|
93 |
+
full_df = _add_postcode_column(clean_addresses, postcode_col)
|
94 |
|
95 |
# Remove postcode from main address if there was only one column in the input
|
96 |
if postcode_col == "full_address_postcode":
|
97 |
# Remove postcode from address
|
98 |
+
full_df["full_address"] = remove_postcode(full_df, "full_address")
|
99 |
+
|
|
|
100 |
# Ensure index column
|
101 |
final_df = _ensure_index(full_df, key_col)
|
|
|
|
|
|
|
102 |
|
103 |
return final_df
|
104 |
|
105 |
# Helper functions
|
106 |
+
def _clean_columns(df:PandasDataFrame, cols:List[str]):
|
107 |
+
# Cleaning logic
|
108 |
+
def clean_col(col):
|
109 |
+
return (
|
110 |
+
col.astype(str)
|
111 |
+
.fillna("")
|
112 |
+
.infer_objects(copy=False)
|
113 |
+
.str.replace("nan", "")
|
114 |
+
.str.replace(r"\bNone\b", "", case=False, regex=True)
|
115 |
+
.str.replace(r"\s{2,}", " ", regex=True)
|
116 |
+
.str.replace(",", " ")
|
117 |
+
.str.replace(r"[\r\n]+", " ", regex=True) # Replace line breaks with spaces
|
118 |
+
.str.strip()
|
119 |
+
# Remove duplicate two words at the end if present
|
120 |
+
.str.replace(r'(\b\w+\b\s+\b\w+\b)\s+\1$', r'\1', regex=True)
|
121 |
+
)
|
122 |
+
|
123 |
+
for col in tqdm(cols, desc="Cleaning columns"):
|
124 |
+
df[col] = clean_col(df[col])
|
125 |
|
126 |
+
return df
|
|
|
|
|
127 |
|
128 |
+
def _join_address(df:PandasDataFrame, cols:List[str]):
|
129 |
# Joining logic
|
130 |
full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
|
131 |
df["full_address"] = full_address.str.replace("\s{2,}", " ", regex=True).str.strip()
|
132 |
|
133 |
return df
|
134 |
|
135 |
+
def _add_postcode_column(df:PandasDataFrame, postcodes:str):
|
136 |
# Add postcode column
|
137 |
if isinstance(postcodes, list):
|
138 |
postcodes = postcodes[0]
|
|
|
148 |
|
149 |
return df
|
150 |
|
151 |
+
def _ensure_index(df:PandasDataFrame, index_col:str):
|
152 |
# Ensure index column exists
|
153 |
if ((index_col == "index") & ~("index" in df.columns)):
|
154 |
print("Resetting index in _ensure_index function")
|
|
|
158 |
|
159 |
return df
|
160 |
|
161 |
+
def create_full_address(df:PandasDataFrame):
|
162 |
|
163 |
df = df.fillna("").infer_objects(copy=False)
|
164 |
|
|
|
184 |
|
185 |
return df["full_address"]
|
186 |
|
187 |
+
def prepare_ref_address(ref_df:PandasDataFrame, ref_address_cols:List[str], new_join_col = [], standard_cols = True, progress=Progress(track_tqdm=True)):
|
188 |
|
189 |
+
progress(0, "Preparing reference address")
|
190 |
+
|
191 |
if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns): standard_cols = True
|
192 |
else: standard_cols = False
|
193 |
|
|
|
199 |
ref_address_cols_uprn_w_ref.extend(["Reference file"])
|
200 |
|
201 |
ref_df_cleaned = ref_df.copy()
|
202 |
+
|
203 |
+
ref_df_cleaned["ref_index"] = ref_df_cleaned.index
|
204 |
|
205 |
# In on-prem LPI db street has been excluded, so put this back in
|
206 |
if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
|
|
|
237 |
full_address = ref_df_cleaned[ref_address_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
|
238 |
ref_df_cleaned["fulladdress"] = full_address
|
239 |
|
240 |
+
ref_df_cleaned = _clean_columns(ref_df_cleaned, ["fulladdress"])
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
# Create a street column if it doesn't exist by extracting street from the full address
|
243 |
|
|
|
245 |
ref_df_cleaned['Street'] = ref_df_cleaned["fulladdress"].apply(extract_street_name)
|
246 |
|
247 |
# Add index column
|
248 |
+
if 'ref_index' not in ref_df_cleaned.columns:
|
249 |
ref_df_cleaned['ref_index'] = ref_df_cleaned.index
|
250 |
|
251 |
return ref_df_cleaned
|
|
|
260 |
return postcode_series
|
261 |
|
262 |
# Remove addresses with no numbers in at all - too high a risk of badly assigning an address
|
263 |
+
def check_no_number_addresses(df:PandasDataFrame, in_address_series:str) -> PandasSeries:
|
264 |
'''
|
265 |
Highlight addresses from a pandas df where there are no numbers in the address.
|
266 |
'''
|
|
|
276 |
|
277 |
return df
|
278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
def extract_street_name(address:str) -> str:
|
280 |
"""
|
281 |
Extracts the street name from the given address.
|
|
|
347 |
|
348 |
# Exclude non-postal addresses
|
349 |
|
350 |
+
def remove_non_postal(df:PandasDataFrame, in_address_series:str):
|
351 |
'''
|
352 |
Highlight non-postal addresses from a polars df where a string series that contain specific substrings
|
353 |
indicating non-postal addresses like 'garage', 'parking', 'shed', etc.
|
tools/standardise.py
CHANGED
@@ -136,7 +136,9 @@ def standardise_address(df:PandasDataFrame, col:str, out_col:str, standardise:bo
|
|
136 |
str.replace(r"\bmaisonette\b", "flat", regex=True).\
|
137 |
str.replace(r"\bpt\b", "penthouse", regex=True).\
|
138 |
str.replace(r"\bbst\b","basement", regex=True).\
|
139 |
-
str.replace(r"\bbsmt\b","basement", regex=True)
|
|
|
|
|
140 |
|
141 |
df_copy["add_no_pcode_house"] = move_flat_house_court(df_copy)
|
142 |
|
|
|
136 |
str.replace(r"\bmaisonette\b", "flat", regex=True).\
|
137 |
str.replace(r"\bpt\b", "penthouse", regex=True).\
|
138 |
str.replace(r"\bbst\b","basement", regex=True).\
|
139 |
+
str.replace(r"\bbsmt\b","basement", regex=True).\
|
140 |
+
str.replace(r"\s{2,}", " ", regex=True).\
|
141 |
+
str.strip()
|
142 |
|
143 |
df_copy["add_no_pcode_house"] = move_flat_house_court(df_copy)
|
144 |
|