Spaces:

seanpedrickcase
/

address_matcher

Sleeping

App Files Files Community

Sean Pedrick-Case commited on 14 days ago

Commit

5bdafb4

unverified ·

2 Parent(s): 8fa83d7 9394293

Merge pull request #1 from seanpedrick-case/dev

Browse files

Deals with multi-line addresses. Better progress tracking. Returns original address in results. Updated packages

Files changed (9) hide show

.dockerignore +4 -1
.gitignore +4 -1
app.py +13 -15
requirements.txt +4 -4
requirements_aws.txt +5 -5
tools/fuzzy_match.py +23 -13
tools/matcher_funcs.py +143 -69
tools/preparation.py +51 -46
tools/standardise.py +3 -1

.dockerignore CHANGED Viewed

@@ -21,4 +21,7 @@ usage/
 logs/
 feedback/
 input/
-output/

 logs/
 feedback/
 input/
+output/
+cat_to_idx.txt
+vocab.txt
+word_to_index.txt

.gitignore CHANGED Viewed

@@ -21,4 +21,7 @@ usage/*
 logs/*
 feedback/*
 input/*
-output/*

 logs/*
 feedback/*
 input/*
+output/*
+cat_to_idx.txt
+vocab.txt
+word_to_index.txt

app.py CHANGED Viewed

@@ -55,45 +55,43 @@ with block:
     access_logs_state = gr.State(access_logs_folder + 'log.csv')
     access_s3_logs_loc_state = gr.State(access_logs_folder)
     usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
-    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
     gr.Markdown(
     """
     # Address matcher
-    Match single or multiple addresses to the reference address file of your choice. Fuzzy matching should work on any address columns as long as you specify the postcode column at the end. The neural network component only activates with the in-house neural network model - contact me for details if you have access to AddressBase already.The neural network component works with LLPG files in the LPI format.
-    The tool can accept csv, xlsx (with one sheet), and parquet files. You
-     need to specify the address columns of the file to match specifically in the address column area with postcode at the end.
-    Use the 'New Column' button to create a new cell for each column name. After you have chosen a reference file, an address match file, and specified its address columns (plus postcode), you can press 'Match addresses' to run the tool.
-    """)
     with gr.Tab("Match addresses"):
-        with gr.Accordion("Quick check - single address", open = True):
-            in_text = gr.Textbox(label="Input a single address as text")
-        with gr.Accordion("I have multiple addresses", open = False):
             in_file = gr.File(label="Input addresses from file", file_count= "multiple")
             in_colnames = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the address. Make sure postcode is at the end")
             in_existing = gr.Dropdown(value=[], choices=[], multiselect=False, label="Select columns that indicate existing matches.")
         gr.Markdown(
         """
         ## Choose reference file / call API
-        Upload a reference file to match against, or alternatively call the Addressbase API (requires API key). Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you,
-        open 'Custom reference file format or join columns' below.
         """)
-        with gr.Accordion("Use Addressbase API (instead of reference file)", open = True):
             in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode"])#["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
             in_api_key = gr.Textbox(label="Addressbase API key", type='password', value = ADDRESSBASE_API_KEY)
-        with gr.Accordion("Match against reference file of addresses", open = False):
             in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
-        with gr.Accordion("Custom reference file format or join columns (i.e. not LLPG LPI format)", open = False):
             in_refcol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
             in_joincol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns you want to join on to the search dataset")

     access_logs_state = gr.State(access_logs_folder + 'log.csv')
     access_s3_logs_loc_state = gr.State(access_logs_folder)
     usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
+    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
     gr.Markdown(
     """
     # Address matcher
+    Match single or multiple addresses to the reference address file of your choice. *Please note that a postcode column is required for matching*. Fuzzy matching should work on any address columns as long as you specify the postcode column at the end. The neural network component only activates with the in-house neural network model - contact me for details if you have access to AddressBase already. The neural network component works with LLPG files in the LPI format.
+    The tool can accept csv, xlsx (with one sheet), and parquet files. You need to specify the address columns of the file to match specifically in the address column area with postcode at the end.
+    Use the 'New Column' button to create a new cell for each column name. After you have chosen a reference file, an address match file, and specified its address columns (plus postcode), you can press 'Match addresses' to run the tool.""")
     with gr.Tab("Match addresses"):
+        with gr.Accordion("I have multiple addresses in a CSV/XLSX/Parquet file", open = True):
             in_file = gr.File(label="Input addresses from file", file_count= "multiple")
             in_colnames = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the address. Make sure postcode is at the end")
             in_existing = gr.Dropdown(value=[], choices=[], multiselect=False, label="Select columns that indicate existing matches.")
+        with gr.Accordion("Quick check - single address", open = False):
+            in_text = gr.Textbox(label="Input a single address as text")
         gr.Markdown(
         """
         ## Choose reference file / call API
+        Upload a reference file to match against, or alternatively call the Addressbase API (requires API key). Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you, open 'Custom reference file format or join columns' below.
         """)
+        with gr.Accordion("Use Addressbase API (instead of reference file)", open = False):
             in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode"])#["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
             in_api_key = gr.Textbox(label="Addressbase API key", type='password', value = ADDRESSBASE_API_KEY)
+        with gr.Accordion("Match against reference list of addresses in a CSV/XLSX/Parquet file", open = True):
             in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
+        with gr.Accordion("Custom reference file format or join columns (if not LLPG/Addressbase format with columns SaoText, SaoStartNumber etc.)", open = False):
             in_refcol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
             in_joincol = gr.Dropdown(value=[], choices=[], multiselect=True, label="Select columns you want to join on to the search dataset")

requirements.txt CHANGED Viewed

@@ -1,12 +1,12 @@
 torch==2.7.1
 pandas==2.2.3
-rapidfuzz==3.8.1
 recordlinkage==0.16
 pyap==0.3.1
 pytest==7.4.3
-pyarrow==19.0.1
 openpyxl==3.1.2
-gradio==5.34.0
-boto3==1.38.37
 polars==0.20.19
 numpy==1.26.4

 torch==2.7.1
 pandas==2.2.3
+rapidfuzz==3.13.0
 recordlinkage==0.16
 pyap==0.3.1
 pytest==7.4.3
+pyarrow==21.0.0
 openpyxl==3.1.2
+gradio==5.34.2
+boto3==1.40.5
 polars==0.20.19
 numpy==1.26.4

requirements_aws.txt CHANGED Viewed

@@ -1,11 +1,11 @@
-pandas==2.2.3
-rapidfuzz==3.8.1
 recordlinkage==0.16
 pyap==0.3.1
 pytest==7.4.3
-pyarrow==19.0.0
 openpyxl==3.1.2
-gradio==5.34.0
-boto3==1.38.37
 polars==0.20.19
 numpy==1.26.4

+pandas==2.3.0
+rapidfuzz==3.13.0
 recordlinkage==0.16
 pyap==0.3.1
 pytest==7.4.3
+pyarrow==21.0.0
 openpyxl==3.1.2
+gradio==5.34.2
+boto3==1.40.5
 polars==0.20.19
 numpy==1.26.4

tools/fuzzy_match.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Dict, List, Tuple, Type
 from datetime import datetime
 from rapidfuzz import fuzz, process
 import gradio as gr
 PandasDataFrame = Type[pd.DataFrame]
 PandasSeries = Type[pd.Series]
@@ -51,7 +52,7 @@ def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:Panda
     return df
 def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
-                              search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults:
     '''
     Matches by Series values; for example idx is post code and
     values address. Search field is reduced by comparing same post codes address reference_address_series.
@@ -140,7 +141,7 @@ def string_match_by_post_code_multiple(match_address_series:PandasSeries, refere
     unique_postcodes = pd.unique(match_address_df['postcode_search'])
-    for postcode_match in progress.tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):
         postcode_match_list = [postcode_match]
         search_indexes = pd.Series()
@@ -177,7 +178,7 @@ def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_
         ## Diagnostics
-        diag_shortlist, diag_best_match = refine_export_results(results_df=results,\
                                       matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
                                       fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
@@ -308,7 +309,7 @@ def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_mat
             diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")
     if 'wratio_score' not in diag_shortlist.columns:
-        diag_shortlist['wratio_score'] = ''
     # Order by best score
     diag_shortlist = diag_shortlist.sort_values([
@@ -317,7 +318,7 @@ def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_mat
     return diag_shortlist
-def refine_export_results(results_df:PandasDataFrame,
                            matched_df:PandasDataFrame,
                            ref_list_df:PandasDataFrame,
                            matched_col="fuzzy_match_search_address",
@@ -340,7 +341,10 @@ def refine_export_results(results_df:PandasDataFrame,
     results_df = results_df[results_df[matched_col] !=0 ]
     ### Join property number and flat/room number etc. onto results_df
-    ref_list_df["ref_index"] = ref_list_df.index
     ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
     ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})
@@ -351,7 +355,7 @@ def refine_export_results(results_df:PandasDataFrame,
     matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
     matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
-    results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
     # Choose your best matches from the list of options
     diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)
@@ -381,12 +385,15 @@ def refine_export_results(results_df:PandasDataFrame,
     diag_shortlist = diag_shortlist[match_results_cols]
     # Choose best match from the shortlist that has been ordered according to score descending
     diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
     return diag_shortlist, diag_best_match
-def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
     '''
     Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
     '''
@@ -402,7 +409,6 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
     ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
     ref_df_after_stand_cols.extend(new_join_col)
     if (search_df_key_field == "index"):
         # Check index is int
@@ -420,8 +426,6 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
     if "Matched with reference address_y" in results_for_orig_df_join.columns:
         results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))
-        #results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False)
     if "Reference file_y" in results_for_orig_df_join.columns:
         results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)
@@ -429,8 +433,13 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
         results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)
     # Drop columns that aren't useful
-    results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2", "full_address",
                                    "address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")
     # Replace blanks with NA, fix UPRNs
     results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)
@@ -439,6 +448,7 @@ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFr
     # Replace cells with only 'nan' with blank
     results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
     return results_for_orig_df_join

 from datetime import datetime
 from rapidfuzz import fuzz, process
 import gradio as gr
+from tqdm import tqdm
 PandasDataFrame = Type[pd.DataFrame]
 PandasSeries = Type[pd.Series]
     return df
 def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
+                              search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress(track_tqdm=True))-> MatchedResults:
     '''
     Matches by Series values; for example idx is post code and
     values address. Search field is reduced by comparing same post codes address reference_address_series.
     unique_postcodes = pd.unique(match_address_df['postcode_search'])
+    for postcode_match in tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):
         postcode_match_list = [postcode_match]
         search_indexes = pd.Series()
         ## Diagnostics
+        diag_shortlist, diag_best_match = create_diagnostic_results(results_df=results,\
                                       matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
                                       fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
             diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")
     if 'wratio_score' not in diag_shortlist.columns:
+        diag_shortlist['wratio_score'] = None
     # Order by best score
     diag_shortlist = diag_shortlist.sort_values([
     return diag_shortlist
+def create_diagnostic_results(results_df:PandasDataFrame,
                            matched_df:PandasDataFrame,
                            ref_list_df:PandasDataFrame,
                            matched_col="fuzzy_match_search_address",
     results_df = results_df[results_df[matched_col] !=0 ]
     ### Join property number and flat/room number etc. onto results_df
+    if 'ref_index' not in ref_list_df.columns:
+        print("Existing ref_index column not found")
+        ref_list_df["ref_index"] = ref_list_df.index
     ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
     ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})
     matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
     matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
+    results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
     # Choose your best matches from the list of options
     diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)
     diag_shortlist = diag_shortlist[match_results_cols]
+    diag_shortlist["ref_index"] = diag_shortlist["ref_index"].astype(int, errors="ignore")
+    diag_shortlist["wratio_score"] = diag_shortlist["wratio_score"].astype(float, errors="ignore")
     # Choose best match from the shortlist that has been ordered according to score descending
     diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
     return diag_shortlist, diag_best_match
+def create_results_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
     '''
     Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
     '''
     ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
     ref_df_after_stand_cols.extend(new_join_col)
     if (search_df_key_field == "index"):
         # Check index is int
     if "Matched with reference address_y" in results_for_orig_df_join.columns:
         results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))
     if "Reference file_y" in results_for_orig_df_join.columns:
         results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)
         results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)
     # Drop columns that aren't useful
+    results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2",
                                    "address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")
+    results_for_orig_df_join.rename(columns={"full_address":"Search data address"}, inplace = True)
+    results_for_orig_df_join["index"] = results_for_orig_df_join["index"].astype(int, errors="ignore")
+    results_for_orig_df_join["ref_index"] = results_for_orig_df_join["ref_index"].astype(int, errors="ignore")
     # Replace blanks with NA, fix UPRNs
     results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)
     # Replace cells with only 'nan' with blank
     results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
+    results_for_orig_df_join.to_csv("output/results_for_orig_df_join.csv")
     return results_for_orig_df_join

tools/matcher_funcs.py CHANGED Viewed

@@ -9,6 +9,7 @@ import math
 from datetime import datetime
 import copy
 import gradio as gr
 PandasDataFrame = Type[pd.DataFrame]
 PandasSeries = Type[pd.Series]
@@ -26,7 +27,7 @@ run_standardise = True
 from tools.constants import *
 from tools.preparation import prepare_search_address_string, prepare_search_address,  extract_street_name, prepare_ref_address, remove_non_postal, check_no_number_addresses
-from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
 from tools.standardise import standardise_wrapper_func
 # Neural network functions
@@ -69,14 +70,11 @@ def read_file(filename:str) -> PandasDataFrame:
 def get_file_name(in_name: str) -> str:
     """Get the name of a file from a string, handling both Windows and Unix paths."""
-    print("in_name: ", in_name)
     match = re.search(rf'{re.escape(os.sep)}(?!.*{re.escape(os.sep)})(.*)', in_name)
     if match:
         matched_result = match.group(1)
     else:
         matched_result = None
-    print("Matched result: ", matched_result)
     return matched_result
@@ -108,7 +106,7 @@ def filter_not_matched(
     return search_df.iloc[np.where(~matched)[0]]
-def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
     final_api_output_file_name = ""
@@ -204,7 +202,7 @@ def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str,
             loop_df = Matcher.ref_df
             loop_list = [Matcher.ref_df]
-            for address in progress.tqdm(api_search_df['full_address_postcode'], desc= "Making API calls", unit="addresses", total=len(api_search_df['full_address_postcode'])):
                 print("Query number: " + str(i+1), "with address: ", address)
                 api_search_index = api_search_df.index
@@ -368,7 +366,7 @@ def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str,
     return Matcher, final_api_output_file_name
-def load_ref_data(Matcher:MatcherClass, ref_data_state:PandasDataFrame, in_ref:List[str], in_refcol:List[str], in_api:List[str], in_api_key:str, query_type:str, progress=gr.Progress()):
         '''
         Check for reference address data, do some preprocessing, and load in from the Addressbase API if required.
         '''
@@ -577,7 +575,7 @@ def load_match_data_and_filter(Matcher:MatcherClass, data_state:PandasDataFrame,
             Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
             Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
             Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
-            Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
             Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
             Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
@@ -657,43 +655,91 @@ def load_match_data_and_filter(Matcher:MatcherClass, data_state:PandasDataFrame,
     return Matcher
-def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, Matcher, in_api, in_api_key):
-        '''
-        Load in user inputs from the Gradio interface. Convert all input types (single address, or csv input) into standardised data format that can be used downstream for the fuzzy matching.
-        '''
-        final_api_output_file_name = ""
-        today_rev = datetime.now().strftime("%Y%m%d")
-        # Abort flag for if it's not even possible to attempt the first stage of the match for some reason
-        Matcher.abort_flag = False
-        ### ref_df FILES ###
-        # If not an API call, run this first
-        if not in_api:
-            Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
-        ### MATCH/SEARCH FILES ###
-        # If doing API calls, we need to know the search data before querying for specific addresses/postcodes
-        Matcher = load_match_data_and_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
-        # If an API call, ref_df data is loaded after
-        if in_api:
-            Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
-        print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
-        print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
-        Matcher.match_outputs_name = output_folder + "diagnostics_initial_" + today_rev + ".csv"
-        Matcher.results_orig_df_name = output_folder + "results_initial_" + today_rev + ".csv"
-        Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
-        Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
-        return Matcher, final_api_output_file_name
 # Run whole matcher process
-def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
     '''
     Split search and reference data into batches. Loop and run through the match script for each batch of data.
     '''
@@ -722,7 +768,6 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
     # Polars implementation not yet finalised
     #InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
     #InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
     # Prepare all search addresses
     if type(InitMatch.search_df) == str:
@@ -739,7 +784,6 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
     # Initial preparation of reference addresses
     InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
     # Polars implementation - not finalised
     #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
     #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
@@ -747,8 +791,10 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
     # Standardise addresses
     # Standardise - minimal
     tic = time.perf_counter()
     InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
         InitMatch.search_df_cleaned.copy(),
         InitMatch.ref_df_cleaned.copy(),
@@ -759,6 +805,8 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
     toc = time.perf_counter()
     print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
     # Standardise - full
     tic = time.perf_counter()
     InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
@@ -784,8 +832,8 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
     n = 0
     number_of_batches = range_df.shape[0]
-    for row in progress.tqdm(range(0,len(range_df)), desc= "Running through batches", unit="batches", total=number_of_batches):
-        print("Running batch ", str(n+1))
         search_range = range_df.iloc[row]['search_range']
         ref_range = range_df.iloc[row]['ref_range']
@@ -830,6 +878,8 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
     # Remove any duplicates from reference df, prioritise successful matches
     OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
     overall_toc = time.perf_counter()
     time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
@@ -851,14 +901,13 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
     nnet_std_output = OutputMatch.match_results_output.copy()
     nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
-    final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
     estimate_total_processing_time = sum_numbers_before_seconds(time_out)
     print("Estimated total processing time:", str(estimate_total_processing_time))
     output_files.extend([OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name])
     return final_summary, output_files, estimate_total_processing_time
 # Run a match run for a single batch
@@ -985,7 +1034,7 @@ def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:i
     return lengths_df
-def run_single_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
     '''
     Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
     '''
@@ -1074,7 +1123,7 @@ def run_single_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches
     return summary_of_summaries, FuzzyNNetStdMatch
 # Overarching functions
-def orchestrate_single_match_batch(Matcher, standardise = False, nnet = False, file_stub= "not_std_", df_name = "Fuzzy not standardised"):
         today_rev = datetime.now().strftime("%Y%m%d")
@@ -1152,20 +1201,24 @@ def orchestrate_single_match_batch(Matcher, standardise = False, nnet = False, f
                 return Matcher
             else:
                 Matcher.match_results_output = match_results_output
-                Matcher.predict_df_nnet = predict_df_nnet
         # Save to file
         Matcher.results_on_orig_df = results_on_orig_df
-        print("Results output in orchestrate match run shape: ", Matcher.results_on_orig_df.shape)
-        Matcher.summary = summary
         Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
         Matcher.match_outputs_name = output_folder + "diagnostics_" + file_stub + today_rev + ".csv"
         Matcher.results_orig_df_name = output_folder + "results_" + file_stub + today_rev + ".csv"
         Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
         Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
@@ -1248,7 +1301,7 @@ def full_fuzzy_match(search_df:PandasDataFrame,
         summary = create_match_summary(match_results_output, df_name)
         if type(search_df) != str:
-            results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
         else: results_on_orig_df = match_results_output
         print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
@@ -1283,11 +1336,10 @@ def full_fuzzy_match(search_df:PandasDataFrame,
         summary = create_match_summary(match_results_output, df_name)
         if type(search_df) != str:
-            results_on_orig_df = join_to_orig_df(match_results_output, search_df_after_stand, search_df_key_field, new_join_col)
         else: results_on_orig_df = match_results_output
-        return diag_shortlist, diag_best_match,\
-        match_results_output, results_on_orig_df, summary, search_address_cols
     print("Starting the fuzzy match with street as blocker")
@@ -1314,7 +1366,7 @@ def full_fuzzy_match(search_df:PandasDataFrame,
     ### Join URPN back onto orig df
     if type(search_df) != str:
-        results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
     else: results_on_orig_df = match_results_output
     print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
@@ -1480,7 +1532,7 @@ def full_nn_match(ref_address_cols:List[str],
     ### Join URPN back onto orig df
     if type(search_df) != str:
-        results_on_orig_df = join_to_orig_df(match_results_output_final_three, search_df_after_stand, search_df_key_field, new_join_col)
     else: results_on_orig_df = match_results_output_final_three
     return match_results_output_final_three, results_on_orig_df, summary_three, predict_df
@@ -1495,18 +1547,28 @@ def combine_dfs_and_remove_dups(orig_df:PandasDataFrame, new_df:PandasDataFrame,
     # If one of the dataframes is empty, break
     if (orig_df.empty) & (new_df.empty):
         return orig_df
-    combined_std_not_matches = pd.concat([orig_df, new_df])#, ignore_index=True)
     # If no results were combined
     if combined_std_not_matches.empty:
         combined_std_not_matches[match_address_series] = False
-        if "full_address" in combined_std_not_matches.columns:
-            combined_std_not_matches[index_col] = combined_std_not_matches["full_address"]
         combined_std_not_matches["fuzzy_score"] = 0
         return combined_std_not_matches
@@ -1540,6 +1602,7 @@ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass,
         found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
         key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int)  # Assuming list conversion is suitable
         rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
         NewMatchClass.search_df_not_matched = NewMatchClass.search_df_not_matched.loc[~NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].isin(rows_to_drop),:]#.drop(rows_to_drop, axis = 0)
@@ -1565,11 +1628,13 @@ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass,
         NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.drop("fuzzy_score", axis = 1)
         # Drop any duplicates, prioritise any matches
         NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
         NewMatchClass.output_summary = create_match_summary(NewMatchClass.match_results_output, df_name = df_name)
-        print(NewMatchClass.output_summary)
         NewMatchClass.search_df_not_matched = filter_not_matched(NewMatchClass.match_results_output, NewMatchClass.search_df, NewMatchClass.search_df_key_field)
@@ -1580,8 +1645,17 @@ def combine_two_matches(OrigMatchClass:MatcherClass, NewMatchClass:MatcherClass,
         NewMatchClass.results_orig_df_name = output_folder + "results_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
         # Only keep essential columns
-        essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
-        essential_results_cols.extend(NewMatchClass.new_join_col)
         NewMatchClass.match_results_output.to_csv(NewMatchClass.match_outputs_name, index = None)
         NewMatchClass.results_on_orig_df[essential_results_cols].to_csv(NewMatchClass.results_orig_df_name, index = None)

 from datetime import datetime
 import copy
 import gradio as gr
+from tqdm import tqdm
 PandasDataFrame = Type[pd.DataFrame]
 PandasSeries = Type[pd.Series]
 from tools.constants import *
 from tools.preparation import prepare_search_address_string, prepare_search_address,  extract_street_name, prepare_ref_address, remove_non_postal, check_no_number_addresses
+from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, create_results_df
 from tools.standardise import standardise_wrapper_func
 # Neural network functions
 def get_file_name(in_name: str) -> str:
     """Get the name of a file from a string, handling both Windows and Unix paths."""
     match = re.search(rf'{re.escape(os.sep)}(?!.*{re.escape(os.sep)})(.*)', in_name)
     if match:
         matched_result = match.group(1)
     else:
         matched_result = None
     return matched_result
     return search_df.iloc[np.where(~matched)[0]]
+def query_addressbase_api(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress(track_tqdm=True)):
     final_api_output_file_name = ""
             loop_df = Matcher.ref_df
             loop_list = [Matcher.ref_df]
+            for address in tqdm(api_search_df['full_address_postcode'], desc= "Making API calls", unit="addresses", total=len(api_search_df['full_address_postcode'])):
                 print("Query number: " + str(i+1), "with address: ", address)
                 api_search_index = api_search_df.index
     return Matcher, final_api_output_file_name
+def load_ref_data(Matcher:MatcherClass, ref_data_state:PandasDataFrame, in_ref:List[str], in_refcol:List[str], in_api:List[str], in_api_key:str, query_type:str, progress=gr.Progress(track_tqdm=True)):
         '''
         Check for reference address data, do some preprocessing, and load in from the Addressbase API if required.
         '''
             Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
             Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
             Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
+            #Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
             Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
             Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
     return Matcher
+def load_matcher_data(
+    in_text: str,
+    in_file: str,
+    in_ref: str,
+    data_state: PandasDataFrame,
+    results_data_state: PandasDataFrame,
+    ref_data_state: PandasDataFrame,
+    in_colnames: list,
+    in_refcol: list,
+    in_joincol: list,
+    in_existing: list,
+    Matcher: MatcherClass,
+    in_api:str,
+    in_api_key: str
+) -> tuple:
+    """
+    Load and preprocess user inputs from the Gradio interface for address matching.
+    This function standardises all input types (single address string, file uploads, etc.) into a consistent data format
+    suitable for downstream fuzzy matching. It handles both search and reference data, including API-based reference data retrieval
+    if requested.
+    Args:
+        in_text (str): Single address input as text, if provided.
+        in_file: Uploaded file(s) containing addresses to match.
+        in_ref: Uploaded reference file(s) or None if using API.
+        data_state (PandasDataFrame): Current state of the search data.
+        results_data_state (PandasDataFrame): Current state of the results data.
+        ref_data_state (PandasDataFrame): Current state of the reference data.
+        in_colnames (list): List of column names that make up the address in the search data.
+        in_refcol (list): List of column names that make up the address in the reference data.
+        in_joincol (list): List of columns to join on between search and reference data.
+        in_existing (list): List of columns indicating existing matches.
+        Matcher (MatcherClass): Matcher object to store and process data.
+        in_api: Flag or value indicating whether to use the API for reference data.
+        in_api_key (str): API key for reference data retrieval, if applicable.
+    Returns:
+        tuple: (Matcher, final_api_output_file_name)
+            Matcher: The updated Matcher object with loaded and preprocessed data.
+            final_api_output_file_name (str): The filename of the reference data if loaded from API, else empty string.
+    """
+    final_api_output_file_name = ""
+    today_rev = datetime.now().strftime("%Y%m%d")
+    # Abort flag for if it's not even possible to attempt the first stage of the match for some reason
+    Matcher.abort_flag = False
+    ### ref_df FILES ###
+    # If not an API call, run this first
+    if not in_api:
+        Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
+    ### MATCH/SEARCH FILES ###
+    # If doing API calls, we need to know the search data before querying for specific addresses/postcodes
+    Matcher = load_match_data_and_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
+    # If an API call, ref_df data is loaded after
+    if in_api:
+        Matcher, final_api_output_file_name = load_ref_data(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
+    print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
+    print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
+    Matcher.match_outputs_name = output_folder + "diagnostics_initial_" + today_rev + ".csv"
+    Matcher.results_orig_df_name = output_folder + "results_initial_" + today_rev + ".csv"
+    if "fuzzy_score" in Matcher.match_results_output.columns:
+        Matcher.match_results_output["fuzzy_score"] = (
+            pd.to_numeric(Matcher.match_results_output["fuzzy_score"], errors="coerce").round(2)
+        )
+    if "wratio_score" in Matcher.match_results_output.columns:
+        Matcher.match_results_output["wratio_score"] = (
+            pd.to_numeric(Matcher.match_results_output["wratio_score"], errors="coerce").round(2)
+        )
+    Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
+    Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
+    return Matcher, final_api_output_file_name
 # Run whole matcher process
+def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress(track_tqdm=True)):
     '''
     Split search and reference data into batches. Loop and run through the match script for each batch of data.
     '''
     # Polars implementation not yet finalised
     #InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
     #InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
     # Prepare all search addresses
     if type(InitMatch.search_df) == str:
     # Initial preparation of reference addresses
     InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
     # Polars implementation - not finalised
     #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
     #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
     # Standardise addresses
     # Standardise - minimal
     tic = time.perf_counter()
+    progress(0.1, desc="Performing minimal standardisation")
     InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
         InitMatch.search_df_cleaned.copy(),
         InitMatch.ref_df_cleaned.copy(),
     toc = time.perf_counter()
     print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
+    progress(0.1, desc="Performing full standardisation")
     # Standardise - full
     tic = time.perf_counter()
     InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
     n = 0
     number_of_batches = range_df.shape[0]
+    for row in progress.tqdm(range(0,number_of_batches), desc= "Matching addresses in batches", unit="batches", total=number_of_batches):
+        print("Running batch", str(n+1))
         search_range = range_df.iloc[row]['search_range']
         ref_range = range_df.iloc[row]['ref_range']
     # Remove any duplicates from reference df, prioritise successful matches
     OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
     overall_toc = time.perf_counter()
     time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
     nnet_std_output = OutputMatch.match_results_output.copy()
     nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
+    final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
     estimate_total_processing_time = sum_numbers_before_seconds(time_out)
     print("Estimated total processing time:", str(estimate_total_processing_time))
     output_files.extend([OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name])
     return final_summary, output_files, estimate_total_processing_time
 # Run a match run for a single batch
     return lengths_df
+def run_single_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress(track_tqdm=True)):
     '''
     Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
     '''
     return summary_of_summaries, FuzzyNNetStdMatch
 # Overarching functions
+def orchestrate_single_match_batch(Matcher:MatcherClass, standardise = False, nnet = False, file_stub= "not_std_", df_name = "Fuzzy not standardised"):
         today_rev = datetime.now().strftime("%Y%m%d")
                 return Matcher
             else:
                 Matcher.match_results_output = match_results_output
+                Matcher.predict_df_nnet = predict_df_nnet
         # Save to file
         Matcher.results_on_orig_df = results_on_orig_df
+        Matcher.summary = summary
         Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
         Matcher.match_outputs_name = output_folder + "diagnostics_" + file_stub + today_rev + ".csv"
         Matcher.results_orig_df_name = output_folder + "results_" + file_stub + today_rev + ".csv"
+        if "fuzzy_score" in Matcher.match_results_output.columns:
+            Matcher.match_results_output["fuzzy_score"] = (
+            pd.to_numeric(Matcher.match_results_output["fuzzy_score"], errors="coerce").round(2)
+            )
+        if "wratio_score" in Matcher.match_results_output.columns:
+            Matcher.match_results_output["wratio_score"] = (
+                pd.to_numeric(Matcher.match_results_output["wratio_score"], errors="coerce").round(2)
+            )
         Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
         Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
         summary = create_match_summary(match_results_output, df_name)
         if type(search_df) != str:
+            results_on_orig_df = create_results_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
         else: results_on_orig_df = match_results_output
         print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
         summary = create_match_summary(match_results_output, df_name)
         if type(search_df) != str:
+            results_on_orig_df = create_results_df(match_results_output, search_df_after_stand, search_df_key_field, new_join_col)
         else: results_on_orig_df = match_results_output
+        return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
     print("Starting the fuzzy match with street as blocker")
     ### Join URPN back onto orig df
     if type(search_df) != str:
+        results_on_orig_df = create_results_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
     else: results_on_orig_df = match_results_output
     print("results_on_orig_df in fuzzy_match shape: ", results_on_orig_df.shape)
     ### Join URPN back onto orig df
     if type(search_df) != str:
+        results_on_orig_df = create_results_df(match_results_output_final_three, search_df_after_stand, search_df_key_field, new_join_col)
     else: results_on_orig_df = match_results_output_final_three
     return match_results_output_final_three, results_on_orig_df, summary_three, predict_df
     # If one of the dataframes is empty, break
     if (orig_df.empty) & (new_df.empty):
         return orig_df
+    # Ensure that the original search result is returned
+    if "Search data address" not in orig_df.columns:
+        if "search_orig_address" in orig_df.columns:
+            orig_df["Search data address"] = orig_df["search_orig_address"]
+        elif "address_cols_joined" in orig_df.columns:
+            orig_df["Search data address"] = orig_df["address_cols_joined"]
+    if "Search data address" not in new_df.columns:
+        if "search_orig_address" in new_df.columns:
+            new_df["Search data address"] = new_df["search_orig_address"]
+        elif "address_cols_joined" in new_df.columns:
+            new_df["Search data address"] = new_df["address_cols_joined"]
+    combined_std_not_matches = pd.concat([orig_df, new_df])#, ignore_index=True)
     # If no results were combined
     if combined_std_not_matches.empty:
         combined_std_not_matches[match_address_series] = False
+        #if "full_address" in combined_std_not_matches.columns:
+        #    combined_std_not_matches[index_col] = combined_std_not_matches["full_address"]
         combined_std_not_matches["fuzzy_score"] = 0
         return combined_std_not_matches
         found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
         key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int)  # Assuming list conversion is suitable
         rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
         NewMatchClass.search_df_not_matched = NewMatchClass.search_df_not_matched.loc[~NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].isin(rows_to_drop),:]#.drop(rows_to_drop, axis = 0)
         NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.drop("fuzzy_score", axis = 1)
         # Drop any duplicates, prioritise any matches
+        NewMatchClass.results_on_orig_df["index"] = NewMatchClass.results_on_orig_df["index"].astype(int, errors="ignore")
+        NewMatchClass.results_on_orig_df["ref_index"] = NewMatchClass.results_on_orig_df["ref_index"].astype(int, errors="ignore")
         NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
         NewMatchClass.output_summary = create_match_summary(NewMatchClass.match_results_output, df_name = df_name)
+        print(NewMatchClass.output_summary)
         NewMatchClass.search_df_not_matched = filter_not_matched(NewMatchClass.match_results_output, NewMatchClass.search_df, NewMatchClass.search_df_key_field)
         NewMatchClass.results_orig_df_name = output_folder + "results_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
         # Only keep essential columns
+        essential_results_cols = [NewMatchClass.search_df_key_field, "Search data address", "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
+        essential_results_cols.extend(NewMatchClass.new_join_col)
+        if "fuzzy_score" in NewMatchClass.match_results_output.columns:
+            NewMatchClass.match_results_output["fuzzy_score"] = (
+            pd.to_numeric(NewMatchClass.match_results_output["fuzzy_score"], errors="coerce").round(2)
+            )
+        if "wratio_score" in NewMatchClass.match_results_output.columns:
+            NewMatchClass.match_results_output["wratio_score"] = (
+                pd.to_numeric(NewMatchClass.match_results_output["wratio_score"], errors="coerce").round(2)
+            )
         NewMatchClass.match_results_output.to_csv(NewMatchClass.match_outputs_name, index = None)
         NewMatchClass.results_on_orig_df[essential_results_cols].to_csv(NewMatchClass.results_orig_df_name, index = None)

tools/preparation.py CHANGED Viewed

@@ -3,6 +3,10 @@ from typing import Type, Dict, List, Tuple
 from datetime import datetime
 #import polars as pl
 import re
 PandasDataFrame = Type[pd.DataFrame]
 PandasSeries = Type[pd.Series]
@@ -54,9 +58,12 @@ def prepare_search_address(
     search_df: pd.DataFrame,
     address_cols: list,
     postcode_col: list,
-    key_col: str
 ) -> Tuple[pd.DataFrame, str]:
     # Validate inputs
     if not isinstance(search_df, pd.DataFrame):
         raise TypeError("search_df must be a Pandas DataFrame")
@@ -68,56 +75,64 @@ def prepare_search_address(
         raise TypeError("postcode_col must be a list")
     if not isinstance(key_col, str):
-        raise TypeError("key_col must be a string")
-    # Clean address columns
-    #search_df_polars = pl.from_dataframe(search_df)
-    clean_addresses = _clean_columns(search_df, address_cols)
     # If there is a full address and postcode column in the addresses, clean any postcodes from the first column
     if len(address_cols) == 2:
         # Remove postcode from address
-        address_series = remove_postcode(clean_addresses, address_cols[0])
-        clean_addresses[address_cols[0]] = address_series
     # Join address columns into one
-    full_addresses = _join_address(clean_addresses, address_cols)
     # Add postcode column
-    full_df = _add_postcode_column(full_addresses, postcode_col)
     # Remove postcode from main address if there was only one column in the input
     if postcode_col == "full_address_postcode":
         # Remove postcode from address
-        address_series = remove_postcode(search_df, "full_address")
-        search_df["full_address"] == address_series
     # Ensure index column
     final_df = _ensure_index(full_df, key_col)
-    #print(final_df)
     return final_df
 # Helper functions
-def _clean_columns(df, cols):
-   # Cleaning logic
-   def clean_col(col):
-       return col.astype(str).fillna("").infer_objects(copy=False).str.replace("nan","").str.replace("\s{2,}", " ", regex=True).str.replace(","," ").str.strip()
-   df[cols] = df[cols].apply(clean_col)
-   return df
-def _join_address(df, cols):
    # Joining logic
    full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    df["full_address"] = full_address.str.replace("\s{2,}", " ", regex=True).str.strip()
    return df
-def _add_postcode_column(df, postcodes):
    # Add postcode column
    if isinstance(postcodes, list):
         postcodes = postcodes[0]
@@ -133,7 +148,7 @@ def _add_postcode_column(df, postcodes):
    return df
-def _ensure_index(df, index_col):
    # Ensure index column exists
    if ((index_col == "index") & ~("index" in df.columns)):
         print("Resetting index in _ensure_index function")
@@ -143,7 +158,7 @@ def _ensure_index(df, index_col):
    return df
-def create_full_address(df):
     df = df.fillna("").infer_objects(copy=False)
@@ -169,8 +184,10 @@ def create_full_address(df):
     return df["full_address"]
-def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_cols = True):
     if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns): standard_cols = True
     else: standard_cols = False
@@ -182,6 +199,8 @@ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_co
     ref_address_cols_uprn_w_ref.extend(["Reference file"])
     ref_df_cleaned = ref_df.copy()
     # In on-prem LPI db street has been excluded, so put this back in
     if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
@@ -218,13 +237,7 @@ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_co
         full_address  = ref_df_cleaned[ref_address_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
         ref_df_cleaned["fulladdress"] = full_address
-    ref_df_cleaned["fulladdress"] = ref_df_cleaned["fulladdress"]\
-    .str.replace("-999","")\
-    .str.replace(" -"," ")\
-    .str.replace("- "," ")\
-    .str.replace(".0","", regex=False)\
-    .str.replace("\s{2,}", " ", regex=True)\
-    .str.strip()
     # Create a street column if it doesn't exist by extracting street from the full address
@@ -232,6 +245,7 @@ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = [], standard_co
         ref_df_cleaned['Street'] = ref_df_cleaned["fulladdress"].apply(extract_street_name)
     # Add index column
         ref_df_cleaned['ref_index'] = ref_df_cleaned.index
     return ref_df_cleaned
@@ -246,7 +260,7 @@ def extract_postcode(df, col:str) -> PandasSeries:
     return postcode_series
 # Remove addresses with no numbers in at all - too high a risk of badly assigning an address
-def check_no_number_addresses(df, in_address_series) -> PandasSeries:
     '''
     Highlight addresses from a pandas df where there are no numbers in the address.
     '''
@@ -262,15 +276,6 @@ def check_no_number_addresses(df, in_address_series) -> PandasSeries:
     return df
-# def remove_postcode(df, col:str) -> PandasSeries:
-#     '''
-#     Remove a postcode from a string column in a dataframe
-#     '''
-#     address_series_no_pcode = df[col].str.upper().str.replace(\
-#     "\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","", regex=True).str.lower()
-#     return address_series_no_pcode
 def extract_street_name(address:str) -> str:
     """
     Extracts the street name from the given address.
@@ -342,7 +347,7 @@ def extract_street_name(address:str) -> str:
     # Exclude non-postal addresses
-def remove_non_postal(df, in_address_series):
     '''
     Highlight non-postal addresses from a polars df where a string series that contain specific substrings
     indicating non-postal addresses like 'garage', 'parking', 'shed', etc.

 from datetime import datetime
 #import polars as pl
 import re
+from tqdm import tqdm
+from gradio import Progress
+tqdm.pandas()  # Registers the progress_apply method
 PandasDataFrame = Type[pd.DataFrame]
 PandasSeries = Type[pd.Series]
     search_df: pd.DataFrame,
     address_cols: list,
     postcode_col: list,
+    key_col: str,
+    progress = Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, str]:
+    progress(0, "Preparing search address column")
     # Validate inputs
     if not isinstance(search_df, pd.DataFrame):
         raise TypeError("search_df must be a Pandas DataFrame")
         raise TypeError("postcode_col must be a list")
     if not isinstance(key_col, str):
+        raise TypeError("key_col must be a string")
     # If there is a full address and postcode column in the addresses, clean any postcodes from the first column
     if len(address_cols) == 2:
         # Remove postcode from address
+        search_df[address_cols[0]] = remove_postcode(search_df, address_cols[0])
     # Join address columns into one
+    full_addresses = _join_address(search_df, address_cols)
+    # Clean address columns
+    #search_df_polars = pl.from_dataframe(search_df)
+    clean_addresses = _clean_columns(full_addresses, ["full_address"])
     # Add postcode column
+    full_df = _add_postcode_column(clean_addresses, postcode_col)
     # Remove postcode from main address if there was only one column in the input
     if postcode_col == "full_address_postcode":
         # Remove postcode from address
+        full_df["full_address"] = remove_postcode(full_df, "full_address")
     # Ensure index column
     final_df = _ensure_index(full_df, key_col)
     return final_df
 # Helper functions
+def _clean_columns(df:PandasDataFrame, cols:List[str]):
+    # Cleaning logic
+    def clean_col(col):
+       return (
+           col.astype(str)
+           .fillna("")
+           .infer_objects(copy=False)
+           .str.replace("nan", "")
+           .str.replace(r"\bNone\b", "", case=False, regex=True)
+           .str.replace(r"\s{2,}", " ", regex=True)
+           .str.replace(",", " ")
+           .str.replace(r"[\r\n]+", " ", regex=True)  # Replace line breaks with spaces
+           .str.strip()
+           # Remove duplicate two words at the end if present
+           .str.replace(r'(\b\w+\b\s+\b\w+\b)\s+\1$', r'\1', regex=True)
+       )
+    for col in tqdm(cols, desc="Cleaning columns"):
+        df[col] = clean_col(df[col])
+    return df
+def _join_address(df:PandasDataFrame, cols:List[str]):
    # Joining logic
    full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    df["full_address"] = full_address.str.replace("\s{2,}", " ", regex=True).str.strip()
    return df
+def _add_postcode_column(df:PandasDataFrame, postcodes:str):
    # Add postcode column
    if isinstance(postcodes, list):
         postcodes = postcodes[0]
    return df
+def _ensure_index(df:PandasDataFrame, index_col:str):
    # Ensure index column exists
    if ((index_col == "index") & ~("index" in df.columns)):
         print("Resetting index in _ensure_index function")
    return df
+def create_full_address(df:PandasDataFrame):
     df = df.fillna("").infer_objects(copy=False)
     return df["full_address"]
+def prepare_ref_address(ref_df:PandasDataFrame, ref_address_cols:List[str], new_join_col = [], standard_cols = True, progress=Progress(track_tqdm=True)):
+    progress(0, "Preparing reference address")
     if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns): standard_cols = True
     else: standard_cols = False
     ref_address_cols_uprn_w_ref.extend(["Reference file"])
     ref_df_cleaned = ref_df.copy()
+    ref_df_cleaned["ref_index"] = ref_df_cleaned.index
     # In on-prem LPI db street has been excluded, so put this back in
     if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
         full_address  = ref_df_cleaned[ref_address_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
         ref_df_cleaned["fulladdress"] = full_address
+    ref_df_cleaned = _clean_columns(ref_df_cleaned, ["fulladdress"])
     # Create a street column if it doesn't exist by extracting street from the full address
         ref_df_cleaned['Street'] = ref_df_cleaned["fulladdress"].apply(extract_street_name)
     # Add index column
+    if 'ref_index' not in ref_df_cleaned.columns:
         ref_df_cleaned['ref_index'] = ref_df_cleaned.index
     return ref_df_cleaned
     return postcode_series
 # Remove addresses with no numbers in at all - too high a risk of badly assigning an address
+def check_no_number_addresses(df:PandasDataFrame, in_address_series:str) -> PandasSeries:
     '''
     Highlight addresses from a pandas df where there are no numbers in the address.
     '''
     return df
 def extract_street_name(address:str) -> str:
     """
     Extracts the street name from the given address.
     # Exclude non-postal addresses
+def remove_non_postal(df:PandasDataFrame, in_address_series:str):
     '''
     Highlight non-postal addresses from a polars df where a string series that contain specific substrings
     indicating non-postal addresses like 'garage', 'parking', 'shed', etc.

tools/standardise.py CHANGED Viewed

@@ -136,7 +136,9 @@ def standardise_address(df:PandasDataFrame, col:str, out_col:str, standardise:bo
                                                 str.replace(r"\bmaisonette\b", "flat", regex=True).\
                                                 str.replace(r"\bpt\b", "penthouse", regex=True).\
                                                 str.replace(r"\bbst\b","basement", regex=True).\
-                                                str.replace(r"\bbsmt\b","basement", regex=True)
         df_copy["add_no_pcode_house"] = move_flat_house_court(df_copy)

                                                 str.replace(r"\bmaisonette\b", "flat", regex=True).\
                                                 str.replace(r"\bpt\b", "penthouse", regex=True).\
                                                 str.replace(r"\bbst\b","basement", regex=True).\
+                                                str.replace(r"\bbsmt\b","basement", regex=True).\
+                                                str.replace(r"\s{2,}", " ", regex=True).\
+                                                str.strip()
         df_copy["add_no_pcode_house"] = move_flat_house_court(df_copy)