File size: 5,405 Bytes
739b386
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import spacy
from spacy.matcher import Matcher
import numpy as np
import gradio as gr
import pandas as pd
from typing import List, Type

PandasDataFrame = Type[pd.DataFrame]

nlp = spacy.load("en_core_web_sm")

string_query = "knife attack run fast"
df_list = ["Last week someone was grievously injured in a knife attack on Exmoor road. Running away. They ran as fast as possible. I run.","This is the 3rd knifing in the area in as many weeks; knives everywhere.", "attacks of this kind have been increasing for years. Knife attack or knife attack.", "Nothing happened here"]


def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
    ''' Conduct fuzzy match on a list of data.'''

    query = nlp(string_query)
    tokenised_query = [token.text for token in query]
    print(tokenised_query)

    spelling_mistakes_fuzzy_pattern = "FUZZY" + str(no_spelling_mistakes)

    # %%
    if len(tokenised_query) > 1:
        pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
        pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
    elif len(tokenised_query) == 1:
        pattern_lemma = [{"LEMMA": tokenised_query[0]}]
        pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
    else:
        tokenised_query = [""]

    # %%
    search_pattern = pattern_fuzz.copy()
    search_pattern.extend(pattern_lemma)

  
    # %%
    matcher = Matcher(nlp.vocab)

    # %% [markdown]
    # from spacy.tokens import Span
    # from spacy import displacy
    # 
    # def add_event_ent(matcher, doc, i, matches):
    #     # Get the current match and create tuple of entity label, start and end.
    #     # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    #     match_id, start, end = matches[i]
    #     entity = Span(doc, start, end, label="EVENT")
    #     doc.ents += (entity,)
    #     print(entity.text)

    # %% [markdown]
    # matched_sents = []  # Collect data of matched sentences to be visualized
    # 
    # def collect_sents(matcher, doc, i, matches):
    #     match_id, start, end = matches[i]
    #     span = doc[start:end]  # Matched span
    #     sent = span.sent  # Sentence containing matched span
    #     # Append mock entity for match in displaCy style to matched_sents
    #     # get the match span by ofsetting the start and end of the span with the
    #     # start and end of the sentence in the doc
    #     match_ents = [{
    #         "start": span.start_char - sent.start_char,
    #         "end": span.end_char - sent.start_char,
    #         "label": "MATCH",
    #     }]
    #     matched_sents.append({"text": sent.text, "ents": match_ents})

    # %%
    matcher.add(string_query, [pattern_fuzz])#, on_match=add_event_ent)
    matcher.add(string_query, [pattern_lemma])#, on_match=add_event_ent)

    # %%
    batch_size = 256
    docs = nlp.pipe(df_list, batch_size=batch_size)

    # %%
    all_matches = []   

    # Get number of matches per doc
    for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
        matches = matcher(doc)
        match_count = len(matches)
        all_matches.append(match_count)

    print("Search complete")

    ## Get document lengths
    lengths = []
    for element in df_list:
        lengths.append(len(element))
        
    # Score is number of matches divided by length of document
    match_scores = (np.array(all_matches)/np.array(lengths)).tolist()

    # Prepare results and export
    results_df = pd.DataFrame(data={"index": list(range(len(df_list))),
                                    "search_text": df_list,
                                    "search_score_abs": match_scores})
    results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
    results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)

    # Join on additional files
    if not in_join_file.empty:
        progress(0.5, desc = "Joining on additional data file")
        join_df = in_join_file
        join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
        results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)

        # Duplicates dropped so as not to expand out dataframe
        join_df = join_df.drop_duplicates(in_join_column)

        results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")#.drop(in_join_column, axis=1)

    # Reorder results by score
    results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)

    # Out file
    query_str_file = ("_").join(token_query)
    results_df_name = "keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"

    print("Saving search file output")
    progress(0.7, desc = "Saving search output to file")

    results_df_out.to_excel(results_df_name, index= None)
    results_first_text = results_df_out[text_column].iloc[0]

    print("Returning results")

    return results_first_text, results_df_name


match_list = spacy_fuzzy_search(string_query, df_list)
print(match_list)