sibthinon commited on
Commit
374e7c8
·
verified ·
1 Parent(s): d0b76d3

update fuzzy method

Browse files
Files changed (1) hide show
  1. app.py +16 -1
app.py CHANGED
@@ -37,7 +37,12 @@ with open("keyword_whitelist.pkl", "rb") as f:
37
  keyword_whitelist = pickle.load(f)
38
 
39
  # Utils
 
 
 
40
  def normalize(text: str) -> str:
 
 
41
  text = unicodedata.normalize("NFC", text)
42
  return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
43
 
@@ -54,7 +59,17 @@ def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3
54
  matched = False
55
  for n in range(min(max_ngram, len(tokens) - i), 0, -1):
56
  phrase = "".join(tokens[i:i+n])
57
- match, score, _ = process.extractOne(phrase, whitelist, scorer=fuzz.token_sort_ratio)
 
 
 
 
 
 
 
 
 
 
58
  if score >= threshold:
59
  corrected.append(match)
60
  i += n
 
37
  keyword_whitelist = pickle.load(f)
38
 
39
  # Utils
40
+ def is_non_thai(text):
41
+ return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
42
+
43
  def normalize(text: str) -> str:
44
+ if is_non_thai(text):
45
+ return text.strip()
46
  text = unicodedata.normalize("NFC", text)
47
  return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
48
 
 
59
  matched = False
60
  for n in range(min(max_ngram, len(tokens) - i), 0, -1):
61
  phrase = "".join(tokens[i:i+n])
62
+ if phrase in whitelist:
63
+ corrected.append(phrase)
64
+ i += n
65
+ matched = True
66
+ break
67
+ match, score, _ = process.extractOne(
68
+ phrase,
69
+ whitelist,
70
+ scorer=fuzz.token_sort_ratio,
71
+ processor=lambda x: x.lower()
72
+ )
73
  if score >= threshold:
74
  corrected.append(match)
75
  i += n