Spaces:

sibthinon
/

environment

Running

sibthinon commited on 20 days ago

Commit

374e7c8

verified ·

1 Parent(s): d0b76d3

update fuzzy method

Files changed (1) hide show

app.py CHANGED Viewed

@@ -37,7 +37,12 @@ with open("keyword_whitelist.pkl", "rb") as f:
     keyword_whitelist = pickle.load(f)
 # Utils
 def normalize(text: str) -> str:
     text = unicodedata.normalize("NFC", text)
     return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
@@ -54,7 +59,17 @@ def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3
         matched = False
         for n in range(min(max_ngram, len(tokens) - i), 0, -1):
             phrase = "".join(tokens[i:i+n])
-            match, score, _ = process.extractOne(phrase, whitelist, scorer=fuzz.token_sort_ratio)
             if score >= threshold:
                 corrected.append(match)
                 i += n

     keyword_whitelist = pickle.load(f)
 # Utils
+def is_non_thai(text):
+    return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
 def normalize(text: str) -> str:
+    if is_non_thai(text):
+        return text.strip()
     text = unicodedata.normalize("NFC", text)
     return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
         matched = False
         for n in range(min(max_ngram, len(tokens) - i), 0, -1):
             phrase = "".join(tokens[i:i+n])
+            if phrase in whitelist:
+                corrected.append(phrase)
+                i += n
+                matched = True
+                break
+            match, score, _ = process.extractOne(
+                phrase,
+                whitelist,
+                scorer=fuzz.token_sort_ratio,
+                processor=lambda x: x.lower()
+            )
             if score >= threshold:
                 corrected.append(match)
                 i += n