Spaces:

sibthinon
/

environment

Running

App Files Files Community

sibthinon commited on 15 days ago

Commit

cd3f6c0

verified ·

1 Parent(s): eadc64f

delete fuzzy

Browse files

Files changed (1) hide show

app.py +2 -46

app.py CHANGED Viewed

@@ -29,10 +29,6 @@ TABLE_NAME = "Feedback_search"
 api = Api(AIRTABLE_API_KEY)
 table = api.table(BASE_ID, TABLE_NAME)
-# Load whitelist
-with open("keyword_whitelist.pkl", "rb") as f:
-    keyword_whitelist = pickle.load(f)
 # Preload Models
 model = SentenceTransformer("BAAI/bge-m3")
 collection_name = "product_bge-m3"
@@ -43,53 +39,13 @@ reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)
 # Utils
 def is_non_thai(text):
     return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
-def join_corrected_tokens(corrected: list) -> str:
-    if corrected and is_non_thai("".join(corrected)):
-        return " ".join([w for w in corrected if len(w) > 1 or w in keyword_whitelist])
-    else:
-        return "".join([w for w in corrected if len(w) > 1 or w in keyword_whitelist])
 def normalize(text: str) -> str:
     if is_non_thai(text):
         return text.strip()
     text = unicodedata.normalize("NFC", text)
     return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
-def smart_tokenize(text: str) -> list:
-    tokens = word_tokenize(text.strip(), engine="newmm")
-    return tokens if tokens and len("".join(tokens)) >= len(text.strip()) * 0.5 else [text.strip()]
-def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3):
-    query_norm = normalize(query)
-    tokens = smart_tokenize(query_norm)
-    corrected = []
-    i = 0
-    while i < len(tokens):
-        matched = False
-        for n in range(min(max_ngram, len(tokens) - i), 0, -1):
-            phrase = "".join(tokens[i:i+n])
-            if phrase in whitelist:
-                corrected.append(phrase)
-                i += n
-                matched = True
-                break
-            match, score, _ = process.extractOne(
-                phrase,
-                whitelist,
-                scorer=fuzz.token_sort_ratio,
-                processor=lambda x: x.lower()
-            )
-            if score >= threshold:
-                corrected.append(match)
-                i += n
-                matched = True
-                break
-        if not matched:
-            corrected.append(tokens[i])
-            i += 1
-    return join_corrected_tokens(corrected)
 # Global state
 latest_query_result = {"query": "", "result": "", "raw_query": "", "time": ""}
@@ -100,7 +56,7 @@ def search_product(query):
     start_time = time.time()
     latest_query_result["raw_query"] = query
-    corrected_query = correct_query_merge_phrases(query, keyword_whitelist)
     query_embed = model.encode(corrected_query)
     try:

 api = Api(AIRTABLE_API_KEY)
 table = api.table(BASE_ID, TABLE_NAME)
 # Preload Models
 model = SentenceTransformer("BAAI/bge-m3")
 collection_name = "product_bge-m3"
 # Utils
 def is_non_thai(text):
     return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
 def normalize(text: str) -> str:
     if is_non_thai(text):
         return text.strip()
     text = unicodedata.normalize("NFC", text)
     return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
 # Global state
 latest_query_result = {"query": "", "result": "", "raw_query": "", "time": ""}
     start_time = time.time()
     latest_query_result["raw_query"] = query
+    corrected_query = normalize(query)
     query_embed = model.encode(corrected_query)
     try: