sibthinon commited on
Commit
cd3f6c0
·
verified ·
1 Parent(s): eadc64f

delete fuzzy

Browse files
Files changed (1) hide show
  1. app.py +2 -46
app.py CHANGED
@@ -29,10 +29,6 @@ TABLE_NAME = "Feedback_search"
29
  api = Api(AIRTABLE_API_KEY)
30
  table = api.table(BASE_ID, TABLE_NAME)
31
 
32
- # Load whitelist
33
- with open("keyword_whitelist.pkl", "rb") as f:
34
- keyword_whitelist = pickle.load(f)
35
-
36
  # Preload Models
37
  model = SentenceTransformer("BAAI/bge-m3")
38
  collection_name = "product_bge-m3"
@@ -43,53 +39,13 @@ reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)
43
  # Utils
44
  def is_non_thai(text):
45
  return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
46
-
47
- def join_corrected_tokens(corrected: list) -> str:
48
- if corrected and is_non_thai("".join(corrected)):
49
- return " ".join([w for w in corrected if len(w) > 1 or w in keyword_whitelist])
50
- else:
51
- return "".join([w for w in corrected if len(w) > 1 or w in keyword_whitelist])
52
-
53
  def normalize(text: str) -> str:
54
  if is_non_thai(text):
55
  return text.strip()
56
  text = unicodedata.normalize("NFC", text)
57
  return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
58
 
59
- def smart_tokenize(text: str) -> list:
60
- tokens = word_tokenize(text.strip(), engine="newmm")
61
- return tokens if tokens and len("".join(tokens)) >= len(text.strip()) * 0.5 else [text.strip()]
62
-
63
- def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3):
64
- query_norm = normalize(query)
65
- tokens = smart_tokenize(query_norm)
66
- corrected = []
67
- i = 0
68
- while i < len(tokens):
69
- matched = False
70
- for n in range(min(max_ngram, len(tokens) - i), 0, -1):
71
- phrase = "".join(tokens[i:i+n])
72
- if phrase in whitelist:
73
- corrected.append(phrase)
74
- i += n
75
- matched = True
76
- break
77
- match, score, _ = process.extractOne(
78
- phrase,
79
- whitelist,
80
- scorer=fuzz.token_sort_ratio,
81
- processor=lambda x: x.lower()
82
- )
83
- if score >= threshold:
84
- corrected.append(match)
85
- i += n
86
- matched = True
87
- break
88
- if not matched:
89
- corrected.append(tokens[i])
90
- i += 1
91
- return join_corrected_tokens(corrected)
92
-
93
  # Global state
94
  latest_query_result = {"query": "", "result": "", "raw_query": "", "time": ""}
95
 
@@ -100,7 +56,7 @@ def search_product(query):
100
  start_time = time.time()
101
  latest_query_result["raw_query"] = query
102
 
103
- corrected_query = correct_query_merge_phrases(query, keyword_whitelist)
104
  query_embed = model.encode(corrected_query)
105
 
106
  try:
 
29
  api = Api(AIRTABLE_API_KEY)
30
  table = api.table(BASE_ID, TABLE_NAME)
31
 
 
 
 
 
32
  # Preload Models
33
  model = SentenceTransformer("BAAI/bge-m3")
34
  collection_name = "product_bge-m3"
 
39
  # Utils
40
  def is_non_thai(text):
41
  return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
42
+
 
 
 
 
 
 
43
  def normalize(text: str) -> str:
44
  if is_non_thai(text):
45
  return text.strip()
46
  text = unicodedata.normalize("NFC", text)
47
  return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # Global state
50
  latest_query_result = {"query": "", "result": "", "raw_query": "", "time": ""}
51
 
 
56
  start_time = time.time()
57
  latest_query_result["raw_query"] = query
58
 
59
+ corrected_query = normalize(query)
60
  query_embed = model.encode(corrected_query)
61
 
62
  try: