sibthinon commited on
Commit
ccd63be
·
verified ·
1 Parent(s): 999314e

remove fuzzy

Browse files
Files changed (1) hide show
  1. app.py +1 -40
app.py CHANGED
@@ -6,13 +6,9 @@ from sentence_transformers import SentenceTransformer
6
  from qdrant_client import QdrantClient
7
  from qdrant_client.models import Filter, FieldCondition, MatchValue
8
  import os
9
- from rapidfuzz import process, fuzz
10
  from pythainlp.tokenize import word_tokenize
11
  from pyairtable import Table
12
  from pyairtable import Api
13
- import pickle
14
- import re
15
- import unicodedata
16
 
17
 
18
  qdrant_client = QdrantClient(
@@ -52,41 +48,6 @@ model_config = {
52
  # Global memory to hold feedback state
53
  latest_query_result = {"query": "", "result": "", "model": "", "raw_query": "", "time": ""}
54
 
55
- with open("keyword_whitelist.pkl", "rb") as f:
56
- keyword_whitelist = pickle.load(f)
57
-
58
- def smart_tokenize(query: str) -> list:
59
- tokens = word_tokenize(query, engine="newmm")
60
- if len("".join(tokens)) < len(query) * 0.7: # ตัดคำขาดเกินไป
61
- return query.strip().split()
62
- return tokens
63
-
64
- def normalize_and_clean_thai(text: str) -> str:
65
- text = unicodedata.normalize("NFC", text)
66
- text = text.replace("เแ", "แ").replace("เเ", "แ")
67
- return text
68
-
69
- def correct_query_smart(query: str, whitelist=None, threshold=70) -> str:
70
- query_norm = normalize_and_clean_thai(query)
71
- tokens = query_norm.strip().split()
72
-
73
- # ถ้า token เดียว → fuzzy ตรงไปที่คำเต็มเลย
74
- if len(tokens) == 1:
75
- match, score, _ = process.extractOne(tokens[0].lower(), whitelist, scorer=fuzz.token_sort_ratio)
76
- return match if score >= threshold else query_norm
77
-
78
- # token หลายคำ → ลองแก้ทีละคำ
79
- corrected = []
80
- for word in tokens:
81
- word_lower = word.lower()
82
- if word_lower in whitelist:
83
- corrected.append(word)
84
- else:
85
- match, score, _ = process.extractOne(word_lower, whitelist, scorer=fuzz.token_sort_ratio)
86
- corrected.append(match if score >= threshold else word)
87
-
88
- return " ".join(corrected)
89
-
90
  # 🌟 Main search function
91
  def search_product(query, model_name):
92
  start_time = time.time()
@@ -94,7 +55,7 @@ def search_product(query, model_name):
94
  return "<p>❌ ไม่พบโมเดล</p>"
95
 
96
  latest_query_result["raw_query"] = query
97
- corrected_query = correct_query_smart(query,keyword_whitelist)
98
 
99
  query_embed = model_config[model_name]["func"](corrected_query)
100
  collection_name = model_config[model_name]["collection"]
 
6
  from qdrant_client import QdrantClient
7
  from qdrant_client.models import Filter, FieldCondition, MatchValue
8
  import os
 
9
  from pythainlp.tokenize import word_tokenize
10
  from pyairtable import Table
11
  from pyairtable import Api
 
 
 
12
 
13
 
14
  qdrant_client = QdrantClient(
 
48
  # Global memory to hold feedback state
49
  latest_query_result = {"query": "", "result": "", "model": "", "raw_query": "", "time": ""}
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # 🌟 Main search function
52
  def search_product(query, model_name):
53
  start_time = time.time()
 
55
  return "<p>❌ ไม่พบโมเดล</p>"
56
 
57
  latest_query_result["raw_query"] = query
58
+ corrected_query = query
59
 
60
  query_embed = model_config[model_name]["func"](corrected_query)
61
  collection_name = model_config[model_name]["collection"]