sibthinon commited on
Commit
9ddaa27
·
verified ·
1 Parent(s): 05c34ec

new fuzzy method

Browse files
Files changed (1) hide show
  1. app.py +51 -9
app.py CHANGED
@@ -6,19 +6,26 @@ from sentence_transformers import SentenceTransformer
6
  from qdrant_client import QdrantClient
7
  from qdrant_client.models import Filter, FieldCondition, MatchValue
8
  import os
 
9
  from pythainlp.tokenize import word_tokenize
10
  from pyairtable import Table
11
  from pyairtable import Api
 
 
12
  import unicodedata
13
 
14
 
15
  qdrant_client = QdrantClient(
16
- url=os.environ.get("Qdrant_url"),
17
- api_key=os.environ.get("Qdrant_api"),
 
 
18
  )
19
 
20
- AIRTABLE_API_KEY = os.environ.get("airtable_api")
21
- BASE_ID = os.environ.get("airtable_baseid")
 
 
22
  TABLE_NAME = "Feedback_search" # หรือเปลี่ยนชื่อให้ชัดเช่น 'Feedback'
23
  api = Api(AIRTABLE_API_KEY)
24
  table = api.table(BASE_ID, TABLE_NAME)
@@ -49,10 +56,45 @@ model_config = {
49
  # Global memory to hold feedback state
50
  latest_query_result = {"query": "", "result": "", "model": "", "raw_query": "", "time": ""}
51
 
52
- def fix_common_thai_typos(text: str) -> str:
53
- text = unicodedata.normalize("NFC", text) # normalize ตัวอักษรซ้อน
54
- text = text.replace("เเ", "แ").replace("เแ", "") # แก้เฉพาะเอกลักษณ์ผิด
55
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  # 🌟 Main search function
58
  def search_product(query, model_name):
@@ -61,7 +103,7 @@ def search_product(query, model_name):
61
  return "<p>❌ ไม่พบโมเดล</p>"
62
 
63
  latest_query_result["raw_query"] = query
64
- corrected_query = fix_common_thai_typos(query)
65
 
66
  query_embed = model_config[model_name]["func"](corrected_query)
67
  collection_name = model_config[model_name]["collection"]
 
6
  from qdrant_client import QdrantClient
7
  from qdrant_client.models import Filter, FieldCondition, MatchValue
8
  import os
9
+ from rapidfuzz import process, fuzz
10
  from pythainlp.tokenize import word_tokenize
11
  from pyairtable import Table
12
  from pyairtable import Api
13
+ import pickle
14
+ import re
15
  import unicodedata
16
 
17
 
18
  qdrant_client = QdrantClient(
19
+ #url=os.environ.get("Qdrant_url"),
20
+ #api_key=os.environ.get("Qdrant_api"),
21
+ url=userdata.get("Qdrant_url"),
22
+ api_key=userdata.get("Qdrant_api"),
23
  )
24
 
25
+ #AIRTABLE_API_KEY = os.environ.get("airtable_api")
26
+ #BASE_ID = os.environ.get("airtable_baseid")
27
+ AIRTABLE_API_KEY = "patwFFErs6fJ8fBhP.15dd01f7be728762fbbb03fdec6284ec01ee0ab84c0abd2ec17374cdaa63500e"
28
+ BASE_ID = "app2OU6L8LK4JLV3M" # จาก URL ของคุณ
29
  TABLE_NAME = "Feedback_search" # หรือเปลี่ยนชื่อให้ชัดเช่น 'Feedback'
30
  api = Api(AIRTABLE_API_KEY)
31
  table = api.table(BASE_ID, TABLE_NAME)
 
56
  # Global memory to hold feedback state
57
  latest_query_result = {"query": "", "result": "", "model": "", "raw_query": "", "time": ""}
58
 
59
+ with open("/content/drive/MyDrive/mypinmall/data/keyword_whitelist.pkl", "rb") as f:
60
+ keyword_whitelist = pickle.load(f)
61
+ #with open("keyword_whitelist.pkl", "rb") as f:
62
+ # keyword_whitelist = pickle.load(f)
63
+
64
+ def normalize(text: str) -> str:
65
+ text = unicodedata.normalize("NFC", text)
66
+ text = text.replace("เแ", "แ").replace("เเ", "แ")
67
+ return text.strip().lower()
68
+
69
+ def smart_tokenize(text: str) -> list:
70
+ tokens = word_tokenize(text.strip(), engine="newmm")
71
+ if not tokens or len("".join(tokens)) < len(text.strip()) * 0.5:
72
+ return [text.strip()]
73
+ return tokens
74
+
75
+ def correct_query_merge_phrases(query: str, whitelist, threshold=75, max_ngram=3):
76
+ query_norm = normalize(query)
77
+ tokens = smart_tokenize(query_norm)
78
+ corrected = []
79
+ i = 0
80
+
81
+ while i < len(tokens):
82
+ matched = False
83
+ for n in range(min(max_ngram, len(tokens) - i), 0, -1):
84
+ phrase = "".join(tokens[i:i+n])
85
+ match, score, _ = process.extractOne(phrase, whitelist, scorer=fuzz.ratio)
86
+ if score >= threshold:
87
+ corrected.append(match)
88
+ i += n
89
+ matched = True
90
+ break
91
+ if not matched:
92
+ corrected.append(tokens[i])
93
+ i += 1
94
+
95
+ # ✅ ตัดคำที่มีความยาว 1 ตัวอักษรและไม่ได้อยู่ใน whitelist
96
+ cleaned = [word for word in corrected if len(word) > 1 or word in whitelist]
97
+ return " ".join(cleaned)
98
 
99
  # 🌟 Main search function
100
  def search_product(query, model_name):
 
103
  return "<p>❌ ไม่พบโมเดล</p>"
104
 
105
  latest_query_result["raw_query"] = query
106
+ corrected_query = correct_query_merge_phrases(query,keyword_whitelist)
107
 
108
  query_embed = model_config[model_name]["func"](corrected_query)
109
  collection_name = model_config[model_name]["collection"]