Spaces:
Running
Running
File size: 8,221 Bytes
28b8e02 9ddaa27 ee48c56 7c23eb0 9ddaa27 39b722c 5629bb7 28b8e02 41cf03d 28b8e02 41cf03d 66a3591 7c23eb0 28b8e02 5629bb7 66a3591 5629bb7 66a3591 5629bb7 66a3591 5629bb7 28b8e02 ef6809f 28b8e02 41cf03d 9ddaa27 50c341d adc1e5a 50c341d 28b8e02 5629bb7 66a3591 28b8e02 cddab55 50c341d 49c543f 5629bb7 49c543f 66a3591 d3b2a2e 49c543f 5629bb7 66a3591 28b8e02 8c1aede 49c543f 66a3591 8c1aede 49c543f 28b8e02 c68ca70 28b8e02 d3b2a2e c68ca70 66a3591 8c1aede 66a3591 8c1aede 28b8e02 c68ca70 49c543f 28b8e02 ef6809f 28b8e02 66a3591 28b8e02 7c23eb0 66a3591 7c23eb0 cddab55 7c23eb0 ef6809f 7c23eb0 28b8e02 66a3591 28b8e02 66a3591 28b8e02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
import gradio as gr
import time
from datetime import datetime
import pandas as pd
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue
import os
from rapidfuzz import process, fuzz
from pythainlp.tokenize import word_tokenize
from pyairtable import Table
from pyairtable import Api
import pickle
import re
import unicodedata
qdrant_client = QdrantClient(
url=os.environ.get("Qdrant_url"),
api_key=os.environ.get("Qdrant_api"),
)
AIRTABLE_API_KEY = os.environ.get("airtable_api")
BASE_ID = os.environ.get("airtable_baseid")
TABLE_NAME = "Feedback_search" # หรือเปลี่ยนชื่อให้ชัดเช่น 'Feedback'
api = Api(AIRTABLE_API_KEY)
table = api.table(BASE_ID, TABLE_NAME)
# โมเดลที่โหลดล่วงหน้า
models = {
"E5 (intfloat/multilingual-e5-small)": SentenceTransformer('intfloat/multilingual-e5-small'),
"E5 large instruct (multilingual-e5-large-instruct)": SentenceTransformer("intfloat/multilingual-e5-large-instruct"),
"Kalm (KaLM-embedding-multilingual-mini-v1)": SentenceTransformer('HIT-TMG/KaLM-embedding-multilingual-mini-v1')
}
model_config = {
"E5 (intfloat/multilingual-e5-small)": {
"func": lambda query: models["E5 (intfloat/multilingual-e5-small)"].encode("query: " + query),
"collection": "product_E5",
},
"E5 large instruct (multilingual-e5-large-instruct)": {
"func": lambda query: models["E5 large instruct (multilingual-e5-large-instruct)"].encode(
"Instruct: Given a product search query, retrieve relevant product listings\nQuery: " + query, convert_to_tensor=False, normalize_embeddings=True),
"collection": "product_E5_large_instruct",
},
"Kalm (KaLM-embedding-multilingual-mini-v1)": {
"func": lambda query: models["Kalm (KaLM-embedding-multilingual-mini-v1)"].encode(query, normalize_embeddings=True),
"collection": "product_kalm",
}
}
# Global memory to hold feedback state
latest_query_result = {"query": "", "result": "", "model": "", "raw_query": "", "time": ""}
with open("keyword_whitelist.pkl", "rb") as f:
keyword_whitelist = pickle.load(f)
def normalize(text: str) -> str:
text = unicodedata.normalize("NFC", text)
text = text.replace("เแ", "แ").replace("เเ", "แ")
return text.strip().lower()
def smart_tokenize(text: str) -> list:
tokens = word_tokenize(text.strip(), engine="newmm")
if not tokens or len("".join(tokens)) < len(text.strip()) * 0.5:
return [text.strip()]
return tokens
def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3):
query_norm = normalize(query)
tokens = smart_tokenize(query_norm)
corrected = []
i = 0
while i < len(tokens):
matched = False
for n in range(min(max_ngram, len(tokens) - i), 0, -1):
phrase = "".join(tokens[i:i+n])
match, score, _ = process.extractOne(phrase, whitelist, scorer=fuzz.token_sort_ratio)
if score >= threshold:
corrected.append(match)
i += n
matched = True
break
if not matched:
corrected.append(tokens[i])
i += 1
# ✅ ตัดคำที่มีความยาว 1 ตัวอักษรและไม่ได้อยู่ใน whitelist
cleaned = [word for word in corrected if len(word) > 1 or word in whitelist]
return "".join(cleaned)
# 🌟 Main search function
def search_product(query, model_name):
start_time = time.time()
if model_name not in model_config:
return "<p>❌ ไม่พบโมเดล</p>"
latest_query_result["raw_query"] = query
corrected_query = correct_query_merge_phrases(query,keyword_whitelist)
query_embed = model_config[model_name]["func"](corrected_query)
collection_name = model_config[model_name]["collection"]
try:
result = qdrant_client.query_points(
collection_name=collection_name,
query=query_embed.tolist(),
with_payload=True,
query_filter=Filter(must=[FieldCondition(key="type", match=MatchValue(value="product"))]),
limit=50
).points
except Exception as e:
return f"<p>❌ Qdrant error: {str(e)}</p>"
elapsed = time.time() - start_time
html_output = f"<p>⏱ <strong>{elapsed:.2f} วินาที</strong></p>"
if corrected_query != query:
html_output += f"<p>🔧 แก้คำค้นจาก: <code>{query}</code> → <code>{corrected_query}</code></p>"
html_output += """
<div style="display: grid; grid-template-columns: repeat(auto-fill, minmax(220px, 1fr)); gap: 20px;">
"""
result_summary = ""
found = False
for res in result:
if res.score > 0.83:
found = True
name = res.payload.get("name", "ไม่ทราบชื่อสินค้า")
score = f"{res.score:.4f}"
img_url = res.payload.get("imageUrl", "")
price = res.payload.get("price", "ไม่ระบุ")
brand = res.payload.get("brand", "")
html_output += f"""
<div style="border: 1px solid #ddd; border-radius: 8px; padding: 10px; text-align: center; box-shadow: 1px 1px 5px rgba(0,0,0,0.1); background: #fff;">
<img src="{img_url}" style="width: 100%; max-height: 150px; object-fit: contain; border-radius: 4px;">
<div style="margin-top: 10px;">
<div style="font-weight: bold; font-size: 14px;">{name}</div>
<div style="color: gray; font-size: 12px;">{brand}</div>
<div style="color: green; margin: 4px 0;">฿{price}</div>
<div style="font-size: 12px; color: #555;">score: {score}</div>
</div>
</div>
"""
result_summary += f"{name} (score: {score}) | "
html_output += "</div>"
if not found:
html_output += """
<div style="text-align: center; font-size: 18px; color: #a00; padding: 30px;">
❌ ไม่พบสินค้าที่เกี่ยวข้องกับคำค้นนี้
</div>
"""
return html_output
latest_query_result["query"] = corrected_query
latest_query_result["result"] = result_summary.strip()
latest_query_result["model"] = model_name
latest_query_result["time"] = elapsed
return html_output
# 📝 Logging feedback
def log_feedback(feedback):
try:
now = datetime.now().strftime("%Y-%m-%d")
table.create({
"timestamp": now,
"raw_query": latest_query_result["raw_query"],
"model": latest_query_result["model"],
"query": latest_query_result["query"],
"result": latest_query_result["result"],
"time(second)": latest_query_result["time"],
"feedback": feedback
})
return "✅ Feedback saved to Airtable!"
except Exception as e:
return f"❌ Failed to save feedback: {str(e)}"
# 🎨 Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## 🔎 Product Semantic Search (Vector Search + Qdrant)")
with gr.Row():
model_selector = gr.Dropdown(
choices=list(models.keys()),
label="เลือกโมเดล",
value="E5 (intfloat/multilingual-e5-small)"
)
query_input = gr.Textbox(label="พิมพ์คำค้นหา")
result_output = gr.HTML(label="📋 ผลลัพธ์") # HTML แสดงผลลัพธ์พร้อมรูป
with gr.Row():
match_btn = gr.Button("✅ ตรง")
not_match_btn = gr.Button("❌ ไม่ตรง")
feedback_status = gr.Textbox(label="📬 สถานะ Feedback")
query_input.submit(search_product, inputs=[query_input, model_selector], outputs=result_output)
match_btn.click(lambda: log_feedback("match"), outputs=feedback_status)
not_match_btn.click(lambda: log_feedback("not_match"), outputs=feedback_status)
# Run app
demo.launch(share=True) |