pcreem commited on
Commit
ed14d2a
·
1 Parent(s): 854f8ed
Files changed (2) hide show
  1. app.py +87 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import faiss
4
+ import pickle
5
+ from datasets import load_dataset
6
+ from sentence_transformers import SentenceTransformer
7
+ from llama_cpp import Llama
8
+ import gradio as gr
9
+
10
+ # =========================
11
+ # STEP 1: 載入 Hugging Face Dataset
12
+ # =========================
13
+ dataset = load_dataset("pcreem/37", split="train")
14
+ df = dataset.to_pandas()
15
+ df.columns = df.columns.str.strip() # 清理欄位空白
16
+
17
+ def make_passage(row):
18
+ return f"""藥品名稱:{row['中文品名']}
19
+ 英文品名:{row['英文品名']}
20
+ 主成分:{row['主成分略述']}
21
+ 劑型:{row['劑型']}
22
+ 適應症:{row['適應症']}
23
+ 用法用量:{row['用法用量']}
24
+ 申請商:{row['申請商名稱']}
25
+ 製造商:{row['製造商名稱']}
26
+ 製造廠地址:{row['製造廠廠址']}
27
+ 包裝:{row['包裝']}
28
+ 有效日期:{row['有效日期']}
29
+ 許可證字號:{row['許可證字號']}"""
30
+
31
+ df["retrieval_passage"] = df.apply(make_passage, axis=1)
32
+ passages = df["retrieval_passage"].tolist()
33
+
34
+ # =========================
35
+ # STEP 2: 建立 FAISS 檢索
36
+ # =========================
37
+ embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
38
+ embeddings = embedding_model.encode(passages, show_progress_bar=True)
39
+ dimension = embeddings.shape[1]
40
+ index = faiss.IndexFlatL2(dimension)
41
+ index.add(np.array(embeddings).astype("float32"))
42
+
43
+ # =========================
44
+ # STEP 3: 載入 Llama 模型
45
+ # =========================
46
+ from huggingface_hub import hf_hub_download
47
+
48
+ model_path = hf_hub_download(
49
+ repo_id="chienweichang/Llama-3-Taiwan-8B-Instruct-GGUF",
50
+ filename="llama-3-taiwan-8B-instruct-q5_1.gguf"
51
+ )
52
+
53
+ llm = Llama(
54
+ model_path=model_path,
55
+ n_gpu_layers=35,
56
+ n_ctx=2048,
57
+ seed=42,
58
+ verbose=False,
59
+ )
60
+
61
+ # =========================
62
+ # STEP 4: 定義查詢函式
63
+ # =========================
64
+ def rag_qa(query, k=3):
65
+ query_embedding = embedding_model.encode([query])
66
+ D, I = index.search(np.array(query_embedding).astype("float32"), k=k)
67
+ top_passages = [passages[idx] for idx in I[0]]
68
+
69
+ context = "\n\n---\n\n".join(top_passages)
70
+ system_prompt = "你是一位專業藥師,根據以下藥品資料,回答使用者的問題,請用簡潔中文說明並避免虛構資訊。\n"
71
+ user_prompt = f"{system_prompt}\n以下是參考資料:\n\n{context}\n\n使用者問題:{query}"
72
+ chat_prompt = f"<|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
73
+
74
+ output = llm(chat_prompt, max_tokens=512, temperature=0.7, top_p=0.9, stop=["<|eot_id|>"])
75
+ answer = output["choices"][0]["text"]
76
+ return answer.strip()
77
+
78
+ # =========================
79
+ # STEP 5: Gradio 介面
80
+ # =========================
81
+ gr.Interface(
82
+ fn=rag_qa,
83
+ inputs=gr.Textbox(label="請輸入問題", placeholder="例如:感冒藥有什麼選擇?"),
84
+ outputs=gr.Textbox(label="藥師回答"),
85
+ title="台灣藥品問答系統",
86
+ description="輸入藥品相關問題,我會根據台灣合法藥品資料庫回答你!"
87
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ sentence-transformers
2
+ faiss-cpu
3
+ llama-cpp-python
4
+ datasets
5
+ gradio
6
+ huggingface_hub