Spaces:
Sleeping
Sleeping
File size: 14,118 Bytes
e322e6b 81054dc e322e6b 81054dc e322e6b 81054dc e322e6b 81054dc e322e6b 81054dc e322e6b 81054dc e322e6b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 |
import os
import sys
import math
from openai import OpenAI
import requests
import gradio as gr
import pandas as pd
import concurrent.futures
from datasets import Dataset
from tqdm import tqdm
from ragas import evaluate, SingleTurnSample
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.metrics import (
ResponseRelevancy, LLMContextPrecisionWithReference, LLMContextRecall,
ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness
)
# 設定輸出編碼為 UTF-8(解決中文顯示問題)
sys.stdout.reconfigure(encoding="utf-8")
# 從Google Drive下載 Ground Truth
gt_url = os.environ.get("GT_URL")
gt_path = "tender_groundtruth.csv"
if gt_url and not os.path.exists(gt_path):
print("嘗試下載 Ground Truth...")
r = requests.get(gt_url)
print("HTTP 狀態碼:", r.status_code)
if r.status_code != 200:
print("下載失敗內容預覽:", r.text[:500])
else:
with open(gt_path, "wb") as f:
f.write(r.content)
# 綁定實驗室Google帳號(Python TA)Google Sheet,以記錄評估logs
def log_to_google_sheet(question, answer, contexts, scores):
url = os.environ.get("G_SHEET_URL")
if not url:
print("G_SHEET_URL 未設定,略過記錄")
return
try:
payload = {
"question": question,
"answer": answer,
"contexts": contexts,
"faithfulness": scores.get("Faithfulness"),
"answer_relevancy": scores.get("Answer Relevancy"),
"semantic_similarity": scores.get("Semantic Similarity"),
"context_precision": scores.get("Context Precision"),
"context_recall": scores.get("Context Recall"),
"context_entity_recall": scores.get("Context Entity Recall")
}
response = requests.post(url, json=payload)
print("成功寫入 Google Sheet:", response.status_code)
except Exception as e:
print("寫入 Google Sheet 失敗:", str(e))
def fetch_announcement_from_sheet():
DEFAULT_MESSAGE = "尚無公告"
try:
url = os.environ.get("ANNOUNCEMENT_URL")
if not url:
print("Warning: 環境變數 'ANNOUNCEMENT_URL' 未設定")
return DEFAULT_MESSAGE
df = pd.read_csv(url)
if "Announcement" not in df.columns:
print("Error: CSV 檔案中無 'Announcement' 欄位")
return DEFAULT_MESSAGE
content = str(df["Announcement"].iloc[0]).strip()
content = content.replace("\\n", "<br>").replace("\n", "<br>")
return content if content else DEFAULT_MESSAGE
except Exception as e:
print(f"Error: 載入公告失敗:{e}")
return DEFAULT_MESSAGE
def validate_openai_key(api_key):
try:
client = OpenAI(api_key=api_key)
client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "hi"}],
max_tokens=1
)
return None
except Exception as e:
err_msg = str(e)
if "Incorrect API key provided" in err_msg:
return pd.DataFrame([{"錯誤訊息": " 您輸入的 OpenAI API Key 有誤,請確認是否貼錯、字數不符或格式異常。"}]), None
elif "exceeded your current quota" in err_msg:
return pd.DataFrame([{"錯誤訊息": "您的 OpenAI 帳戶額度已用盡,請前往帳戶頁面檢查餘額。"}]), None
elif "Rate limit" in err_msg:
return pd.DataFrame([{"錯誤訊息": "OpenAI 請求頻率過高,請稍後再試"}]), None
else:
return pd.DataFrame([{"錯誤訊息": f"API Key 錯誤:{err_msg}"}]), None
def RAG_evaluation(uploaded_file, user_api_key):
try:
# 檢查 OpenAI API Key 是否有效
validation_result = validate_openai_key(user_api_key)
if validation_result:
return validation_result
os.environ["OPENAI_API_KEY"] = user_api_key
print("評估開始")
if not os.path.exists(gt_path):
print("找不到 Ground Truth!")
return pd.DataFrame(), None
gt_df = pd.read_csv(gt_path)
df = pd.read_csv(uploaded_file.name, converters={"Context": eval})
print(f"上傳檔案筆數:{len(df)},GT 檔案筆數:{len(gt_df)}")
merged_df = pd.merge(df, gt_df[["Question", "Answer"]], on="Question", suffixes=("", "_GroundTruth"))
merged_df = merged_df.rename(columns={"Answer_GroundTruth": "GroundTruth"})
print(f"成功合併筆數:{len(merged_df)} / {len(df)}")
if len(merged_df) < len(df):
missing = df[~df["Question"].isin(merged_df["Question"])]
print("未合併題目:", missing["Question"].tolist())
if merged_df.empty:
return pd.DataFrame([{"錯誤訊息": "合併後無資料,請確認題目與 GT 是否對應"}]), None
llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18"))
embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large"))
batch_size = 10
records = []
for batch_start in tqdm(range(0, len(merged_df), batch_size), desc="RAGAS Batch Evaluating"):
batch_df = merged_df.iloc[batch_start:batch_start + batch_size]
samples = []
for _, row in batch_df.iterrows():
if not isinstance(row["Context"], list):
print(f"Context 非 list,跳過。值:{row['Question']}")
continue
sample = SingleTurnSample(
user_input=row["Question"],
response=row["Answer"],
retrieved_contexts=row["Context"],
reference=row["GroundTruth"],
)
samples.append(sample)
try:
dataset = Dataset.from_list([s.to_dict() for s in samples])
result = evaluate(
dataset=dataset,
metrics=[
LLMContextPrecisionWithReference(), # context precision
LLMContextRecall(), # context recall
ContextEntityRecall(),
# NoiseSensitivity(),
Faithfulness(), # faithfulness
ResponseRelevancy(), # answer relevancy
SemanticSimilarity(), # semantic similarity
# FactualCorrectness()
],
llm=llm_wrapper,
embeddings=embedding_wrapper,
show_progress=True
)
result_df = result.to_pandas()
for i, row in enumerate(result_df.itertuples()):
input_row = batch_df.iloc[i]
record = {
"Question": input_row["Question"],
"Faithfulness": getattr(row, "faithfulness", None),
"Answer Relevancy": getattr(row, "answer_relevancy", None),
"Semantic Similarity": getattr(row, "semantic_similarity", None),
# "Factual Correctness": getattr(row, "factual_correctness", None),
"Context Precision": getattr(row, "llm_context_precision_with_reference", None),
"Context Recall": getattr(row, "context_recall", None),
"Context Entity Recall": getattr(row, "context_entity_recall", None),
# "Noise Sensitivity": getattr(row, "noise_sensitivity_relevant", None)
}
for key in list(record.keys()):
val = record[key]
if isinstance(val, float) and not math.isfinite(val):
record[key] = ""
records.append(record)
log_to_google_sheet(
question=input_row["Question"],
answer=input_row["Answer"],
contexts=input_row["Context"],
scores=record
)
except Exception as e:
print(f"批次評估失敗(第 {batch_start+1} 筆起):{e}")
continue
score_df = pd.DataFrame(records).fillna("")
print("完成評估筆數:", len(score_df))
numeric_cols = score_df.drop(columns=["Question"]).select_dtypes(include="number")
if not numeric_cols.empty:
avg_row = numeric_cols.mean().to_dict()
avg_row["Question"] = "Average"
score_df = pd.concat([score_df, pd.DataFrame([avg_row])], ignore_index=True)
original_name = os.path.basename(uploaded_file.name)
filename = os.path.splitext(original_name)[0]
output_path = f"{filename}_result.csv"
score_df.to_csv(output_path, index=False, encoding="utf-8-sig")
print("評估結果已儲存:", output_path)
return score_df, output_path
except Exception as e:
print("評估函式整體錯誤:", str(e))
return pd.DataFrame([{"錯誤訊息": f"系統錯誤:{str(e)}"}]), None
# handle exception並執行RAG評估
def check_csv_and_run(file, key):
if file is None:
return pd.DataFrame([{"錯誤訊息": "請上傳檔案!"}]), None
if not key or key.strip() == "":
return pd.DataFrame([{"錯誤訊息": "請輸入 OpenAI API Key"}]), None
try:
df = pd.read_csv(file.name, encoding="utf-8-sig")
df.columns = [col.strip() for col in df.columns]
required_columns = {"Question", "Context", "Answer"}
actual_columns = set(df.columns)
if actual_columns != required_columns:
return pd.DataFrame([{"錯誤訊息": f"欄位錯誤:應包含欄位 {required_columns},實際為 {actual_columns}"}]), None
if df.shape[0] == 0:
return pd.DataFrame([{"錯誤訊息": "檔案中沒有資料列!"}]), None
invalid_rows = df[df["Question"].notnull() & (df["Answer"].isnull() | df["Context"].isnull())]
if len(invalid_rows) > 0:
missing_questions = "\n".join(f"- {q}" for q in invalid_rows["Question"].tolist())
return pd.DataFrame([{"錯誤訊息": f"發現 {len(invalid_rows)} 筆資料中 Answer 或 Context 為空:\n{missing_questions}"}]), None
# check eval context
try:
for i, val in df["Context"].dropna().items():
if not isinstance(eval(val), list):
return pd.DataFrame([{"錯誤訊息": f"第 {i + 1} 筆 Context 欄格式錯誤,請確認其內容應為 list"}]), None
except Exception as e:
return pd.DataFrame([{"錯誤訊息": f"Context 欄格式解析錯誤,請確認其為有效的 list 格式,例如 ['A', 'B']:{str(e)}"}]), None
# 若上傳之待評估檔案無錯誤,執行評估
try:
return RAG_evaluation(file, key)
# 檢查 OpenAI API Key 是否有效
except Exception as e:
error_message = str(e)
return pd.DataFrame([{"錯誤訊息": f"系統錯誤:{error_message}"}]), None
except Exception as e:
return pd.DataFrame([{"錯誤訊息": f"評估失敗:{str(e)}"}]), None
# Gradio 介面
with gr.Blocks() as demo:
gr.Markdown("""
# 📐 RAG系統評估工具 (分流A)
### 📄 使用說明
- 請上傳您 RAG 系統產出的結果檔案(需包含欄位:Question、Context、Answer),並填入您的 OpenAI API Key,以進行評估。
- ⏳ 完整評估**通常需耗時 1 小時以上**,若無即時回應,請**耐心等候**,系統並未當機,謝謝您的理解。
### 🚦 分流措施
本工具部署於 Hugging Face Public Space,若同時有多位使用者使用,系統會將您的評估請求**排入佇列**。
為避免長時間等待,建議您**先僅送出 1 筆資料進行測試**,若進度條顯示之預估**等待時間超過 2 小時(7000 秒以上),可能是其他使用者正在使用**。
本頁為**分流 A**,您可以考慮改用其他分流或稍後再試,感謝您的耐心與配合!
- 🔁 [主頁面 (Main)](https://huggingface.co/spaces/KSLab/RAG_Evaluator)
- 🔁 [分流 A](https://huggingface.co/spaces/KSLab/RAG_Evaluator_A)
- 🔁 [分流 C](https://huggingface.co/spaces/KSLab/RAG_Evaluator_C)
### 📢 系統公告
""")
gr.Markdown(fetch_announcement_from_sheet())
file_input = gr.File(label="上傳 Evaluation_Dataset.csv")
api_key_input = gr.Textbox(label="OpenAI API Key", type="password")
submit_btn = gr.Button("開始評估")
result_output = gr.Dataframe(label="評估結果")
download_link = gr.File(label="下載評估結果(CSV)")
# 常見QA文字
gr.Markdown("""
---
### ❓ 常見問題 & 解答
**Q: 什麼是「指令集」?**
A: 「指令集」是我們用來描述老師在課堂上所設計的各種學習活動操作流程。在與教學系統互動時,老師通常會透過一系列結構化的指令來引導學生完成任務,因此我們將這些可重複使用的操作流程統稱為「指令集」。
指令集也如同RESTful API一樣,我們有先盡力的與老師們溝通他們的需求,不過這些需求都只能視為一個草案,最終仍需要仰賴得標業者與老師們收斂,並且確定最終的版本來加以實作。
""")
def wrapped_fn(file, key):
return RAG_evaluation(file, key)
submit_btn.click(
fn=check_csv_and_run,
inputs=[file_input, api_key_input],
outputs=[result_output, download_link],
)
demo.launch() |