Spaces:
Sleeping
Sleeping
import os | |
import sys | |
import math | |
from openai import OpenAI | |
import requests | |
import gradio as gr | |
import pandas as pd | |
import concurrent.futures | |
from datasets import Dataset | |
from tqdm import tqdm | |
from ragas import evaluate, SingleTurnSample | |
from ragas.llms import LangchainLLMWrapper | |
from ragas.embeddings import LangchainEmbeddingsWrapper | |
from langchain_openai import ChatOpenAI, OpenAIEmbeddings | |
from ragas.metrics import ( | |
ResponseRelevancy, LLMContextPrecisionWithReference, LLMContextRecall, | |
ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness | |
) | |
# 設定輸出編碼為 UTF-8(解決中文顯示問題) | |
sys.stdout.reconfigure(encoding="utf-8") | |
# 從Google Drive下載 Ground Truth | |
gt_url = os.environ.get("GT_URL") | |
gt_path = "tender_groundtruth.csv" | |
if gt_url and not os.path.exists(gt_path): | |
print("嘗試下載 Ground Truth...") | |
r = requests.get(gt_url) | |
print("HTTP 狀態碼:", r.status_code) | |
if r.status_code != 200: | |
print("下載失敗內容預覽:", r.text[:500]) | |
else: | |
with open(gt_path, "wb") as f: | |
f.write(r.content) | |
# 綁定實驗室Google帳號(Python TA)Google Sheet,以記錄評估logs | |
def log_to_google_sheet(question, answer, contexts, scores): | |
url = os.environ.get("G_SHEET_URL") | |
if not url: | |
print("G_SHEET_URL 未設定,略過記錄") | |
return | |
try: | |
payload = { | |
"question": question, | |
"answer": answer, | |
"contexts": contexts, | |
"faithfulness": scores.get("Faithfulness"), | |
"answer_relevancy": scores.get("Answer Relevancy"), | |
"semantic_similarity": scores.get("Semantic Similarity"), | |
"context_precision": scores.get("Context Precision"), | |
"context_recall": scores.get("Context Recall"), | |
"context_entity_recall": scores.get("Context Entity Recall") | |
} | |
response = requests.post(url, json=payload) | |
print("成功寫入 Google Sheet:", response.status_code) | |
except Exception as e: | |
print("寫入 Google Sheet 失敗:", str(e)) | |
def fetch_sheet_content(): | |
DEFAULT_ANNOUNCEMENT = "尚無公告" | |
DEFAULT_FAQ = "尚無常見問題" | |
try: | |
url = os.environ.get("ANNOUNCEMENT_URL") | |
if not url: | |
print("Warning: 未設定 ANNOUNCEMENT_URL") | |
return DEFAULT_ANNOUNCEMENT, DEFAULT_FAQ | |
df = pd.read_csv(url) | |
announcement = df["Announcement"].iloc[0].strip() if "Announcement" in df.columns else DEFAULT_ANNOUNCEMENT | |
faq = df["FAQ"].iloc[0].strip() if "FAQ" in df.columns else DEFAULT_FAQ | |
announcement = announcement.replace("\\n", "<br>").replace("\n", "<br>") | |
faq = faq.replace("\\n", "<br>").replace("\n", "<br>") | |
return announcement or DEFAULT_ANNOUNCEMENT, faq or DEFAULT_FAQ | |
except Exception as e: | |
print("載入 Sheet 錯誤:", e) | |
return DEFAULT_ANNOUNCEMENT, DEFAULT_FAQ | |
def validate_openai_key(api_key): | |
try: | |
client = OpenAI(api_key=api_key) | |
client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[{"role": "user", "content": "hi"}], | |
max_tokens=1 | |
) | |
return None | |
except Exception as e: | |
err_msg = str(e) | |
if "Incorrect API key provided" in err_msg: | |
return pd.DataFrame([{"錯誤訊息": " 您輸入的 OpenAI API Key 有誤,請確認是否貼錯、字數不符或格式異常。"}]), None | |
elif "exceeded your current quota" in err_msg: | |
return pd.DataFrame([{"錯誤訊息": "您的 OpenAI 帳戶額度已用盡,請前往帳戶頁面檢查餘額。"}]), None | |
elif "Rate limit" in err_msg: | |
return pd.DataFrame([{"錯誤訊息": "OpenAI 請求頻率過高,請稍後再試"}]), None | |
else: | |
return pd.DataFrame([{"錯誤訊息": f"API Key 錯誤:{err_msg}"}]), None | |
def RAG_evaluation(uploaded_file, user_api_key): | |
try: | |
# 檢查 OpenAI API Key 是否有效 | |
validation_result = validate_openai_key(user_api_key) | |
if validation_result: | |
return validation_result | |
os.environ["OPENAI_API_KEY"] = user_api_key | |
print("評估開始") | |
if not os.path.exists(gt_path): | |
print("找不到 Ground Truth!") | |
return pd.DataFrame(), None | |
gt_df = pd.read_csv(gt_path) | |
df = pd.read_csv(uploaded_file.name, converters={"Context": eval}) | |
print(f"上傳檔案筆數:{len(df)},GT 檔案筆數:{len(gt_df)}") | |
merged_df = pd.merge(df, gt_df[["Question", "Answer"]], on="Question", suffixes=("", "_GroundTruth")) | |
merged_df = merged_df.rename(columns={"Answer_GroundTruth": "GroundTruth"}) | |
print(f"成功合併筆數:{len(merged_df)} / {len(df)}") | |
if len(merged_df) < len(df): | |
missing = df[~df["Question"].isin(merged_df["Question"])] | |
print("未合併題目:", missing["Question"].tolist()) | |
if merged_df.empty: | |
return pd.DataFrame([{"錯誤訊息": "合併後無資料,請確認題目與 GT 是否對應"}]), None | |
llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18")) | |
embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large")) | |
batch_size = 10 | |
records = [] | |
for batch_start in tqdm(range(0, len(merged_df), batch_size), desc="RAGAS Batch Evaluating"): | |
batch_df = merged_df.iloc[batch_start:batch_start + batch_size] | |
samples = [] | |
for _, row in batch_df.iterrows(): | |
if not isinstance(row["Context"], list): | |
print(f"Context 非 list,跳過。值:{row['Question']}") | |
continue | |
sample = SingleTurnSample( | |
user_input=row["Question"], | |
response=row["Answer"], | |
retrieved_contexts=row["Context"], | |
reference=row["GroundTruth"], | |
) | |
samples.append(sample) | |
try: | |
dataset = Dataset.from_list([s.to_dict() for s in samples]) | |
result = evaluate( | |
dataset=dataset, | |
metrics=[ | |
LLMContextPrecisionWithReference(), # context precision | |
LLMContextRecall(), # context recall | |
ContextEntityRecall(), | |
# NoiseSensitivity(), | |
Faithfulness(), # faithfulness | |
ResponseRelevancy(), # answer relevancy | |
SemanticSimilarity(), # semantic similarity | |
# FactualCorrectness() | |
], | |
llm=llm_wrapper, | |
embeddings=embedding_wrapper, | |
show_progress=True | |
) | |
result_df = result.to_pandas() | |
for i, row in enumerate(result_df.itertuples()): | |
input_row = batch_df.iloc[i] | |
record = { | |
"Question": input_row["Question"], | |
"Faithfulness": getattr(row, "faithfulness", None), | |
"Answer Relevancy": getattr(row, "answer_relevancy", None), | |
"Semantic Similarity": getattr(row, "semantic_similarity", None), | |
# "Factual Correctness": getattr(row, "factual_correctness", None), | |
"Context Precision": getattr(row, "llm_context_precision_with_reference", None), | |
"Context Recall": getattr(row, "context_recall", None), | |
"Context Entity Recall": getattr(row, "context_entity_recall", None), | |
# "Noise Sensitivity": getattr(row, "noise_sensitivity_relevant", None) | |
} | |
for key in list(record.keys()): | |
val = record[key] | |
if isinstance(val, float) and not math.isfinite(val): | |
record[key] = "" | |
records.append(record) | |
log_to_google_sheet( | |
question=input_row["Question"], | |
answer=input_row["Answer"], | |
contexts=input_row["Context"], | |
scores=record | |
) | |
except Exception as e: | |
print(f"批次評估失敗(第 {batch_start+1} 筆起):{e}") | |
continue | |
score_df = pd.DataFrame(records).fillna("") | |
print("完成評估筆數:", len(score_df)) | |
numeric_cols = score_df.drop(columns=["Question"]).select_dtypes(include="number") | |
if not numeric_cols.empty: | |
avg_row = numeric_cols.mean().to_dict() | |
avg_row["Question"] = "Average" | |
score_df = pd.concat([score_df, pd.DataFrame([avg_row])], ignore_index=True) | |
original_name = os.path.basename(uploaded_file.name) | |
filename = os.path.splitext(original_name)[0] | |
output_path = f"{filename}_result.csv" | |
score_df.to_csv(output_path, index=False, encoding="utf-8-sig") | |
print("評估結果已儲存:", output_path) | |
return score_df, output_path | |
except Exception as e: | |
print("評估函式整體錯誤:", str(e)) | |
return pd.DataFrame([{"錯誤訊息": f"系統錯誤:{str(e)}"}]), None | |
# handle exception並執行RAG評估 | |
def check_csv_and_run(file, key): | |
if file is None: | |
return pd.DataFrame([{"錯誤訊息": "請上傳檔案!"}]), None | |
if not key or key.strip() == "": | |
return pd.DataFrame([{"錯誤訊息": "請輸入 OpenAI API Key"}]), None | |
try: | |
df = pd.read_csv(file.name, encoding="utf-8-sig") | |
df.columns = [col.strip() for col in df.columns] | |
required_columns = {"Question", "Context", "Answer"} | |
actual_columns = set(df.columns) | |
if actual_columns != required_columns: | |
return pd.DataFrame([{"錯誤訊息": f"欄位錯誤:應包含欄位 {required_columns},實際為 {actual_columns}"}]), None | |
if df.shape[0] == 0: | |
return pd.DataFrame([{"錯誤訊息": "檔案中沒有資料列!"}]), None | |
invalid_rows = df[df["Question"].notnull() & (df["Answer"].isnull() | df["Context"].isnull())] | |
if len(invalid_rows) > 0: | |
missing_questions = "\n".join(f"- {q}" for q in invalid_rows["Question"].tolist()) | |
return pd.DataFrame([{"錯誤訊息": f"發現 {len(invalid_rows)} 筆資料中 Answer 或 Context 為空:\n{missing_questions}"}]), None | |
# check eval context | |
try: | |
for i, val in df["Context"].dropna().items(): | |
if not isinstance(eval(val), list): | |
return pd.DataFrame([{"錯誤訊息": f"第 {i + 1} 筆 Context 欄格式錯誤,請確認其內容應為 list"}]), None | |
except Exception as e: | |
return pd.DataFrame([{"錯誤訊息": f"Context 欄格式解析錯誤,請確認其為有效的 list 格式,例如 ['A', 'B']:{str(e)}"}]), None | |
# 若上傳之待評估檔案無錯誤,執行評估 | |
try: | |
return RAG_evaluation(file, key) | |
# 檢查 OpenAI API Key 是否有效 | |
except Exception as e: | |
error_message = str(e) | |
return pd.DataFrame([{"錯誤訊息": f"系統錯誤:{error_message}"}]), None | |
except Exception as e: | |
return pd.DataFrame([{"錯誤訊息": f"評估失敗:{str(e)}"}]), None | |
# Gradio 介面 | |
with gr.Blocks() as demo: | |
gr.Markdown(""" | |
# 📐 RAG系統評估工具 (分流C) | |
### 📄 使用說明 | |
- 請上傳您 RAG 系統產出的結果檔案(需包含欄位:Question、Context、Answer),並填入您的 OpenAI API Key,以進行評估。 | |
- ⏳ 完整評估**通常需耗時 1 小時以上**,若無即時回應,請**耐心等候**,系統並未當機,謝謝您的理解。 | |
### 🚦 分流措施 | |
本工具部署於 Hugging Face Public Space,若同時有多位使用者使用,系統會將您的評估請求**排入佇列**。 | |
為避免長時間等待,建議您**先僅送出 1 筆資料進行測試**,若進度條顯示之預估**等待時間超過 2 小時(7000 秒以上),可能是其他使用者正在使用**。 | |
本頁為**分流 C**,您可以考慮改用其他分流或稍後再試,感謝您的耐心與配合! | |
- 🔁 [主頁面 (Main)](https://huggingface.co/spaces/KSLab/RAG_Evaluator) | |
- 🔁 [分流 A](https://huggingface.co/spaces/KSLab/RAG_Evaluator_A) | |
- 🔁 [分流 B](https://huggingface.co/spaces/KSLab/RAG_Evaluator_B) | |
### 📢 系統公告 | |
""") | |
announcement_display = gr.Markdown() | |
file_input = gr.File(label="上傳 Evaluation_Dataset.csv") | |
api_key_input = gr.Textbox(label="OpenAI API Key", type="password") | |
submit_btn = gr.Button("開始評估") | |
result_output = gr.Dataframe(label="評估結果") | |
download_link = gr.File(label="下載評估結果(CSV)") | |
# 常見QA | |
gr.Markdown(""" | |
--- | |
### ❓ 常見問題 & 解答 | |
""") | |
faq_display = gr.Markdown() | |
# 載入公告與 FAQ | |
def load_sheet(): | |
return fetch_sheet_content() | |
demo.load(fn=load_sheet, inputs=[], outputs=[announcement_display, faq_display]) | |
def wrapped_fn(file, key): | |
return RAG_evaluation(file, key) | |
submit_btn.click( | |
fn=check_csv_and_run, | |
inputs=[file_input, api_key_input], | |
outputs=[result_output, download_link], | |
) | |
demo.launch() |