Spaces:
Running
Running
import os | |
import sys | |
import requests | |
import gradio as gr | |
import pandas as pd | |
from datasets import Dataset | |
from tqdm import tqdm | |
from ragas import evaluate, SingleTurnSample | |
from ragas.llms import LangchainLLMWrapper | |
from ragas.embeddings import LangchainEmbeddingsWrapper | |
from langchain_openai import ChatOpenAI, OpenAIEmbeddings | |
from ragas.metrics import ( | |
ResponseRelevancy, LLMContextPrecisionWithReference, LLMContextRecall, | |
ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness | |
) | |
sys.stdout.reconfigure(encoding="utf-8") | |
# 嘗試從Google Drive下載 Ground Truth | |
gt_url = os.environ.get("GT_URL") | |
gt_path = "ragas_groundtruth.csv" | |
if gt_url and not os.path.exists(gt_path): | |
print("嘗試下載 Ground Truth...") | |
r = requests.get(gt_url) | |
print("HTTP 狀態碼:", r.status_code) | |
if r.status_code != 200: | |
print("下載失敗內容預覽:", r.text[:500]) | |
else: | |
with open(gt_path, "wb") as f: | |
f.write(r.content) | |
def RAG_evaluation(uploaded_file, user_api_key): | |
try: | |
os.environ["OPENAI_API_KEY"] = user_api_key | |
print("評估開始") | |
if not os.path.exists(gt_path): | |
print("找不到 Ground Truth!") | |
return pd.DataFrame(), None | |
gt_df = pd.read_csv(gt_path) | |
df = pd.read_csv(uploaded_file.name, converters={"Context": eval}) | |
print(f"上傳檔案筆數:{len(df)},GT 檔案筆數:{len(gt_df)}") | |
merged_df = pd.merge(df, gt_df[["Question", "Answer"]], on="Question", suffixes=("", "_GroundTruth")) | |
merged_df = merged_df.rename(columns={"Answer_GroundTruth": "GroundTruth"}) | |
print(f"成功合併筆數:{len(merged_df)} / {len(df)}") | |
if len(merged_df) < len(df): | |
missing = df[~df["Question"].isin(merged_df["Question"])] | |
print("未合併題目:", missing["Question"].tolist()) | |
if merged_df.empty: | |
return pd.DataFrame([{"錯誤訊息": "合併後無資料,請確認題目與 GT 是否對應"}]), None | |
llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18")) | |
embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large")) | |
records = [] | |
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Evaluating"): | |
try: | |
if not isinstance(row["Context"], list): | |
print(f"第 {idx + 1} 筆 Context 非 list,跳過。值:{row['Context']}") | |
continue | |
sample = SingleTurnSample( | |
user_input=row["Question"], | |
response=row["Answer"], | |
retrieved_contexts=row["Context"], | |
reference=row["GroundTruth"] | |
) | |
dataset = Dataset.from_list([sample.to_dict()]) | |
result = evaluate( | |
dataset=dataset, | |
metrics=[ | |
LLMContextPrecisionWithReference(), LLMContextRecall(), ContextEntityRecall(), | |
NoiseSensitivity(), Faithfulness(), ResponseRelevancy(), | |
SemanticSimilarity(), FactualCorrectness() | |
], | |
llm=llm_wrapper, | |
embeddings=embedding_wrapper, | |
show_progress=False | |
) | |
score_row = result.to_pandas().iloc[0].to_dict() | |
records.append({ | |
"Question": row["Question"], | |
"Faithfulness": score_row.get("faithfulness"), | |
"Answer Relevancy": score_row.get("answer_relevancy"), | |
"Semantic Similarity": score_row.get("semantic_similarity"), | |
# "Factual Correctness": score_row.get("factual_correctness"), | |
"Context Precision": score_row.get("llm_context_precision_with_reference"), | |
"Context Recall": score_row.get("context_recall"), | |
"Context Entity Recall": score_row.get("context_entity_recall"), | |
# "noise_sensitivity_relevant": score_row.get("noise_sensitivity_relevant") | |
}) | |
log_to_google_sheet( | |
question=row["Question"], | |
answer=row["Answer"], | |
contexts=row["Context"], | |
scores=score_row | |
) | |
except Exception as e: | |
print(f"第 {idx + 1} 筆評估失敗:{e}") | |
continue | |
score_df = pd.DataFrame(records).fillna("") | |
print("完成評估筆數:", len(score_df)) | |
numeric_cols = score_df.drop(columns=["Question"]).select_dtypes(include="number") | |
if not numeric_cols.empty: | |
avg_row = numeric_cols.mean().to_dict() | |
avg_row["Question"] = "Average" | |
score_df = pd.concat([score_df, pd.DataFrame([avg_row])], ignore_index=True) | |
output_path = "result_output.csv" | |
score_df.to_csv(output_path, index=False, encoding="utf-8-sig") | |
print("評估結果已儲存為 CSV:", output_path) | |
return score_df, output_path | |
except Exception as e: | |
print("評估函式整體錯誤:", str(e)) | |
return pd.DataFrame([{"錯誤訊息": f"系統錯誤:{str(e)}"}]), None | |
def check_csv_and_run(file, key): | |
print("開始檢查CSV檔案格式並執行評估") | |
if file is None: | |
return pd.DataFrame([{"錯誤訊息": "請上傳檔案!"}]), None | |
if not key or key.strip() == "": | |
return pd.DataFrame([{"錯誤訊息": "請輸入 OpenAI API Key"}]), None | |
try: | |
df = pd.read_csv(file.name, encoding="utf-8-sig") | |
df.columns = [col.strip() for col in df.columns] | |
required_columns = {"Question", "Context", "Answer"} | |
actual_columns = set(df.columns) | |
if actual_columns != required_columns: | |
return pd.DataFrame([{"錯誤訊息": f"欄位錯誤:應包含欄位 {required_columns},實際為 {actual_columns}"}]), None | |
if df.shape[0] == 0: | |
return pd.DataFrame([{"錯誤訊息": "檔案中沒有資料列!"}]), None | |
invalid_rows = df[df["Question"].notnull() & (df["Answer"].isnull() | df["Context"].isnull())] | |
if len(invalid_rows) > 0: | |
missing_questions = "\n".join(f"- {q}" for q in invalid_rows["Question"].tolist()) | |
return pd.DataFrame([{"錯誤訊息": f"發現 {len(invalid_rows)} 筆資料中 Answer 或 Context 為空:\n{missing_questions}"}]), None | |
try: | |
for i, val in df["Context"].dropna().items(): | |
if not isinstance(eval(val), list): | |
return pd.DataFrame([{"錯誤訊息": f"第 {i + 1} 筆 Context 欄格式錯誤,請確認其內容應為 list"}]), None | |
except Exception as e: | |
return pd.DataFrame([{"錯誤訊息": f"Context 欄格式解析錯誤,請確認其為有效的 list 格式,例如 ['A', 'B']:{str(e)}"}]), None | |
except Exception as e: | |
return pd.DataFrame([{"錯誤訊息": f"發生錯誤:{str(e)}"}]), None | |
# 若上傳之待評估檔案無錯誤,執行評估 | |
try: | |
return RAG_evaluation(file, key) | |
except Exception as e: | |
return pd.DataFrame([{"錯誤訊息": f"RAG 評估失敗:{str(e)}"}]), None | |
def log_to_google_sheet(question, answer, contexts, scores): | |
url = os.environ.get("G_SHEET_URL") | |
if not url: | |
print("G_SHEET_URL 未設定,略過記錄") | |
return | |
try: | |
payload = { | |
"question": question, | |
"answer": answer, | |
"contexts": contexts, | |
"scores": scores | |
} | |
response = requests.post(url, json=payload) | |
print("成功寫入 Google Sheet:", response.status_code) | |
except Exception as e: | |
print("寫入 Google Sheet 失敗:", str(e)) | |
# Gradio 介面 | |
with gr.Blocks() as demo: | |
gr.Markdown("## 📐 RAG系統評估工具") | |
gr.Markdown(""" | |
### 📄 使用說明 | |
請上傳您RAG系統產出的結果檔案(包含 Question, Context, Answer 欄位),並填入您的OpenAI API Key,以評估您的RAG系統。 | |
#### ⏳ 評估需要時間,請耐心等候。 | |
""") | |
file_input = gr.File(label="上傳 Evaluation_Dataset.csv") | |
api_key_input = gr.Textbox(label="OpenAI API Key", type="password") | |
submit_btn = gr.Button("開始評估") | |
result_output = gr.Dataframe(label="評估結果") | |
download_link = gr.File(label="下載結果檔案(CSV)") | |
submit_btn.click( | |
fn=check_csv_and_run, | |
inputs=[file_input, api_key_input], | |
outputs=[result_output, download_link] | |
) | |
demo.launch() |