MonicaChen0330 commited on
Commit
f65d205
·
verified ·
1 Parent(s): 6add0a1

first commit

Browse files
Files changed (1) hide show
  1. app.py +184 -0
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import requests
4
+ import gradio as gr
5
+ import pandas as pd
6
+ from datasets import Dataset
7
+ from tqdm import tqdm
8
+ from ragas import evaluate, SingleTurnSample
9
+ from ragas.llms import LangchainLLMWrapper
10
+ from ragas.embeddings import LangchainEmbeddingsWrapper
11
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
12
+ from ragas.metrics import (
13
+ ResponseRelevancy, LLMContextPrecisionWithReference, LLMContextRecall,
14
+ ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness
15
+ )
16
+
17
+ sys.stdout.reconfigure(encoding="utf-8")
18
+
19
+ # 嘗試從Google Drive下載 Ground Truth
20
+ gt_url = os.environ.get("GT_URL")
21
+ gt_path = "ragas_groundtruth.csv"
22
+
23
+ if gt_url and not os.path.exists(gt_path):
24
+ print("嘗試下載 Ground Truth...")
25
+ r = requests.get(gt_url)
26
+ print("HTTP 狀態碼:", r.status_code)
27
+ if r.status_code != 200:
28
+ print("下載失敗內容預覽:", r.text[:500])
29
+ else:
30
+ with open(gt_path, "wb") as f:
31
+ f.write(r.content)
32
+
33
+ def RAG_evaluation(uploaded_file, user_api_key):
34
+ try:
35
+ os.environ["OPENAI_API_KEY"] = user_api_key
36
+ print("評估開始")
37
+
38
+ if not os.path.exists(gt_path):
39
+ print("找不到 Ground Truth!")
40
+ return pd.DataFrame(), None
41
+
42
+ gt_df = pd.read_csv(gt_path)
43
+ df = pd.read_csv(uploaded_file.name, converters={"Context": eval})
44
+ print(f"上傳檔案筆數:{len(df)},GT 檔案筆數:{len(gt_df)}")
45
+
46
+ merged_df = pd.merge(df, gt_df[["Question", "Answer"]], on="Question", suffixes=("", "_GroundTruth"))
47
+ merged_df = merged_df.rename(columns={"Answer_GroundTruth": "GroundTruth"})
48
+ print(f"成功合併筆數:{len(merged_df)} / {len(df)}")
49
+ if len(merged_df) < len(df):
50
+ missing = df[~df["Question"].isin(merged_df["Question"])]
51
+ print("未合併題目:", missing["Question"].tolist())
52
+ if merged_df.empty:
53
+ return pd.DataFrame([{"錯誤訊息": "合併後無資料,請確認題目與 GT 是否對應"}]), None
54
+
55
+ llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18"))
56
+ embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large"))
57
+
58
+ records = []
59
+ for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Evaluating"):
60
+ try:
61
+ if not isinstance(row["Context"], list):
62
+ print(f"第 {idx + 1} 筆 Context 非 list,跳過。值:{row['Context']}")
63
+ continue
64
+
65
+ sample = SingleTurnSample(
66
+ user_input=row["Question"],
67
+ response=row["Answer"],
68
+ retrieved_contexts=row["Context"],
69
+ reference=row["GroundTruth"]
70
+ )
71
+ dataset = Dataset.from_list([sample.to_dict()])
72
+ result = evaluate(
73
+ dataset=dataset,
74
+ metrics=[
75
+ LLMContextPrecisionWithReference(), LLMContextRecall(), ContextEntityRecall(),
76
+ NoiseSensitivity(), Faithfulness(), ResponseRelevancy(),
77
+ SemanticSimilarity(), FactualCorrectness()
78
+ ],
79
+ llm=llm_wrapper,
80
+ embeddings=embedding_wrapper,
81
+ show_progress=False
82
+ )
83
+
84
+ score_row = result.to_pandas().iloc[0].to_dict()
85
+ records.append({
86
+ "Question": row["Question"],
87
+ "Faithfulness": score_row.get("faithfulness"),
88
+ "Answer Relevancy": score_row.get("answer_relevancy"),
89
+ "Semantic Similarity": score_row.get("semantic_similarity"),
90
+ # "Factual Correctness": score_row.get("factual_correctness"),
91
+ "Context Precision": score_row.get("llm_context_precision_with_reference"),
92
+ "Context Recall": score_row.get("context_recall"),
93
+ "Context Entity Recall": score_row.get("context_entity_recall"),
94
+ # "noise_sensitivity_relevant": score_row.get("noise_sensitivity_relevant")
95
+ })
96
+
97
+ except Exception as e:
98
+ print(f"第 {idx + 1} 筆評估失敗:{e}")
99
+ continue
100
+
101
+ score_df = pd.DataFrame(records).fillna("")
102
+ print("完成評估筆數:", len(score_df))
103
+
104
+ numeric_cols = score_df.drop(columns=["Question"]).select_dtypes(include="number")
105
+ if not numeric_cols.empty:
106
+ avg_row = numeric_cols.mean().to_dict()
107
+ avg_row["Question"] = "Average"
108
+ score_df = pd.concat([score_df, pd.DataFrame([avg_row])], ignore_index=True)
109
+
110
+ output_path = "result_output.csv"
111
+ score_df.to_csv(output_path, index=False, encoding="utf-8-sig")
112
+ print("評估結果已儲存為 CSV:", output_path)
113
+
114
+ return score_df, output_path
115
+
116
+ except Exception as e:
117
+ print("評估函式整體錯誤:", str(e))
118
+ return pd.DataFrame([{"錯誤訊息": f"系統錯誤:{str(e)}"}]), None
119
+
120
+ def check_csv_and_run(file, key):
121
+ print("開始檢查CSV檔案格式並執行評估")
122
+ if file is None:
123
+ return pd.DataFrame([{"錯誤訊息": "請上傳檔案!"}]), None
124
+
125
+ if not key or key.strip() == "":
126
+ return pd.DataFrame([{"錯誤訊息": "請輸入 OpenAI API Key"}]), None
127
+
128
+ try:
129
+ df = pd.read_csv(file.name, encoding="utf-8-sig")
130
+ df.columns = [col.strip() for col in df.columns]
131
+
132
+ required_columns = {"Question", "Context", "Answer"}
133
+ actual_columns = set(df.columns)
134
+
135
+ if actual_columns != required_columns:
136
+ return pd.DataFrame([{"錯誤訊息": f"欄位錯誤:應包含欄位 {required_columns},實際為 {actual_columns}"}]), None
137
+
138
+ if df.shape[0] == 0:
139
+ return pd.DataFrame([{"錯誤訊息": "檔案中沒有資料列!"}]), None
140
+
141
+ invalid_rows = df[df["Question"].notnull() & (df["Answer"].isnull() | df["Context"].isnull())]
142
+ if len(invalid_rows) > 0:
143
+ missing_questions = "\n".join(f"- {q}" for q in invalid_rows["Question"].tolist())
144
+ return pd.DataFrame([{"錯誤訊息": f"發現 {len(invalid_rows)} 筆資料中 Answer 或 Context 為空:\n{missing_questions}"}]), None
145
+
146
+ try:
147
+ for i, val in df["Context"].dropna().items():
148
+ if not isinstance(eval(val), list):
149
+ return pd.DataFrame([{"錯誤訊息": f"第 {i + 1} 筆 Context 欄格式錯誤,請確認其內容應為 list"}]), None
150
+ except Exception as e:
151
+ return pd.DataFrame([{"錯誤訊息": f"Context 欄格式解析錯誤,請確認其為有效的 list 格式,例如 ['A', 'B']:{str(e)}"}]), None
152
+
153
+ except Exception as e:
154
+ return pd.DataFrame([{"錯誤訊息": f"發生錯誤:{str(e)}"}]), None
155
+
156
+ # 若上傳之待評估檔案無錯誤,執行評估
157
+ try:
158
+ return RAG_evaluation(file, key)
159
+ except Exception as e:
160
+ return pd.DataFrame([{"錯誤訊息": f"RAG 評估失敗:{str(e)}"}]), None
161
+
162
+ # Gradio 介面
163
+ with gr.Blocks() as demo:
164
+ gr.Markdown("## 📐 RAG系統評估工具")
165
+ gr.Markdown("""
166
+ ### 📄 使用說明
167
+ 請上傳您RAG系統產出的結果檔案(包含 Question, Context, Answer 欄位),並填入您的OpenAI API Key,以評估您的RAG系統。
168
+ #### ⏳ 評估需要時間,請耐心等候。
169
+ """)
170
+
171
+ file_input = gr.File(label="上傳 Evaluation_Dataset.csv")
172
+ api_key_input = gr.Textbox(label="OpenAI API Key", type="password")
173
+ submit_btn = gr.Button("開始評估")
174
+
175
+ result_output = gr.Dataframe(label="評估結果")
176
+ download_link = gr.File(label="下載結果檔案(CSV)")
177
+
178
+ submit_btn.click(
179
+ fn=check_csv_and_run,
180
+ inputs=[file_input, api_key_input],
181
+ outputs=[result_output, download_link]
182
+ )
183
+
184
+ demo.launch()