MonicaChen0330 commited on
Commit
995479c
·
verified ·
1 Parent(s): 5b51eba

Optimized: batch evaluation

Browse files
Files changed (1) hide show
  1. app.py +65 -50
app.py CHANGED
@@ -14,11 +14,12 @@ from ragas.metrics import (
14
  ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness
15
  )
16
 
 
17
  sys.stdout.reconfigure(encoding="utf-8")
18
 
19
- # 嘗試從Google Drive下載 Ground Truth
20
  gt_url = os.environ.get("GT_URL")
21
- gt_path = "ragas_groundtruth.csv"
22
 
23
  if gt_url and not os.path.exists(gt_path):
24
  print("嘗試下載 Ground Truth...")
@@ -30,6 +31,24 @@ if gt_url and not os.path.exists(gt_path):
30
  with open(gt_path, "wb") as f:
31
  f.write(r.content)
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def RAG_evaluation(uploaded_file, user_api_key):
34
  try:
35
  os.environ["OPENAI_API_KEY"] = user_api_key
@@ -55,20 +74,27 @@ def RAG_evaluation(uploaded_file, user_api_key):
55
  llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18"))
56
  embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large"))
57
 
 
58
  records = []
59
- for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Evaluating"):
60
- try:
 
 
 
61
  if not isinstance(row["Context"], list):
62
- print(f"第 {idx + 1} 筆 Context 非 list,跳過。值:{row['Context']}")
63
  continue
64
-
65
  sample = SingleTurnSample(
66
  user_input=row["Question"],
67
  response=row["Answer"],
68
  retrieved_contexts=row["Context"],
69
- reference=row["GroundTruth"]
70
  )
71
- dataset = Dataset.from_list([sample.to_dict()])
 
 
 
72
  result = evaluate(
73
  dataset=dataset,
74
  metrics=[
@@ -81,28 +107,32 @@ def RAG_evaluation(uploaded_file, user_api_key):
81
  show_progress=False
82
  )
83
 
84
- score_row = result.to_pandas().iloc[0].to_dict()
85
- records.append({
86
- "Question": row["Question"],
87
- "Faithfulness": score_row.get("faithfulness"),
88
- "Answer Relevancy": score_row.get("answer_relevancy"),
89
- "Semantic Similarity": score_row.get("semantic_similarity"),
90
- # "Factual Correctness": score_row.get("factual_correctness"),
91
- "Context Precision": score_row.get("llm_context_precision_with_reference"),
92
- "Context Recall": score_row.get("context_recall"),
93
- "Context Entity Recall": score_row.get("context_entity_recall"),
94
- # "noise_sensitivity_relevant": score_row.get("noise_sensitivity_relevant")
95
- })
96
-
97
- log_to_google_sheet(
98
- question=row["Question"],
99
- answer=row["Answer"],
100
- contexts=row["Context"],
101
- scores=score_row
102
- )
 
 
 
 
103
 
104
  except Exception as e:
105
- print(f" {idx + 1} 筆評估失敗:{e}")
106
  continue
107
 
108
  score_df = pd.DataFrame(records).fillna("")
@@ -123,9 +153,9 @@ def RAG_evaluation(uploaded_file, user_api_key):
123
  except Exception as e:
124
  print("評估函式整體錯誤:", str(e))
125
  return pd.DataFrame([{"錯誤訊息": f"系統錯誤:{str(e)}"}]), None
126
-
 
127
  def check_csv_and_run(file, key):
128
- print("開始檢查CSV檔案格式並執行評估")
129
  if file is None:
130
  return pd.DataFrame([{"錯誤訊息": "請上傳檔案!"}]), None
131
 
@@ -166,31 +196,13 @@ def check_csv_and_run(file, key):
166
  except Exception as e:
167
  return pd.DataFrame([{"錯誤訊息": f"RAG 評估失敗:{str(e)}"}]), None
168
 
169
- def log_to_google_sheet(question, answer, contexts, scores):
170
- url = os.environ.get("G_SHEET_URL")
171
- if not url:
172
- print("G_SHEET_URL 未設定,略過記錄")
173
- return
174
- try:
175
- payload = {
176
- "question": question,
177
- "answer": answer,
178
- "contexts": contexts,
179
- "scores": scores
180
- }
181
- response = requests.post(url, json=payload)
182
- print("成功寫入 Google Sheet:", response.status_code)
183
- except Exception as e:
184
- print("寫入 Google Sheet 失敗:", str(e))
185
-
186
-
187
  # Gradio 介面
188
  with gr.Blocks() as demo:
189
  gr.Markdown("## 📐 RAG系統評估工具")
190
  gr.Markdown("""
191
  ### 📄 使用說明
192
  請上傳您RAG系統產出的結果檔案(包含 Question, Context, Answer 欄位),並填入您的OpenAI API Key,以評估您的RAG系統。
193
- #### ⏳ 評估需要時間,請耐心等候。
194
  """)
195
 
196
  file_input = gr.File(label="上傳 Evaluation_Dataset.csv")
@@ -198,7 +210,10 @@ with gr.Blocks() as demo:
198
  submit_btn = gr.Button("開始評估")
199
 
200
  result_output = gr.Dataframe(label="評估結果")
201
- download_link = gr.File(label="下載結果檔案(CSV)")
 
 
 
202
 
203
  submit_btn.click(
204
  fn=check_csv_and_run,
 
14
  ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness
15
  )
16
 
17
+ # 設定輸出編碼為 UTF-8(解決中文顯示問題)
18
  sys.stdout.reconfigure(encoding="utf-8")
19
 
20
+ # 支援從Google Drive下載 Ground Truth
21
  gt_url = os.environ.get("GT_URL")
22
+ gt_path = "tender_groundtruth.csv"
23
 
24
  if gt_url and not os.path.exists(gt_path):
25
  print("嘗試下載 Ground Truth...")
 
31
  with open(gt_path, "wb") as f:
32
  f.write(r.content)
33
 
34
+ # 綁定實驗室Google帳號(Python TA)Google Sheet,以記錄評估logs
35
+ def log_to_google_sheet(question, answer, contexts, scores):
36
+ url = os.environ.get("G_SHEET_URL")
37
+ if not url:
38
+ print("G_SHEET_URL 未設定,略過記錄")
39
+ return
40
+ try:
41
+ payload = {
42
+ "question": question,
43
+ "answer": answer,
44
+ "contexts": contexts,
45
+ "scores": scores
46
+ }
47
+ response = requests.post(url, json=payload)
48
+ print("成功寫入 Google Sheet:", response.status_code)
49
+ except Exception as e:
50
+ print("寫入 Google Sheet 失敗:", str(e))
51
+
52
  def RAG_evaluation(uploaded_file, user_api_key):
53
  try:
54
  os.environ["OPENAI_API_KEY"] = user_api_key
 
74
  llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18"))
75
  embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large"))
76
 
77
+ batch_size = 10
78
  records = []
79
+ for batch_start in tqdm(range(0, len(merged_df), batch_size), desc="RAGAS Batch Evaluating"):
80
+ batch_df = merged_df.iloc[batch_start:batch_start + batch_size]
81
+
82
+ samples = []
83
+ for _, row in batch_df.iterrows():
84
  if not isinstance(row["Context"], list):
85
+ print(f"Context 非 list,跳過。值:{row['Question']}")
86
  continue
87
+
88
  sample = SingleTurnSample(
89
  user_input=row["Question"],
90
  response=row["Answer"],
91
  retrieved_contexts=row["Context"],
92
+ reference=row["GroundTruth"],
93
  )
94
+ samples.append(sample)
95
+
96
+ try:
97
+ dataset = Dataset.from_list([s.to_dict() for s in samples])
98
  result = evaluate(
99
  dataset=dataset,
100
  metrics=[
 
107
  show_progress=False
108
  )
109
 
110
+ result_df = result.to_pandas()
111
+
112
+ for i, row in enumerate(result_df.itertuples()):
113
+ input_row = batch_df.iloc[i]
114
+ record = {
115
+ "Question": input_row["Question"],
116
+ "Faithfulness": getattr(row, "faithfulness", None),
117
+ "Answer Relevancy": getattr(row, "answer_relevancy", None),
118
+ "Semantic Similarity": getattr(row, "semantic_similarity", None),
119
+ # "Factual Correctness": getattr(row, "factual_correctness", None),
120
+ "Context Precision": getattr(row, "llm_context_precision_with_reference", None),
121
+ "Context Recall": getattr(row, "context_recall", None),
122
+ "Context Entity Recall": getattr(row, "context_entity_recall", None),
123
+ # "Noise Sensitivity": getattr(row, "noise_sensitivity_relevant", None)
124
+ }
125
+ records.append(record)
126
+
127
+ log_to_google_sheet(
128
+ question=input_row["Question"],
129
+ answer=input_row["Answer"],
130
+ contexts=input_row["Context"],
131
+ scores=record
132
+ )
133
 
134
  except Exception as e:
135
+ print(f"批次評估失敗(第 {batch_start+1} 筆起):{e}")
136
  continue
137
 
138
  score_df = pd.DataFrame(records).fillna("")
 
153
  except Exception as e:
154
  print("評估函式整體錯誤:", str(e))
155
  return pd.DataFrame([{"錯誤訊息": f"系統錯誤:{str(e)}"}]), None
156
+
157
+ # handle exception並執行RAG評估
158
  def check_csv_and_run(file, key):
 
159
  if file is None:
160
  return pd.DataFrame([{"錯誤訊息": "請上傳檔案!"}]), None
161
 
 
196
  except Exception as e:
197
  return pd.DataFrame([{"錯誤訊息": f"RAG 評估失敗:{str(e)}"}]), None
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  # Gradio 介面
200
  with gr.Blocks() as demo:
201
  gr.Markdown("## 📐 RAG系統評估工具")
202
  gr.Markdown("""
203
  ### 📄 使用說明
204
  請上傳您RAG系統產出的結果檔案(包含 Question, Context, Answer 欄位),並填入您的OpenAI API Key,以評估您的RAG系統。
205
+ #### ⏳ 完整評估需要數小時,無即時回應並不是當機,請耐心等候。
206
  """)
207
 
208
  file_input = gr.File(label="上傳 Evaluation_Dataset.csv")
 
210
  submit_btn = gr.Button("開始評估")
211
 
212
  result_output = gr.Dataframe(label="評估結果")
213
+ download_link = gr.File(label="下載評估結果(CSV)")
214
+
215
+ def wrapped_fn(file, key):
216
+ return RAG_evaluation(file, key)
217
 
218
  submit_btn.click(
219
  fn=check_csv_and_run,