yoshizen commited on
Commit
f7cf33f
·
verified ·
1 Parent(s): f0bb83e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -152
app.py CHANGED
@@ -1,20 +1,16 @@
1
- import os
2
  import json
3
  import time
4
- import torch
5
  import requests
6
  import gradio as gr
7
  import pandas as pd
8
- from typing import List, Dict, Any, Optional, Union, Callable, Tuple
9
- from agent_gaia import GAIAExpertAgent as OptimizedGAIAAgent
10
 
11
  # Константы
12
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
13
- MAX_RETRIES = 3
14
- RETRY_DELAY = 5
15
 
16
  class EvaluationRunner:
17
- """Обрабатывает процесс оценки: получение вопросов, запуск агента, отправку ответов"""
18
 
19
  def __init__(self, api_url=DEFAULT_API_URL):
20
  self.api_url = api_url
@@ -24,213 +20,132 @@ class EvaluationRunner:
24
  self.correct_answers = 0
25
  self.total_questions = 0
26
 
27
- def run_evaluation(self,
28
- agent: Callable[[str], str],
29
- username: str,
30
- agent_code: str) -> tuple[str, pd.DataFrame]:
31
- # Получаем вопросы
32
  questions_data = self._fetch_questions()
33
- if isinstance(questions_data, str): # Сообщение об ошибке
34
- return questions_data, None
35
 
36
- # Запускаем агента на всех вопросах
37
  results_log, answers_payload = self._run_agent_on_questions(agent, questions_data)
38
  if not answers_payload:
39
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
40
 
41
- # Отправляем ответы
42
  submission_result = self._submit_answers(username, agent_code, answers_payload)
43
-
44
- # Проверяем результаты
45
- self._check_results(username)
46
- self.print_evaluation_summary(username)
47
-
48
  return submission_result, pd.DataFrame(results_log)
49
 
50
- def _fetch_questions(self) -> Union[List[Dict[str, Any]], str]:
51
  try:
52
- response = requests.get(self.questions_url, timeout=15)
53
  response.raise_for_status()
54
  questions_data = response.json()
55
-
56
- if not questions_data:
57
- return "Fetched questions list is empty or invalid format."
58
-
59
  self.total_questions = len(questions_data)
60
- print(f"Successfully fetched {self.total_questions} questions.")
61
  return questions_data
62
-
63
  except Exception as e:
64
- return f"Error fetching questions: {e}"
65
 
66
- def _run_agent_on_questions(self,
67
- agent: Any,
68
- questions_data: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
69
  results_log = []
70
  answers_payload = []
71
 
72
- print(f"Running agent on {len(questions_data)} questions...")
73
- for item in questions_data:
74
  task_id = item.get("task_id")
75
  question_text = item.get("question")
76
 
77
- if not task_id or question_text is None:
78
  continue
79
 
80
  try:
81
  json_response = agent(question_text, task_id)
82
  response_obj = json.loads(json_response)
83
- submitted_answer = response_obj.get("final_answer", "")
84
-
85
- answers_payload.append({
86
- "task_id": task_id,
87
- "submitted_answer": submitted_answer
88
- })
89
 
 
90
  results_log.append({
91
- "Task ID": task_id,
92
- "Question": question_text,
93
- "Submitted Answer": submitted_answer,
94
- "Full Response": json_response
95
  })
96
  except Exception as e:
 
97
  results_log.append({
98
- "Task ID": task_id,
99
- "Question": question_text,
100
- "Submitted Answer": f"AGENT ERROR: {e}"
101
  })
102
 
103
  return results_log, answers_payload
104
 
105
- def _submit_answers(self,
106
- username: str,
107
- agent_code: str,
108
- answers_payload: List[Dict[str, Any]]) -> str:
109
  submission_data = {
110
  "username": username.strip(),
111
- "agent_code": agent_code.strip(), # Ключевое исправление: agent_code вместо agent_code_url
112
  "answers": answers_payload
113
  }
114
 
115
- print(f"Submitting {len(answers_payload)} answers to: {self.submit_url}")
116
- print("Submission data:", json.dumps(submission_data, indent=2))
117
-
118
- for attempt in range(1, MAX_RETRIES + 1):
119
- try:
120
- response = requests.post(
121
- self.submit_url,
122
- json=submission_data,
123
- headers={"Content-Type": "application/json"},
124
- timeout=30
125
- )
126
- response.raise_for_status()
127
-
128
- try:
129
- result = response.json()
130
- if "message" in result:
131
- return result["message"]
132
- return "Evaluation submitted successfully"
133
- except:
134
- return f"Submission successful, but response was not JSON: {response.text}"
135
-
136
- except Exception as e:
137
- print(f"Submission attempt {attempt} failed: {e}")
138
- time.sleep(RETRY_DELAY)
139
-
140
- return "Error submitting answers after multiple attempts"
141
-
142
- def _check_results(self, username: str) -> None:
143
  try:
144
- results_url = f"{self.results_url}?username={username}"
145
- response = requests.get(results_url, timeout=15)
146
- if response.status_code == 200:
147
- data = response.json()
148
- if isinstance(data, dict) and "score" in data:
149
- self.correct_answers = int(data["score"])
 
 
150
  except Exception as e:
151
- print(f"Error checking results: {e}")
152
-
153
- def get_correct_answers_count(self) -> int:
154
- return self.correct_answers
155
-
156
- def get_total_questions_count(self) -> int:
157
- return self.total_questions
158
-
159
- def print_evaluation_summary(self, username: str) -> None:
160
- print("\n===== EVALUATION SUMMARY =====")
161
- print(f"User: {username}")
162
- print(f"Overall Score: {self.correct_answers}/{self.total_questions}")
163
- print("=============================\n")
164
 
165
 
166
- def run_evaluation(username: str,
167
- agent_code: str, # Исправлено имя параметра
168
- model_name: str = "google/flan-t5-base",
169
- use_cache: bool = False) -> Tuple[str, int, int, str, str, str]: # Кэш отключен по умолчанию
170
- start_time = time.time()
171
-
172
- # Инициализируем агента
173
- agent = EnhancedGAIAAgent(model_name=model_name, use_cache=use_cache)
174
-
175
- # Инициализируем runner
176
- runner = EvaluationRunner(api_url=DEFAULT_API_URL)
177
 
178
- # Запускаем оценку
179
- result, results_log = runner.run_evaluation(agent, username, agent_code)
 
180
 
181
- # Вычисляем время выполнения
182
- elapsed_time = time.time() - start_time
183
- elapsed_time_str = f"{elapsed_time:.2f} seconds"
184
 
185
- # Формируем URL результатов
186
- results_url = f"{DEFAULT_API_URL}/results?username={username}"
187
- cache_status = "Cache enabled and used" if use_cache else "Cache disabled"
188
-
189
- return (
190
- result,
191
- runner.get_correct_answers_count(),
192
- runner.get_total_questions_count(),
193
- elapsed_time_str,
194
- results_url,
195
- cache_status
196
- )
197
 
198
 
199
  def create_gradio_interface():
200
- with gr.Blocks(title="GAIA Agent Evaluation") as demo:
201
- gr.Markdown("# GAIA Agent Evaluation")
202
 
203
  with gr.Row():
204
  with gr.Column():
205
- username = gr.Textbox(label="Hugging Face Username")
206
- agent_code = gr.Textbox(label="Agent Code", lines=2, placeholder="Your agent code here")
 
 
 
 
207
  model_name = gr.Dropdown(
208
  label="Model",
209
- choices=["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"],
210
- value="google/flan-t5-base"
 
 
 
 
211
  )
212
- use_cache = gr.Checkbox(label="Use Answer Cache", value=False)
213
- run_button = gr.Button("Run Evaluation & Submit All Answers")
214
 
215
  with gr.Column():
216
- result_text = gr.Textbox(label="Result", lines=2)
 
217
  correct_answers = gr.Number(label="Correct Answers")
218
  total_questions = gr.Number(label="Total Questions")
219
- elapsed_time = gr.Textbox(label="Elapsed Time")
220
- results_url = gr.Textbox(label="Results URL")
221
- cache_status = gr.Textbox(label="Cache Status")
222
 
223
  run_button.click(
224
  fn=run_evaluation,
225
- inputs=[username, agent_code, model_name, use_cache],
226
- outputs=[
227
- result_text,
228
- correct_answers,
229
- total_questions,
230
- elapsed_time,
231
- results_url,
232
- cache_status
233
- ]
234
  )
235
 
236
  return demo
@@ -238,4 +153,4 @@ def create_gradio_interface():
238
 
239
  if __name__ == "__main__":
240
  demo = create_gradio_interface()
241
- demo.launch(share=True)
 
 
1
  import json
2
  import time
 
3
  import requests
4
  import gradio as gr
5
  import pandas as pd
6
+ from tqdm import tqdm
7
+ from agent import GAIAExpertAgent
8
 
9
  # Константы
10
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
 
11
 
12
  class EvaluationRunner:
13
+ """Оптимизированный обработчик оценки"""
14
 
15
  def __init__(self, api_url=DEFAULT_API_URL):
16
  self.api_url = api_url
 
20
  self.correct_answers = 0
21
  self.total_questions = 0
22
 
23
+ def run_evaluation(self, agent, username: str, agent_code: str) -> Tuple[str, pd.DataFrame]:
 
 
 
 
24
  questions_data = self._fetch_questions()
25
+ if not isinstance(questions_data, list):
26
+ return questions_data, pd.DataFrame()
27
 
 
28
  results_log, answers_payload = self._run_agent_on_questions(agent, questions_data)
29
  if not answers_payload:
30
+ return "No answers generated", pd.DataFrame()
31
 
 
32
  submission_result = self._submit_answers(username, agent_code, answers_payload)
 
 
 
 
 
33
  return submission_result, pd.DataFrame(results_log)
34
 
35
+ def _fetch_questions(self):
36
  try:
37
+ response = requests.get(self.questions_url, timeout=30)
38
  response.raise_for_status()
39
  questions_data = response.json()
 
 
 
 
40
  self.total_questions = len(questions_data)
41
+ print(f"Fetched {self.total_questions} questions")
42
  return questions_data
 
43
  except Exception as e:
44
+ return f"Error: {str(e)}"
45
 
46
+ def _run_agent_on_questions(self, agent, questions_data):
 
 
47
  results_log = []
48
  answers_payload = []
49
 
50
+ print(f"Processing {len(questions_data)} questions...")
51
+ for item in tqdm(questions_data, desc="Questions"):
52
  task_id = item.get("task_id")
53
  question_text = item.get("question")
54
 
55
+ if not task_id or not question_text:
56
  continue
57
 
58
  try:
59
  json_response = agent(question_text, task_id)
60
  response_obj = json.loads(json_response)
61
+ answer = response_obj.get("final_answer", "")
 
 
 
 
 
62
 
63
+ answers_payload.append({"task_id": task_id, "submitted_answer": answer})
64
  results_log.append({
65
+ "Task ID": task_id,
66
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
67
+ "Answer": answer[:50] + "..." if len(answer) > 50 else answer
 
68
  })
69
  except Exception as e:
70
+ answers_payload.append({"task_id": task_id, "submitted_answer": f"ERROR: {str(e)}"})
71
  results_log.append({
72
+ "Task ID": task_id,
73
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
74
+ "Answer": f"ERROR: {str(e)}"
75
  })
76
 
77
  return results_log, answers_payload
78
 
79
+ def _submit_answers(self, username: str, agent_code: str, answers_payload):
 
 
 
80
  submission_data = {
81
  "username": username.strip(),
82
+ "agent_code": agent_code.strip(),
83
  "answers": answers_payload
84
  }
85
 
86
+ print("Submitting answers...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  try:
88
+ response = requests.post(
89
+ self.submit_url,
90
+ json=submission_data,
91
+ headers={"Content-Type": "application/json"},
92
+ timeout=60
93
+ )
94
+ response.raise_for_status()
95
+ return response.json().get("message", "Answers submitted successfully")
96
  except Exception as e:
97
+ return f"Submission failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
 
100
+ def run_evaluation(username: str, agent_code: str, model_name: str):
101
+ print("Initializing GAIA Expert Agent...")
102
+ agent = GAIAExpertAgent(model_name=model_name)
 
 
 
 
 
 
 
 
103
 
104
+ print("Starting evaluation...")
105
+ runner = EvaluationRunner()
106
+ result, results_df = runner.run_evaluation(agent, username, agent_code)
107
 
108
+ # Добавляем счетчики вопросов
109
+ total_questions = runner.total_questions
110
+ correct_answers = runner.correct_answers if hasattr(runner, 'correct_answers') else 0
111
 
112
+ return result, correct_answers, total_questions, results_df
 
 
 
 
 
 
 
 
 
 
 
113
 
114
 
115
  def create_gradio_interface():
116
+ with gr.Blocks(title="GAIA Expert Agent") as demo:
117
+ gr.Markdown("# 🧠 GAIA Expert Agent Evaluation")
118
 
119
  with gr.Row():
120
  with gr.Column():
121
+ gr.Markdown("### Configuration")
122
+ username = gr.Textbox(label="Hugging Face Username", value="yoshizen")
123
+ agent_code = gr.Textbox(
124
+ label="Agent Code",
125
+ value="https://huggingface.co/spaces/yoshizen/FinalTest"
126
+ )
127
  model_name = gr.Dropdown(
128
  label="Model",
129
+ choices=[
130
+ "google/flan-t5-small",
131
+ "google/flan-t5-base",
132
+ "google/flan-t5-large"
133
+ ],
134
+ value="google/flan-t5-large"
135
  )
136
+ run_button = gr.Button("🚀 Run Evaluation", variant="primary")
 
137
 
138
  with gr.Column():
139
+ gr.Markdown("### Results")
140
+ result_text = gr.Textbox(label="Submission Status")
141
  correct_answers = gr.Number(label="Correct Answers")
142
  total_questions = gr.Number(label="Total Questions")
143
+ results_table = gr.Dataframe(label="Processed Questions", interactive=False)
 
 
144
 
145
  run_button.click(
146
  fn=run_evaluation,
147
+ inputs=[username, agent_code, model_name],
148
+ outputs=[result_text, correct_answers, total_questions, results_table]
 
 
 
 
 
 
 
149
  )
150
 
151
  return demo
 
153
 
154
  if __name__ == "__main__":
155
  demo = create_gradio_interface()
156
+ demo.launch(server_name="0.0.0.0", server_port=7860)