|
import re |
|
import requests |
|
import pandas as pd |
|
import torch |
|
import gradio as gr |
|
from tqdm import tqdm |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline |
|
from typing import List, Dict, Any, Tuple, Optional |
|
import json |
|
import ast |
|
import numpy as np |
|
from PIL import Image, UnidentifiedImageError |
|
import io |
|
import base64 |
|
import logging |
|
import time |
|
import sys |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger("GAIA-Mastermind") |
|
|
|
|
|
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
MODEL_NAME = "google/flan-t5-xxl" |
|
API_RETRIES = 3 |
|
API_TIMEOUT = 45 |
|
|
|
|
|
class GAIAThoughtProcessor: |
|
def __init__(self): |
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
logger.info(f"⚡ Инициализация GAIAThoughtProcessor на {self.device.upper()}") |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
self.model = AutoModelForSeq2SeqLM.from_pretrained( |
|
MODEL_NAME, |
|
device_map="auto", |
|
torch_dtype=torch.float16 if "cuda" in self.device else torch.float32, |
|
low_cpu_mem_usage=True |
|
).eval() |
|
|
|
|
|
self.text_generator = pipeline( |
|
"text2text-generation", |
|
model=self.model, |
|
tokenizer=self.tokenizer, |
|
device=self.device, |
|
max_new_tokens=512 |
|
) |
|
|
|
logger.info("✅ GAIAThoughtProcessor готов") |
|
|
|
def _math_solver(self, expression: str) -> str: |
|
"""Безопасное вычисление математических выражений""" |
|
try: |
|
|
|
clean_expr = re.sub(r"[^0-9+\-*/().^√π]", "", expression) |
|
|
|
context = { |
|
"sqrt": np.sqrt, |
|
"log": np.log, |
|
"log10": np.log10, |
|
"pi": np.pi, |
|
"e": np.e, |
|
"sin": np.sin, |
|
"cos": np.cos, |
|
"tan": np.tan |
|
} |
|
return str(eval(clean_expr, {"__builtins__": None}, context)) |
|
except Exception as e: |
|
logger.error(f"Math error: {e}") |
|
return f"Math Error: {str(e)}" |
|
|
|
def _table_analyzer(self, table_data: str, query: str) -> str: |
|
"""Анализ табличных данных""" |
|
try: |
|
|
|
if "\t" in table_data: |
|
df = pd.read_csv(io.StringIO(table_data), sep="\t") |
|
elif "," in table_data: |
|
df = pd.read_csv(io.StringIO(table_data)) |
|
else: |
|
df = pd.read_fwf(io.StringIO(table_data)) |
|
|
|
|
|
query = query.lower() |
|
if "sum" in query: |
|
return str(df.sum(numeric_only=True).to_dict()) |
|
elif "mean" in query: |
|
return str(df.mean(numeric_only=True).to_dict()) |
|
elif "max" in query: |
|
return str(df.max(numeric_only=True).to_dict()) |
|
elif "min" in query: |
|
return str(df.min(numeric_only=True).to_dict()) |
|
elif "count" in query: |
|
return str(df.count().to_dict()) |
|
else: |
|
return df.describe().to_string() |
|
except Exception as e: |
|
logger.error(f"Table error: {e}") |
|
return f"Table Error: {str(e)}" |
|
|
|
def _text_processor(self, text: str, operation: str) -> str: |
|
"""Операции с текстом""" |
|
operation = operation.lower() |
|
if operation == "reverse": |
|
return text[::-1] |
|
elif operation == "count_words": |
|
return str(len(text.split())) |
|
elif operation == "extract_numbers": |
|
return ", ".join(re.findall(r"[-+]?\d*\.\d+|\d+", text)) |
|
elif operation == "uppercase": |
|
return text.upper() |
|
elif operation == "lowercase": |
|
return text.lower() |
|
else: |
|
return f"Unsupported operation: {operation}" |
|
|
|
def _image_processor(self, image_input: str) -> str: |
|
"""Обработка изображений""" |
|
try: |
|
|
|
if image_input.startswith("http"): |
|
response = requests.get(image_input, timeout=30) |
|
response.raise_for_status() |
|
img_data = response.content |
|
img = Image.open(io.BytesIO(img_data)) |
|
|
|
elif image_input.startswith("data:image"): |
|
header, data = image_input.split(",", 1) |
|
img_data = base64.b64decode(data) |
|
img = Image.open(io.BytesIO(img_data)) |
|
else: |
|
return "Invalid image format" |
|
|
|
|
|
description = ( |
|
f"Format: {img.format}, Size: {img.size}, " |
|
f"Mode: {img.mode}, Colors: {len(set(img.getdata()))}" |
|
) |
|
return description |
|
except (UnidentifiedImageError, requests.exceptions.RequestException) as e: |
|
logger.error(f"Image processing error: {e}") |
|
return f"Image Error: {str(e)}" |
|
except Exception as e: |
|
logger.exception("Unexpected image error") |
|
return f"Unexpected Error: {str(e)}" |
|
|
|
def _call_tool(self, tool_name: str, arguments: str) -> str: |
|
"""Вызов инструмента по имени""" |
|
try: |
|
|
|
args = [a.strip() for a in arguments.split(",")] |
|
|
|
if tool_name == "math_solver": |
|
return self._math_solver(args[0]) |
|
elif tool_name == "table_analyzer": |
|
return self._table_analyzer(args[0], args[1]) |
|
elif tool_name == "text_processor": |
|
return self._text_processor(args[0], args[1]) |
|
elif tool_name == "image_processor": |
|
return self._image_processor(args[0]) |
|
else: |
|
return f"Unknown tool: {tool_name}" |
|
except Exception as e: |
|
return f"Tool Error: {str(e)}" |
|
|
|
def _generate_response(self, prompt: str) -> str: |
|
"""Генерация ответа с помощью модели""" |
|
try: |
|
result = self.text_generator( |
|
prompt, |
|
max_new_tokens=256, |
|
num_beams=3, |
|
early_stopping=True, |
|
temperature=0.01 |
|
) |
|
return result[0]['generated_text'] |
|
except Exception as e: |
|
logger.error(f"Generation error: {e}") |
|
return f"Generation Error: {str(e)}" |
|
finally: |
|
|
|
if "cuda" in self.device: |
|
torch.cuda.empty_cache() |
|
|
|
def process_question(self, question: str, task_id: str) -> str: |
|
"""Обработка вопроса с декомпозицией на шаги""" |
|
try: |
|
|
|
decomposition_prompt = ( |
|
f"Декомпозируй задачу GAIA ({task_id}) на шаги. " |
|
f"Используй инструменты: math_solver, table_analyzer, text_processor, image_processor.\n\n" |
|
f"Задача: {question}\n\n" |
|
"Шаги (формат: [tool_name] arguments):" |
|
) |
|
|
|
steps_response = self._generate_response(decomposition_prompt) |
|
steps = [s.strip() for s in steps_response.split("\n") if s.strip()] |
|
|
|
|
|
results = [] |
|
for step in steps: |
|
if step: |
|
try: |
|
|
|
match = re.match(r"\[(\w+)\]\s*(.+)", step) |
|
if match: |
|
tool_name = match.group(1) |
|
arguments = match.group(2) |
|
result = self._call_tool(tool_name, arguments) |
|
results.append(f"{step} -> {result}") |
|
else: |
|
results.append(f"{step} -> ERROR: Invalid format") |
|
except Exception as e: |
|
results.append(f"{step} -> ERROR: {str(e)}") |
|
|
|
|
|
synthesis_prompt = ( |
|
f"Задача GAIA {task_id}:\n{question}\n\n" |
|
"Выполненные шаги:\n" + "\n".join(results) + |
|
"\n\nФинальный ответ в формате JSON (только поле final_answer):" |
|
) |
|
|
|
final_response = self._generate_response(synthesis_prompt) |
|
|
|
|
|
if "final_answer" in final_response: |
|
return json.dumps({"final_answer": final_response}) |
|
else: |
|
|
|
answer_match = re.search(r'\{.*\}', final_response, re.DOTALL) |
|
if answer_match: |
|
return answer_match.group(0) |
|
else: |
|
return json.dumps({"final_answer": final_response.strip()}) |
|
except Exception as e: |
|
logger.exception("Processing failed") |
|
return json.dumps({ |
|
"task_id": task_id, |
|
"error": str(e), |
|
"final_answer": f"SYSTEM ERROR: {str(e)}" |
|
}) |
|
|
|
|
|
class GAIAEvaluationRunner: |
|
def __init__(self, api_url: str = DEFAULT_API_URL): |
|
self.api_url = api_url |
|
self.questions_url = f"{api_url}/questions" |
|
self.submit_url = f"{api_url}/submit" |
|
self.session = requests.Session() |
|
self.session.headers.update({ |
|
"Accept": "application/json", |
|
"User-Agent": "GAIA-Mastermind/1.0", |
|
"Content-Type": "application/json" |
|
}) |
|
logger.info(f"🌐 Инициализирован GAIAEvaluationRunner для {api_url}") |
|
|
|
def run_evaluation(self, agent, username: str, agent_code: str, progress=tqdm): |
|
|
|
questions, status = self._fetch_questions() |
|
if status != "success": |
|
return status, 0, 0, pd.DataFrame() |
|
|
|
|
|
results = [] |
|
answers = [] |
|
for i, q in enumerate(progress(questions, desc="🧠 Processing GAIA")): |
|
try: |
|
task_id = q.get("task_id", f"unknown_{i}") |
|
json_response = agent.process_question(q["question"], task_id) |
|
|
|
|
|
try: |
|
response_obj = json.loads(json_response) |
|
final_answer = response_obj.get("final_answer", "") |
|
if not isinstance(final_answer, str): |
|
final_answer = str(final_answer) |
|
except json.JSONDecodeError: |
|
final_answer = json_response |
|
|
|
|
|
answers.append({ |
|
"task_id": task_id, |
|
"answer": final_answer[:500] |
|
}) |
|
|
|
|
|
results.append({ |
|
"Task ID": task_id, |
|
"Question": q["question"][:150] + "..." if len(q["question"]) > 150 else q["question"], |
|
"Answer": final_answer[:200], |
|
"Status": "Processed" |
|
}) |
|
except Exception as e: |
|
logger.error(f"Task {task_id} failed: {e}") |
|
answers.append({ |
|
"task_id": task_id, |
|
"answer": f"ERROR: {str(e)}" |
|
}) |
|
results.append({ |
|
"Task ID": task_id, |
|
"Question": "Error", |
|
"Answer": f"ERROR: {str(e)}", |
|
"Status": "Failed" |
|
}) |
|
|
|
|
|
submission_result, score = self._submit_answers(username, agent_code, answers) |
|
return submission_result, score, len(questions), pd.DataFrame(results) |
|
|
|
def _fetch_questions(self) -> Tuple[list, str]: |
|
"""Получение вопросов с API""" |
|
for _ in range(API_RETRIES): |
|
try: |
|
response = self.session.get( |
|
self.questions_url, |
|
timeout=API_TIMEOUT |
|
) |
|
|
|
if response.status_code == 200: |
|
questions = response.json() |
|
if not isinstance(questions, list): |
|
return [], "Invalid response format: expected list" |
|
|
|
|
|
for q in questions: |
|
q.setdefault("task_id", f"id_{hash(q['question']) % 100000}") |
|
return questions, "success" |
|
|
|
elif response.status_code == 429: |
|
logger.warning("Rate limited, retrying...") |
|
time.sleep(5) |
|
continue |
|
|
|
else: |
|
return [], f"API error: HTTP {response.status_code}" |
|
|
|
except Exception as e: |
|
logger.error(f"Fetch error: {e}") |
|
return [], f"Connection error: {str(e)}" |
|
|
|
return [], "API unavailable after retries" |
|
|
|
def _submit_answers(self, username: str, agent_code: str, answers: list) -> Tuple[str, int]: |
|
"""Отправка ответов на сервер""" |
|
payload = { |
|
"username": username.strip(), |
|
"agent_code": agent_code.strip(), |
|
"answers": answers |
|
} |
|
|
|
for attempt in range(API_RETRIES): |
|
try: |
|
response = self.session.post( |
|
self.submit_url, |
|
json=payload, |
|
timeout=API_TIMEOUT * 2 |
|
) |
|
|
|
if response.status_code == 200: |
|
result = response.json() |
|
score = result.get("score", 0) |
|
return result.get("message", "Answers submitted"), score |
|
|
|
elif response.status_code == 400: |
|
error = response.json().get("error", "Invalid request") |
|
logger.error(f"Validation error: {error}") |
|
return f"Validation Error: {error}", 0 |
|
|
|
elif response.status_code == 429: |
|
logger.warning("Rate limited, retrying...") |
|
time.sleep(10) |
|
continue |
|
|
|
else: |
|
return f"HTTP Error {response.status_code}", 0 |
|
|
|
except Exception as e: |
|
logger.error(f"Submit error: {e}") |
|
return f"Connection Error: {str(e)}", 0 |
|
|
|
return "Submission failed after retries", 0 |
|
|
|
|
|
def run_evaluation(username: str, agent_code: str, progress=gr.Progress()): |
|
progress(0, desc="⚡ Инициализация GAIA Mastermind...") |
|
try: |
|
agent = GAIAThoughtProcessor() |
|
except Exception as e: |
|
logger.exception("Agent initialization failed") |
|
return f"Agent Error: {str(e)}", 0, 0, pd.DataFrame() |
|
|
|
progress(0.1, desc="🌐 Подключение к GAIA API...") |
|
runner = GAIAEvaluationRunner() |
|
|
|
|
|
questions, status = runner._fetch_questions() |
|
if status != "success": |
|
return status, 0, 0, pd.DataFrame() |
|
|
|
|
|
results = [] |
|
answers = [] |
|
total = len(questions) |
|
|
|
for i, q in enumerate(questions): |
|
progress(i / total, desc=f"🧠 Обработка задач ({i+1}/{total})") |
|
try: |
|
task_id = q.get("task_id", f"unknown_{i}") |
|
json_response = agent.process_question(q["question"], task_id) |
|
|
|
|
|
try: |
|
response_obj = json.loads(json_response) |
|
final_answer = response_obj.get("final_answer", "") |
|
except: |
|
final_answer = json_response |
|
|
|
answers.append({ |
|
"task_id": task_id, |
|
"answer": str(final_answer)[:500] |
|
}) |
|
|
|
results.append({ |
|
"Task ID": task_id, |
|
"Question": q["question"][:150] + "..." if len(q["question"]) > 150 else q["question"], |
|
"Answer": str(final_answer)[:200], |
|
"Status": "Processed" |
|
}) |
|
except Exception as e: |
|
logger.error(f"Task {task_id} failed: {e}") |
|
answers.append({ |
|
"task_id": task_id, |
|
"answer": f"ERROR: {str(e)}" |
|
}) |
|
results.append({ |
|
"Task ID": task_id, |
|
"Question": "Error", |
|
"Answer": f"ERROR: {str(e)}", |
|
"Status": "Failed" |
|
}) |
|
|
|
|
|
submission_result, score = runner._submit_answers(username, agent_code, answers) |
|
return submission_result, score, total, pd.DataFrame(results) |
|
|
|
|
|
with gr.Blocks( |
|
title="🧠 GAIA Mastermind", |
|
theme=gr.themes.Soft(), |
|
css=""" |
|
.gradio-container {background: linear-gradient(135deg, #1a2a6c, #2c5364)} |
|
.dark {color: #f0f0f0} |
|
""" |
|
) as demo: |
|
gr.Markdown(""" |
|
<div style="text-align:center; background: linear-gradient(135deg, #0f2027, #203a43); |
|
padding: 20px; border-radius: 15px; color: white; box-shadow: 0 10px 20px rgba(0,0,0,0.3);"> |
|
<h1>🧠 GAIA Mastermind</h1> |
|
<h3>Многошаговое решение задач с декомпозицией</h3> |
|
<p>Соответствует спецификации GAIA API</p> |
|
</div> |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown("### 🔐 Авторизация") |
|
username = gr.Textbox( |
|
label="HF Username", |
|
value="yoshizen", |
|
info="Ваше имя пользователя Hugging Face" |
|
) |
|
agent_code = gr.Textbox( |
|
label="Agent Code", |
|
value="https://huggingface.co/spaces/yoshizen/FinalTest", |
|
info="URL вашего агента" |
|
) |
|
run_btn = gr.Button("🚀 Запустить оценку", variant="primary", scale=1) |
|
|
|
gr.Markdown("### ⚙️ Статус системы") |
|
sys_info = gr.Textbox(label="Системная информация", interactive=False, value="") |
|
|
|
with gr.Column(scale=2): |
|
gr.Markdown("### 📊 Результаты GAIA") |
|
with gr.Row(): |
|
result_output = gr.Textbox( |
|
label="Статус отправки", |
|
interactive=False, |
|
max_lines=3 |
|
) |
|
correct_output = gr.Number( |
|
label="✅ Правильные ответы", |
|
interactive=False |
|
) |
|
total_output = gr.Number( |
|
label="📚 Всего вопросов", |
|
interactive=False |
|
) |
|
|
|
|
|
results_table = gr.Dataframe( |
|
label="🔍 Детализация ответов", |
|
headers=["Task ID", "Question", "Answer", "Status"], |
|
interactive=False |
|
) |
|
|
|
|
|
def get_system_info(): |
|
device = "GPU ✅" if torch.cuda.is_available() else "CPU ⚠️" |
|
return f"Device: {device} | Model: {MODEL_NAME} | API: {DEFAULT_API_URL}" |
|
|
|
demo.load(get_system_info, inputs=None, outputs=sys_info) |
|
|
|
run_btn.click( |
|
fn=run_evaluation, |
|
inputs=[username, agent_code], |
|
outputs=[result_output, correct_output, total_output, results_table], |
|
concurrency_limit=1, |
|
show_progress="minimal" |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.queue(max_size=5).launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
show_error=True, |
|
debug=False |
|
) |