|
import re |
|
import requests |
|
import pandas as pd |
|
import torch |
|
import gradio as gr |
|
from tqdm import tqdm |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
from llama_index.core import Settings |
|
from llama_index.core.tools import FunctionTool |
|
from llama_index.core.agent import ReActAgent |
|
from llama_index.llms.huggingface import HuggingFaceLLM |
|
from typing import List, Dict, Any, Tuple, Optional |
|
import json |
|
import ast |
|
import numpy as np |
|
from PIL import Image, UnidentifiedImageError |
|
import io |
|
import base64 |
|
import logging |
|
import time |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger("GAIA-Mastermind") |
|
|
|
|
|
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
MODEL_NAME = "google/flan-t5-xxl" |
|
API_RETRIES = 3 |
|
API_TIMEOUT = 45 |
|
|
|
|
|
class GAIAThoughtProcessor: |
|
def __init__(self): |
|
|
|
self.llm = HuggingFaceLLM( |
|
model_name=MODEL_NAME, |
|
tokenizer_name=MODEL_NAME, |
|
context_window=2048, |
|
max_new_tokens=512, |
|
device_map="auto", |
|
model_kwargs={ |
|
"torch_dtype": torch.float16, |
|
"load_in_4bit": True, |
|
"device_map": "auto" |
|
}, |
|
generate_kwargs={"temperature": 0.01, "do_sample": False} |
|
) |
|
self.tools = self._create_gaia_tools() |
|
self.agent = ReActAgent.from_tools( |
|
self.tools, |
|
llm=self.llm, |
|
verbose=True, |
|
max_iterations=10, |
|
react_mode="plan_and_solve" |
|
) |
|
logger.info("⚙️ Инициализирован GAIAThoughtProcessor с %d инструментами", len(self.tools)) |
|
|
|
def _create_gaia_tools(self) -> List[FunctionTool]: |
|
"""Создает инструменты, соответствующие спецификации GAIA""" |
|
return [ |
|
FunctionTool.from_defaults( |
|
fn=self._math_solver, |
|
name="math_solver", |
|
description="Вычисляет математические выражения. Ввод: строка с выражением (например, '2+2*3')" |
|
), |
|
FunctionTool.from_defaults( |
|
fn=self._table_analyzer, |
|
name="table_analyzer", |
|
description="Анализирует табличные данные. Ввод: (table_data:str, query:str)" |
|
), |
|
FunctionTool.from_defaults( |
|
fn=self._text_processor, |
|
name="text_processor", |
|
description="Операции с текстом: reverse, count_words, extract_numbers. Ввод: (text:str, operation:str)" |
|
), |
|
FunctionTool.from_defaults( |
|
fn=self._image_processor, |
|
name="image_processor", |
|
description="Анализирует изображения. Ввод: base64 изображения или URL" |
|
) |
|
] |
|
|
|
def _math_solver(self, expression: str) -> str: |
|
"""Безопасное вычисление математических выражений""" |
|
try: |
|
|
|
clean_expr = re.sub(r"[^0-9+\-*/().^√π]", "", expression) |
|
|
|
context = { |
|
"sqrt": np.sqrt, |
|
"log": np.log, |
|
"log10": np.log10, |
|
"pi": np.pi, |
|
"e": np.e, |
|
"sin": np.sin, |
|
"cos": np.cos, |
|
"tan": np.tan |
|
} |
|
return str(eval(clean_expr, {"__builtins__": None}, context)) |
|
except Exception as e: |
|
logger.error("Math error: %s", e) |
|
return f"Math Error: {str(e)}" |
|
|
|
def _table_analyzer(self, table_data: str, query: str) -> str: |
|
"""Анализ табличных данных с поддержкой сложных запросов""" |
|
try: |
|
|
|
if "\t" in table_data: |
|
df = pd.read_csv(io.StringIO(table_data), sep="\t") |
|
elif "," in table_data: |
|
df = pd.read_csv(io.StringIO(table_data)) |
|
else: |
|
df = pd.read_fwf(io.StringIO(table_data)) |
|
|
|
|
|
if "sum" in query.lower(): |
|
return str(df.sum(numeric_only=True).to_dict()) |
|
elif "mean" in query.lower(): |
|
return str(df.mean(numeric_only=True).to_dict()) |
|
elif "max" in query.lower(): |
|
return str(df.max(numeric_only=True).to_dict()) |
|
elif "min" in query.lower(): |
|
return str(df.min(numeric_only=True).to_dict()) |
|
elif "count" in query.lower(): |
|
return str(df.count().to_dict()) |
|
else: |
|
|
|
try: |
|
result = df.query(query) |
|
return result.to_string() |
|
except: |
|
return df.describe().to_string() |
|
except Exception as e: |
|
logger.error("Table error: %s", e) |
|
return f"Table Error: {str(e)}" |
|
|
|
def _text_processor(self, text: str, operation: str) -> str: |
|
"""Операции с текстом с поддержкой GAIA спецификации""" |
|
operation = operation.lower() |
|
if operation == "reverse": |
|
return text[::-1] |
|
elif operation == "count_words": |
|
return str(len(text.split())) |
|
elif operation == "extract_numbers": |
|
return ", ".join(re.findall(r"[-+]?\d*\.\d+|\d+", text)) |
|
elif operation == "uppercase": |
|
return text.upper() |
|
elif operation == "lowercase": |
|
return text.lower() |
|
else: |
|
return f"Unsupported operation: {operation}" |
|
|
|
def _image_processor(self, image_input: str) -> str: |
|
"""Обработка изображений с поддержкой URL и base64""" |
|
try: |
|
|
|
if image_input.startswith("http"): |
|
response = requests.get(image_input, timeout=30) |
|
response.raise_for_status() |
|
img_data = response.content |
|
img = Image.open(io.BytesIO(img_data)) |
|
|
|
elif image_input.startswith("data:image"): |
|
header, data = image_input.split(",", 1) |
|
img_data = base64.b64decode(data) |
|
img = Image.open(io.BytesIO(img_data)) |
|
else: |
|
return "Invalid image format" |
|
|
|
|
|
description = ( |
|
f"Format: {img.format}, Size: {img.size}, " |
|
f"Mode: {img.mode}, Colors: {len(set(img.getdata()))}" |
|
) |
|
return description |
|
except (UnidentifiedImageError, requests.exceptions.RequestException) as e: |
|
logger.error("Image processing error: %s", e) |
|
return f"Image Error: {str(e)}" |
|
except Exception as e: |
|
logger.exception("Unexpected image error") |
|
return f"Unexpected Error: {str(e)}" |
|
|
|
def process_question(self, question: str, task_id: str) -> str: |
|
"""Обработка вопроса с учетом спецификации GAIA""" |
|
try: |
|
|
|
decomposition_prompt = ( |
|
f"Декомпозируй задачу GAIA ({task_id}) на шаги:\n{question}\n\n" |
|
"Шаги (разделены точкой с запятой):" |
|
) |
|
steps_response = self.llm.complete(decomposition_prompt) |
|
steps = [s.strip() for s in steps_response.text.split(";") if s.strip()] |
|
|
|
|
|
results = [] |
|
for step in steps: |
|
if step: |
|
try: |
|
result = self.agent.chat(step) |
|
results.append(f"{step}: {result}") |
|
except Exception as e: |
|
results.append(f"{step}: ERROR - {str(e)}") |
|
|
|
|
|
synthesis_prompt = ( |
|
f"Задача GAIA {task_id}:\n{question}\n\n" |
|
"Выполненные шаги:\n" + "\n".join(results) + |
|
"\n\nФинальный ответ в формате JSON:" |
|
) |
|
final_response = self.llm.complete(synthesis_prompt) |
|
|
|
|
|
answer_match = re.search(r'\{.*\}', final_response.text, re.DOTALL) |
|
if answer_match: |
|
return answer_match.group(0) |
|
else: |
|
return json.dumps({ |
|
"final_answer": final_response.text.strip(), |
|
"task_id": task_id, |
|
"reasoning_steps": results |
|
}) |
|
except Exception as e: |
|
logger.exception("Processing failed") |
|
return json.dumps({ |
|
"task_id": task_id, |
|
"error": str(e), |
|
"final_answer": f"SYSTEM ERROR: {str(e)}" |
|
}) |
|
|
|
|
|
class GAIAEvaluationRunner: |
|
def __init__(self, api_url: str = DEFAULT_API_URL): |
|
self.api_url = api_url |
|
self.questions_url = f"{api_url}/questions" |
|
self.submit_url = f"{api_url}/submit" |
|
self.session = requests.Session() |
|
self.session.headers.update({ |
|
"Accept": "application/json", |
|
"User-Agent": "GAIA-Mastermind/1.0", |
|
"Content-Type": "application/json" |
|
}) |
|
logger.info("🌐 Инициализирован GAIAEvaluationRunner для %s", api_url) |
|
|
|
def run_evaluation(self, agent, username: str, agent_code: str, progress=tqdm): |
|
|
|
questions, status = self._fetch_questions() |
|
if status != "success": |
|
return status, 0, 0, pd.DataFrame() |
|
|
|
|
|
results = [] |
|
answers = [] |
|
for i, q in enumerate(progress(questions, desc="🧠 Processing GAIA")): |
|
try: |
|
|
|
task_id = q.get("task_id", f"unknown_{i}") |
|
|
|
|
|
json_response = agent.process_question(q["question"], task_id) |
|
|
|
|
|
try: |
|
response_obj = json.loads(json_response) |
|
final_answer = response_obj.get("final_answer", "") |
|
|
|
|
|
if not isinstance(final_answer, str): |
|
final_answer = str(final_answer) |
|
except json.JSONDecodeError: |
|
final_answer = json_response |
|
|
|
|
|
answers.append({ |
|
"task_id": task_id, |
|
"answer": final_answer[:500] |
|
}) |
|
|
|
|
|
results.append({ |
|
"Task ID": task_id, |
|
"Question": q["question"][:150] + "..." if len(q["question"]) > 150 else q["question"], |
|
"Answer": final_answer[:200], |
|
"Status": "Processed" |
|
}) |
|
except Exception as e: |
|
logger.error("Task %s failed: %s", task_id, e) |
|
answers.append({ |
|
"task_id": task_id, |
|
"answer": f"ERROR: {str(e)}" |
|
}) |
|
results.append({ |
|
"Task ID": task_id, |
|
"Question": "Error", |
|
"Answer": f"ERROR: {str(e)}", |
|
"Status": "Failed" |
|
}) |
|
|
|
|
|
submission_result, score = self._submit_answers(username, agent_code, answers) |
|
return submission_result, score, len(questions), pd.DataFrame(results) |
|
|
|
def _fetch_questions(self) -> Tuple[list, str]: |
|
"""Получение вопросов с обработкой GAIA спецификации""" |
|
for _ in range(API_RETRIES): |
|
try: |
|
response = self.session.get( |
|
self.questions_url, |
|
timeout=API_TIMEOUT |
|
) |
|
|
|
|
|
if response.status_code == 200: |
|
questions = response.json() |
|
if not isinstance(questions, list): |
|
return [], "Invalid response format: expected list" |
|
|
|
|
|
for q in questions: |
|
q.setdefault("task_id", f"id_{hash(q['question']) % 100000}") |
|
if "image" in q: |
|
q["question"] = f"[IMAGE] {q['question']}" |
|
return questions, "success" |
|
|
|
elif response.status_code == 429: |
|
logger.warning("Rate limited, retrying...") |
|
time.sleep(5) |
|
continue |
|
|
|
elif response.status_code == 404: |
|
return [], "API endpoint not found" |
|
|
|
else: |
|
return [], f"API error: HTTP {response.status_code}" |
|
|
|
except Exception as e: |
|
logger.error("Fetch error: %s", e) |
|
return [], f"Connection error: {str(e)}" |
|
|
|
return [], "API unavailable after retries" |
|
|
|
def _submit_answers(self, username: str, agent_code: str, answers: list) -> Tuple[str, int]: |
|
"""Отправка ответов согласно GAIA API спецификации""" |
|
payload = { |
|
"username": username.strip(), |
|
"agent_code": agent_code.strip(), |
|
"answers": answers |
|
} |
|
|
|
for attempt in range(API_RETRIES): |
|
try: |
|
response = self.session.post( |
|
self.submit_url, |
|
json=payload, |
|
timeout=API_TIMEOUT * 2 |
|
) |
|
|
|
|
|
if response.status_code == 200: |
|
result = response.json() |
|
score = result.get("score", 0) |
|
return result.get("message", "Answers submitted"), score |
|
|
|
elif response.status_code == 400: |
|
error = response.json().get("error", "Invalid request") |
|
logger.error("Validation error: %s", error) |
|
return f"Validation Error: {error}", 0 |
|
|
|
elif response.status_code == 429: |
|
logger.warning("Rate limited, retrying...") |
|
time.sleep(10) |
|
continue |
|
|
|
else: |
|
return f"HTTP Error {response.status_code}", 0 |
|
|
|
except Exception as e: |
|
logger.error("Submit error: %s", e) |
|
return f"Connection Error: {str(e)}", 0 |
|
|
|
return "Submission failed after retries", 0 |
|
|
|
|
|
def run_evaluation(username: str, agent_code: str, progress=gr.Progress()): |
|
progress(0, desc="⚡ Инициализация GAIA Mastermind...") |
|
try: |
|
agent = GAIAThoughtProcessor() |
|
except Exception as e: |
|
logger.exception("Agent initialization failed") |
|
return f"Agent Error: {str(e)}", 0, 0, pd.DataFrame() |
|
|
|
progress(0.1, desc="🌐 Подключение к GAIA API...") |
|
runner = GAIAEvaluationRunner() |
|
|
|
|
|
class ProgressWrapper: |
|
def __init__(self, total, progress): |
|
self.total = total |
|
self.progress = progress |
|
self.current = 0 |
|
|
|
def update(self, n=1): |
|
self.current += n |
|
self.progress(self.current / self.total, desc=f"🧠 Обработка задач ({self.current}/{self.total})") |
|
|
|
def __iter__(self): |
|
return self |
|
|
|
def __next__(self): |
|
if self.current >= self.total: |
|
raise StopIteration |
|
return self.current |
|
|
|
return runner.run_evaluation( |
|
agent, |
|
username, |
|
agent_code, |
|
progress=ProgressWrapper |
|
) |
|
|
|
|
|
with gr.Blocks( |
|
title="🧠 GAIA Mastermind", |
|
theme=gr.themes.Soft(), |
|
css=""" |
|
.gradio-container {background: linear-gradient(135deg, #1a2a6c, #2c5364)} |
|
.dark {color: #f0f0f0} |
|
""" |
|
) as demo: |
|
gr.Markdown(""" |
|
<div style="text-align:center; background: linear-gradient(135deg, #0f2027, #203a43); |
|
padding: 20px; border-radius: 15px; color: white; box-shadow: 0 10px 20px rgba(0,0,0,0.3);"> |
|
<h1>🧠 GAIA Mastermind</h1> |
|
<h3>Многошаговое решение задач с Tree-of-Thought</h3> |
|
<p>Соответствует спецификации GAIA API v1.2</p> |
|
</div> |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown("### 🔐 Авторизация") |
|
username = gr.Textbox( |
|
label="HF Username", |
|
value="yoshizen", |
|
info="Ваше имя пользователя Hugging Face" |
|
) |
|
agent_code = gr.Textbox( |
|
label="Agent Code", |
|
value="https://huggingface.co/spaces/yoshizen/FinalTest", |
|
info="URL вашего агента" |
|
) |
|
run_btn = gr.Button("🚀 Запустить оценку", variant="primary", scale=1) |
|
|
|
gr.Markdown("### ⚙️ Статус системы") |
|
sys_info = gr.Textbox(label="Системная информация", interactive=False) |
|
|
|
with gr.Column(scale=2): |
|
gr.Markdown("### 📊 Результаты GAIA") |
|
with gr.Row(): |
|
result_output = gr.Textbox( |
|
label="Статус отправки", |
|
interactive=False, |
|
max_lines=3 |
|
) |
|
correct_output = gr.Number( |
|
label="✅ Правильные ответы", |
|
interactive=False |
|
) |
|
total_output = gr.Number( |
|
label="📚 Всего вопросов", |
|
interactive=False |
|
) |
|
|
|
with gr.Row(): |
|
results_table = gr.Dataframe( |
|
label="🔍 Детализация ответов", |
|
headers=["Task ID", "Question", "Answer", "Status"], |
|
interactive=False, |
|
wrap=True, |
|
overflow_row_behaviour="paginate", |
|
height=400, |
|
column_widths=["15%", "35%", "40%", "10%"] |
|
) |
|
|
|
|
|
def get_system_info(): |
|
return ( |
|
f"Device: {'GPU ✅' if torch.cuda.is_available() else 'CPU ⚠️'}, " |
|
f"Model: {MODEL_NAME}, " |
|
f"API: {DEFAULT_API_URL}" |
|
) |
|
|
|
demo.load(get_system_info, inputs=None, outputs=sys_info) |
|
|
|
run_btn.click( |
|
fn=run_evaluation, |
|
inputs=[username, agent_code], |
|
outputs=[result_output, correct_output, total_output, results_table], |
|
concurrency_limit=1, |
|
show_progress="minimal", |
|
api_name="run_evaluation" |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.queue( |
|
max_size=5, |
|
api_open=False |
|
).launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
show_error=True, |
|
debug=False |
|
) |