|
import re |
|
import requests |
|
import pandas as pd |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
import json |
|
import logging |
|
import time |
|
import sys |
|
import os |
|
from functools import lru_cache |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger("GAIA-Mastermind") |
|
|
|
|
|
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
MODEL_NAME = "google/flan-t5-small" |
|
API_RETRIES = 3 |
|
API_TIMEOUT = 30 |
|
|
|
|
|
os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache" |
|
os.environ["HF_HOME"] = "/tmp/hf_home" |
|
|
|
class GAIAExpert: |
|
_instance = None |
|
_is_initialized = False |
|
|
|
def __new__(cls): |
|
|
|
if cls._instance is None: |
|
cls._instance = super(GAIAExpert, cls).__new__(cls) |
|
return cls._instance |
|
|
|
def __init__(self): |
|
|
|
if not GAIAExpert._is_initialized: |
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
logger.info(f"Инициализация модели на {self.device.upper()}") |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
self.model = None |
|
GAIAExpert._is_initialized = True |
|
|
|
def _ensure_model_loaded(self): |
|
"""Ленивая загрузка модели только при необходимости""" |
|
if self.model is None: |
|
try: |
|
logger.info("Загрузка модели...") |
|
|
|
self.model = AutoModelForSeq2SeqLM.from_pretrained( |
|
MODEL_NAME, |
|
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, |
|
low_cpu_mem_usage=True, |
|
device_map="auto" |
|
).eval() |
|
logger.info("Модель успешно загружена") |
|
except Exception as e: |
|
logger.exception("Ошибка загрузки модели") |
|
raise RuntimeError(f"Ошибка инициализации: {str(e)}") |
|
|
|
@lru_cache(maxsize=100) |
|
def process_question(self, question: str) -> str: |
|
"""Обработка вопроса с оптимизацией и кэшированием""" |
|
try: |
|
|
|
self._ensure_model_loaded() |
|
|
|
|
|
inputs = self.tokenizer( |
|
f"Вопрос: {question}\nОтвет:", |
|
return_tensors="pt", |
|
max_length=256, |
|
truncation=True, |
|
padding="max_length" |
|
) |
|
|
|
|
|
if self.device == "cuda": |
|
inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = self.model.generate( |
|
**inputs, |
|
max_new_tokens=50, |
|
num_beams=1, |
|
early_stopping=True, |
|
do_sample=False |
|
) |
|
|
|
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return json.dumps({"final_answer": answer.strip()}) |
|
except Exception as e: |
|
return json.dumps({"final_answer": f"ERROR: {str(e)}"}) |
|
|
|
class GAIAEvaluator: |
|
def __init__(self, api_url: str = DEFAULT_API_URL): |
|
self.api_url = api_url |
|
self.questions_url = f"{api_url}/questions" |
|
self.submit_url = f"{api_url}/submit" |
|
self.session = requests.Session() |
|
self.session.headers.update({"Content-Type": "application/json"}) |
|
|
|
self.session.mount('https://', requests.adapters.HTTPAdapter(max_retries=API_RETRIES)) |
|
|
|
def run_evaluation(self, username: str, agent_code: str): |
|
"""Консольный процесс оценки без интерфейса""" |
|
|
|
agent = GAIAExpert() |
|
|
|
|
|
questions = self._fetch_questions_with_retry() |
|
if not isinstance(questions, list): |
|
logger.error(f"Ошибка получения вопросов: {questions}") |
|
return 0, 0 |
|
|
|
|
|
answers = [] |
|
|
|
for i, q in enumerate(questions): |
|
task_id = q.get("task_id", f"task_{i}") |
|
logger.info(f"Обработка задачи {i+1}/{len(questions)}: {q['question'][:50]}...") |
|
|
|
try: |
|
json_response = agent.process_question(q["question"]) |
|
response_obj = json.loads(json_response) |
|
answer = response_obj.get("final_answer", "") |
|
|
|
answers.append({ |
|
"task_id": task_id, |
|
"answer": str(answer)[:300] |
|
}) |
|
except Exception as e: |
|
logger.error(f"Ошибка обработки: {str(e)}") |
|
answers.append({ |
|
"task_id": task_id, |
|
"answer": f"ERROR: {str(e)}" |
|
}) |
|
|
|
|
|
return self._submit_answers_with_retry(username, agent_code, answers) |
|
|
|
def _fetch_questions_with_retry(self, max_retries=3): |
|
"""Получение вопросов с API с повторными попытками""" |
|
for attempt in range(max_retries): |
|
try: |
|
response = self.session.get(self.questions_url, timeout=API_TIMEOUT) |
|
if response.status_code == 200: |
|
return response.json() |
|
logger.warning(f"HTTP error {response.status_code}, попытка {attempt+1}/{max_retries}") |
|
time.sleep(2 ** attempt) |
|
except Exception as e: |
|
logger.warning(f"Connection error: {str(e)}, попытка {attempt+1}/{max_retries}") |
|
time.sleep(2 ** attempt) |
|
return f"Failed after {max_retries} attempts" |
|
|
|
def _submit_answers_with_retry(self, username: str, agent_code: str, answers: list, max_retries=3): |
|
"""Отправка ответов на сервер с повторными попытками""" |
|
for attempt in range(max_retries): |
|
try: |
|
payload = { |
|
"username": username.strip(), |
|
"agent_code": agent_code.strip(), |
|
"answers": answers |
|
} |
|
|
|
response = self.session.post( |
|
self.submit_url, |
|
json=payload, |
|
timeout=API_TIMEOUT * 2 |
|
) |
|
|
|
if response.status_code == 200: |
|
result = response.json() |
|
score = result.get("score", 0) |
|
return score, len(answers) |
|
|
|
logger.warning(f"HTTP error {response.status_code}, попытка {attempt+1}/{max_retries}") |
|
time.sleep(2 ** attempt) |
|
except Exception as e: |
|
logger.error(f"Ошибка отправки: {str(e)}, попытка {attempt+1}/{max_retries}") |
|
time.sleep(2 ** attempt) |
|
return 0, len(answers) |
|
|
|
if __name__ == "__main__": |
|
|
|
USERNAME = "yoshizen" |
|
AGENT_CODE = "https://huggingface.co/spaces/yoshizen/FinalTest" |
|
|
|
logger.info(f"Запуск оценки для {USERNAME}") |
|
|
|
start_time = time.time() |
|
evaluator = GAIAEvaluator() |
|
score, total = evaluator.run_evaluation(USERNAME, AGENT_CODE) |
|
|
|
elapsed = time.time() - start_time |
|
logger.info(f"Оценка завершена за {elapsed:.1f} сек") |
|
logger.info(f"Результат: {score}/{total} правильных ответов") |
|
|
|
if total > 0: |
|
logger.info(f"Точность: {score/total*100:.1f}%") |
|
else: |
|
logger.error("Не удалось обработать ни одного вопроса") |
|
|