|
import os |
|
import io |
|
import base64 |
|
import requests |
|
import pandas as pd |
|
from openai import OpenAI |
|
|
|
class GaiaAgent: |
|
def __init__(self): |
|
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) |
|
self.instructions = ( |
|
"You are a multimodal GAIA assistant capable of understanding text, images, audio, and code. " |
|
"Use file context if provided, think step by step, and respond with the exact answer only." |
|
) |
|
self.api_url = "https://agents-course-unit4-scoring.hf.space" |
|
|
|
def fetch_file(self, task_id: str) -> (str, bytes, str): |
|
try: |
|
url = f"{self.api_url}/files/{task_id}" |
|
response = requests.get(url, timeout=15) |
|
response.raise_for_status() |
|
content_type = response.headers.get("Content-Type", "") |
|
return url, response.content, content_type |
|
except Exception as e: |
|
return None, None, f"[Fetch error: {e}]" |
|
|
|
def __call__(self, question: str, task_id: str = None) -> str: |
|
image = None |
|
audio = None |
|
tool_context = "" |
|
|
|
if task_id: |
|
url, file_bytes, file_type = self.fetch_file(task_id) |
|
if file_bytes is None: |
|
tool_context = file_type |
|
elif "image" in file_type: |
|
image = base64.b64encode(file_bytes).decode("utf-8") |
|
elif "audio" in file_type: |
|
audio = file_bytes |
|
elif file_type.endswith("python"): |
|
try: |
|
exec_env = {} |
|
exec(file_bytes.decode("utf-8"), {}, exec_env) |
|
result = exec_env.get("result", "[Executed. Check code return value manually if needed.]") |
|
tool_context = f"Python result: {result}" |
|
except Exception as e: |
|
tool_context = f"[Python execution error: {e}]" |
|
elif "text" in file_type or "csv" in file_type: |
|
tool_context = file_bytes.decode("utf-8")[:2000] |
|
elif "pdf" in file_type: |
|
tool_context = "[PDF file detected. OCR not yet implemented.]" |
|
|
|
messages = [ |
|
{"role": "system", "content": self.instructions}, |
|
{"role": "user", "content": f"{tool_context}\n\nQUESTION: {question}\nANSWER:"} |
|
] |
|
|
|
try: |
|
if image: |
|
response = self.client.chat.completions.create( |
|
model="gpt-4o", |
|
messages=[ |
|
{"role": "system", "content": self.instructions}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": question}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/png;base64,{image}", |
|
"detail": "auto" |
|
} |
|
} |
|
] |
|
} |
|
] |
|
) |
|
elif audio: |
|
transcript = self.client.audio.transcriptions.create( |
|
model="whisper-1", |
|
file=io.BytesIO(audio), |
|
response_format="text" |
|
) |
|
messages.append({"role": "user", "content": f"Transcript: {transcript.strip()}"}) |
|
response = self.client.chat.completions.create( |
|
model="gpt-4-turbo", |
|
messages=messages, |
|
temperature=0.0 |
|
) |
|
else: |
|
response = self.client.chat.completions.create( |
|
model="gpt-4-turbo", |
|
messages=messages, |
|
temperature=0.0 |
|
) |
|
|
|
return response.choices[0].message.content.strip() |
|
|
|
except Exception as e: |
|
return f"[Agent error: {e}]" |
|
|