File size: 4,081 Bytes
332e48b
2693f75
9eb69da
eb7cc40
9eb69da
eb7cc40
332e48b
 
 
 
 
9eb69da
 
332e48b
eb7cc40
 
9eb69da
eb7cc40
 
9eb69da
eb7cc40
 
9eb69da
eb7cc40
9eb69da
eb7cc40
 
9eb69da
 
 
d48b3cc
eb7cc40
9eb69da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb7cc40
9eb69da
 
 
 
332e48b
d48b3cc
9eb69da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d48b3cc
9eb69da
d48b3cc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import io
import base64
import requests
import pandas as pd
from openai import OpenAI

class GaiaAgent:
    def __init__(self):
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.instructions = (
            "You are a multimodal GAIA assistant capable of understanding text, images, audio, and code. "
            "Use file context if provided, think step by step, and respond with the exact answer only."
        )
        self.api_url = "https://agents-course-unit4-scoring.hf.space"

    def fetch_file(self, task_id: str) -> (str, bytes, str):
        try:
            url = f"{self.api_url}/files/{task_id}"
            response = requests.get(url, timeout=15)
            response.raise_for_status()
            content_type = response.headers.get("Content-Type", "")
            return url, response.content, content_type
        except Exception as e:
            return None, None, f"[Fetch error: {e}]"

    def __call__(self, question: str, task_id: str = None) -> str:
        image = None
        audio = None
        tool_context = ""

        if task_id:
            url, file_bytes, file_type = self.fetch_file(task_id)
            if file_bytes is None:
                tool_context = file_type  # error message
            elif "image" in file_type:
                image = base64.b64encode(file_bytes).decode("utf-8")
            elif "audio" in file_type:
                audio = file_bytes
            elif file_type.endswith("python"):
                try:
                    exec_env = {}
                    exec(file_bytes.decode("utf-8"), {}, exec_env)
                    result = exec_env.get("result", "[Executed. Check code return value manually if needed.]")
                    tool_context = f"Python result: {result}"
                except Exception as e:
                    tool_context = f"[Python execution error: {e}]"
            elif "text" in file_type or "csv" in file_type:
                tool_context = file_bytes.decode("utf-8")[:2000]
            elif "pdf" in file_type:
                tool_context = "[PDF file detected. OCR not yet implemented.]"

        messages = [
            {"role": "system", "content": self.instructions},
            {"role": "user", "content": f"{tool_context}\n\nQUESTION: {question}\nANSWER:"}
        ]

        try:
            if image:
                response = self.client.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {"role": "system", "content": self.instructions},
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": question},
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/png;base64,{image}",
                                        "detail": "auto"
                                    }
                                }
                            ]
                        }
                    ]
                )
            elif audio:
                transcript = self.client.audio.transcriptions.create(
                    model="whisper-1",
                    file=io.BytesIO(audio),
                    response_format="text"
                )
                messages.append({"role": "user", "content": f"Transcript: {transcript.strip()}"})
                response = self.client.chat.completions.create(
                    model="gpt-4-turbo",
                    messages=messages,
                    temperature=0.0
                )
            else:
                response = self.client.chat.completions.create(
                    model="gpt-4-turbo",
                    messages=messages,
                    temperature=0.0
                )

            return response.choices[0].message.content.strip()

        except Exception as e:
            return f"[Agent error: {e}]"