File size: 4,036 Bytes
332e48b
8dcca97
5fffd11
6a05ca9
eb7cc40
332e48b
 
 
 
5fffd11
75e40db
8dcca97
6a05ca9
5fffd11
75e40db
8dcca97
6a05ca9
8dcca97
 
 
 
 
6a05ca9
8dcca97
6a05ca9
5fffd11
 
 
 
 
 
6a05ca9
 
5fffd11
 
392825a
6a05ca9
 
 
 
 
 
 
 
 
 
8dcca97
6a05ca9
5fffd11
8dcca97
6a05ca9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fffd11
 
 
6a05ca9
 
 
8dcca97
6a05ca9
 
 
 
 
 
 
 
 
5fffd11
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import requests
import re
import base64
from openai import OpenAI

class GaiaAgent:
    def __init__(self):
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.api_url = "https://agents-course-unit4-scoring.hf.space"
        self.instructions = (
            "You are a highly skilled and concise research assistant solving GAIA benchmark questions.\n"
            "Analyze attached files, video links, and images. Reason step-by-step internally.\n"
            "Return only the final factual answer. Do not explain."
        )

    def fetch_file(self, task_id: str):
        try:
            url = f"{self.api_url}/files/{task_id}"
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            content_type = response.headers.get("Content-Type", "")
            return response.content, content_type
        except Exception as e:
            return None, f"[File error: {e}]"

    def extract_youtube_context(self, question: str) -> str:
        match = re.search(r"https://www\.youtube\.com/watch\?v=([\w-]+)", question)
        if match:
            video_id = match.group(1)
            return (
                f"This question refers to a YouTube video with ID: {video_id}.\n"
                f"Assume the video contains relevant visual or auditory cues.\n"
            )
        return ""

    def extract_image_prompt(self, image_bytes: bytes) -> dict:
        image_b64 = base64.b64encode(image_bytes).decode("utf-8")
        return {
            "role": "user",
            "content": [
                {"type": "text", "text": "Please analyze the image and answer the chess question accurately. Provide only the move in algebraic notation."},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
            ]
        }

    def __call__(self, question: str, task_id: str = None) -> str:
        messages = [{"role": "system", "content": self.instructions}]

        if task_id:
            file_data, content_type = self.fetch_file(task_id)

            if isinstance(content_type, str) and "image" in content_type:
                image_message = self.extract_image_prompt(file_data)
                messages.append(image_message)
                messages.append({"role": "user", "content": question})
                try:
                    response = self.client.chat.completions.create(
                        model="gpt-4o",
                        messages=messages
                    )
                    return response.choices[0].message.content.strip()
                except Exception as e:
                    return f"[Image answer error: {e}]"

            elif isinstance(content_type, str) and ("text" in content_type or "csv" in content_type or "json" in content_type):
                context = file_data.decode(errors="ignore")[:3000]
                messages.append({"role": "user", "content": f"File Content:\n{context}\n\nQuestion: {question}"})

            elif isinstance(content_type, str) and "pdf" in content_type:
                messages.append({"role": "user", "content": f"[PDF content detected]\n\nQuestion: {question}"})

            elif isinstance(content_type, str) and "audio" in content_type:
                messages.append({"role": "user", "content": f"[Audio content detected]\n\nQuestion: {question}"})

        video_context = self.extract_youtube_context(question)
        if video_context:
            messages.append({"role": "user", "content": f"{video_context}\n\nQuestion: {question}"})
        elif not any(m["role"] == "user" for m in messages):
            messages.append({"role": "user", "content": question})

        try:
            response = self.client.chat.completions.create(
                model="gpt-4-turbo",
                messages=messages,
                temperature=0.0
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            return f"[Answer error: {e}]"