File size: 4,771 Bytes
332e48b
8dcca97
5fffd11
6a05ca9
6acc56a
 
eb7cc40
332e48b
 
 
 
5fffd11
75e40db
8dcca97
6a05ca9
5fffd11
75e40db
8dcca97
6a05ca9
8dcca97
 
 
 
 
6a05ca9
8dcca97
6a05ca9
5fffd11
 
 
 
 
 
6a05ca9
 
5fffd11
 
392825a
6a05ca9
 
 
 
 
6acc56a
6a05ca9
 
 
 
6acc56a
 
 
 
 
 
 
 
 
 
 
8dcca97
6a05ca9
5fffd11
8dcca97
6a05ca9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fffd11
6acc56a
 
 
5fffd11
 
6a05ca9
 
 
8dcca97
6a05ca9
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import requests
import re
import base64
import pandas as pd
import io
from openai import OpenAI

class GaiaAgent:
    def __init__(self):
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.api_url = "https://agents-course-unit4-scoring.hf.space"
        self.instructions = (
            "You are a highly skilled and concise research assistant solving GAIA benchmark questions.\n"
            "Analyze attached files, video links, and images. Reason step-by-step internally.\n"
            "Return only the final factual answer. Do not explain."
        )

    def fetch_file(self, task_id: str):
        try:
            url = f"{self.api_url}/files/{task_id}"
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            content_type = response.headers.get("Content-Type", "")
            return response.content, content_type
        except Exception as e:
            return None, f"[File error: {e}]"

    def extract_youtube_context(self, question: str) -> str:
        match = re.search(r"https://www\.youtube\.com/watch\?v=([\w-]+)", question)
        if match:
            video_id = match.group(1)
            return (
                f"This question refers to a YouTube video with ID: {video_id}.\n"
                f"Assume the video contains relevant visual or auditory cues.\n"
            )
        return ""

    def extract_image_prompt(self, image_bytes: bytes) -> dict:
        image_b64 = base64.b64encode(image_bytes).decode("utf-8")
        return {
            "role": "user",
            "content": [
                {"type": "text", "text": "Please analyze the image and answer the question accurately."},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
            ]
        }

    def handle_excel_sales_question(self, excel_bytes: bytes, question: str) -> str:
        try:
            df = pd.read_excel(io.BytesIO(excel_bytes))
            if 'category' in df.columns and 'sales' in df.columns:
                food_only = df[df['category'].str.lower() == 'food']
                total = food_only['sales'].sum()
                return f"${total:.2f}"
            return "[SKIPPED: Required columns not found in Excel]"
        except Exception as e:
            return f"[Excel processing error: {e}]"

    def __call__(self, question: str, task_id: str = None) -> str:
        messages = [{"role": "system", "content": self.instructions}]

        if task_id:
            file_data, content_type = self.fetch_file(task_id)

            if isinstance(content_type, str) and "image" in content_type:
                image_message = self.extract_image_prompt(file_data)
                messages.append(image_message)
                messages.append({"role": "user", "content": question})
                try:
                    response = self.client.chat.completions.create(
                        model="gpt-4o",
                        messages=messages
                    )
                    return response.choices[0].message.content.strip()
                except Exception as e:
                    return f"[Image answer error: {e}]"

            elif isinstance(content_type, str) and ("text" in content_type or "csv" in content_type or "json" in content_type):
                context = file_data.decode(errors="ignore")[:3000]
                messages.append({"role": "user", "content": f"File Content:\n{context}\n\nQuestion: {question}"})

            elif isinstance(content_type, str) and "pdf" in content_type:
                messages.append({"role": "user", "content": f"[PDF content detected]\n\nQuestion: {question}"})

            elif isinstance(content_type, str) and "audio" in content_type:
                messages.append({"role": "user", "content": f"[Audio content detected]\n\nQuestion: {question}"})

            elif isinstance(content_type, str) and "spreadsheet" in content_type or content_type.endswith("excel") or content_type.endswith("xlsx"):
                return self.handle_excel_sales_question(file_data, question)

        video_context = self.extract_youtube_context(question)
        if video_context:
            messages.append({"role": "user", "content": f"{video_context}\n\nQuestion: {question}"})
        elif not any(m["role"] == "user" for m in messages):
            messages.append({"role": "user", "content": question})

        try:
            response = self.client.chat.completions.create(
                model="gpt-4-turbo",
                messages=messages,
                temperature=0.0
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            return f"[Answer error: {e}]"