File size: 13,748 Bytes
b800e08
 
 
 
 
 
 
 
 
76a8abb
b800e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90a12ad
 
b800e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4483b98
b800e08
 
4483b98
b800e08
4483b98
 
 
b800e08
 
 
 
 
 
 
 
 
 
 
 
 
4483b98
b800e08
4483b98
 
b800e08
4483b98
 
b800e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76a8abb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# Import Modules
import os
import pandas as pd
import yt_dlp
import re

# Smolagents
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from smolagents import tool, Tool
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import numpy as np

gradio_main_instructions =  """
**Instructions:**

1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.

---
**Disclaimers:**
Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
"""

def get_manager_agent_prompt(question_text, file_prompt):
    return f"""
# Objective:  
Your task is to analyze the following question and to provide a final answer.

{file_prompt}

# Question:                         
{question_text}

# Final Answer requirements:
The final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. 
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. 
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. 
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.

!! Note !! If the question itself mentions specific instructions for how the answer should be formatted than make absolutely sure those are also applied to the answer!!
"""

def check_for_file_name_and_return_prompt(file_name):
    if file_name == '':
        return 'For this question there is no file with additional information available.'
    else:
        # Detect File Type
        if '.xlsx' in file_name:
            file_type = 'Excel Sheet'
            return f"""
# File Information
For this question there is a file named "{file_name}" with additional information related to the question available.
The specific file is of type: {file_type}.
The file is already downloaded and available for use.
Load the file based on the file name with the pandas python library or use the read_excel_tool. Choose what works best for you.
Carefully load the file and use its content in the best and correct way possible to help you answer the question."""
        elif '.csv' in file_name:
            file_type = 'CSV File'
            return f"""
# File Information
For this question there is a file named "{file_name}" with additional information related to the question available.
The specific file is of type: {file_type}.
The file is already downloaded and available for use.
Load the file based on the file name with the pandas python library.
Carefully load the file and use its content in the best and correct way possible to help you answer the question."""
        elif '.mp3' in file_name:
            file_type = 'MP3 Audio File'
            return f"""
# File Information
For this question there is a file named '{file_name}' with additional information related to the question available.
The specific file is of type: {file_type}.
The file is already downloaded and available for use with the available tools to load the specific file.
Carefully load the file and use its content in the best and correct way possible to help you answer the question.
If the file name mentioned specifically in the question is different from the following file name '{file_name}' then keep using the following file name: '{file_name}'.
"""
        elif '.png' in file_name:
            file_type = 'PNG Image File'
            return f"""
# File Information
For this question there is a file named "{file_name}" with additional information related to the question available.
The specific file is of type: {file_type}.
The file is already downloaded and available for use. Use the 'vision_agent' to load the file and answer the question.
Make sure to pass the file name and question!!"""
        elif '.py' in file_name:
            file_type = 'Python Script File'
            with open(file_name, "r") as pyfile:
                python_script_contents = pyfile.read()
            return f"""
# File Information
For this question there is a file named '{file_name}' with additional information related to the question available.
The specific file is of type: {file_type}.
The file is already downloaded and available for use with the available tools to load the specific file.

As an extra service below is the content of the Python Script File also visible.

# Python Script File Content
```
{python_script_contents}
```
"""

# Create Models for Vision Tool
device = "cuda"
vision_model_path = "ibm-granite/granite-vision-3.2-2b"
vision_processor = AutoProcessor.from_pretrained(vision_model_path)
vision_model = AutoModelForVision2Seq.from_pretrained(vision_model_path,
                                                      torch_dtype = torch.bfloat16).to(device)

@tool
def vision_language_tool(question: str, file_name: str) -> str:
    """    
    This vision language tool will load any image based on the provided file_name and will answer the question that is provided.
    Args:
        question: A string that contains the question that we need to answer about the image.
        file_name: A string containing the image file name.
    Returns:
        A string containing the answer to the question.
    """
    
    prompt = f"""
# Objective:
You are provided with an image.

Answer the following question about the image very specifically and in detail. Think step by step.

# Question:
{question}
"""
    conversation = [
        {
            "role": "user",
            "content": [{"type": "image", "url": file_name}, {"type": "text", "text": prompt}],
        },
    ]
    inputs = vision_processor.apply_chat_template(conversation,
                                                  add_generation_prompt = True,
                                                  tokenize = True,
                                                  return_dict = True,
                                                  return_tensors = "pt").to(device)


    # Generate
    model_output = vision_model.generate(**inputs, 
                                         max_new_tokens = 2048,
                                         temperature = 0.5,
                                         do_sample = True,
                                         top_p = 0.98,
                                         top_k = 80,
                                         min_p = 0.05,
                                         repetition_penalty = 1.15)
    answer = vision_processor.decode(model_output[0], skip_special_tokens = True)

    return answer

@tool
def speech_to_text_tool(file_name: str) -> str:
    """    
    This speech to text tool will use the provided file name to load an mp3 audio file and and output a transcription of the audio file as a text string.
    Args:
        file_name: A string containing the audio file name.
    Returns:
        A string containing the transcribed text of the audio file.
    """
    
    # Load model and processor
    model_name = "openai/whisper-small"
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name).to('cpu')
    model.config.forced_decoder_ids = None

    # Load and resample audio to 16kHz mono
    speech_array, sampling_rate = librosa.load(file_name, sr = 16000, mono=True)

    # Define chunk size: 30 seconds at 16kHz = 480000 samples
    chunk_size = 30 * 16000  # 480000

    # Split into chunks
    chunks = [
        speech_array[i:i+chunk_size] 
        for i in range(0, len(speech_array), chunk_size)
    ]

    # Pad last chunk if it's shorter
    if len(chunks[-1]) < chunk_size:
        chunks[-1] = np.pad(chunks[-1], (0, chunk_size - len(chunks[-1])))

    # Prepare input features in batch
    input_features = processor(chunks, sampling_rate=16000, return_tensors="pt").input_features

    # Generate predictions in batch
    predicted_ids = model.generate(input_features)

    # Decode all chunks and concatenate
    transcribed_texts = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    full_transcription = " ".join([t.strip() for t in transcribed_texts])

    return full_transcription

@tool
def youtube_captions_tool(youtube_video_url: str) -> str:
    """    
    This youtube captions tool will use a youtube video url to retrieve the captions and output them as a string containing the conversations in the video.
    Args:
        youtube_video_url: A string containing the url for a youtube video from which the captions will be retrieved.
    Returns:
        A string containing the captions of the youtube video url.
    """
    
    outtmpl = "caption.%(ext)s"
    ydl_opts = {
        'writesubtitles': True,
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'skip_download': True,
        'outtmpl': outtmpl,
        'quiet': True
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(youtube_video_url, download=True)
    vtt_filename = None
    for ext in ('en.vtt', 'en-US.vtt'):
        if os.path.isfile(f'caption.{ext}'):
            vtt_filename = f'caption.{ext}'
            break
    if not vtt_filename:
        raise FileNotFoundError("Could not find English captions (.vtt) after download.")
    with open(vtt_filename, encoding='utf-8') as f:
        vtt_content = f.read()
    os.remove(vtt_filename)
    
    # Remove headers and unnecessary metadata
    vtt_content = re.sub(r'WEBVTT.*?\n', '', vtt_content, flags=re.DOTALL)
    vtt_content = re.sub(r'^Kind:.*\n?', '', vtt_content, flags=re.MULTILINE)
    vtt_content = re.sub(r'^Language:.*\n?', '', vtt_content, flags=re.MULTILINE)
    vtt_content = re.sub(r'^NOTE.*\n?', '', vtt_content, flags=re.MULTILINE)
    vtt_content = re.sub(r'X-TIMESTAMP.*', '', vtt_content)
    vtt_content = re.sub(r'\[.*?\]', '', vtt_content)
    vtt_content = re.sub(r'<.*?>', '', vtt_content)  # Remove tags like <c> and <00:00:01.000>
    
    # Split by lines, remove lines that are timestamps, metadata, or blank
    cleaned_lines = []
    last_line = None
    for line in vtt_content.splitlines():
        line = line.strip()
        if not line:
            continue  # Skip blank lines
        if re.match(r'^\d{2}:\d{2}:\d{2}\.\d{3} -->', line):
            continue  # Skip timestamps
        if re.match(r'^\d+$', line):
            continue  # Skip sequence numbers
        if 'align:' in line or 'position:' in line:
            # Remove align/position metadata but keep the actual text
            line = re.sub(r'align:[^\s]+', '', line)
            line = re.sub(r'position:[^\s]+', '', line)
            line = line.strip()
        if not line:
            continue
        if line == last_line:
            continue  # Deduplicate consecutive lines
        cleaned_lines.append(line)
        last_line = line
    captions = '\n'.join(cleaned_lines).strip()

    return captions

@tool
def read_excel_tool(file_name: str) -> str:
    """    
    This read excel tool will use the provided file name to load an Excel file into a Pandas DataFrame and output the various information as a text string.
    Args:
        file_name: A string containing the Excel file name.
    Returns:
        A string containing the structured output from a Pandas DataFrame after reading the Excel file.
    """
    # Read Excel File
    df = pd.read_excel(file_name)

    # Excel String
    excel_string = f"""
# Summary
The text below contains the information from the Excel File that has been loaded into a Pandas DataFrame.

## DataFrame Shape
{df.shape}

## DataFrame Columns
{df.columns}

## DataFrame Describe
{df.describe}

## DataFrame Head
{df.head(25)}
"""
    
    return excel_string

# Added as a fall backup...If Google doesn't work....may'be duck duck go does
class DuckDuckGoSearchTool(Tool):
    name = "alternative_web_search"
    description = """Use this as an alternative to perform a duckduckgo web search based on your query (think a Google search) then returns the top search results."""
    inputs = {"query": {"type": "string", "description": "The search query to perform."}}
    output_type = "string"

    def __init__(self, max_results=10, **kwargs):
        super().__init__()
        self.max_results = max_results
        try:
            from duckduckgo_search import DDGS
        except ImportError as e:
            raise ImportError(
                "You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`."
            ) from e
        self.ddgs = DDGS(**kwargs)

    def forward(self, query: str) -> str:
        results = self.ddgs.text(query, max_results=self.max_results)
        if len(results) == 0:
            raise Exception("No results found! Try a less restrictive/shorter query.")
        postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results]
        return "## Search Results\n\n" + "\n\n".join(postprocessed_results)