Spaces:
Runtime error
Runtime error
# https://github.com/huggingface/smolagents/blob/v1.17.0/src/smolagents/default_tools.py#L479 | |
# Import Modules | |
import os | |
import pandas as pd | |
import yt_dlp | |
import re | |
# Smolagents | |
import torch | |
from transformers import AutoProcessor, AutoModelForVision2Seq | |
from smolagents import tool, Tool | |
from smolagents.tools import PipelineTool | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
import librosa | |
import numpy as np | |
gradio_main_instructions = """ | |
**Instructions:** | |
1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ... | |
2. Log in to your Hugging Face account using the button below. This uses your HF username for submission. | |
3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score. | |
--- | |
**Disclaimers:** | |
Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions). | |
This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async. | |
""" | |
def get_manager_agent_prompt(question_text, file_prompt): | |
return f""" | |
# Objective: | |
Your task is to analyze the following question and to provide a final answer. | |
{file_prompt} | |
# Question: | |
{question_text} | |
# Final Answer requirements: | |
The final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. | |
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. | |
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. | |
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. | |
!! Note !! If the question itself mentions specific instructions for how the answer should be formatted than make absolutely sure those are also applied to the answer!! | |
""" | |
def check_for_file_name_and_return_prompt(file_name): | |
if file_name == '': | |
return 'For this question there is no file with additional information available.' | |
else: | |
# Detect File Type | |
if '.xlsx' in file_name: | |
file_type = 'Excel Sheet' | |
return f""" | |
# File Information | |
For this question there is a file named "{file_name}" with additional information related to the question available. | |
The specific file is of type: {file_type}. | |
The file is already downloaded and available for use. | |
Load the file based on the file name with the pandas python library or use the read_excel_tool. Choose what works best for you. | |
Carefully load the file and use its content in the best and correct way possible to help you answer the question.""" | |
elif '.csv' in file_name: | |
file_type = 'CSV File' | |
return f""" | |
# File Information | |
For this question there is a file named "{file_name}" with additional information related to the question available. | |
The specific file is of type: {file_type}. | |
The file is already downloaded and available for use. | |
Load the file based on the file name with the pandas python library. | |
Carefully load the file and use its content in the best and correct way possible to help you answer the question.""" | |
elif '.mp3' in file_name: | |
file_type = 'MP3 Audio File' | |
return f""" | |
# File Information | |
For this question there is a file named '{file_name}' with additional information related to the question available. | |
The specific file is of type: {file_type}. | |
The file is already downloaded and available for use with the available tools to load the specific file. | |
Carefully load the file and use its content in the best and correct way possible to help you answer the question. | |
If the file name mentioned specifically in the question is different from the following file name '{file_name}' then keep using the following file name: '{file_name}'. | |
""" | |
elif '.png' in file_name: | |
file_type = 'PNG Image File' | |
return f""" | |
# File Information | |
For this question there is a file named "{file_name}" with additional information related to the question available. | |
The specific file is of type: {file_type}. | |
The file is already downloaded and available for use. Use the 'vision_agent' to load the file and answer the question. | |
Make sure to pass the file name and question!!""" | |
elif '.py' in file_name: | |
file_type = 'Python Script File' | |
with open(file_name, "r") as pyfile: | |
python_script_contents = pyfile.read() | |
return f""" | |
# File Information | |
For this question there is a file named '{file_name}' with additional information related to the question available. | |
The specific file is of type: {file_type}. | |
The file is already downloaded and available for use with the available tools to load the specific file. | |
As an extra service below is the content of the Python Script File also visible. | |
# Python Script File Content | |
``` | |
{python_script_contents} | |
``` | |
""" | |
# Create Models for Vision Tool | |
device = "cuda" | |
vision_model_path = "ibm-granite/granite-vision-3.2-2b" | |
vision_processor = AutoProcessor.from_pretrained(vision_model_path) | |
vision_model = AutoModelForVision2Seq.from_pretrained(vision_model_path, | |
torch_dtype = torch.bfloat16).to(device) | |
def vision_language_tool(question: str, file_name: str) -> str: | |
""" | |
This vision language tool will load any image based on the provided file_name and will answer the question that is provided. | |
Args: | |
question: A string that contains the question that we need to answer about the image. | |
file_name: A string containing the image file name. | |
Returns: | |
A string containing the answer to the question. | |
""" | |
prompt = f""" | |
You are provided with an image. | |
Answer the following question about the image very specifically and in detail: | |
{question}""" | |
print(f"vlt: {os.listdir('./')}") | |
conversation = [ | |
{ | |
"role": "user", | |
"content": [{"type": "image", "url": file_name}, {"type": "text", "text": prompt}], | |
}, | |
] | |
inputs = vision_processor.apply_chat_template(conversation, | |
add_generation_prompt = True, | |
tokenize = True, | |
return_dict = True, | |
return_tensors = "pt").to(device) | |
# autoregressively complete prompt | |
model_output = vision_model.generate(**inputs, | |
max_new_tokens = 1024, | |
temperature = 0.2, | |
do_sample = True, | |
top_p = 0.975, | |
top_k = 75, | |
min_p = 0.05, | |
repetition_penalty = 1.15) | |
answer = vision_processor.decode(model_output[0], skip_special_tokens = True) | |
return answer | |
def speech_to_text_tool(file_name: str) -> str: | |
""" | |
This speech to text tool will use the provided file name to load an mp3 audio file and and output a transcription of the audio file as a text string. | |
Args: | |
file_name: A string containing the audio file name. | |
Returns: | |
A string containing the transcribed text of the audio file. | |
""" | |
# Load model and processor | |
model_name = "openai/whisper-small" | |
processor = WhisperProcessor.from_pretrained(model_name) | |
model = WhisperForConditionalGeneration.from_pretrained(model_name).to('cpu') | |
model.config.forced_decoder_ids = None | |
# Load and resample audio to 16kHz mono | |
speech_array, sampling_rate = librosa.load(file_name, sr = 16000, mono=True) | |
# Define chunk size: 30 seconds at 16kHz = 480000 samples | |
chunk_size = 30 * 16000 # 480000 | |
# Split into chunks | |
chunks = [ | |
speech_array[i:i+chunk_size] | |
for i in range(0, len(speech_array), chunk_size) | |
] | |
# Pad last chunk if it's shorter | |
if len(chunks[-1]) < chunk_size: | |
chunks[-1] = np.pad(chunks[-1], (0, chunk_size - len(chunks[-1]))) | |
# Prepare input features in batch | |
input_features = processor(chunks, sampling_rate=16000, return_tensors="pt").input_features | |
# Generate predictions in batch | |
predicted_ids = model.generate(input_features) | |
# Decode all chunks and concatenate | |
transcribed_texts = processor.batch_decode(predicted_ids, skip_special_tokens=True) | |
full_transcription = " ".join([t.strip() for t in transcribed_texts]) | |
return full_transcription | |
def youtube_captions_tool(youtube_video_url: str) -> str: | |
""" | |
This youtube captions tool will use a youtube video url to retrieve the captions and output them as a string containing the conversations in the video. | |
Args: | |
youtube_video_url: A string containing the url for a youtube video from which the captions will be retrieved. | |
Returns: | |
A string containing the captions of the youtube video url. | |
""" | |
outtmpl = "caption.%(ext)s" | |
ydl_opts = { | |
'writesubtitles': True, | |
'writeautomaticsub': True, | |
'subtitleslangs': ['en'], | |
'skip_download': True, | |
'outtmpl': outtmpl, | |
'quiet': True | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
info = ydl.extract_info(youtube_video_url, download=True) | |
vtt_filename = None | |
for ext in ('en.vtt', 'en-US.vtt'): | |
if os.path.isfile(f'caption.{ext}'): | |
vtt_filename = f'caption.{ext}' | |
break | |
if not vtt_filename: | |
raise FileNotFoundError("Could not find English captions (.vtt) after download.") | |
with open(vtt_filename, encoding='utf-8') as f: | |
vtt_content = f.read() | |
os.remove(vtt_filename) | |
# Remove headers and unnecessary metadata | |
vtt_content = re.sub(r'WEBVTT.*?\n', '', vtt_content, flags=re.DOTALL) | |
vtt_content = re.sub(r'^Kind:.*\n?', '', vtt_content, flags=re.MULTILINE) | |
vtt_content = re.sub(r'^Language:.*\n?', '', vtt_content, flags=re.MULTILINE) | |
vtt_content = re.sub(r'^NOTE.*\n?', '', vtt_content, flags=re.MULTILINE) | |
vtt_content = re.sub(r'X-TIMESTAMP.*', '', vtt_content) | |
vtt_content = re.sub(r'\[.*?\]', '', vtt_content) | |
vtt_content = re.sub(r'<.*?>', '', vtt_content) # Remove tags like <c> and <00:00:01.000> | |
# Split by lines, remove lines that are timestamps, metadata, or blank | |
cleaned_lines = [] | |
last_line = None | |
for line in vtt_content.splitlines(): | |
line = line.strip() | |
if not line: | |
continue # Skip blank lines | |
if re.match(r'^\d{2}:\d{2}:\d{2}\.\d{3} -->', line): | |
continue # Skip timestamps | |
if re.match(r'^\d+$', line): | |
continue # Skip sequence numbers | |
if 'align:' in line or 'position:' in line: | |
# Remove align/position metadata but keep the actual text | |
line = re.sub(r'align:[^\s]+', '', line) | |
line = re.sub(r'position:[^\s]+', '', line) | |
line = line.strip() | |
if not line: | |
continue | |
if line == last_line: | |
continue # Deduplicate consecutive lines | |
cleaned_lines.append(line) | |
last_line = line | |
captions = '\n'.join(cleaned_lines).strip() | |
return captions | |
def read_excel_tool(file_name: str) -> str: | |
""" | |
This read excel tool will use the provided file name to load an Excel file into a Pandas DataFrame and output the various information as a text string. | |
Args: | |
file_name: A string containing the Excel file name. | |
Returns: | |
A string containing the structured output from a Pandas DataFrame after reading the Excel file. | |
""" | |
# Read Excel File | |
df = pd.read_excel(file_name) | |
# Excel String | |
excel_string = f""" | |
# Summary | |
The text below contains the information from the Excel File that has been loaded into a Pandas DataFrame. | |
## DataFrame Shape | |
{df.shape} | |
## DataFrame Columns | |
{df.columns} | |
## DataFrame Describe | |
{df.describe} | |
## DataFrame Head | |
{df.head(25)} | |
""" | |
return excel_string | |
# Added as a fall backup...If Google doesn't work....may'be duck duck go does | |
class DuckDuckGoSearchTool(Tool): | |
name = "alternative_web_search" | |
description = """Use this as an alternative to perform a duckduckgo web search based on your query (think a Google search) then returns the top search results.""" | |
inputs = {"query": {"type": "string", "description": "The search query to perform."}} | |
output_type = "string" | |
def __init__(self, max_results=10, **kwargs): | |
super().__init__() | |
self.max_results = max_results | |
try: | |
from duckduckgo_search import DDGS | |
except ImportError as e: | |
raise ImportError( | |
"You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`." | |
) from e | |
self.ddgs = DDGS(**kwargs) | |
def forward(self, query: str) -> str: | |
results = self.ddgs.text(query, max_results=self.max_results) | |
if len(results) == 0: | |
raise Exception("No results found! Try a less restrictive/shorter query.") | |
postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results] | |
return "## Search Results\n\n" + "\n\n".join(postprocessed_results) |