Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -361,31 +361,113 @@ class ImageAnalysisTool:
|
|
361 |
Makes the instance callable directly, invoking the _run method for convenience.
|
362 |
"""
|
363 |
return self._run(image_url)
|
364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
|
366 |
|
367 |
class BasicAgent:
|
368 |
def __init__(self):
|
369 |
token = os.environ.get("HF_API_TOKEN")
|
370 |
-
model = HfApiModel(
|
371 |
temperature=0.1,
|
372 |
token=token
|
373 |
)
|
374 |
|
375 |
-
#
|
376 |
-
search_tool = DuckDuckGoSearchTool()
|
377 |
-
wiki_search_tool = WikiSearchTool()
|
378 |
-
str_reverse_tool = StringReverseTool()
|
379 |
-
keywords_extract_tool = KeywordsExtractorTool()
|
380 |
-
speech_to_text_tool = SpeechToTextTool()
|
381 |
-
visit_webpage_tool = VisitWebpageTool()
|
382 |
-
final_answer_tool = FinalAnswerTool()
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
|
|
|
|
389 |
You are my general AI assistant. Your task is to answer the question I asked.
|
390 |
First, provide an explanation of your reasoning, step by step, to arrive at the answer.
|
391 |
Then, return your final answer in a single line, formatted as follows: "FINAL ANSWER: [YOUR FINAL ANSWER]".
|
@@ -394,40 +476,45 @@ If the answer is a number, do not use commas or units (e.g., $, %) unless specif
|
|
394 |
If the answer is a string, do not use articles or abbreviations (e.g., for cities), and write digits in plain text unless specified.
|
395 |
If the answer is a comma-separated list, apply the above rules for each element based on whether it is a number or a string.
|
396 |
"""
|
397 |
-
|
398 |
# Create web agent with image analysis capability
|
399 |
self.web_agent = ToolCallingAgent(
|
400 |
tools=[
|
401 |
-
|
402 |
-
visit_webpage_tool,
|
403 |
-
|
404 |
],
|
405 |
-
model=model,
|
406 |
max_steps=10,
|
407 |
name="web_search_agent",
|
408 |
description="Runs web searches and analyzes images",
|
409 |
)
|
410 |
|
411 |
-
# Create main agent with
|
412 |
self.agent = CodeAgent(
|
413 |
-
model=model,
|
414 |
tools=[
|
415 |
-
search_tool,
|
416 |
-
wiki_search_tool,
|
417 |
-
str_reverse_tool,
|
418 |
-
keywords_extract_tool,
|
419 |
-
speech_to_text_tool,
|
420 |
-
visit_webpage_tool,
|
421 |
-
final_answer_tool,
|
422 |
-
video_transcription_tool,
|
423 |
-
code_llama_tool,
|
424 |
-
|
|
|
425 |
],
|
426 |
-
add_base_tools=True
|
427 |
)
|
428 |
-
|
429 |
# Update system prompt
|
430 |
-
|
|
|
|
|
|
|
|
|
431 |
|
432 |
def __call__(self, question: str) -> str:
|
433 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
|
|
361 |
Makes the instance callable directly, invoking the _run method for convenience.
|
362 |
"""
|
363 |
return self._run(image_url)
|
364 |
+
|
365 |
+
|
366 |
+
import os
|
367 |
+
import requests
|
368 |
+
from transformers import pipeline
|
369 |
+
import yt_dlp
|
370 |
+
|
371 |
+
# Assuming 'tool' decorator and other smolagents components are imported
|
372 |
+
from smolagents import tool, FinalAnswerTool, DuckDuckGoSearchTool, HfApiModel, CodeAgent # Add other necessary imports
|
373 |
+
|
374 |
+
# --- Custom VideoTranscriptionTool Class ---
|
375 |
+
|
376 |
+
class VideoTranscriptionTool:
|
377 |
+
"""
|
378 |
+
A tool for transcribing audio from YouTube videos using Whisper.
|
379 |
+
"""
|
380 |
+
name = "video_transcription"
|
381 |
+
description = (
|
382 |
+
"Transcribes the audio from a given YouTube video URL and returns the text content. "
|
383 |
+
"Useful for getting text from video lectures, interviews, etc."
|
384 |
+
)
|
385 |
+
inputs = {
|
386 |
+
"video_url": {
|
387 |
+
"type": "string",
|
388 |
+
"description": "The URL of the YouTube video to transcribe (e.g., 'https://www.youtube.com/watch?v=dQw4w9WgXcQ').",
|
389 |
+
}
|
390 |
+
}
|
391 |
+
|
392 |
+
def __init__(self):
|
393 |
+
# Initialize the Whisper ASR pipeline only once
|
394 |
+
self.transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
|
395 |
+
|
396 |
+
def _run(self, video_url: str) -> str:
|
397 |
+
"""
|
398 |
+
Downloads the audio from the video and transcribes it.
|
399 |
+
"""
|
400 |
+
temp_audio_file = "temp_audio.mp3"
|
401 |
+
try:
|
402 |
+
# 1. Download audio from YouTube video
|
403 |
+
ydl_opts = {
|
404 |
+
'format': 'bestaudio/best',
|
405 |
+
'postprocessors': [{
|
406 |
+
'key': 'FFmpegExtractAudio',
|
407 |
+
'preferredcodec': 'mp3',
|
408 |
+
'preferredquality': '192',
|
409 |
+
}],
|
410 |
+
'outtmpl': temp_audio_file, # Specify output filename
|
411 |
+
}
|
412 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
413 |
+
info_dict = ydl.extract_info(video_url, download=True)
|
414 |
+
# yt_dlp sometimes adds an extension, ensure we get the right name
|
415 |
+
downloaded_file = ydl.prepare_filename(info_dict)
|
416 |
+
if not downloaded_file.endswith(".mp3"):
|
417 |
+
# This might happen if the original format was already mp3 or similar
|
418 |
+
downloaded_file = os.path.splitext(downloaded_file)[0] + ".mp3"
|
419 |
+
|
420 |
+
if not os.path.exists(downloaded_file):
|
421 |
+
return f"Error: Could not download audio from {video_url}"
|
422 |
+
|
423 |
+
# 2. Transcribe the audio
|
424 |
+
transcription_result = self.transcriber(downloaded_file)
|
425 |
+
transcribed_text = transcription_result['text']
|
426 |
+
|
427 |
+
return transcribed_text
|
428 |
+
|
429 |
+
except yt_dlp.DownloadError as e:
|
430 |
+
return f"Error downloading video: {e}"
|
431 |
+
except Exception as e:
|
432 |
+
return f"An error occurred during transcription: {e}"
|
433 |
+
finally:
|
434 |
+
# Clean up the temporary audio file
|
435 |
+
if os.path.exists(temp_audio_file):
|
436 |
+
os.remove(temp_audio_file)
|
437 |
+
# Remove any other potential temporary files created by yt_dlp
|
438 |
+
# This is a bit tricky, yt_dlp can create .ytdl files or similar
|
439 |
+
for f in os.listdir('.'):
|
440 |
+
if f.startswith(os.path.splitext(os.path.basename(temp_audio_file))[0]) and f != temp_audio_file:
|
441 |
+
os.remove(f)
|
442 |
+
|
443 |
+
def __call__(self, video_url: str) -> str:
|
444 |
+
return self._run(video_url)
|
445 |
|
446 |
|
447 |
class BasicAgent:
|
448 |
def __init__(self):
|
449 |
token = os.environ.get("HF_API_TOKEN")
|
450 |
+
self.model = HfApiModel( # Store model as self.model if you need to access it later
|
451 |
temperature=0.1,
|
452 |
token=token
|
453 |
)
|
454 |
|
455 |
+
# Initialize all tool instances
|
456 |
+
self.search_tool = DuckDuckGoSearchTool()
|
457 |
+
self.wiki_search_tool = WikiSearchTool() # Ensure this class is defined/imported
|
458 |
+
self.str_reverse_tool = StringReverseTool() # Ensure this class is defined/imported
|
459 |
+
self.keywords_extract_tool = KeywordsExtractorTool() # Ensure this class is defined/imported
|
460 |
+
self.speech_to_text_tool = SpeechToTextTool() # Ensure this class is defined/imported
|
461 |
+
self.visit_webpage_tool = VisitWebpageTool() # Ensure this class is defined/imported
|
462 |
+
self.final_answer_tool = FinalAnswerTool()
|
463 |
+
|
464 |
+
# Custom tools - ensure these classes are defined and imported
|
465 |
+
self.video_transcription_tool = VideoTranscriptionTool()
|
466 |
+
self.image_analysis_tool_instance = ImageAnalysisTool() # Renamed for clarity
|
467 |
+
self.analyse_attachment_tool = AnalyseAttachmentTool() # Renamed for clarity
|
468 |
+
self.code_llama_tool = CodeLlamaTool() # Ensure this class is defined/imported
|
469 |
+
|
470 |
+
system_prompt_template = """
|
471 |
You are my general AI assistant. Your task is to answer the question I asked.
|
472 |
First, provide an explanation of your reasoning, step by step, to arrive at the answer.
|
473 |
Then, return your final answer in a single line, formatted as follows: "FINAL ANSWER: [YOUR FINAL ANSWER]".
|
|
|
476 |
If the answer is a string, do not use articles or abbreviations (e.g., for cities), and write digits in plain text unless specified.
|
477 |
If the answer is a comma-separated list, apply the above rules for each element based on whether it is a number or a string.
|
478 |
"""
|
479 |
+
|
480 |
# Create web agent with image analysis capability
|
481 |
self.web_agent = ToolCallingAgent(
|
482 |
tools=[
|
483 |
+
self.search_tool, # Use the initialized DuckDuckGoSearchTool instance
|
484 |
+
self.visit_webpage_tool,
|
485 |
+
self.image_analysis_tool_instance # Use the initialized instance of your ImageAnalysisTool
|
486 |
],
|
487 |
+
model=self.model, # Use self.model
|
488 |
max_steps=10,
|
489 |
name="web_search_agent",
|
490 |
description="Runs web searches and analyzes images",
|
491 |
)
|
492 |
|
493 |
+
# Create main agent with all capabilities
|
494 |
self.agent = CodeAgent(
|
495 |
+
model=self.model, # Use self.model
|
496 |
tools=[
|
497 |
+
self.search_tool,
|
498 |
+
self.wiki_search_tool,
|
499 |
+
self.str_reverse_tool,
|
500 |
+
self.keywords_extract_tool,
|
501 |
+
self.speech_to_text_tool,
|
502 |
+
self.visit_webpage_tool,
|
503 |
+
self.final_answer_tool,
|
504 |
+
self.video_transcription_tool,
|
505 |
+
self.code_llama_tool,
|
506 |
+
self.image_analysis_tool_instance, # Use the initialized instance
|
507 |
+
self.analyse_attachment_tool # Add the initialized attachment analysis tool
|
508 |
],
|
509 |
+
add_base_tools=True # Consider what this adds, ensure it doesn't duplicate.
|
510 |
)
|
511 |
+
|
512 |
# Update system prompt
|
513 |
+
# It's generally better to pass the system prompt directly if possible
|
514 |
+
# or manage it through prompt templates defined by smolagents.
|
515 |
+
# If smolagents adds its own system prompt, this appends to it.
|
516 |
+
self.agent.prompt_templates["system_prompt"] = self.agent.prompt_templates["system_prompt"] + system_prompt_template
|
517 |
+
|
518 |
|
519 |
def __call__(self, question: str) -> str:
|
520 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|