Final_Assignment_Project

Sleeping

App Files Files Community

wt002 commited on 27 days ago

Commit

dd180a2

verified ·

1 Parent(s): f592791

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -36

app.py CHANGED Viewed

@@ -361,31 +361,113 @@ class ImageAnalysisTool:
         Makes the instance callable directly, invoking the _run method for convenience.
         """
         return self._run(image_url)
 class BasicAgent:
     def __init__(self):
         token = os.environ.get("HF_API_TOKEN")
-        model = HfApiModel(
             temperature=0.1,
             token=token
         )
-        # Existing tools
-        search_tool = DuckDuckGoSearchTool()
-        wiki_search_tool = WikiSearchTool()
-        str_reverse_tool = StringReverseTool()
-        keywords_extract_tool = KeywordsExtractorTool()
-        speech_to_text_tool = SpeechToTextTool()
-        visit_webpage_tool = VisitWebpageTool()
-        final_answer_tool = FinalAnswerTool()
-        video_transcription_tool = VideoTranscriptionTool()
-        Image_Analysis_Tool = ImageAnalysisTool()
-        Analyse_Attachment_Tool = AnalyseAttachmentTool()
-        code_llama_tool = CodeLlamaTool()
-        system_prompt = f"""
 You are my general AI assistant. Your task is to answer the question I asked.
 First, provide an explanation of your reasoning, step by step, to arrive at the answer.
 Then, return your final answer in a single line, formatted as follows: "FINAL ANSWER: [YOUR FINAL ANSWER]".
@@ -394,40 +476,45 @@ If the answer is a number, do not use commas or units (e.g., $, %) unless specif
 If the answer is a string, do not use articles or abbreviations (e.g., for cities), and write digits in plain text unless specified.
 If the answer is a comma-separated list, apply the above rules for each element based on whether it is a number or a string.
 """
         # Create web agent with image analysis capability
         self.web_agent = ToolCallingAgent(
             tools=[
-                WebSearchTool(),
-                visit_webpage_tool,
-                analyze_image  # Add image analysis to web agent
             ],
-            model=model,
             max_steps=10,
             name="web_search_agent",
             description="Runs web searches and analyzes images",
         )
-        # Create main agent with image analysis
         self.agent = CodeAgent(
-            model=model,
             tools=[
-                search_tool,
-                wiki_search_tool,
-                str_reverse_tool,
-                keywords_extract_tool,
-                speech_to_text_tool,
-                visit_webpage_tool,
-                final_answer_tool,
-                video_transcription_tool,
-                code_llama_tool,
-                analyze_image  # Add to main agent too
             ],
-            add_base_tools=True
         )
         # Update system prompt
-        self.agent.prompt_templates["system_prompt"] = self.agent.prompt_templates["system_prompt"] + system_prompt
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")

         Makes the instance callable directly, invoking the _run method for convenience.
         """
         return self._run(image_url)
+import os
+import requests
+from transformers import pipeline
+import yt_dlp
+# Assuming 'tool' decorator and other smolagents components are imported
+from smolagents import tool, FinalAnswerTool, DuckDuckGoSearchTool, HfApiModel, CodeAgent # Add other necessary imports
+# --- Custom VideoTranscriptionTool Class ---
+class VideoTranscriptionTool:
+    """
+    A tool for transcribing audio from YouTube videos using Whisper.
+    """
+    name = "video_transcription"
+    description = (
+        "Transcribes the audio from a given YouTube video URL and returns the text content. "
+        "Useful for getting text from video lectures, interviews, etc."
+    )
+    inputs = {
+        "video_url": {
+            "type": "string",
+            "description": "The URL of the YouTube video to transcribe (e.g., 'https://www.youtube.com/watch?v=dQw4w9WgXcQ').",
+        }
+    }
+    def __init__(self):
+        # Initialize the Whisper ASR pipeline only once
+        self.transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
+    def _run(self, video_url: str) -> str:
+        """
+        Downloads the audio from the video and transcribes it.
+        """
+        temp_audio_file = "temp_audio.mp3"
+        try:
+            # 1. Download audio from YouTube video
+            ydl_opts = {
+                'format': 'bestaudio/best',
+                'postprocessors': [{
+                    'key': 'FFmpegExtractAudio',
+                    'preferredcodec': 'mp3',
+                    'preferredquality': '192',
+                }],
+                'outtmpl': temp_audio_file, # Specify output filename
+            }
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info_dict = ydl.extract_info(video_url, download=True)
+                # yt_dlp sometimes adds an extension, ensure we get the right name
+                downloaded_file = ydl.prepare_filename(info_dict)
+                if not downloaded_file.endswith(".mp3"):
+                     # This might happen if the original format was already mp3 or similar
+                     downloaded_file = os.path.splitext(downloaded_file)[0] + ".mp3"
+            if not os.path.exists(downloaded_file):
+                return f"Error: Could not download audio from {video_url}"
+            # 2. Transcribe the audio
+            transcription_result = self.transcriber(downloaded_file)
+            transcribed_text = transcription_result['text']
+            return transcribed_text
+        except yt_dlp.DownloadError as e:
+            return f"Error downloading video: {e}"
+        except Exception as e:
+            return f"An error occurred during transcription: {e}"
+        finally:
+            # Clean up the temporary audio file
+            if os.path.exists(temp_audio_file):
+                os.remove(temp_audio_file)
+            # Remove any other potential temporary files created by yt_dlp
+            # This is a bit tricky, yt_dlp can create .ytdl files or similar
+            for f in os.listdir('.'):
+                if f.startswith(os.path.splitext(os.path.basename(temp_audio_file))[0]) and f != temp_audio_file:
+                    os.remove(f)
+    def __call__(self, video_url: str) -> str:
+        return self._run(video_url)
 class BasicAgent:
     def __init__(self):
         token = os.environ.get("HF_API_TOKEN")
+        self.model = HfApiModel( # Store model as self.model if you need to access it later
             temperature=0.1,
             token=token
         )
+        # Initialize all tool instances
+        self.search_tool = DuckDuckGoSearchTool()
+        self.wiki_search_tool = WikiSearchTool() # Ensure this class is defined/imported
+        self.str_reverse_tool = StringReverseTool() # Ensure this class is defined/imported
+        self.keywords_extract_tool = KeywordsExtractorTool() # Ensure this class is defined/imported
+        self.speech_to_text_tool = SpeechToTextTool() # Ensure this class is defined/imported
+        self.visit_webpage_tool = VisitWebpageTool() # Ensure this class is defined/imported
+        self.final_answer_tool = FinalAnswerTool()
+        # Custom tools - ensure these classes are defined and imported
+        self.video_transcription_tool = VideoTranscriptionTool()
+        self.image_analysis_tool_instance = ImageAnalysisTool() # Renamed for clarity
+        self.analyse_attachment_tool = AnalyseAttachmentTool() # Renamed for clarity
+        self.code_llama_tool = CodeLlamaTool() # Ensure this class is defined/imported
+        system_prompt_template = """
 You are my general AI assistant. Your task is to answer the question I asked.
 First, provide an explanation of your reasoning, step by step, to arrive at the answer.
 Then, return your final answer in a single line, formatted as follows: "FINAL ANSWER: [YOUR FINAL ANSWER]".
 If the answer is a string, do not use articles or abbreviations (e.g., for cities), and write digits in plain text unless specified.
 If the answer is a comma-separated list, apply the above rules for each element based on whether it is a number or a string.
 """
         # Create web agent with image analysis capability
         self.web_agent = ToolCallingAgent(
             tools=[
+                self.search_tool, # Use the initialized DuckDuckGoSearchTool instance
+                self.visit_webpage_tool,
+                self.image_analysis_tool_instance # Use the initialized instance of your ImageAnalysisTool
             ],
+            model=self.model, # Use self.model
             max_steps=10,
             name="web_search_agent",
             description="Runs web searches and analyzes images",
         )
+        # Create main agent with all capabilities
         self.agent = CodeAgent(
+            model=self.model, # Use self.model
             tools=[
+                self.search_tool,
+                self.wiki_search_tool,
+                self.str_reverse_tool,
+                self.keywords_extract_tool,
+                self.speech_to_text_tool,
+                self.visit_webpage_tool,
+                self.final_answer_tool,
+                self.video_transcription_tool,
+                self.code_llama_tool,
+                self.image_analysis_tool_instance, # Use the initialized instance
+                self.analyse_attachment_tool # Add the initialized attachment analysis tool
             ],
+            add_base_tools=True # Consider what this adds, ensure it doesn't duplicate.
         )
         # Update system prompt
+        # It's generally better to pass the system prompt directly if possible
+        # or manage it through prompt templates defined by smolagents.
+        # If smolagents adds its own system prompt, this appends to it.
+        self.agent.prompt_templates["system_prompt"] = self.agent.prompt_templates["system_prompt"] + system_prompt_template
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")