wt002 commited on
Commit
dd180a2
·
verified ·
1 Parent(s): f592791

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -36
app.py CHANGED
@@ -361,31 +361,113 @@ class ImageAnalysisTool:
361
  Makes the instance callable directly, invoking the _run method for convenience.
362
  """
363
  return self._run(image_url)
364
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
 
367
  class BasicAgent:
368
  def __init__(self):
369
  token = os.environ.get("HF_API_TOKEN")
370
- model = HfApiModel(
371
  temperature=0.1,
372
  token=token
373
  )
374
 
375
- # Existing tools
376
- search_tool = DuckDuckGoSearchTool()
377
- wiki_search_tool = WikiSearchTool()
378
- str_reverse_tool = StringReverseTool()
379
- keywords_extract_tool = KeywordsExtractorTool()
380
- speech_to_text_tool = SpeechToTextTool()
381
- visit_webpage_tool = VisitWebpageTool()
382
- final_answer_tool = FinalAnswerTool()
383
- video_transcription_tool = VideoTranscriptionTool()
384
- Image_Analysis_Tool = ImageAnalysisTool()
385
- Analyse_Attachment_Tool = AnalyseAttachmentTool()
386
- code_llama_tool = CodeLlamaTool()
387
-
388
- system_prompt = f"""
 
 
389
  You are my general AI assistant. Your task is to answer the question I asked.
390
  First, provide an explanation of your reasoning, step by step, to arrive at the answer.
391
  Then, return your final answer in a single line, formatted as follows: "FINAL ANSWER: [YOUR FINAL ANSWER]".
@@ -394,40 +476,45 @@ If the answer is a number, do not use commas or units (e.g., $, %) unless specif
394
  If the answer is a string, do not use articles or abbreviations (e.g., for cities), and write digits in plain text unless specified.
395
  If the answer is a comma-separated list, apply the above rules for each element based on whether it is a number or a string.
396
  """
397
-
398
  # Create web agent with image analysis capability
399
  self.web_agent = ToolCallingAgent(
400
  tools=[
401
- WebSearchTool(),
402
- visit_webpage_tool,
403
- analyze_image # Add image analysis to web agent
404
  ],
405
- model=model,
406
  max_steps=10,
407
  name="web_search_agent",
408
  description="Runs web searches and analyzes images",
409
  )
410
 
411
- # Create main agent with image analysis
412
  self.agent = CodeAgent(
413
- model=model,
414
  tools=[
415
- search_tool,
416
- wiki_search_tool,
417
- str_reverse_tool,
418
- keywords_extract_tool,
419
- speech_to_text_tool,
420
- visit_webpage_tool,
421
- final_answer_tool,
422
- video_transcription_tool,
423
- code_llama_tool,
424
- analyze_image # Add to main agent too
 
425
  ],
426
- add_base_tools=True
427
  )
428
-
429
  # Update system prompt
430
- self.agent.prompt_templates["system_prompt"] = self.agent.prompt_templates["system_prompt"] + system_prompt
 
 
 
 
431
 
432
  def __call__(self, question: str) -> str:
433
  print(f"Agent received question (first 50 chars): {question[:50]}...")
 
361
  Makes the instance callable directly, invoking the _run method for convenience.
362
  """
363
  return self._run(image_url)
364
+
365
+
366
+ import os
367
+ import requests
368
+ from transformers import pipeline
369
+ import yt_dlp
370
+
371
+ # Assuming 'tool' decorator and other smolagents components are imported
372
+ from smolagents import tool, FinalAnswerTool, DuckDuckGoSearchTool, HfApiModel, CodeAgent # Add other necessary imports
373
+
374
+ # --- Custom VideoTranscriptionTool Class ---
375
+
376
+ class VideoTranscriptionTool:
377
+ """
378
+ A tool for transcribing audio from YouTube videos using Whisper.
379
+ """
380
+ name = "video_transcription"
381
+ description = (
382
+ "Transcribes the audio from a given YouTube video URL and returns the text content. "
383
+ "Useful for getting text from video lectures, interviews, etc."
384
+ )
385
+ inputs = {
386
+ "video_url": {
387
+ "type": "string",
388
+ "description": "The URL of the YouTube video to transcribe (e.g., 'https://www.youtube.com/watch?v=dQw4w9WgXcQ').",
389
+ }
390
+ }
391
+
392
+ def __init__(self):
393
+ # Initialize the Whisper ASR pipeline only once
394
+ self.transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
395
+
396
+ def _run(self, video_url: str) -> str:
397
+ """
398
+ Downloads the audio from the video and transcribes it.
399
+ """
400
+ temp_audio_file = "temp_audio.mp3"
401
+ try:
402
+ # 1. Download audio from YouTube video
403
+ ydl_opts = {
404
+ 'format': 'bestaudio/best',
405
+ 'postprocessors': [{
406
+ 'key': 'FFmpegExtractAudio',
407
+ 'preferredcodec': 'mp3',
408
+ 'preferredquality': '192',
409
+ }],
410
+ 'outtmpl': temp_audio_file, # Specify output filename
411
+ }
412
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
413
+ info_dict = ydl.extract_info(video_url, download=True)
414
+ # yt_dlp sometimes adds an extension, ensure we get the right name
415
+ downloaded_file = ydl.prepare_filename(info_dict)
416
+ if not downloaded_file.endswith(".mp3"):
417
+ # This might happen if the original format was already mp3 or similar
418
+ downloaded_file = os.path.splitext(downloaded_file)[0] + ".mp3"
419
+
420
+ if not os.path.exists(downloaded_file):
421
+ return f"Error: Could not download audio from {video_url}"
422
+
423
+ # 2. Transcribe the audio
424
+ transcription_result = self.transcriber(downloaded_file)
425
+ transcribed_text = transcription_result['text']
426
+
427
+ return transcribed_text
428
+
429
+ except yt_dlp.DownloadError as e:
430
+ return f"Error downloading video: {e}"
431
+ except Exception as e:
432
+ return f"An error occurred during transcription: {e}"
433
+ finally:
434
+ # Clean up the temporary audio file
435
+ if os.path.exists(temp_audio_file):
436
+ os.remove(temp_audio_file)
437
+ # Remove any other potential temporary files created by yt_dlp
438
+ # This is a bit tricky, yt_dlp can create .ytdl files or similar
439
+ for f in os.listdir('.'):
440
+ if f.startswith(os.path.splitext(os.path.basename(temp_audio_file))[0]) and f != temp_audio_file:
441
+ os.remove(f)
442
+
443
+ def __call__(self, video_url: str) -> str:
444
+ return self._run(video_url)
445
 
446
 
447
  class BasicAgent:
448
  def __init__(self):
449
  token = os.environ.get("HF_API_TOKEN")
450
+ self.model = HfApiModel( # Store model as self.model if you need to access it later
451
  temperature=0.1,
452
  token=token
453
  )
454
 
455
+ # Initialize all tool instances
456
+ self.search_tool = DuckDuckGoSearchTool()
457
+ self.wiki_search_tool = WikiSearchTool() # Ensure this class is defined/imported
458
+ self.str_reverse_tool = StringReverseTool() # Ensure this class is defined/imported
459
+ self.keywords_extract_tool = KeywordsExtractorTool() # Ensure this class is defined/imported
460
+ self.speech_to_text_tool = SpeechToTextTool() # Ensure this class is defined/imported
461
+ self.visit_webpage_tool = VisitWebpageTool() # Ensure this class is defined/imported
462
+ self.final_answer_tool = FinalAnswerTool()
463
+
464
+ # Custom tools - ensure these classes are defined and imported
465
+ self.video_transcription_tool = VideoTranscriptionTool()
466
+ self.image_analysis_tool_instance = ImageAnalysisTool() # Renamed for clarity
467
+ self.analyse_attachment_tool = AnalyseAttachmentTool() # Renamed for clarity
468
+ self.code_llama_tool = CodeLlamaTool() # Ensure this class is defined/imported
469
+
470
+ system_prompt_template = """
471
  You are my general AI assistant. Your task is to answer the question I asked.
472
  First, provide an explanation of your reasoning, step by step, to arrive at the answer.
473
  Then, return your final answer in a single line, formatted as follows: "FINAL ANSWER: [YOUR FINAL ANSWER]".
 
476
  If the answer is a string, do not use articles or abbreviations (e.g., for cities), and write digits in plain text unless specified.
477
  If the answer is a comma-separated list, apply the above rules for each element based on whether it is a number or a string.
478
  """
479
+
480
  # Create web agent with image analysis capability
481
  self.web_agent = ToolCallingAgent(
482
  tools=[
483
+ self.search_tool, # Use the initialized DuckDuckGoSearchTool instance
484
+ self.visit_webpage_tool,
485
+ self.image_analysis_tool_instance # Use the initialized instance of your ImageAnalysisTool
486
  ],
487
+ model=self.model, # Use self.model
488
  max_steps=10,
489
  name="web_search_agent",
490
  description="Runs web searches and analyzes images",
491
  )
492
 
493
+ # Create main agent with all capabilities
494
  self.agent = CodeAgent(
495
+ model=self.model, # Use self.model
496
  tools=[
497
+ self.search_tool,
498
+ self.wiki_search_tool,
499
+ self.str_reverse_tool,
500
+ self.keywords_extract_tool,
501
+ self.speech_to_text_tool,
502
+ self.visit_webpage_tool,
503
+ self.final_answer_tool,
504
+ self.video_transcription_tool,
505
+ self.code_llama_tool,
506
+ self.image_analysis_tool_instance, # Use the initialized instance
507
+ self.analyse_attachment_tool # Add the initialized attachment analysis tool
508
  ],
509
+ add_base_tools=True # Consider what this adds, ensure it doesn't duplicate.
510
  )
511
+
512
  # Update system prompt
513
+ # It's generally better to pass the system prompt directly if possible
514
+ # or manage it through prompt templates defined by smolagents.
515
+ # If smolagents adds its own system prompt, this appends to it.
516
+ self.agent.prompt_templates["system_prompt"] = self.agent.prompt_templates["system_prompt"] + system_prompt_template
517
+
518
 
519
  def __call__(self, question: str) -> str:
520
  print(f"Agent received question (first 50 chars): {question[:50]}...")