Samuel Thomas commited on
Commit
fe1bd6e
·
1 Parent(s): 4000d20
Files changed (3) hide show
  1. app.py +1 -1
  2. requirements.txt +2 -1
  3. tools.py +675 -76
app.py CHANGED
@@ -143,7 +143,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
143
  task_id = hf_questions[r]['task_id']
144
  question_text = hf_questions[r]['question']
145
  full_answer = run_agent(agent, s)
146
- submitted_answer = strip_final_answer(extract_final_answer(full_answer[-1].content))
147
  print(f"\n\nQuestion {r+1} Answer: {submitted_answer}\n\n")
148
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
149
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
143
  task_id = hf_questions[r]['task_id']
144
  question_text = hf_questions[r]['question']
145
  full_answer = run_agent(agent, s)
146
+ submitted_answer = extract_final_answer(full_answer[-1].content)
147
  print(f"\n\nQuestion {r+1} Answer: {submitted_answer}\n\n")
148
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
149
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
requirements.txt CHANGED
@@ -24,4 +24,5 @@ duckduckgo-search==8.0.0
24
  sentencepiece
25
  nltk
26
  SpeechRecognition
27
- pandas
 
 
24
  sentencepiece
25
  nltk
26
  SpeechRecognition
27
+ pandas
28
+ openai-whisper
tools.py CHANGED
@@ -6,16 +6,19 @@ import string
6
  import glob
7
  import shutil
8
  import gc
 
9
  import uuid
10
  import signal
 
 
11
  from datetime import datetime
12
  from io import BytesIO
13
  from contextlib import contextmanager
14
  from langchain_huggingface import HuggingFacePipeline
15
- from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set
16
  import time
17
  from collections import Counter
18
- from pydantic import Field
19
  import hashlib
20
  import json
21
  import numpy as np
@@ -44,6 +47,7 @@ from pydub import AudioSegment
44
  from pydub.silence import split_on_silence
45
  import nltk
46
  from nltk.corpus import words
 
47
 
48
  # LangChain Ecosystem
49
  from langchain.docstore.document import Document
@@ -89,23 +93,21 @@ def create_llm_pipeline():
89
  #model_id = "meta-llama/Llama-3.3-70B-Instruct"
90
  #model_id = "mistralai/Mistral-Small-24B-Base-2501"
91
  model_id = "mistralai/Mistral-7B-Instruct-v0.3"
 
 
 
 
92
  #model_id = "Qwen/Qwen2-7B-Instruct"
93
-
94
- # Load tokenizer explicitly with fast version
95
- tokenizer = AutoTokenizer.from_pretrained(
96
- model_id,
97
- use_fast=True, # Force fast tokenizer
98
- add_prefix_space=True # Only if actually needed
99
- )
100
-
101
  return pipeline(
102
  "text-generation",
103
  model=model_id,
104
- tokenizer = tokenizer,
105
- device_map="cpu",
106
  torch_dtype=torch.float16,
107
  max_new_tokens=1024,
108
- temperature=0.1
 
 
109
  )
110
 
111
  # Define file extension sets for each category
@@ -150,21 +152,637 @@ def write_bytes_to_temp_dir(file_bytes: bytes, file_name: str) -> str:
150
  print(f"File written to: {file_path}")
151
  return file_path
152
 
153
-
154
  def extract_final_answer(text: str) -> str:
155
- """
156
- Returns the substring starting from the last occurrence of 'FINAL ANSWER:' (case-insensitive)
157
- to the end of the string, with any trailing punctuation removed.
158
- If not found, returns an empty string.
159
- """
160
- marker = "FINAL ANSWER:"
161
- idx = text.lower().rfind(marker.lower())
162
- if idx == -1:
163
- return ""
164
- result = text[idx:].strip()
165
- # Remove trailing punctuation
166
- return result.rstrip(string.punctuation + " ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  class EnhancedDuckDuckGoSearchTool(BaseTool):
170
  name: str = "enhanced_search"
@@ -755,12 +1373,11 @@ class WikipediaSearchToolWithFAISS(BaseTool):
755
  return f"An unexpected error occurred: {str(e)}"
756
 
757
 
 
758
  class EnhancedYoutubeScreenshotQA(BaseTool):
759
- name: str = "enhanced_youtube_screenshot_qa"
760
  description: str = (
761
- "Downloads a YouTube video, intelligently extracts screenshots, "
762
- "and answers questions using advanced visual QA with semantic analysis. "
763
- "Use this tool for questions about the VIDEO or IMAGES in the video,"
764
  "Input should be a dict with keys: 'youtube_url', 'question', and optional parameters. "
765
  #"Optional parameters: 'frame_interval_seconds' (default: 10), 'max_frames' (default: 50), "
766
  #"'use_scene_detection' (default: True), 'parallel_processing' (default: True). "
@@ -796,8 +1413,8 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
796
  def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
797
  """Get configuration value with fallback to defaults"""
798
  defaults = {
799
- 'frame_interval_seconds': 10,
800
- 'max_frames': 50,
801
  'use_scene_detection': True,
802
  'resize_frames': True,
803
  'parallel_processing': True,
@@ -822,6 +1439,11 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
822
  "Salesforce/blip-vqa-base"
823
  ).to(self.device)
824
 
 
 
 
 
 
825
  print("BLIP VQA model loaded successfully")
826
  except Exception as e:
827
  print(f"Error initializing VQA model: {str(e)}")
@@ -1057,6 +1679,7 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
1057
  def _answer_question_on_frame(self, frame_path: str, question: str) -> Tuple[str, float]:
1058
  """Answer question on single frame with confidence scoring"""
1059
  try:
 
1060
  image = Image.open(frame_path).convert('RGB')
1061
  inputs = self.processor_vqa(image, question, return_tensors="pt").to(self.device)
1062
 
@@ -1373,6 +1996,7 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
1373
  def _run(self, youtube_url, question, **kwargs) -> str:
1374
  """Enhanced main execution method"""
1375
  #ipdb.set_trace()
 
1376
 
1377
  #input_data = query
1378
  #youtube_url = input_data.get("youtube_url")
@@ -1411,20 +2035,6 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
1411
 
1412
  # Format comprehensive result - Fixed the reference to stats
1413
  result = f"""
1414
- 📊 **ANALYSIS SUMMARY**:
1415
- • Confidence Score: {analysis_result['confidence']:.2%}
1416
- • Frames Analyzed: {analysis_result['successful_analyses']}/{analysis_result['frame_count']}
1417
- • Answer Consistency: {analysis_result['temporal_analysis'].get('stability_ratio', 0):.2%}
1418
-
1419
- 📈 **ANSWER DISTRIBUTION**:
1420
- {chr(10).join([f"• {answer}: {count} frames" for answer, count in analysis_result['answer_distribution'].items()])}
1421
-
1422
- 🔍 **SEMANTIC CLUSTERS**:
1423
- {chr(10).join([f"• '{cluster}': {count} similar answers" for cluster, count in analysis_result['semantic_clusters'].items()])}
1424
-
1425
- ⏱️ **TEMPORAL ANALYSIS**:
1426
- • Answer Changes: {analysis_result['temporal_analysis'].get('total_changes', 0)}
1427
- • Stability: {analysis_result['temporal_analysis'].get('stability_ratio', 0):.2%}
1428
 
1429
  📊 **STATISTICAL SUMMARY**:
1430
  • Minimum: {analysis_result['statistical_summary']['minimum']:.2f}
@@ -1433,10 +2043,6 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
1433
  • Median: {analysis_result['statistical_summary']['median']:.2f}
1434
  • Range: {analysis_result['statistical_summary']['range']:.2f}
1435
 
1436
- 🎯 **CONFIDENCE BREAKDOWN**:
1437
- • Frequency-based: {analysis_result['frequency_confidence']:.2%}
1438
- • Model-based: {analysis_result['average_model_confidence']:.2%}
1439
- • Combined: {analysis_result['confidence']:.2%}
1440
  """.strip()
1441
 
1442
  return result
@@ -1449,30 +2055,18 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
1449
  def create_enhanced_youtube_qa_tool(**kwargs):
1450
  """Factory function to create the enhanced tool with custom parameters"""
1451
  return EnhancedYoutubeScreenshotQA(**kwargs)
1452
- # Example of creating the tool instance:
1453
- # wikipedia_tool_faiss = WikipediaSearchToolWithFAISS()
1454
-
1455
- # To use this new tool in your agent, you would replace the old
1456
- # `wikipedia_tool` instance with `wikipedia_tool_faiss` in your `tools` list.
1457
- # For example:
1458
- # tools = [wikipedia_tool_faiss, search_tool]
1459
- # Create tool instances
1460
- #wikipedia_tool = WikipediaSearchTool()
1461
-
1462
- # --- Define Call LLM function ---
1463
-
1464
- # 3. Improved LLM call with memory management
1465
 
1466
 
1467
  class YouTubeTranscriptExtractor(BaseTool):
1468
  name: str = "youtube_transcript_extractor"
1469
  description: str = (
1470
  "Downloads a YouTube video and extracts the complete audio transcript using speech recognition with speaker identification. "
1471
- "Use this tool when you need the AUDIO or DIALOGUE or sound from a YouTube video with speaker tags,"
 
1472
  "Input should be a dict with keys: 'youtube_url' and optional parameters. "
1473
- "Optional parameters: 'language' (default: 'en-US'), 'chunk_length_ms' (default: 30000), "
1474
- "'silence_thresh' (default: -40), 'use_enhanced_model' (default: True), 'audio_quality' (default: 'best'), "
1475
- "'enable_speaker_id' (default: True), 'max_speakers' (default: 5), 'speaker_min_duration' (default: 2.0). "
1476
  "Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US', 'enable_speaker_id': True}"
1477
  )
1478
 
@@ -2240,8 +2834,6 @@ def create_youtube_transcript_tool(**kwargs):
2240
  """Factory function to create the transcript extraction tool with custom parameters"""
2241
  return YouTubeTranscriptExtractor(**kwargs)
2242
 
2243
-
2244
-
2245
  # --- Model Configuration ---
2246
  def create_llm_pipeline():
2247
  #model_id = "meta-llama/Llama-2-13b-chat-hf"
@@ -2993,17 +3585,19 @@ def fix_backwards_text(text):
2993
 
2994
  # --- Run the Agent ---
2995
  # Enhanced system prompt for better behavior
2996
-
2997
  def run_agent(agent, state: AgentState):
2998
  """Enhanced agent initialization with better prompt and hallucination prevention."""
2999
- global WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_TOOL, YOUTUBE_AUDIO_TOOL, tools
3000
 
3001
  # Initialize tools
3002
  WIKIPEDIA_TOOL = WikipediaSearchToolWithFAISS()
3003
- SEARCH_TOOL = EnhancedDuckDuckGoSearchTool(max_results=3, max_chars_per_page=3000)
3004
  YOUTUBE_TOOL = EnhancedYoutubeScreenshotQA()
3005
  YOUTUBE_AUDIO_TOOL = YouTubeTranscriptExtractor()
3006
- tools = [WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_TOOL, YOUTUBE_AUDIO_TOOL]
 
 
 
3007
 
3008
  formatted_tools_description = render_text_description(tools)
3009
  current_date_str = datetime.now().strftime("%Y-%m-%d")
@@ -3019,6 +3613,7 @@ CRITICAL INSTRUCTIONS:
3019
  3. Use tools ONLY when you need specific information you don't know
3020
  4. After using a tool, provide your FINAL ANSWER immediately
3021
  5. STOP after giving your FINAL ANSWER - do not continue
 
3022
 
3023
  FORMAT for tool use:
3024
  Thought: <brief reasoning>
@@ -3030,12 +3625,15 @@ FINAL ANSWER: [concise answer only]
3030
 
3031
  ANSWER FORMAT:
3032
  - Numbers: no commas, no units unless specified
 
3033
  - Strings: no articles, no abbreviations, digits in plain text
3034
- - Lists: comma-separated following above rules
3035
  - Be extremely brief and concise
3036
  - Do not provide additional context or explanations
3037
  - Do not provide parentheticals
3038
 
 
 
3039
  IMPORTANT: You are responding to ONE question only. Do not ask follow-up questions or generate additional dialogue.
3040
 
3041
  Current date: {current_date_str}
@@ -3062,9 +3660,10 @@ Current date: {current_date_str}
3062
 
3063
  # Cleanup
3064
  if result.get("done"):
3065
- #torch.cuda.empty_cache()
3066
- #torch.cuda.ipc_collect()
3067
  gc.collect()
3068
  print("🧹 Released GPU memory after completion")
3069
 
3070
  return result["messages"]
 
 
6
  import glob
7
  import shutil
8
  import gc
9
+ import sys
10
  import uuid
11
  import signal
12
+ from pathlib import Path
13
+ import subprocess
14
  from datetime import datetime
15
  from io import BytesIO
16
  from contextlib import contextmanager
17
  from langchain_huggingface import HuggingFacePipeline
18
+ from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set, Type
19
  import time
20
  from collections import Counter
21
+ from pydantic import Field, BaseModel
22
  import hashlib
23
  import json
24
  import numpy as np
 
47
  from pydub.silence import split_on_silence
48
  import nltk
49
  from nltk.corpus import words
50
+ import pandas as pd
51
 
52
  # LangChain Ecosystem
53
  from langchain.docstore.document import Document
 
93
  #model_id = "meta-llama/Llama-3.3-70B-Instruct"
94
  #model_id = "mistralai/Mistral-Small-24B-Base-2501"
95
  model_id = "mistralai/Mistral-7B-Instruct-v0.3"
96
+ #model_id = "Meta-Llama/Llama-2-7b-chat-hf"
97
+ #model_id = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"
98
+ #model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
99
+ #model_id = "mistralai/Mistral-7B-Instruct-v0.2"
100
  #model_id = "Qwen/Qwen2-7B-Instruct"
101
+ #model_id = "GSAI-ML/LLaDA-8B-Instruct"
 
 
 
 
 
 
 
102
  return pipeline(
103
  "text-generation",
104
  model=model_id,
105
+ device_map="auto",
 
106
  torch_dtype=torch.float16,
107
  max_new_tokens=1024,
108
+ temperature=0.05,
109
+ do_sample=False,
110
+ repetition_penalty=1.2
111
  )
112
 
113
  # Define file extension sets for each category
 
152
  print(f"File written to: {file_path}")
153
  return file_path
154
 
 
155
  def extract_final_answer(text: str) -> str:
156
+ """
157
+ Extracts the answer after the last 'FINAL ANSWER:' (case-insensitive),
158
+ removes any parenthetical immediately following a numeric answer,
159
+ strips trailing punctuation, sorts comma-separated lists,
160
+ and does not split numbers containing commas.
161
+ Returns an empty string if marker not found.
162
+ """
163
+ marker = "FINAL ANSWER:"
164
+ idx = text.lower().rfind(marker.lower())
165
+ if idx == -1:
166
+ return ""
167
+ # Extract answer after marker
168
+ result = text[idx + len(marker):].strip()
169
+ # Remove parenthetical immediately following a number at the start
170
+ result = re.sub(r'^(\d+(?:\.\d+)?)\s*\(.*?\)', r'\1', result)
171
+ # Remove trailing punctuation and whitespace
172
+ result = result.rstrip(string.punctuation + " ")
173
+ # Split on commas NOT between digits (i.e., not inside numbers)
174
+ # This regex splits on commas not surrounded by digits (to avoid splitting numbers like 1,000)
175
+ items = re.split(r',(?!\s*\d{3}\b)', result)
176
+ # If we have a list, sort it
177
+ if len(items) > 1:
178
+ items = [item.strip() for item in items]
179
+ # Try to sort numerically
180
+ try:
181
+ sorted_items = sorted(
182
+ items,
183
+ key=lambda x: float(re.sub(r'[^\d\.]', '', x)) # Remove non-numeric except .
184
+ )
185
+ return ', '.join(sorted_items)
186
+ except ValueError:
187
+ # Fallback: sort alphabetically
188
+ sorted_items = sorted(items, key=lambda x: x.lower())
189
+ return ', '.join(sorted_items)
190
+ return result
191
+
192
+
193
+ class AudioTranscriptionInput(BaseModel):
194
+ """Input schema for AudioTranscriptionTool."""
195
+ file_path: str = Field(description="Path to the audio file to transcribe")
196
+ engine: Optional[str] = Field(default="google", description="Speech recognition engine to use")
197
+ language: Optional[str] = Field(default="en-US", description="Language of the audio")
198
+
199
+ class AudioTranscriptionTool(BaseTool):
200
+ """Tool for transcribing audio files using local speech recognition."""
201
+
202
+ name: str = "audio_transcription"
203
+ description: str = """
204
+ Transcribes voice memo, audio files (mp3, wav, m4a, flac, etc.) to text using local speech recognition.
205
+ Input should be a dictionary with 'file_path' key containing the path to the audio file.
206
+ Optionally accepts 'engine' and 'language' parameters.
207
+ Returns the transcribed text as a string.
208
+ """
209
+ args_schema: type[BaseModel] = AudioTranscriptionInput
210
+
211
+ class Config:
212
+ arbitrary_types_allowed = True
213
+
214
+ def __init__(self, **kwargs):
215
+ """Initialize the AudioTranscriptionTool."""
216
+ super().__init__(**kwargs)
217
+ self._init_speech_recognition()
218
+
219
+ def _init_speech_recognition(self):
220
+ """Initialize speech recognition components."""
221
+ try:
222
+ import speech_recognition as sr
223
+ from pydub import AudioSegment
224
+ object.__setattr__(self, 'recognizer', sr.Recognizer())
225
+ object.__setattr__(self, 'sr', sr)
226
+ object.__setattr__(self, 'AudioSegment', AudioSegment)
227
+ except ImportError as e:
228
+ raise ImportError(
229
+ "Required libraries not found. Install with: "
230
+ "pip install SpeechRecognition pydub"
231
+ ) from e
232
+
233
+ def _validate_audio_file(self, file_path: str) -> bool:
234
+ """Validate that the audio file exists and has a supported format."""
235
+ if not os.path.exists(file_path):
236
+ raise FileNotFoundError(f"Audio file not found: {file_path}")
237
+
238
+ # Check file extension - pydub supports many formats
239
+ supported_formats = {'.mp3', '.wav', '.m4a', '.flac', '.mp4', '.mpeg', '.mpga', '.webm', '.ogg', '.aac'}
240
+ file_extension = Path(file_path).suffix.lower()
241
+
242
+ if file_extension not in supported_formats:
243
+ raise ValueError(
244
+ f"Unsupported audio format: {file_extension}. "
245
+ f"Supported formats: {', '.join(supported_formats)}"
246
+ )
247
+
248
+ return True
249
+
250
+ def _convert_to_wav(self, file_path: str) -> str:
251
+ """Convert audio file to WAV format if needed."""
252
+ file_extension = Path(file_path).suffix.lower()
253
+
254
+ if file_extension == '.wav':
255
+ return file_path
256
+
257
+ try:
258
+ # Convert to WAV using pydub
259
+ audio = self.AudioSegment.from_file(file_path)
260
+
261
+ # Create temporary WAV file
262
+ temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
263
+ audio.export(temp_wav.name, format="wav")
264
+ return temp_wav.name
265
+ except Exception as e:
266
+ raise RuntimeError(f"Error converting audio file to WAV: {str(e)}")
267
+
268
+ def _transcribe_audio(self, file_path: str, engine: str = "google", language: str = "en-US") -> str:
269
+ """Transcribe audio file using local speech recognition."""
270
+ temp_wav_path = None
271
+
272
+ try:
273
+ # Convert to WAV if necessary
274
+ wav_path = self._convert_to_wav(file_path)
275
+ if wav_path != file_path:
276
+ temp_wav_path = wav_path
277
+
278
+ # Load audio file
279
+ with self.sr.AudioFile(wav_path) as source:
280
+ # Adjust for ambient noise
281
+ self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
282
+ # Record the audio
283
+ audio_data = self.recognizer.record(source)
284
+
285
+ # Choose recognition engine
286
+ if engine == "google":
287
+ transcript = self.recognizer.recognize_google(audio_data, language=language)
288
+ elif engine == "sphinx":
289
+ transcript = self.recognizer.recognize_sphinx(audio_data, language=language)
290
+ elif engine == "wit":
291
+ # Note: requires WIT_AI_KEY environment variable
292
+ wit_key = os.getenv('WIT_AI_KEY')
293
+ if not wit_key:
294
+ raise ValueError("WIT_AI_KEY environment variable required for Wit.ai engine")
295
+ transcript = self.recognizer.recognize_wit(audio_data, key=wit_key)
296
+ elif engine == "bing":
297
+ # Note: requires BING_KEY environment variable
298
+ bing_key = os.getenv('BING_KEY')
299
+ if not bing_key:
300
+ raise ValueError("BING_KEY environment variable required for Bing engine")
301
+ transcript = self.recognizer.recognize_bing(audio_data, key=bing_key, language=language)
302
+ else:
303
+ # Default to Google
304
+ transcript = self.recognizer.recognize_google(audio_data, language=language)
305
+
306
+ return transcript
307
+
308
+ except self.sr.UnknownValueError:
309
+ return "Could not understand the audio - speech was unclear or inaudible"
310
+ except self.sr.RequestError as e:
311
+ return f"Error with speech recognition service: {str(e)}"
312
+ except Exception as e:
313
+ raise RuntimeError(f"Error transcribing audio: {str(e)}")
314
+ finally:
315
+ # Clean up temporary WAV file
316
+ if temp_wav_path and os.path.exists(temp_wav_path):
317
+ try:
318
+ os.unlink(temp_wav_path)
319
+ except OSError:
320
+ pass # Ignore cleanup errors
321
+
322
+ def _run(self, file_path: str, engine: str = "google", language: str = "en-US", **kwargs) -> str:
323
+ """
324
+ Internal method required by LangChain BaseTool.
325
+
326
+ Args:
327
+ file_path: Path to the audio file to transcribe
328
+ engine: Speech recognition engine to use
329
+ language: Language of the audio
330
+
331
+ Returns:
332
+ str: Transcribed text from the audio file
333
+ """
334
+ try:
335
+ # Validate audio file
336
+ self._validate_audio_file(file_path)
337
+
338
+ # Transcribe audio
339
+ transcript = self._transcribe_audio(
340
+ file_path=file_path,
341
+ engine=engine,
342
+ language=language
343
+ )
344
+
345
+ return transcript
346
+
347
+ except Exception as e:
348
+ error_msg = f"AudioTranscriptionTool error: {str(e)}"
349
+ print(error_msg)
350
+ return error_msg
351
+
352
+ def run(self, tool_input: Dict[str, Any]) -> str:
353
+ """
354
+ Main method to run the audio transcription tool.
355
+
356
+ Args:
357
+ tool_input: Dictionary containing 'file_path' and optional parameters
358
+
359
+ Returns:
360
+ str: Transcribed text from the audio file
361
+ """
362
+ try:
363
+ # Extract parameters from input
364
+ file_path = tool_input.get('file_path')
365
+ if not file_path:
366
+ raise ValueError("file_path is required in tool_input")
367
+
368
+ engine = tool_input.get('engine', 'google')
369
+ language = tool_input.get('language', 'en-US')
370
+
371
+ # Call the internal _run method
372
+ return self._run(file_path=file_path, engine=engine, language=language)
373
+
374
+ except Exception as e:
375
+ error_msg = f"AudioTranscriptionTool error: {str(e)}"
376
+ print(error_msg)
377
+ return error_msg
378
+
379
+ # Enhanced local transcription tool with multiple engine support
380
+ class AdvancedAudioTranscriptionTool(BaseTool):
381
+ """Advanced tool with support for multiple local transcription engines including Whisper."""
382
+
383
+ name: str = "advanced_audio_transcription"
384
+ description: str = """
385
+ Advanced audio transcription tool supporting multiple engines including local Whisper.
386
+ Supports engines: 'whisper' (local), 'google', 'sphinx', 'wit', 'bing'.
387
+ Input should be a dictionary with 'file_path' key.
388
+ Returns the transcribed text as a string.
389
+ """
390
+ args_schema: type[BaseModel] = AudioTranscriptionInput
391
+
392
+ class Config:
393
+ arbitrary_types_allowed = True
394
+
395
+ def __init__(self, **kwargs):
396
+ """Initialize the AdvancedAudioTranscriptionTool."""
397
+ super().__init__(**kwargs)
398
+ self._init_speech_recognition()
399
+ self._init_whisper()
400
+
401
+ def _init_speech_recognition(self):
402
+ """Initialize speech recognition components."""
403
+ try:
404
+ import speech_recognition as sr
405
+ from pydub import AudioSegment
406
+ object.__setattr__(self, 'recognizer', sr.Recognizer())
407
+ object.__setattr__(self, 'sr', sr)
408
+ object.__setattr__(self, 'AudioSegment', AudioSegment)
409
+ except ImportError as e:
410
+ raise ImportError(
411
+ "Required libraries not found. Install with: "
412
+ "pip install SpeechRecognition pydub"
413
+ ) from e
414
+
415
+ def _init_whisper(self):
416
+ """Initialize Whisper if available."""
417
+ try:
418
+ import whisper
419
+ object.__setattr__(self, 'whisper', whisper)
420
+ except ImportError:
421
+ object.__setattr__(self, 'whisper', None)
422
+ print("Warning: OpenAI Whisper not installed. Install with 'pip install openai-whisper' for local Whisper support.")
423
+
424
+ def _validate_audio_file(self, file_path: str) -> bool:
425
+ """Validate that the audio file exists and has a supported format."""
426
+ if not os.path.exists(file_path):
427
+ raise FileNotFoundError(f"Audio file not found: {file_path}")
428
+
429
+ supported_formats = {'.mp3', '.wav', '.m4a', '.flac', '.mp4', '.mpeg', '.mpga', '.webm', '.ogg', '.aac'}
430
+ file_extension = Path(file_path).suffix.lower()
431
+
432
+ if file_extension not in supported_formats:
433
+ raise ValueError(
434
+ f"Unsupported audio format: {file_extension}. "
435
+ f"Supported formats: {', '.join(supported_formats)}"
436
+ )
437
+
438
+ return True
439
+
440
+ def _transcribe_with_whisper(self, file_path: str, language: str = "en") -> str:
441
+ """Transcribe using local Whisper model."""
442
+ if not self.whisper:
443
+ raise RuntimeError("Whisper not installed. Install with 'pip install openai-whisper'")
444
+
445
+ try:
446
+ # Load the model (you can change model size: tiny, base, small, medium, large)
447
+ model = self.whisper.load_model("base")
448
+
449
+ # Transcribe the audio
450
+ result = model.transcribe(file_path, language=language if language != "en-US" else "en")
451
+
452
+ return result["text"].strip()
453
+
454
+ except Exception as e:
455
+ raise RuntimeError(f"Error with Whisper transcription: {str(e)}")
456
+
457
+ def _convert_to_wav(self, file_path: str) -> str:
458
+ """Convert audio file to WAV format if needed."""
459
+ file_extension = Path(file_path).suffix.lower()
460
+
461
+ if file_extension == '.wav':
462
+ return file_path
463
+
464
+ try:
465
+ audio = self.AudioSegment.from_file(file_path)
466
+ temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
467
+ audio.export(temp_wav.name, format="wav")
468
+ return temp_wav.name
469
+ except Exception as e:
470
+ raise RuntimeError(f"Error converting audio file to WAV: {str(e)}")
471
+
472
+ def _transcribe_with_sr(self, file_path: str, engine: str = "google", language: str = "en-US") -> str:
473
+ """Transcribe using speech_recognition library."""
474
+ temp_wav_path = None
475
+
476
+ try:
477
+ wav_path = self._convert_to_wav(file_path)
478
+ if wav_path != file_path:
479
+ temp_wav_path = wav_path
480
+
481
+ with self.sr.AudioFile(wav_path) as source:
482
+ self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
483
+ audio_data = self.recognizer.record(source)
484
+
485
+ if engine == "google":
486
+ transcript = self.recognizer.recognize_google(audio_data, language=language)
487
+ elif engine == "sphinx":
488
+ transcript = self.recognizer.recognize_sphinx(audio_data)
489
+ elif engine == "wit":
490
+ wit_key = os.getenv('WIT_AI_KEY')
491
+ if not wit_key:
492
+ raise ValueError("WIT_AI_KEY environment variable required for Wit.ai engine")
493
+ transcript = self.recognizer.recognize_wit(audio_data, key=wit_key)
494
+ elif engine == "bing":
495
+ bing_key = os.getenv('BING_KEY')
496
+ if not bing_key:
497
+ raise ValueError("BING_KEY environment variable required for Bing engine")
498
+ transcript = self.recognizer.recognize_bing(audio_data, key=bing_key, language=language)
499
+ else:
500
+ transcript = self.recognizer.recognize_google(audio_data, language=language)
501
+
502
+ return transcript
503
+
504
+ except self.sr.UnknownValueError:
505
+ return "Could not understand the audio - speech was unclear or inaudible"
506
+ except self.sr.RequestError as e:
507
+ return f"Error with speech recognition service: {str(e)}"
508
+ finally:
509
+ if temp_wav_path and os.path.exists(temp_wav_path):
510
+ try:
511
+ os.unlink(temp_wav_path)
512
+ except OSError:
513
+ pass
514
+
515
+ def _run(self, file_path: str, engine: str = "google", language: str = "en-US", **kwargs) -> str:
516
+ """
517
+ Internal method required by LangChain BaseTool.
518
+
519
+ Args:
520
+ file_path: Path to the audio file to transcribe
521
+ engine: Speech recognition engine to use
522
+ language: Language of the audio
523
+
524
+ Returns:
525
+ str: Transcribed text from the audio file
526
+ """
527
+ try:
528
+ self._validate_audio_file(file_path)
529
+
530
+ # Use local Whisper if specified
531
+ if engine == "whisper":
532
+ transcript = self._transcribe_with_whisper(file_path, language)
533
+ else:
534
+ # Use speech_recognition library
535
+ transcript = self._transcribe_with_sr(file_path, engine, language)
536
+
537
+ return transcript
538
+
539
+ except Exception as e:
540
+ error_msg = f"AdvancedAudioTranscriptionTool error: {str(e)}"
541
+ print(error_msg)
542
+ return error_msg
543
+
544
+ def run(self, tool_input: Dict[str, Any]) -> str:
545
+ """
546
+ Main method to run the advanced audio transcription tool.
547
+
548
+ Args:
549
+ tool_input: Dictionary containing 'file_path' and optional parameters
550
+
551
+ Returns:
552
+ str: Transcribed text from the audio file
553
+ """
554
+ try:
555
+ file_path = tool_input.get('file_path')
556
+ if not file_path:
557
+ raise ValueError("file_path is required in tool_input")
558
+
559
+ engine = tool_input.get('engine', 'google')
560
+ language = tool_input.get('language', 'en-US')
561
+
562
+ # Call the internal _run method
563
+ return self._run(file_path=file_path, engine=engine, language=language)
564
+
565
+ except Exception as e:
566
+ error_msg = f"AdvancedAudioTranscriptionTool error: {str(e)}"
567
+ print(error_msg)
568
+ return error_msg
569
+
570
 
571
+ class ExcelReaderInput(BaseModel):
572
+ """Input schema for ExcelReaderTool."""
573
+ file_path: str = Field(description="Path to the Excel file to read")
574
+
575
+
576
+ class ExcelReaderTool(BaseTool):
577
+ """Tool for reading Excel files and formatting them for LLM consumption."""
578
+
579
+ name: str = "excel_reader"
580
+ description: str = (
581
+ "Reads an Excel file from the specified file path and returns the entire "
582
+ "Use for running math operations on a table of data"
583
+ "table from Sheet1 in a format that can be easily processed by an LLM. "
584
+ "Input should be a file path to an Excel file (.xlsx or .xls)."
585
+ )
586
+ args_schema: Type[BaseModel] = ExcelReaderInput
587
+
588
+ def _run(self, file_path: str, run_manager: Optional[Any] = None) -> str:
589
+ """
590
+ Execute the tool to read Excel file and return formatted table.
591
+
592
+ Args:
593
+ file_path: Path to the Excel file
594
+ run_manager: Optional callback manager
595
+
596
+ Returns:
597
+ Formatted string representation of the Excel table
598
+ """
599
+ try:
600
+ # Validate file exists
601
+ if not os.path.exists(file_path):
602
+ return f"Error: File not found at path: {file_path}"
603
+
604
+ # Validate file extension
605
+ if not file_path.lower().endswith(('.xlsx', '.xls')):
606
+ return f"Error: File must be an Excel file (.xlsx or .xls). Got: {file_path}"
607
+
608
+ # Read Excel file - specifically Sheet1
609
+ try:
610
+ df = pd.read_excel(file_path, sheet_name='Sheet1')
611
+ except ValueError as e:
612
+ if "Worksheet named 'Sheet1' not found" in str(e):
613
+ # If Sheet1 doesn't exist, try reading the first sheet
614
+ df = pd.read_excel(file_path, sheet_name=0)
615
+ else:
616
+ raise e
617
+
618
+ # Check if dataframe is empty
619
+ if df.empty:
620
+ return "The Excel file contains no data in Sheet1."
621
+
622
+ # Format the table for LLM consumption
623
+ formatted_output = self._format_table_for_llm(df, file_path)
624
+
625
+ return formatted_output
626
+
627
+ except FileNotFoundError:
628
+ return f"Error: File not found at path: {file_path}"
629
+ except PermissionError:
630
+ return f"Error: Permission denied accessing file: {file_path}"
631
+ except Exception as e:
632
+ return f"Error reading Excel file: {str(e)}"
633
+
634
+ def _format_table_for_llm(self, df: pd.DataFrame, file_path: str) -> str:
635
+ """
636
+ Format the pandas DataFrame into a readable string format for LLMs.
637
+
638
+ Args:
639
+ df: The pandas DataFrame containing the Excel data
640
+ file_path: Original file path for reference
641
+
642
+ Returns:
643
+ Formatted string representation of the table
644
+ """
645
+ output_lines = []
646
+
647
+ # Add header information
648
+ #output_lines.append(f"EXCEL FILE DATA FROM: {os.path.basename(file_path)}")
649
+ #output_lines.append(f"Sheet: Sheet1")
650
+ #output_lines.append(f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns")
651
+ #output_lines.append("-" * 60)
652
+
653
+ # Add column information
654
+ #output_lines.append("COLUMNS:")
655
+ #for i, col in enumerate(df.columns, 1):
656
+ # col_type = str(df[col].dtype)
657
+ # non_null_count = df[col].count()
658
+ # output_lines.append(f" {i}. {col} ({col_type}) - {non_null_count} non-null values")
659
+
660
+ #output_lines.append("-" * 60)
661
+
662
+ # Add table data in a clean format
663
+ output_lines.append("TABLE DATA:")
664
+
665
+ # Convert DataFrame to string with proper formatting
666
+ # Handle potential NaN values and make it readable
667
+ df_clean = df.fillna("N/A") # Replace NaN with readable placeholder
668
+
669
+ # Create a formatted table string
670
+ #table_str = df_clean.to_string(index=True, max_rows=None, max_cols=None)
671
+ #output_lines.append(table_str)
672
+
673
+ # Add summary statistics for numeric columns if they exist
674
+ numeric_cols = df.select_dtypes(include=['number']).columns
675
+
676
+
677
+ sums = df_clean[numeric_cols].sum()
678
+
679
+
680
+ # Step 2: Define which columns are food and which are drink
681
+ food_cols = [col for col in numeric_cols if col.lower() != 'soda']
682
+ drink_cols = [col for col in numeric_cols if col.lower() == 'soda']
683
+
684
+ # Step 3: Aggregate totals
685
+ food_total = sums[food_cols].sum()
686
+ drink_total = sums[drink_cols].sum()
687
+
688
+ # Step 4: Format the results as dollars
689
+ formatted_totals = {
690
+ 'Food': f"${food_total:,.2f}",
691
+ 'Drink': f"${drink_total:,.2f}"
692
+ }
693
+
694
+ # Step 5: Convert to string for display (optional)
695
+ result_string = '\n'.join([f"{k}: {v}" for k, v in formatted_totals.items()])
696
+
697
+ # Convert to string for display
698
+ #result_string = formatted.to_string()
699
+
700
+ output_lines.append(result_string)
701
+ #output_lines.append(df_clean[numeric_cols].sum())
702
+ if len(numeric_cols) > 0:
703
+ output_lines.append("-" * 60)
704
+ #output_lines.append("NUMERIC COLUMN SUMMARY:")
705
+ #for col in numeric_cols:
706
+ # stats = df[col].describe()
707
+ # output_lines.append(f"\n{col}:")
708
+ # output_lines.append(f" Count: {stats['count']}")
709
+ # output_lines.append(f" Mean: {stats['mean']:.2f}")
710
+ # output_lines.append(f" Min: {stats['min']}")
711
+ # output_lines.append(f" Max: {stats['max']}")
712
+
713
+ return "\n".join(output_lines)
714
+
715
+ async def _arun(self, file_path: str, run_manager: Optional[Any] = None) -> str:
716
+ """Async version of the tool (falls back to sync implementation)."""
717
+ return self._run(file_path, run_manager)
718
+
719
+
720
+
721
+
722
+ class PythonExecutorInput(BaseModel):
723
+ """Input schema for PythonExecutor tool."""
724
+ file_path: str = Field(description="Path to the Python file to execute")
725
+
726
+
727
+ class PythonExecutorTool(BaseTool):
728
+ """Tool that executes a Python file and returns the result."""
729
+
730
+ name: str = "python_executor"
731
+ description: str = "Executes a Python file from the given file path and returns the output"
732
+ args_schema: Type[BaseModel] = PythonExecutorInput
733
+
734
+ def _run(
735
+ self,
736
+ file_path: str,
737
+ run_manager: Optional[Any] = None,
738
+ ) -> str:
739
+ """Execute the Python file and return the result."""
740
+ try:
741
+ # Validate that the file exists
742
+ if not os.path.exists(file_path):
743
+ return f"Error: File '{file_path}' does not exist"
744
+
745
+ # Validate that it's a Python file
746
+ if not file_path.endswith('.py'):
747
+ return f"Error: '{file_path}' is not a Python file (.py extension required)"
748
+
749
+ # Execute the Python file
750
+ result = subprocess.run(
751
+ [sys.executable, file_path],
752
+ capture_output=True,
753
+ text=True,
754
+ timeout=600 # 30 second timeout to prevent hanging
755
+ )
756
+
757
+ # Prepare the output
758
+ output_parts = []
759
+
760
+ if result.stdout:
761
+ output_parts.append(f"STDOUT:\n{result.stdout}")
762
+
763
+ if result.stderr:
764
+ output_parts.append(f"STDERR:\n{result.stderr}")
765
+
766
+ if result.returncode != 0:
767
+ output_parts.append(f"Return code: {result.returncode}")
768
+
769
+ if not output_parts:
770
+ return "Script executed successfully with no output"
771
+
772
+ return "\n\n".join(output_parts)
773
+
774
+ except subprocess.TimeoutExpired:
775
+ return "Error: Script execution timed out (30 seconds)"
776
+ except Exception as e:
777
+ return f"Error executing Python file: {str(e)}"
778
+
779
+ async def _arun(
780
+ self,
781
+ file_path: str,
782
+ run_manager: Optional[Any] = None,
783
+ ) -> str:
784
+ """Async version - delegates to sync implementation."""
785
+ return self._run(file_path, run_manager)
786
 
787
  class EnhancedDuckDuckGoSearchTool(BaseTool):
788
  name: str = "enhanced_search"
 
1373
  return f"An unexpected error occurred: {str(e)}"
1374
 
1375
 
1376
+
1377
  class EnhancedYoutubeScreenshotQA(BaseTool):
1378
+ name: str = "bird_species_screenshot_qa"
1379
  description: str = (
1380
+ "Use this tool to calculate the number of bird species on camera at any one time,"
 
 
1381
  "Input should be a dict with keys: 'youtube_url', 'question', and optional parameters. "
1382
  #"Optional parameters: 'frame_interval_seconds' (default: 10), 'max_frames' (default: 50), "
1383
  #"'use_scene_detection' (default: True), 'parallel_processing' (default: True). "
 
1413
  def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
1414
  """Get configuration value with fallback to defaults"""
1415
  defaults = {
1416
+ 'frame_interval_seconds': 5,
1417
+ 'max_frames': 500,
1418
  'use_scene_detection': True,
1419
  'resize_frames': True,
1420
  'parallel_processing': True,
 
1439
  "Salesforce/blip-vqa-base"
1440
  ).to(self.device)
1441
 
1442
+ #self.processor_vqa = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
1443
+ #self.model_vqa = BlipForQuestionAnswering.from_pretrained(
1444
+ # "Salesforce/blip-vqa-capfilt-large"
1445
+ #).to(self.device)
1446
+
1447
  print("BLIP VQA model loaded successfully")
1448
  except Exception as e:
1449
  print(f"Error initializing VQA model: {str(e)}")
 
1679
  def _answer_question_on_frame(self, frame_path: str, question: str) -> Tuple[str, float]:
1680
  """Answer question on single frame with confidence scoring"""
1681
  try:
1682
+ #ipdb.set_trace()
1683
  image = Image.open(frame_path).convert('RGB')
1684
  inputs = self.processor_vqa(image, question, return_tensors="pt").to(self.device)
1685
 
 
1996
  def _run(self, youtube_url, question, **kwargs) -> str:
1997
  """Enhanced main execution method"""
1998
  #ipdb.set_trace()
1999
+ question = "How many unique bird species are on camera?"
2000
 
2001
  #input_data = query
2002
  #youtube_url = input_data.get("youtube_url")
 
2035
 
2036
  # Format comprehensive result - Fixed the reference to stats
2037
  result = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2038
 
2039
  📊 **STATISTICAL SUMMARY**:
2040
  • Minimum: {analysis_result['statistical_summary']['minimum']:.2f}
 
2043
  • Median: {analysis_result['statistical_summary']['median']:.2f}
2044
  • Range: {analysis_result['statistical_summary']['range']:.2f}
2045
 
 
 
 
 
2046
  """.strip()
2047
 
2048
  return result
 
2055
  def create_enhanced_youtube_qa_tool(**kwargs):
2056
  """Factory function to create the enhanced tool with custom parameters"""
2057
  return EnhancedYoutubeScreenshotQA(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
2058
 
2059
 
2060
  class YouTubeTranscriptExtractor(BaseTool):
2061
  name: str = "youtube_transcript_extractor"
2062
  description: str = (
2063
  "Downloads a YouTube video and extracts the complete audio transcript using speech recognition with speaker identification. "
2064
+ #"Use this tool for AUDIO questions, when the youtube question involves what a person says,"
2065
+ "Use this tool for questions like 'what does jim say in response to a question in this video',"
2066
  "Input should be a dict with keys: 'youtube_url' and optional parameters. "
2067
+ #"Optional parameters: 'language' (default: 'en-US'), 'chunk_length_ms' (default: 30000), "
2068
+ #"'silence_thresh' (default: -40), 'use_enhanced_model' (default: True), 'audio_quality' (default: 'best'), "
2069
+ #"'enable_speaker_id' (default: True), 'max_speakers' (default: 5), 'speaker_min_duration' (default: 2.0). "
2070
  "Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US', 'enable_speaker_id': True}"
2071
  )
2072
 
 
2834
  """Factory function to create the transcript extraction tool with custom parameters"""
2835
  return YouTubeTranscriptExtractor(**kwargs)
2836
 
 
 
2837
  # --- Model Configuration ---
2838
  def create_llm_pipeline():
2839
  #model_id = "meta-llama/Llama-2-13b-chat-hf"
 
3585
 
3586
  # --- Run the Agent ---
3587
  # Enhanced system prompt for better behavior
 
3588
  def run_agent(agent, state: AgentState):
3589
  """Enhanced agent initialization with better prompt and hallucination prevention."""
3590
+ global WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_TOOL, YOUTUBE_AUDIO_TOOL, AUDIO_TRANSCRIPTION_TOOL, EXCEL_TOOL, PYTHON_TOOL, tools
3591
 
3592
  # Initialize tools
3593
  WIKIPEDIA_TOOL = WikipediaSearchToolWithFAISS()
3594
+ SEARCH_TOOL = EnhancedDuckDuckGoSearchTool(max_results=3, max_chars_per_page=8000)
3595
  YOUTUBE_TOOL = EnhancedYoutubeScreenshotQA()
3596
  YOUTUBE_AUDIO_TOOL = YouTubeTranscriptExtractor()
3597
+ AUDIO_TRANSCRIPTION_TOOL = AudioTranscriptionTool()
3598
+ EXCEL_TOOL = ExcelReaderTool()
3599
+ PYTHON_TOOL = PythonExecutorTool()
3600
+ tools = [WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_AUDIO_TOOL, YOUTUBE_TOOL, AUDIO_TRANSCRIPTION_TOOL, EXCEL_TOOL, PYTHON_TOOL]
3601
 
3602
  formatted_tools_description = render_text_description(tools)
3603
  current_date_str = datetime.now().strftime("%Y-%m-%d")
 
3613
  3. Use tools ONLY when you need specific information you don't know
3614
  4. After using a tool, provide your FINAL ANSWER immediately
3615
  5. STOP after giving your FINAL ANSWER - do not continue
3616
+ 6. Do not repeat words in the question in the answer
3617
 
3618
  FORMAT for tool use:
3619
  Thought: <brief reasoning>
 
3625
 
3626
  ANSWER FORMAT:
3627
  - Numbers: no commas, no units unless specified
3628
+ - Questions on "how many" should be answered with a number ONLY
3629
  - Strings: no articles, no abbreviations, digits in plain text
3630
+ - Lists: comma-separated either in ascending numeric order or alphabetical order as requested
3631
  - Be extremely brief and concise
3632
  - Do not provide additional context or explanations
3633
  - Do not provide parentheticals
3634
 
3635
+
3636
+
3637
  IMPORTANT: You are responding to ONE question only. Do not ask follow-up questions or generate additional dialogue.
3638
 
3639
  Current date: {current_date_str}
 
3660
 
3661
  # Cleanup
3662
  if result.get("done"):
3663
+ torch.cuda.empty_cache()
3664
+ torch.cuda.ipc_collect()
3665
  gc.collect()
3666
  print("🧹 Released GPU memory after completion")
3667
 
3668
  return result["messages"]
3669
+