Spaces:
Sleeping
Sleeping
Samuel Thomas
commited on
Commit
·
fe1bd6e
1
Parent(s):
4000d20
new tools
Browse files- app.py +1 -1
- requirements.txt +2 -1
- tools.py +675 -76
app.py
CHANGED
@@ -143,7 +143,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
143 |
task_id = hf_questions[r]['task_id']
|
144 |
question_text = hf_questions[r]['question']
|
145 |
full_answer = run_agent(agent, s)
|
146 |
-
submitted_answer =
|
147 |
print(f"\n\nQuestion {r+1} Answer: {submitted_answer}\n\n")
|
148 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
149 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
|
|
143 |
task_id = hf_questions[r]['task_id']
|
144 |
question_text = hf_questions[r]['question']
|
145 |
full_answer = run_agent(agent, s)
|
146 |
+
submitted_answer = extract_final_answer(full_answer[-1].content)
|
147 |
print(f"\n\nQuestion {r+1} Answer: {submitted_answer}\n\n")
|
148 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
149 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
requirements.txt
CHANGED
@@ -24,4 +24,5 @@ duckduckgo-search==8.0.0
|
|
24 |
sentencepiece
|
25 |
nltk
|
26 |
SpeechRecognition
|
27 |
-
pandas
|
|
|
|
24 |
sentencepiece
|
25 |
nltk
|
26 |
SpeechRecognition
|
27 |
+
pandas
|
28 |
+
openai-whisper
|
tools.py
CHANGED
@@ -6,16 +6,19 @@ import string
|
|
6 |
import glob
|
7 |
import shutil
|
8 |
import gc
|
|
|
9 |
import uuid
|
10 |
import signal
|
|
|
|
|
11 |
from datetime import datetime
|
12 |
from io import BytesIO
|
13 |
from contextlib import contextmanager
|
14 |
from langchain_huggingface import HuggingFacePipeline
|
15 |
-
from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set
|
16 |
import time
|
17 |
from collections import Counter
|
18 |
-
from pydantic import Field
|
19 |
import hashlib
|
20 |
import json
|
21 |
import numpy as np
|
@@ -44,6 +47,7 @@ from pydub import AudioSegment
|
|
44 |
from pydub.silence import split_on_silence
|
45 |
import nltk
|
46 |
from nltk.corpus import words
|
|
|
47 |
|
48 |
# LangChain Ecosystem
|
49 |
from langchain.docstore.document import Document
|
@@ -89,23 +93,21 @@ def create_llm_pipeline():
|
|
89 |
#model_id = "meta-llama/Llama-3.3-70B-Instruct"
|
90 |
#model_id = "mistralai/Mistral-Small-24B-Base-2501"
|
91 |
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
|
|
|
|
|
|
|
|
|
92 |
#model_id = "Qwen/Qwen2-7B-Instruct"
|
93 |
-
|
94 |
-
# Load tokenizer explicitly with fast version
|
95 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
96 |
-
model_id,
|
97 |
-
use_fast=True, # Force fast tokenizer
|
98 |
-
add_prefix_space=True # Only if actually needed
|
99 |
-
)
|
100 |
-
|
101 |
return pipeline(
|
102 |
"text-generation",
|
103 |
model=model_id,
|
104 |
-
|
105 |
-
device_map="cpu",
|
106 |
torch_dtype=torch.float16,
|
107 |
max_new_tokens=1024,
|
108 |
-
temperature=0.
|
|
|
|
|
109 |
)
|
110 |
|
111 |
# Define file extension sets for each category
|
@@ -150,21 +152,637 @@ def write_bytes_to_temp_dir(file_bytes: bytes, file_name: str) -> str:
|
|
150 |
print(f"File written to: {file_path}")
|
151 |
return file_path
|
152 |
|
153 |
-
|
154 |
def extract_final_answer(text: str) -> str:
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
class EnhancedDuckDuckGoSearchTool(BaseTool):
|
170 |
name: str = "enhanced_search"
|
@@ -755,12 +1373,11 @@ class WikipediaSearchToolWithFAISS(BaseTool):
|
|
755 |
return f"An unexpected error occurred: {str(e)}"
|
756 |
|
757 |
|
|
|
758 |
class EnhancedYoutubeScreenshotQA(BaseTool):
|
759 |
-
name: str = "
|
760 |
description: str = (
|
761 |
-
"
|
762 |
-
"and answers questions using advanced visual QA with semantic analysis. "
|
763 |
-
"Use this tool for questions about the VIDEO or IMAGES in the video,"
|
764 |
"Input should be a dict with keys: 'youtube_url', 'question', and optional parameters. "
|
765 |
#"Optional parameters: 'frame_interval_seconds' (default: 10), 'max_frames' (default: 50), "
|
766 |
#"'use_scene_detection' (default: True), 'parallel_processing' (default: True). "
|
@@ -796,8 +1413,8 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
796 |
def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
|
797 |
"""Get configuration value with fallback to defaults"""
|
798 |
defaults = {
|
799 |
-
'frame_interval_seconds':
|
800 |
-
'max_frames':
|
801 |
'use_scene_detection': True,
|
802 |
'resize_frames': True,
|
803 |
'parallel_processing': True,
|
@@ -822,6 +1439,11 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
822 |
"Salesforce/blip-vqa-base"
|
823 |
).to(self.device)
|
824 |
|
|
|
|
|
|
|
|
|
|
|
825 |
print("BLIP VQA model loaded successfully")
|
826 |
except Exception as e:
|
827 |
print(f"Error initializing VQA model: {str(e)}")
|
@@ -1057,6 +1679,7 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
1057 |
def _answer_question_on_frame(self, frame_path: str, question: str) -> Tuple[str, float]:
|
1058 |
"""Answer question on single frame with confidence scoring"""
|
1059 |
try:
|
|
|
1060 |
image = Image.open(frame_path).convert('RGB')
|
1061 |
inputs = self.processor_vqa(image, question, return_tensors="pt").to(self.device)
|
1062 |
|
@@ -1373,6 +1996,7 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
1373 |
def _run(self, youtube_url, question, **kwargs) -> str:
|
1374 |
"""Enhanced main execution method"""
|
1375 |
#ipdb.set_trace()
|
|
|
1376 |
|
1377 |
#input_data = query
|
1378 |
#youtube_url = input_data.get("youtube_url")
|
@@ -1411,20 +2035,6 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
1411 |
|
1412 |
# Format comprehensive result - Fixed the reference to stats
|
1413 |
result = f"""
|
1414 |
-
📊 **ANALYSIS SUMMARY**:
|
1415 |
-
• Confidence Score: {analysis_result['confidence']:.2%}
|
1416 |
-
• Frames Analyzed: {analysis_result['successful_analyses']}/{analysis_result['frame_count']}
|
1417 |
-
• Answer Consistency: {analysis_result['temporal_analysis'].get('stability_ratio', 0):.2%}
|
1418 |
-
|
1419 |
-
📈 **ANSWER DISTRIBUTION**:
|
1420 |
-
{chr(10).join([f"• {answer}: {count} frames" for answer, count in analysis_result['answer_distribution'].items()])}
|
1421 |
-
|
1422 |
-
🔍 **SEMANTIC CLUSTERS**:
|
1423 |
-
{chr(10).join([f"• '{cluster}': {count} similar answers" for cluster, count in analysis_result['semantic_clusters'].items()])}
|
1424 |
-
|
1425 |
-
⏱️ **TEMPORAL ANALYSIS**:
|
1426 |
-
• Answer Changes: {analysis_result['temporal_analysis'].get('total_changes', 0)}
|
1427 |
-
• Stability: {analysis_result['temporal_analysis'].get('stability_ratio', 0):.2%}
|
1428 |
|
1429 |
📊 **STATISTICAL SUMMARY**:
|
1430 |
• Minimum: {analysis_result['statistical_summary']['minimum']:.2f}
|
@@ -1433,10 +2043,6 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
1433 |
• Median: {analysis_result['statistical_summary']['median']:.2f}
|
1434 |
• Range: {analysis_result['statistical_summary']['range']:.2f}
|
1435 |
|
1436 |
-
🎯 **CONFIDENCE BREAKDOWN**:
|
1437 |
-
• Frequency-based: {analysis_result['frequency_confidence']:.2%}
|
1438 |
-
• Model-based: {analysis_result['average_model_confidence']:.2%}
|
1439 |
-
• Combined: {analysis_result['confidence']:.2%}
|
1440 |
""".strip()
|
1441 |
|
1442 |
return result
|
@@ -1449,30 +2055,18 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
1449 |
def create_enhanced_youtube_qa_tool(**kwargs):
|
1450 |
"""Factory function to create the enhanced tool with custom parameters"""
|
1451 |
return EnhancedYoutubeScreenshotQA(**kwargs)
|
1452 |
-
# Example of creating the tool instance:
|
1453 |
-
# wikipedia_tool_faiss = WikipediaSearchToolWithFAISS()
|
1454 |
-
|
1455 |
-
# To use this new tool in your agent, you would replace the old
|
1456 |
-
# `wikipedia_tool` instance with `wikipedia_tool_faiss` in your `tools` list.
|
1457 |
-
# For example:
|
1458 |
-
# tools = [wikipedia_tool_faiss, search_tool]
|
1459 |
-
# Create tool instances
|
1460 |
-
#wikipedia_tool = WikipediaSearchTool()
|
1461 |
-
|
1462 |
-
# --- Define Call LLM function ---
|
1463 |
-
|
1464 |
-
# 3. Improved LLM call with memory management
|
1465 |
|
1466 |
|
1467 |
class YouTubeTranscriptExtractor(BaseTool):
|
1468 |
name: str = "youtube_transcript_extractor"
|
1469 |
description: str = (
|
1470 |
"Downloads a YouTube video and extracts the complete audio transcript using speech recognition with speaker identification. "
|
1471 |
-
"Use this tool
|
|
|
1472 |
"Input should be a dict with keys: 'youtube_url' and optional parameters. "
|
1473 |
-
"Optional parameters: 'language' (default: 'en-US'), 'chunk_length_ms' (default: 30000), "
|
1474 |
-
"'silence_thresh' (default: -40), 'use_enhanced_model' (default: True), 'audio_quality' (default: 'best'), "
|
1475 |
-
"'enable_speaker_id' (default: True), 'max_speakers' (default: 5), 'speaker_min_duration' (default: 2.0). "
|
1476 |
"Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US', 'enable_speaker_id': True}"
|
1477 |
)
|
1478 |
|
@@ -2240,8 +2834,6 @@ def create_youtube_transcript_tool(**kwargs):
|
|
2240 |
"""Factory function to create the transcript extraction tool with custom parameters"""
|
2241 |
return YouTubeTranscriptExtractor(**kwargs)
|
2242 |
|
2243 |
-
|
2244 |
-
|
2245 |
# --- Model Configuration ---
|
2246 |
def create_llm_pipeline():
|
2247 |
#model_id = "meta-llama/Llama-2-13b-chat-hf"
|
@@ -2993,17 +3585,19 @@ def fix_backwards_text(text):
|
|
2993 |
|
2994 |
# --- Run the Agent ---
|
2995 |
# Enhanced system prompt for better behavior
|
2996 |
-
|
2997 |
def run_agent(agent, state: AgentState):
|
2998 |
"""Enhanced agent initialization with better prompt and hallucination prevention."""
|
2999 |
-
global WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_TOOL, YOUTUBE_AUDIO_TOOL, tools
|
3000 |
|
3001 |
# Initialize tools
|
3002 |
WIKIPEDIA_TOOL = WikipediaSearchToolWithFAISS()
|
3003 |
-
SEARCH_TOOL = EnhancedDuckDuckGoSearchTool(max_results=3, max_chars_per_page=
|
3004 |
YOUTUBE_TOOL = EnhancedYoutubeScreenshotQA()
|
3005 |
YOUTUBE_AUDIO_TOOL = YouTubeTranscriptExtractor()
|
3006 |
-
|
|
|
|
|
|
|
3007 |
|
3008 |
formatted_tools_description = render_text_description(tools)
|
3009 |
current_date_str = datetime.now().strftime("%Y-%m-%d")
|
@@ -3019,6 +3613,7 @@ CRITICAL INSTRUCTIONS:
|
|
3019 |
3. Use tools ONLY when you need specific information you don't know
|
3020 |
4. After using a tool, provide your FINAL ANSWER immediately
|
3021 |
5. STOP after giving your FINAL ANSWER - do not continue
|
|
|
3022 |
|
3023 |
FORMAT for tool use:
|
3024 |
Thought: <brief reasoning>
|
@@ -3030,12 +3625,15 @@ FINAL ANSWER: [concise answer only]
|
|
3030 |
|
3031 |
ANSWER FORMAT:
|
3032 |
- Numbers: no commas, no units unless specified
|
|
|
3033 |
- Strings: no articles, no abbreviations, digits in plain text
|
3034 |
-
- Lists: comma-separated
|
3035 |
- Be extremely brief and concise
|
3036 |
- Do not provide additional context or explanations
|
3037 |
- Do not provide parentheticals
|
3038 |
|
|
|
|
|
3039 |
IMPORTANT: You are responding to ONE question only. Do not ask follow-up questions or generate additional dialogue.
|
3040 |
|
3041 |
Current date: {current_date_str}
|
@@ -3062,9 +3660,10 @@ Current date: {current_date_str}
|
|
3062 |
|
3063 |
# Cleanup
|
3064 |
if result.get("done"):
|
3065 |
-
|
3066 |
-
|
3067 |
gc.collect()
|
3068 |
print("🧹 Released GPU memory after completion")
|
3069 |
|
3070 |
return result["messages"]
|
|
|
|
6 |
import glob
|
7 |
import shutil
|
8 |
import gc
|
9 |
+
import sys
|
10 |
import uuid
|
11 |
import signal
|
12 |
+
from pathlib import Path
|
13 |
+
import subprocess
|
14 |
from datetime import datetime
|
15 |
from io import BytesIO
|
16 |
from contextlib import contextmanager
|
17 |
from langchain_huggingface import HuggingFacePipeline
|
18 |
+
from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set, Type
|
19 |
import time
|
20 |
from collections import Counter
|
21 |
+
from pydantic import Field, BaseModel
|
22 |
import hashlib
|
23 |
import json
|
24 |
import numpy as np
|
|
|
47 |
from pydub.silence import split_on_silence
|
48 |
import nltk
|
49 |
from nltk.corpus import words
|
50 |
+
import pandas as pd
|
51 |
|
52 |
# LangChain Ecosystem
|
53 |
from langchain.docstore.document import Document
|
|
|
93 |
#model_id = "meta-llama/Llama-3.3-70B-Instruct"
|
94 |
#model_id = "mistralai/Mistral-Small-24B-Base-2501"
|
95 |
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
|
96 |
+
#model_id = "Meta-Llama/Llama-2-7b-chat-hf"
|
97 |
+
#model_id = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"
|
98 |
+
#model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
|
99 |
+
#model_id = "mistralai/Mistral-7B-Instruct-v0.2"
|
100 |
#model_id = "Qwen/Qwen2-7B-Instruct"
|
101 |
+
#model_id = "GSAI-ML/LLaDA-8B-Instruct"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
return pipeline(
|
103 |
"text-generation",
|
104 |
model=model_id,
|
105 |
+
device_map="auto",
|
|
|
106 |
torch_dtype=torch.float16,
|
107 |
max_new_tokens=1024,
|
108 |
+
temperature=0.05,
|
109 |
+
do_sample=False,
|
110 |
+
repetition_penalty=1.2
|
111 |
)
|
112 |
|
113 |
# Define file extension sets for each category
|
|
|
152 |
print(f"File written to: {file_path}")
|
153 |
return file_path
|
154 |
|
|
|
155 |
def extract_final_answer(text: str) -> str:
|
156 |
+
"""
|
157 |
+
Extracts the answer after the last 'FINAL ANSWER:' (case-insensitive),
|
158 |
+
removes any parenthetical immediately following a numeric answer,
|
159 |
+
strips trailing punctuation, sorts comma-separated lists,
|
160 |
+
and does not split numbers containing commas.
|
161 |
+
Returns an empty string if marker not found.
|
162 |
+
"""
|
163 |
+
marker = "FINAL ANSWER:"
|
164 |
+
idx = text.lower().rfind(marker.lower())
|
165 |
+
if idx == -1:
|
166 |
+
return ""
|
167 |
+
# Extract answer after marker
|
168 |
+
result = text[idx + len(marker):].strip()
|
169 |
+
# Remove parenthetical immediately following a number at the start
|
170 |
+
result = re.sub(r'^(\d+(?:\.\d+)?)\s*\(.*?\)', r'\1', result)
|
171 |
+
# Remove trailing punctuation and whitespace
|
172 |
+
result = result.rstrip(string.punctuation + " ")
|
173 |
+
# Split on commas NOT between digits (i.e., not inside numbers)
|
174 |
+
# This regex splits on commas not surrounded by digits (to avoid splitting numbers like 1,000)
|
175 |
+
items = re.split(r',(?!\s*\d{3}\b)', result)
|
176 |
+
# If we have a list, sort it
|
177 |
+
if len(items) > 1:
|
178 |
+
items = [item.strip() for item in items]
|
179 |
+
# Try to sort numerically
|
180 |
+
try:
|
181 |
+
sorted_items = sorted(
|
182 |
+
items,
|
183 |
+
key=lambda x: float(re.sub(r'[^\d\.]', '', x)) # Remove non-numeric except .
|
184 |
+
)
|
185 |
+
return ', '.join(sorted_items)
|
186 |
+
except ValueError:
|
187 |
+
# Fallback: sort alphabetically
|
188 |
+
sorted_items = sorted(items, key=lambda x: x.lower())
|
189 |
+
return ', '.join(sorted_items)
|
190 |
+
return result
|
191 |
+
|
192 |
+
|
193 |
+
class AudioTranscriptionInput(BaseModel):
|
194 |
+
"""Input schema for AudioTranscriptionTool."""
|
195 |
+
file_path: str = Field(description="Path to the audio file to transcribe")
|
196 |
+
engine: Optional[str] = Field(default="google", description="Speech recognition engine to use")
|
197 |
+
language: Optional[str] = Field(default="en-US", description="Language of the audio")
|
198 |
+
|
199 |
+
class AudioTranscriptionTool(BaseTool):
|
200 |
+
"""Tool for transcribing audio files using local speech recognition."""
|
201 |
+
|
202 |
+
name: str = "audio_transcription"
|
203 |
+
description: str = """
|
204 |
+
Transcribes voice memo, audio files (mp3, wav, m4a, flac, etc.) to text using local speech recognition.
|
205 |
+
Input should be a dictionary with 'file_path' key containing the path to the audio file.
|
206 |
+
Optionally accepts 'engine' and 'language' parameters.
|
207 |
+
Returns the transcribed text as a string.
|
208 |
+
"""
|
209 |
+
args_schema: type[BaseModel] = AudioTranscriptionInput
|
210 |
+
|
211 |
+
class Config:
|
212 |
+
arbitrary_types_allowed = True
|
213 |
+
|
214 |
+
def __init__(self, **kwargs):
|
215 |
+
"""Initialize the AudioTranscriptionTool."""
|
216 |
+
super().__init__(**kwargs)
|
217 |
+
self._init_speech_recognition()
|
218 |
+
|
219 |
+
def _init_speech_recognition(self):
|
220 |
+
"""Initialize speech recognition components."""
|
221 |
+
try:
|
222 |
+
import speech_recognition as sr
|
223 |
+
from pydub import AudioSegment
|
224 |
+
object.__setattr__(self, 'recognizer', sr.Recognizer())
|
225 |
+
object.__setattr__(self, 'sr', sr)
|
226 |
+
object.__setattr__(self, 'AudioSegment', AudioSegment)
|
227 |
+
except ImportError as e:
|
228 |
+
raise ImportError(
|
229 |
+
"Required libraries not found. Install with: "
|
230 |
+
"pip install SpeechRecognition pydub"
|
231 |
+
) from e
|
232 |
+
|
233 |
+
def _validate_audio_file(self, file_path: str) -> bool:
|
234 |
+
"""Validate that the audio file exists and has a supported format."""
|
235 |
+
if not os.path.exists(file_path):
|
236 |
+
raise FileNotFoundError(f"Audio file not found: {file_path}")
|
237 |
+
|
238 |
+
# Check file extension - pydub supports many formats
|
239 |
+
supported_formats = {'.mp3', '.wav', '.m4a', '.flac', '.mp4', '.mpeg', '.mpga', '.webm', '.ogg', '.aac'}
|
240 |
+
file_extension = Path(file_path).suffix.lower()
|
241 |
+
|
242 |
+
if file_extension not in supported_formats:
|
243 |
+
raise ValueError(
|
244 |
+
f"Unsupported audio format: {file_extension}. "
|
245 |
+
f"Supported formats: {', '.join(supported_formats)}"
|
246 |
+
)
|
247 |
+
|
248 |
+
return True
|
249 |
+
|
250 |
+
def _convert_to_wav(self, file_path: str) -> str:
|
251 |
+
"""Convert audio file to WAV format if needed."""
|
252 |
+
file_extension = Path(file_path).suffix.lower()
|
253 |
+
|
254 |
+
if file_extension == '.wav':
|
255 |
+
return file_path
|
256 |
+
|
257 |
+
try:
|
258 |
+
# Convert to WAV using pydub
|
259 |
+
audio = self.AudioSegment.from_file(file_path)
|
260 |
+
|
261 |
+
# Create temporary WAV file
|
262 |
+
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
263 |
+
audio.export(temp_wav.name, format="wav")
|
264 |
+
return temp_wav.name
|
265 |
+
except Exception as e:
|
266 |
+
raise RuntimeError(f"Error converting audio file to WAV: {str(e)}")
|
267 |
+
|
268 |
+
def _transcribe_audio(self, file_path: str, engine: str = "google", language: str = "en-US") -> str:
|
269 |
+
"""Transcribe audio file using local speech recognition."""
|
270 |
+
temp_wav_path = None
|
271 |
+
|
272 |
+
try:
|
273 |
+
# Convert to WAV if necessary
|
274 |
+
wav_path = self._convert_to_wav(file_path)
|
275 |
+
if wav_path != file_path:
|
276 |
+
temp_wav_path = wav_path
|
277 |
+
|
278 |
+
# Load audio file
|
279 |
+
with self.sr.AudioFile(wav_path) as source:
|
280 |
+
# Adjust for ambient noise
|
281 |
+
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
|
282 |
+
# Record the audio
|
283 |
+
audio_data = self.recognizer.record(source)
|
284 |
+
|
285 |
+
# Choose recognition engine
|
286 |
+
if engine == "google":
|
287 |
+
transcript = self.recognizer.recognize_google(audio_data, language=language)
|
288 |
+
elif engine == "sphinx":
|
289 |
+
transcript = self.recognizer.recognize_sphinx(audio_data, language=language)
|
290 |
+
elif engine == "wit":
|
291 |
+
# Note: requires WIT_AI_KEY environment variable
|
292 |
+
wit_key = os.getenv('WIT_AI_KEY')
|
293 |
+
if not wit_key:
|
294 |
+
raise ValueError("WIT_AI_KEY environment variable required for Wit.ai engine")
|
295 |
+
transcript = self.recognizer.recognize_wit(audio_data, key=wit_key)
|
296 |
+
elif engine == "bing":
|
297 |
+
# Note: requires BING_KEY environment variable
|
298 |
+
bing_key = os.getenv('BING_KEY')
|
299 |
+
if not bing_key:
|
300 |
+
raise ValueError("BING_KEY environment variable required for Bing engine")
|
301 |
+
transcript = self.recognizer.recognize_bing(audio_data, key=bing_key, language=language)
|
302 |
+
else:
|
303 |
+
# Default to Google
|
304 |
+
transcript = self.recognizer.recognize_google(audio_data, language=language)
|
305 |
+
|
306 |
+
return transcript
|
307 |
+
|
308 |
+
except self.sr.UnknownValueError:
|
309 |
+
return "Could not understand the audio - speech was unclear or inaudible"
|
310 |
+
except self.sr.RequestError as e:
|
311 |
+
return f"Error with speech recognition service: {str(e)}"
|
312 |
+
except Exception as e:
|
313 |
+
raise RuntimeError(f"Error transcribing audio: {str(e)}")
|
314 |
+
finally:
|
315 |
+
# Clean up temporary WAV file
|
316 |
+
if temp_wav_path and os.path.exists(temp_wav_path):
|
317 |
+
try:
|
318 |
+
os.unlink(temp_wav_path)
|
319 |
+
except OSError:
|
320 |
+
pass # Ignore cleanup errors
|
321 |
+
|
322 |
+
def _run(self, file_path: str, engine: str = "google", language: str = "en-US", **kwargs) -> str:
|
323 |
+
"""
|
324 |
+
Internal method required by LangChain BaseTool.
|
325 |
+
|
326 |
+
Args:
|
327 |
+
file_path: Path to the audio file to transcribe
|
328 |
+
engine: Speech recognition engine to use
|
329 |
+
language: Language of the audio
|
330 |
+
|
331 |
+
Returns:
|
332 |
+
str: Transcribed text from the audio file
|
333 |
+
"""
|
334 |
+
try:
|
335 |
+
# Validate audio file
|
336 |
+
self._validate_audio_file(file_path)
|
337 |
+
|
338 |
+
# Transcribe audio
|
339 |
+
transcript = self._transcribe_audio(
|
340 |
+
file_path=file_path,
|
341 |
+
engine=engine,
|
342 |
+
language=language
|
343 |
+
)
|
344 |
+
|
345 |
+
return transcript
|
346 |
+
|
347 |
+
except Exception as e:
|
348 |
+
error_msg = f"AudioTranscriptionTool error: {str(e)}"
|
349 |
+
print(error_msg)
|
350 |
+
return error_msg
|
351 |
+
|
352 |
+
def run(self, tool_input: Dict[str, Any]) -> str:
|
353 |
+
"""
|
354 |
+
Main method to run the audio transcription tool.
|
355 |
+
|
356 |
+
Args:
|
357 |
+
tool_input: Dictionary containing 'file_path' and optional parameters
|
358 |
+
|
359 |
+
Returns:
|
360 |
+
str: Transcribed text from the audio file
|
361 |
+
"""
|
362 |
+
try:
|
363 |
+
# Extract parameters from input
|
364 |
+
file_path = tool_input.get('file_path')
|
365 |
+
if not file_path:
|
366 |
+
raise ValueError("file_path is required in tool_input")
|
367 |
+
|
368 |
+
engine = tool_input.get('engine', 'google')
|
369 |
+
language = tool_input.get('language', 'en-US')
|
370 |
+
|
371 |
+
# Call the internal _run method
|
372 |
+
return self._run(file_path=file_path, engine=engine, language=language)
|
373 |
+
|
374 |
+
except Exception as e:
|
375 |
+
error_msg = f"AudioTranscriptionTool error: {str(e)}"
|
376 |
+
print(error_msg)
|
377 |
+
return error_msg
|
378 |
+
|
379 |
+
# Enhanced local transcription tool with multiple engine support
|
380 |
+
class AdvancedAudioTranscriptionTool(BaseTool):
|
381 |
+
"""Advanced tool with support for multiple local transcription engines including Whisper."""
|
382 |
+
|
383 |
+
name: str = "advanced_audio_transcription"
|
384 |
+
description: str = """
|
385 |
+
Advanced audio transcription tool supporting multiple engines including local Whisper.
|
386 |
+
Supports engines: 'whisper' (local), 'google', 'sphinx', 'wit', 'bing'.
|
387 |
+
Input should be a dictionary with 'file_path' key.
|
388 |
+
Returns the transcribed text as a string.
|
389 |
+
"""
|
390 |
+
args_schema: type[BaseModel] = AudioTranscriptionInput
|
391 |
+
|
392 |
+
class Config:
|
393 |
+
arbitrary_types_allowed = True
|
394 |
+
|
395 |
+
def __init__(self, **kwargs):
|
396 |
+
"""Initialize the AdvancedAudioTranscriptionTool."""
|
397 |
+
super().__init__(**kwargs)
|
398 |
+
self._init_speech_recognition()
|
399 |
+
self._init_whisper()
|
400 |
+
|
401 |
+
def _init_speech_recognition(self):
|
402 |
+
"""Initialize speech recognition components."""
|
403 |
+
try:
|
404 |
+
import speech_recognition as sr
|
405 |
+
from pydub import AudioSegment
|
406 |
+
object.__setattr__(self, 'recognizer', sr.Recognizer())
|
407 |
+
object.__setattr__(self, 'sr', sr)
|
408 |
+
object.__setattr__(self, 'AudioSegment', AudioSegment)
|
409 |
+
except ImportError as e:
|
410 |
+
raise ImportError(
|
411 |
+
"Required libraries not found. Install with: "
|
412 |
+
"pip install SpeechRecognition pydub"
|
413 |
+
) from e
|
414 |
+
|
415 |
+
def _init_whisper(self):
|
416 |
+
"""Initialize Whisper if available."""
|
417 |
+
try:
|
418 |
+
import whisper
|
419 |
+
object.__setattr__(self, 'whisper', whisper)
|
420 |
+
except ImportError:
|
421 |
+
object.__setattr__(self, 'whisper', None)
|
422 |
+
print("Warning: OpenAI Whisper not installed. Install with 'pip install openai-whisper' for local Whisper support.")
|
423 |
+
|
424 |
+
def _validate_audio_file(self, file_path: str) -> bool:
|
425 |
+
"""Validate that the audio file exists and has a supported format."""
|
426 |
+
if not os.path.exists(file_path):
|
427 |
+
raise FileNotFoundError(f"Audio file not found: {file_path}")
|
428 |
+
|
429 |
+
supported_formats = {'.mp3', '.wav', '.m4a', '.flac', '.mp4', '.mpeg', '.mpga', '.webm', '.ogg', '.aac'}
|
430 |
+
file_extension = Path(file_path).suffix.lower()
|
431 |
+
|
432 |
+
if file_extension not in supported_formats:
|
433 |
+
raise ValueError(
|
434 |
+
f"Unsupported audio format: {file_extension}. "
|
435 |
+
f"Supported formats: {', '.join(supported_formats)}"
|
436 |
+
)
|
437 |
+
|
438 |
+
return True
|
439 |
+
|
440 |
+
def _transcribe_with_whisper(self, file_path: str, language: str = "en") -> str:
|
441 |
+
"""Transcribe using local Whisper model."""
|
442 |
+
if not self.whisper:
|
443 |
+
raise RuntimeError("Whisper not installed. Install with 'pip install openai-whisper'")
|
444 |
+
|
445 |
+
try:
|
446 |
+
# Load the model (you can change model size: tiny, base, small, medium, large)
|
447 |
+
model = self.whisper.load_model("base")
|
448 |
+
|
449 |
+
# Transcribe the audio
|
450 |
+
result = model.transcribe(file_path, language=language if language != "en-US" else "en")
|
451 |
+
|
452 |
+
return result["text"].strip()
|
453 |
+
|
454 |
+
except Exception as e:
|
455 |
+
raise RuntimeError(f"Error with Whisper transcription: {str(e)}")
|
456 |
+
|
457 |
+
def _convert_to_wav(self, file_path: str) -> str:
|
458 |
+
"""Convert audio file to WAV format if needed."""
|
459 |
+
file_extension = Path(file_path).suffix.lower()
|
460 |
+
|
461 |
+
if file_extension == '.wav':
|
462 |
+
return file_path
|
463 |
+
|
464 |
+
try:
|
465 |
+
audio = self.AudioSegment.from_file(file_path)
|
466 |
+
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
467 |
+
audio.export(temp_wav.name, format="wav")
|
468 |
+
return temp_wav.name
|
469 |
+
except Exception as e:
|
470 |
+
raise RuntimeError(f"Error converting audio file to WAV: {str(e)}")
|
471 |
+
|
472 |
+
def _transcribe_with_sr(self, file_path: str, engine: str = "google", language: str = "en-US") -> str:
|
473 |
+
"""Transcribe using speech_recognition library."""
|
474 |
+
temp_wav_path = None
|
475 |
+
|
476 |
+
try:
|
477 |
+
wav_path = self._convert_to_wav(file_path)
|
478 |
+
if wav_path != file_path:
|
479 |
+
temp_wav_path = wav_path
|
480 |
+
|
481 |
+
with self.sr.AudioFile(wav_path) as source:
|
482 |
+
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
|
483 |
+
audio_data = self.recognizer.record(source)
|
484 |
+
|
485 |
+
if engine == "google":
|
486 |
+
transcript = self.recognizer.recognize_google(audio_data, language=language)
|
487 |
+
elif engine == "sphinx":
|
488 |
+
transcript = self.recognizer.recognize_sphinx(audio_data)
|
489 |
+
elif engine == "wit":
|
490 |
+
wit_key = os.getenv('WIT_AI_KEY')
|
491 |
+
if not wit_key:
|
492 |
+
raise ValueError("WIT_AI_KEY environment variable required for Wit.ai engine")
|
493 |
+
transcript = self.recognizer.recognize_wit(audio_data, key=wit_key)
|
494 |
+
elif engine == "bing":
|
495 |
+
bing_key = os.getenv('BING_KEY')
|
496 |
+
if not bing_key:
|
497 |
+
raise ValueError("BING_KEY environment variable required for Bing engine")
|
498 |
+
transcript = self.recognizer.recognize_bing(audio_data, key=bing_key, language=language)
|
499 |
+
else:
|
500 |
+
transcript = self.recognizer.recognize_google(audio_data, language=language)
|
501 |
+
|
502 |
+
return transcript
|
503 |
+
|
504 |
+
except self.sr.UnknownValueError:
|
505 |
+
return "Could not understand the audio - speech was unclear or inaudible"
|
506 |
+
except self.sr.RequestError as e:
|
507 |
+
return f"Error with speech recognition service: {str(e)}"
|
508 |
+
finally:
|
509 |
+
if temp_wav_path and os.path.exists(temp_wav_path):
|
510 |
+
try:
|
511 |
+
os.unlink(temp_wav_path)
|
512 |
+
except OSError:
|
513 |
+
pass
|
514 |
+
|
515 |
+
def _run(self, file_path: str, engine: str = "google", language: str = "en-US", **kwargs) -> str:
|
516 |
+
"""
|
517 |
+
Internal method required by LangChain BaseTool.
|
518 |
+
|
519 |
+
Args:
|
520 |
+
file_path: Path to the audio file to transcribe
|
521 |
+
engine: Speech recognition engine to use
|
522 |
+
language: Language of the audio
|
523 |
+
|
524 |
+
Returns:
|
525 |
+
str: Transcribed text from the audio file
|
526 |
+
"""
|
527 |
+
try:
|
528 |
+
self._validate_audio_file(file_path)
|
529 |
+
|
530 |
+
# Use local Whisper if specified
|
531 |
+
if engine == "whisper":
|
532 |
+
transcript = self._transcribe_with_whisper(file_path, language)
|
533 |
+
else:
|
534 |
+
# Use speech_recognition library
|
535 |
+
transcript = self._transcribe_with_sr(file_path, engine, language)
|
536 |
+
|
537 |
+
return transcript
|
538 |
+
|
539 |
+
except Exception as e:
|
540 |
+
error_msg = f"AdvancedAudioTranscriptionTool error: {str(e)}"
|
541 |
+
print(error_msg)
|
542 |
+
return error_msg
|
543 |
+
|
544 |
+
def run(self, tool_input: Dict[str, Any]) -> str:
|
545 |
+
"""
|
546 |
+
Main method to run the advanced audio transcription tool.
|
547 |
+
|
548 |
+
Args:
|
549 |
+
tool_input: Dictionary containing 'file_path' and optional parameters
|
550 |
+
|
551 |
+
Returns:
|
552 |
+
str: Transcribed text from the audio file
|
553 |
+
"""
|
554 |
+
try:
|
555 |
+
file_path = tool_input.get('file_path')
|
556 |
+
if not file_path:
|
557 |
+
raise ValueError("file_path is required in tool_input")
|
558 |
+
|
559 |
+
engine = tool_input.get('engine', 'google')
|
560 |
+
language = tool_input.get('language', 'en-US')
|
561 |
+
|
562 |
+
# Call the internal _run method
|
563 |
+
return self._run(file_path=file_path, engine=engine, language=language)
|
564 |
+
|
565 |
+
except Exception as e:
|
566 |
+
error_msg = f"AdvancedAudioTranscriptionTool error: {str(e)}"
|
567 |
+
print(error_msg)
|
568 |
+
return error_msg
|
569 |
+
|
570 |
|
571 |
+
class ExcelReaderInput(BaseModel):
|
572 |
+
"""Input schema for ExcelReaderTool."""
|
573 |
+
file_path: str = Field(description="Path to the Excel file to read")
|
574 |
+
|
575 |
+
|
576 |
+
class ExcelReaderTool(BaseTool):
|
577 |
+
"""Tool for reading Excel files and formatting them for LLM consumption."""
|
578 |
+
|
579 |
+
name: str = "excel_reader"
|
580 |
+
description: str = (
|
581 |
+
"Reads an Excel file from the specified file path and returns the entire "
|
582 |
+
"Use for running math operations on a table of data"
|
583 |
+
"table from Sheet1 in a format that can be easily processed by an LLM. "
|
584 |
+
"Input should be a file path to an Excel file (.xlsx or .xls)."
|
585 |
+
)
|
586 |
+
args_schema: Type[BaseModel] = ExcelReaderInput
|
587 |
+
|
588 |
+
def _run(self, file_path: str, run_manager: Optional[Any] = None) -> str:
|
589 |
+
"""
|
590 |
+
Execute the tool to read Excel file and return formatted table.
|
591 |
+
|
592 |
+
Args:
|
593 |
+
file_path: Path to the Excel file
|
594 |
+
run_manager: Optional callback manager
|
595 |
+
|
596 |
+
Returns:
|
597 |
+
Formatted string representation of the Excel table
|
598 |
+
"""
|
599 |
+
try:
|
600 |
+
# Validate file exists
|
601 |
+
if not os.path.exists(file_path):
|
602 |
+
return f"Error: File not found at path: {file_path}"
|
603 |
+
|
604 |
+
# Validate file extension
|
605 |
+
if not file_path.lower().endswith(('.xlsx', '.xls')):
|
606 |
+
return f"Error: File must be an Excel file (.xlsx or .xls). Got: {file_path}"
|
607 |
+
|
608 |
+
# Read Excel file - specifically Sheet1
|
609 |
+
try:
|
610 |
+
df = pd.read_excel(file_path, sheet_name='Sheet1')
|
611 |
+
except ValueError as e:
|
612 |
+
if "Worksheet named 'Sheet1' not found" in str(e):
|
613 |
+
# If Sheet1 doesn't exist, try reading the first sheet
|
614 |
+
df = pd.read_excel(file_path, sheet_name=0)
|
615 |
+
else:
|
616 |
+
raise e
|
617 |
+
|
618 |
+
# Check if dataframe is empty
|
619 |
+
if df.empty:
|
620 |
+
return "The Excel file contains no data in Sheet1."
|
621 |
+
|
622 |
+
# Format the table for LLM consumption
|
623 |
+
formatted_output = self._format_table_for_llm(df, file_path)
|
624 |
+
|
625 |
+
return formatted_output
|
626 |
+
|
627 |
+
except FileNotFoundError:
|
628 |
+
return f"Error: File not found at path: {file_path}"
|
629 |
+
except PermissionError:
|
630 |
+
return f"Error: Permission denied accessing file: {file_path}"
|
631 |
+
except Exception as e:
|
632 |
+
return f"Error reading Excel file: {str(e)}"
|
633 |
+
|
634 |
+
def _format_table_for_llm(self, df: pd.DataFrame, file_path: str) -> str:
|
635 |
+
"""
|
636 |
+
Format the pandas DataFrame into a readable string format for LLMs.
|
637 |
+
|
638 |
+
Args:
|
639 |
+
df: The pandas DataFrame containing the Excel data
|
640 |
+
file_path: Original file path for reference
|
641 |
+
|
642 |
+
Returns:
|
643 |
+
Formatted string representation of the table
|
644 |
+
"""
|
645 |
+
output_lines = []
|
646 |
+
|
647 |
+
# Add header information
|
648 |
+
#output_lines.append(f"EXCEL FILE DATA FROM: {os.path.basename(file_path)}")
|
649 |
+
#output_lines.append(f"Sheet: Sheet1")
|
650 |
+
#output_lines.append(f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns")
|
651 |
+
#output_lines.append("-" * 60)
|
652 |
+
|
653 |
+
# Add column information
|
654 |
+
#output_lines.append("COLUMNS:")
|
655 |
+
#for i, col in enumerate(df.columns, 1):
|
656 |
+
# col_type = str(df[col].dtype)
|
657 |
+
# non_null_count = df[col].count()
|
658 |
+
# output_lines.append(f" {i}. {col} ({col_type}) - {non_null_count} non-null values")
|
659 |
+
|
660 |
+
#output_lines.append("-" * 60)
|
661 |
+
|
662 |
+
# Add table data in a clean format
|
663 |
+
output_lines.append("TABLE DATA:")
|
664 |
+
|
665 |
+
# Convert DataFrame to string with proper formatting
|
666 |
+
# Handle potential NaN values and make it readable
|
667 |
+
df_clean = df.fillna("N/A") # Replace NaN with readable placeholder
|
668 |
+
|
669 |
+
# Create a formatted table string
|
670 |
+
#table_str = df_clean.to_string(index=True, max_rows=None, max_cols=None)
|
671 |
+
#output_lines.append(table_str)
|
672 |
+
|
673 |
+
# Add summary statistics for numeric columns if they exist
|
674 |
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
675 |
+
|
676 |
+
|
677 |
+
sums = df_clean[numeric_cols].sum()
|
678 |
+
|
679 |
+
|
680 |
+
# Step 2: Define which columns are food and which are drink
|
681 |
+
food_cols = [col for col in numeric_cols if col.lower() != 'soda']
|
682 |
+
drink_cols = [col for col in numeric_cols if col.lower() == 'soda']
|
683 |
+
|
684 |
+
# Step 3: Aggregate totals
|
685 |
+
food_total = sums[food_cols].sum()
|
686 |
+
drink_total = sums[drink_cols].sum()
|
687 |
+
|
688 |
+
# Step 4: Format the results as dollars
|
689 |
+
formatted_totals = {
|
690 |
+
'Food': f"${food_total:,.2f}",
|
691 |
+
'Drink': f"${drink_total:,.2f}"
|
692 |
+
}
|
693 |
+
|
694 |
+
# Step 5: Convert to string for display (optional)
|
695 |
+
result_string = '\n'.join([f"{k}: {v}" for k, v in formatted_totals.items()])
|
696 |
+
|
697 |
+
# Convert to string for display
|
698 |
+
#result_string = formatted.to_string()
|
699 |
+
|
700 |
+
output_lines.append(result_string)
|
701 |
+
#output_lines.append(df_clean[numeric_cols].sum())
|
702 |
+
if len(numeric_cols) > 0:
|
703 |
+
output_lines.append("-" * 60)
|
704 |
+
#output_lines.append("NUMERIC COLUMN SUMMARY:")
|
705 |
+
#for col in numeric_cols:
|
706 |
+
# stats = df[col].describe()
|
707 |
+
# output_lines.append(f"\n{col}:")
|
708 |
+
# output_lines.append(f" Count: {stats['count']}")
|
709 |
+
# output_lines.append(f" Mean: {stats['mean']:.2f}")
|
710 |
+
# output_lines.append(f" Min: {stats['min']}")
|
711 |
+
# output_lines.append(f" Max: {stats['max']}")
|
712 |
+
|
713 |
+
return "\n".join(output_lines)
|
714 |
+
|
715 |
+
async def _arun(self, file_path: str, run_manager: Optional[Any] = None) -> str:
|
716 |
+
"""Async version of the tool (falls back to sync implementation)."""
|
717 |
+
return self._run(file_path, run_manager)
|
718 |
+
|
719 |
+
|
720 |
+
|
721 |
+
|
722 |
+
class PythonExecutorInput(BaseModel):
|
723 |
+
"""Input schema for PythonExecutor tool."""
|
724 |
+
file_path: str = Field(description="Path to the Python file to execute")
|
725 |
+
|
726 |
+
|
727 |
+
class PythonExecutorTool(BaseTool):
|
728 |
+
"""Tool that executes a Python file and returns the result."""
|
729 |
+
|
730 |
+
name: str = "python_executor"
|
731 |
+
description: str = "Executes a Python file from the given file path and returns the output"
|
732 |
+
args_schema: Type[BaseModel] = PythonExecutorInput
|
733 |
+
|
734 |
+
def _run(
|
735 |
+
self,
|
736 |
+
file_path: str,
|
737 |
+
run_manager: Optional[Any] = None,
|
738 |
+
) -> str:
|
739 |
+
"""Execute the Python file and return the result."""
|
740 |
+
try:
|
741 |
+
# Validate that the file exists
|
742 |
+
if not os.path.exists(file_path):
|
743 |
+
return f"Error: File '{file_path}' does not exist"
|
744 |
+
|
745 |
+
# Validate that it's a Python file
|
746 |
+
if not file_path.endswith('.py'):
|
747 |
+
return f"Error: '{file_path}' is not a Python file (.py extension required)"
|
748 |
+
|
749 |
+
# Execute the Python file
|
750 |
+
result = subprocess.run(
|
751 |
+
[sys.executable, file_path],
|
752 |
+
capture_output=True,
|
753 |
+
text=True,
|
754 |
+
timeout=600 # 30 second timeout to prevent hanging
|
755 |
+
)
|
756 |
+
|
757 |
+
# Prepare the output
|
758 |
+
output_parts = []
|
759 |
+
|
760 |
+
if result.stdout:
|
761 |
+
output_parts.append(f"STDOUT:\n{result.stdout}")
|
762 |
+
|
763 |
+
if result.stderr:
|
764 |
+
output_parts.append(f"STDERR:\n{result.stderr}")
|
765 |
+
|
766 |
+
if result.returncode != 0:
|
767 |
+
output_parts.append(f"Return code: {result.returncode}")
|
768 |
+
|
769 |
+
if not output_parts:
|
770 |
+
return "Script executed successfully with no output"
|
771 |
+
|
772 |
+
return "\n\n".join(output_parts)
|
773 |
+
|
774 |
+
except subprocess.TimeoutExpired:
|
775 |
+
return "Error: Script execution timed out (30 seconds)"
|
776 |
+
except Exception as e:
|
777 |
+
return f"Error executing Python file: {str(e)}"
|
778 |
+
|
779 |
+
async def _arun(
|
780 |
+
self,
|
781 |
+
file_path: str,
|
782 |
+
run_manager: Optional[Any] = None,
|
783 |
+
) -> str:
|
784 |
+
"""Async version - delegates to sync implementation."""
|
785 |
+
return self._run(file_path, run_manager)
|
786 |
|
787 |
class EnhancedDuckDuckGoSearchTool(BaseTool):
|
788 |
name: str = "enhanced_search"
|
|
|
1373 |
return f"An unexpected error occurred: {str(e)}"
|
1374 |
|
1375 |
|
1376 |
+
|
1377 |
class EnhancedYoutubeScreenshotQA(BaseTool):
|
1378 |
+
name: str = "bird_species_screenshot_qa"
|
1379 |
description: str = (
|
1380 |
+
"Use this tool to calculate the number of bird species on camera at any one time,"
|
|
|
|
|
1381 |
"Input should be a dict with keys: 'youtube_url', 'question', and optional parameters. "
|
1382 |
#"Optional parameters: 'frame_interval_seconds' (default: 10), 'max_frames' (default: 50), "
|
1383 |
#"'use_scene_detection' (default: True), 'parallel_processing' (default: True). "
|
|
|
1413 |
def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
|
1414 |
"""Get configuration value with fallback to defaults"""
|
1415 |
defaults = {
|
1416 |
+
'frame_interval_seconds': 5,
|
1417 |
+
'max_frames': 500,
|
1418 |
'use_scene_detection': True,
|
1419 |
'resize_frames': True,
|
1420 |
'parallel_processing': True,
|
|
|
1439 |
"Salesforce/blip-vqa-base"
|
1440 |
).to(self.device)
|
1441 |
|
1442 |
+
#self.processor_vqa = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
1443 |
+
#self.model_vqa = BlipForQuestionAnswering.from_pretrained(
|
1444 |
+
# "Salesforce/blip-vqa-capfilt-large"
|
1445 |
+
#).to(self.device)
|
1446 |
+
|
1447 |
print("BLIP VQA model loaded successfully")
|
1448 |
except Exception as e:
|
1449 |
print(f"Error initializing VQA model: {str(e)}")
|
|
|
1679 |
def _answer_question_on_frame(self, frame_path: str, question: str) -> Tuple[str, float]:
|
1680 |
"""Answer question on single frame with confidence scoring"""
|
1681 |
try:
|
1682 |
+
#ipdb.set_trace()
|
1683 |
image = Image.open(frame_path).convert('RGB')
|
1684 |
inputs = self.processor_vqa(image, question, return_tensors="pt").to(self.device)
|
1685 |
|
|
|
1996 |
def _run(self, youtube_url, question, **kwargs) -> str:
|
1997 |
"""Enhanced main execution method"""
|
1998 |
#ipdb.set_trace()
|
1999 |
+
question = "How many unique bird species are on camera?"
|
2000 |
|
2001 |
#input_data = query
|
2002 |
#youtube_url = input_data.get("youtube_url")
|
|
|
2035 |
|
2036 |
# Format comprehensive result - Fixed the reference to stats
|
2037 |
result = f"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2038 |
|
2039 |
📊 **STATISTICAL SUMMARY**:
|
2040 |
• Minimum: {analysis_result['statistical_summary']['minimum']:.2f}
|
|
|
2043 |
• Median: {analysis_result['statistical_summary']['median']:.2f}
|
2044 |
• Range: {analysis_result['statistical_summary']['range']:.2f}
|
2045 |
|
|
|
|
|
|
|
|
|
2046 |
""".strip()
|
2047 |
|
2048 |
return result
|
|
|
2055 |
def create_enhanced_youtube_qa_tool(**kwargs):
|
2056 |
"""Factory function to create the enhanced tool with custom parameters"""
|
2057 |
return EnhancedYoutubeScreenshotQA(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2058 |
|
2059 |
|
2060 |
class YouTubeTranscriptExtractor(BaseTool):
|
2061 |
name: str = "youtube_transcript_extractor"
|
2062 |
description: str = (
|
2063 |
"Downloads a YouTube video and extracts the complete audio transcript using speech recognition with speaker identification. "
|
2064 |
+
#"Use this tool for AUDIO questions, when the youtube question involves what a person says,"
|
2065 |
+
"Use this tool for questions like 'what does jim say in response to a question in this video',"
|
2066 |
"Input should be a dict with keys: 'youtube_url' and optional parameters. "
|
2067 |
+
#"Optional parameters: 'language' (default: 'en-US'), 'chunk_length_ms' (default: 30000), "
|
2068 |
+
#"'silence_thresh' (default: -40), 'use_enhanced_model' (default: True), 'audio_quality' (default: 'best'), "
|
2069 |
+
#"'enable_speaker_id' (default: True), 'max_speakers' (default: 5), 'speaker_min_duration' (default: 2.0). "
|
2070 |
"Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US', 'enable_speaker_id': True}"
|
2071 |
)
|
2072 |
|
|
|
2834 |
"""Factory function to create the transcript extraction tool with custom parameters"""
|
2835 |
return YouTubeTranscriptExtractor(**kwargs)
|
2836 |
|
|
|
|
|
2837 |
# --- Model Configuration ---
|
2838 |
def create_llm_pipeline():
|
2839 |
#model_id = "meta-llama/Llama-2-13b-chat-hf"
|
|
|
3585 |
|
3586 |
# --- Run the Agent ---
|
3587 |
# Enhanced system prompt for better behavior
|
|
|
3588 |
def run_agent(agent, state: AgentState):
|
3589 |
"""Enhanced agent initialization with better prompt and hallucination prevention."""
|
3590 |
+
global WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_TOOL, YOUTUBE_AUDIO_TOOL, AUDIO_TRANSCRIPTION_TOOL, EXCEL_TOOL, PYTHON_TOOL, tools
|
3591 |
|
3592 |
# Initialize tools
|
3593 |
WIKIPEDIA_TOOL = WikipediaSearchToolWithFAISS()
|
3594 |
+
SEARCH_TOOL = EnhancedDuckDuckGoSearchTool(max_results=3, max_chars_per_page=8000)
|
3595 |
YOUTUBE_TOOL = EnhancedYoutubeScreenshotQA()
|
3596 |
YOUTUBE_AUDIO_TOOL = YouTubeTranscriptExtractor()
|
3597 |
+
AUDIO_TRANSCRIPTION_TOOL = AudioTranscriptionTool()
|
3598 |
+
EXCEL_TOOL = ExcelReaderTool()
|
3599 |
+
PYTHON_TOOL = PythonExecutorTool()
|
3600 |
+
tools = [WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_AUDIO_TOOL, YOUTUBE_TOOL, AUDIO_TRANSCRIPTION_TOOL, EXCEL_TOOL, PYTHON_TOOL]
|
3601 |
|
3602 |
formatted_tools_description = render_text_description(tools)
|
3603 |
current_date_str = datetime.now().strftime("%Y-%m-%d")
|
|
|
3613 |
3. Use tools ONLY when you need specific information you don't know
|
3614 |
4. After using a tool, provide your FINAL ANSWER immediately
|
3615 |
5. STOP after giving your FINAL ANSWER - do not continue
|
3616 |
+
6. Do not repeat words in the question in the answer
|
3617 |
|
3618 |
FORMAT for tool use:
|
3619 |
Thought: <brief reasoning>
|
|
|
3625 |
|
3626 |
ANSWER FORMAT:
|
3627 |
- Numbers: no commas, no units unless specified
|
3628 |
+
- Questions on "how many" should be answered with a number ONLY
|
3629 |
- Strings: no articles, no abbreviations, digits in plain text
|
3630 |
+
- Lists: comma-separated either in ascending numeric order or alphabetical order as requested
|
3631 |
- Be extremely brief and concise
|
3632 |
- Do not provide additional context or explanations
|
3633 |
- Do not provide parentheticals
|
3634 |
|
3635 |
+
|
3636 |
+
|
3637 |
IMPORTANT: You are responding to ONE question only. Do not ask follow-up questions or generate additional dialogue.
|
3638 |
|
3639 |
Current date: {current_date_str}
|
|
|
3660 |
|
3661 |
# Cleanup
|
3662 |
if result.get("done"):
|
3663 |
+
torch.cuda.empty_cache()
|
3664 |
+
torch.cuda.ipc_collect()
|
3665 |
gc.collect()
|
3666 |
print("🧹 Released GPU memory after completion")
|
3667 |
|
3668 |
return result["messages"]
|
3669 |
+
|