Enhance agent.py with image data extraction and YouTube transcript retrieval tools; update .gitignore and requirements.txt to include test files and yt_dlp dependency.
Browse files- .gitignore +5 -1
- agent.py +150 -4
- app.py +7 -0
- requirements.txt +1 -1
.gitignore
CHANGED
@@ -12,4 +12,8 @@ TEST_SET/
|
|
12 |
test_results/
|
13 |
|
14 |
#cursor
|
15 |
-
.cursor/
|
|
|
|
|
|
|
|
|
|
12 |
test_results/
|
13 |
|
14 |
#cursor
|
15 |
+
.cursor/
|
16 |
+
|
17 |
+
#test
|
18 |
+
test.py
|
19 |
+
test_youtube.py
|
agent.py
CHANGED
@@ -15,7 +15,11 @@ import requests
|
|
15 |
import json
|
16 |
import time
|
17 |
from daytona_sdk import Daytona, DaytonaConfig
|
18 |
-
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
# Load environment variablesTuple
|
@@ -485,6 +489,69 @@ def extract_document_data(input_method: str, files: list, prompt: str, json_mode
|
|
485 |
except Exception as e:
|
486 |
return f"Error extracting document data: {str(e)}"
|
487 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
488 |
@tool
|
489 |
def extract_url_content(url: str) -> str:
|
490 |
"""Extract content from a URL using Diffbot API (supports webpages, articles, PDFs, etc.).
|
@@ -521,7 +588,7 @@ def extract_url_content(url: str) -> str:
|
|
521 |
|
522 |
try:
|
523 |
# Make the API request with a timeout
|
524 |
-
response = requests.get(api_url, params=params, timeout=
|
525 |
response.raise_for_status() # Raise exception for HTTP errors
|
526 |
|
527 |
# Parse the response
|
@@ -559,6 +626,77 @@ def extract_url_content(url: str) -> str:
|
|
559 |
except Exception as e:
|
560 |
return f"Error extracting content from {url}: {str(e)}"
|
561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
562 |
class BasicAgent:
|
563 |
def __init__(self):
|
564 |
print("BasicAgent initialized.")
|
@@ -594,7 +732,7 @@ class BasicAgent:
|
|
594 |
)
|
595 |
|
596 |
# Initialize tools
|
597 |
-
self.tools = [search_web_tavily, search_web_serper, execute_code_securely, execute_shell_command, sandbox_file_operation, extract_document_data, extract_url_content]
|
598 |
|
599 |
# Bind tools only to the worker model
|
600 |
self.worker_model = self.worker_model_base.bind_tools(self.tools)
|
@@ -721,6 +859,8 @@ class BasicAgent:
|
|
721 |
Worker has access to the following tools:
|
722 |
- Web search (using Tavily and Serper)
|
723 |
- Web content extraction
|
|
|
|
|
724 |
- Secure code execution (for Python and other languages)
|
725 |
- Secure shell command execution
|
726 |
- Secure file operations
|
@@ -763,6 +903,8 @@ class BasicAgent:
|
|
763 |
Worker has access to the following powerful tools:
|
764 |
- Web search (using Tavily and Serper)
|
765 |
- Web content extraction
|
|
|
|
|
766 |
- Secure code execution (for Python and other languages)
|
767 |
- Secure shell command execution
|
768 |
- Secure file operations
|
@@ -870,6 +1012,8 @@ class BasicAgent:
|
|
870 |
Remember that the worker had access to:
|
871 |
- Web search tools
|
872 |
- Web content extraction
|
|
|
|
|
873 |
- Secure code execution
|
874 |
- Secure shell commands
|
875 |
- Secure file operations
|
@@ -882,6 +1026,8 @@ class BasicAgent:
|
|
882 |
- Ensure any numerical values, dates, names, or technical terms are correct
|
883 |
- Confirm that the formatting precisely matches what was requested
|
884 |
- Do not add units to the final answer if not explicitly requested
|
|
|
|
|
885 |
- Answers tend to be as short as possible, so do not add extra data unless explicitly requested
|
886 |
|
887 |
If the answer report is correct, format it exactly as asked in the question, and respond with:
|
@@ -988,7 +1134,7 @@ class BasicAgent:
|
|
988 |
|
989 |
try:
|
990 |
# Run the workflow
|
991 |
-
final_state = self.app.invoke(initial_state, config={"callbacks": [self.langfuse_handler], "recursion_limit":
|
992 |
|
993 |
# Return the final answer
|
994 |
answer = final_state.get("final_answer", "")
|
|
|
15 |
import json
|
16 |
import time
|
17 |
from daytona_sdk import Daytona, DaytonaConfig
|
18 |
+
import yt_dlp
|
19 |
+
import io
|
20 |
+
import os
|
21 |
+
import tempfile
|
22 |
+
from pathlib import Path
|
23 |
|
24 |
|
25 |
# Load environment variablesTuple
|
|
|
489 |
except Exception as e:
|
490 |
return f"Error extracting document data: {str(e)}"
|
491 |
|
492 |
+
@tool
|
493 |
+
def extract_image_data(input_method: str, images: list, prompt: str, json_mode: bool = False) -> str:
|
494 |
+
"""Extract visual information from images using Dumpling AI.
|
495 |
+
|
496 |
+
This tool allows you to extract detailed descriptions or specific information from images
|
497 |
+
using vision-capable Large Language Models (LLMs). It can identify objects, scenes, text,
|
498 |
+
and other visual elements based on your specific prompt.
|
499 |
+
|
500 |
+
Parameters:
|
501 |
+
- input_method: How to input images, either "url" or "base64"
|
502 |
+
- images: List of image URLs or base64-encoded strings depending on input_method
|
503 |
+
- prompt: Specific instructions for what information to extract from the image
|
504 |
+
- json_mode: Whether to return structured JSON (true) or free text (false)
|
505 |
+
|
506 |
+
Returns:
|
507 |
+
- Extracted visual data from the image based on your prompt
|
508 |
+
"""
|
509 |
+
api_key = os.getenv("DUMPLING_API_KEY")
|
510 |
+
if not api_key:
|
511 |
+
return "Error: DUMPLING_API_KEY environment variable not set"
|
512 |
+
|
513 |
+
try:
|
514 |
+
url = "https://app.dumplingai.com/api/v1/extract-image"
|
515 |
+
headers = {
|
516 |
+
"Content-Type": "application/json",
|
517 |
+
"Authorization": f"Bearer {api_key}"
|
518 |
+
}
|
519 |
+
|
520 |
+
data = {
|
521 |
+
"inputMethod": input_method,
|
522 |
+
"images": images,
|
523 |
+
"prompt": prompt,
|
524 |
+
"jsonMode": json_mode
|
525 |
+
}
|
526 |
+
|
527 |
+
response = requests.post(url, headers=headers, json=data, timeout=120)
|
528 |
+
response.raise_for_status()
|
529 |
+
|
530 |
+
result = response.json()
|
531 |
+
|
532 |
+
# Format the response in a readable way
|
533 |
+
formatted_response = f"Image Analysis Results:\n\n"
|
534 |
+
formatted_response += f"Extracted Data:\n{result.get('results', 'No results found')}\n\n"
|
535 |
+
formatted_response += f"Images Processed: {result.get('imageCount', 'Unknown')}\n"
|
536 |
+
formatted_response += f"Credit Usage: {result.get('creditUsage', 'Unknown')}\n"
|
537 |
+
|
538 |
+
return formatted_response
|
539 |
+
|
540 |
+
except requests.exceptions.Timeout:
|
541 |
+
return "Error: Request to Dumpling AI API timed out after 120 seconds"
|
542 |
+
except requests.exceptions.HTTPError as e:
|
543 |
+
error_detail = f"HTTP Error: {e.response.status_code}"
|
544 |
+
try:
|
545 |
+
error_json = e.response.json()
|
546 |
+
error_detail += f" - {error_json.get('detail', error_json)}"
|
547 |
+
except:
|
548 |
+
error_detail += f" - {e.response.text[:500]}"
|
549 |
+
return error_detail
|
550 |
+
except requests.exceptions.RequestException as e:
|
551 |
+
return f"Error making request to Dumpling AI API: {str(e)}"
|
552 |
+
except Exception as e:
|
553 |
+
return f"Error extracting image data: {str(e)}"
|
554 |
+
|
555 |
@tool
|
556 |
def extract_url_content(url: str) -> str:
|
557 |
"""Extract content from a URL using Diffbot API (supports webpages, articles, PDFs, etc.).
|
|
|
588 |
|
589 |
try:
|
590 |
# Make the API request with a timeout
|
591 |
+
response = requests.get(api_url, params=params, timeout=60) # 30 second timeout
|
592 |
response.raise_for_status() # Raise exception for HTTP errors
|
593 |
|
594 |
# Parse the response
|
|
|
626 |
except Exception as e:
|
627 |
return f"Error extracting content from {url}: {str(e)}"
|
628 |
|
629 |
+
@tool
|
630 |
+
def get_youtube_transcript(url: str) -> str:
|
631 |
+
"""Get the transcript (captions) from a YouTube video as text.
|
632 |
+
|
633 |
+
This tool extracts the transcript text from YouTube videos, returns the transcript as a string.
|
634 |
+
|
635 |
+
Parameters:
|
636 |
+
- url: The YouTube video URL
|
637 |
+
|
638 |
+
Returns:
|
639 |
+
- The transcript as a string, or an error message if the transcript couldn't be obtained
|
640 |
+
"""
|
641 |
+
|
642 |
+
|
643 |
+
# Create a temporary directory to store subtitle files
|
644 |
+
temp_dir = tempfile.mkdtemp()
|
645 |
+
current_dir = os.getcwd()
|
646 |
+
|
647 |
+
try:
|
648 |
+
# Change to temp directory for download
|
649 |
+
os.chdir(temp_dir)
|
650 |
+
|
651 |
+
ydl_opts = {
|
652 |
+
'writesubtitles': True, # Download subtitles
|
653 |
+
'writeautomaticsub': True, # Download automatic subtitles
|
654 |
+
'subtitleslangs': ['en'], # Specify English language
|
655 |
+
'skip_download': True, # Skip downloading the video, only get subtitles
|
656 |
+
'outtmpl': 'subtitle', # Simple output template
|
657 |
+
}
|
658 |
+
|
659 |
+
# Download the subtitles
|
660 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
661 |
+
info_dict = ydl.extract_info(url, download=True)
|
662 |
+
video_title = info_dict.get('title', 'Unknown Title')
|
663 |
+
|
664 |
+
# Look for subtitle files in the temp directory
|
665 |
+
subtitle_content = ""
|
666 |
+
subtitle_files = list(Path(temp_dir).glob("*.vtt")) + list(Path(temp_dir).glob("*.srt"))
|
667 |
+
|
668 |
+
if subtitle_files:
|
669 |
+
# Read the first subtitle file found
|
670 |
+
with open(subtitle_files[0], 'r', encoding='utf-8') as f:
|
671 |
+
subtitle_content = f.read()
|
672 |
+
|
673 |
+
# Clean up the subtitle content to remove timestamps and formatting
|
674 |
+
# This is a simple cleaning - more complex parsing may be needed for perfect results
|
675 |
+
lines = subtitle_content.split('\n')
|
676 |
+
cleaned_lines = []
|
677 |
+
for line in lines:
|
678 |
+
# Skip time codes, numbering and empty lines
|
679 |
+
if line.strip() and not line.strip().isdigit() and not '-->' in line and not line.startswith('WEBVTT'):
|
680 |
+
cleaned_lines.append(line)
|
681 |
+
|
682 |
+
subtitle_content = ' '.join(cleaned_lines)
|
683 |
+
return f"Transcript from YouTube video: '{video_title}'\n\n{subtitle_content}"
|
684 |
+
else:
|
685 |
+
return f"No transcript found for YouTube video: '{video_title}'"
|
686 |
+
|
687 |
+
except Exception as e:
|
688 |
+
return f"Error retrieving YouTube transcript: {str(e)}"
|
689 |
+
finally:
|
690 |
+
# Change back to original directory and clean up
|
691 |
+
os.chdir(current_dir)
|
692 |
+
# Cleanup files (optional)
|
693 |
+
try:
|
694 |
+
for file in os.listdir(temp_dir):
|
695 |
+
os.remove(os.path.join(temp_dir, file))
|
696 |
+
os.rmdir(temp_dir)
|
697 |
+
except:
|
698 |
+
pass
|
699 |
+
|
700 |
class BasicAgent:
|
701 |
def __init__(self):
|
702 |
print("BasicAgent initialized.")
|
|
|
732 |
)
|
733 |
|
734 |
# Initialize tools
|
735 |
+
self.tools = [search_web_tavily, search_web_serper, execute_code_securely, execute_shell_command, sandbox_file_operation, extract_document_data, extract_image_data, extract_url_content, get_youtube_transcript]
|
736 |
|
737 |
# Bind tools only to the worker model
|
738 |
self.worker_model = self.worker_model_base.bind_tools(self.tools)
|
|
|
859 |
Worker has access to the following tools:
|
860 |
- Web search (using Tavily and Serper)
|
861 |
- Web content extraction
|
862 |
+
- Image analysis (can extract visual information from images)
|
863 |
+
- Document data extraction (from PDFs, documents, etc.)
|
864 |
- Secure code execution (for Python and other languages)
|
865 |
- Secure shell command execution
|
866 |
- Secure file operations
|
|
|
903 |
Worker has access to the following powerful tools:
|
904 |
- Web search (using Tavily and Serper)
|
905 |
- Web content extraction
|
906 |
+
- Image analysis (can extract visual information from images)
|
907 |
+
- Document data extraction (can extract data from PDFs, documents, etc.)
|
908 |
- Secure code execution (for Python and other languages)
|
909 |
- Secure shell command execution
|
910 |
- Secure file operations
|
|
|
1012 |
Remember that the worker had access to:
|
1013 |
- Web search tools
|
1014 |
- Web content extraction
|
1015 |
+
- Image analysis (can extract visual information from images)
|
1016 |
+
- Document data extraction (from PDFs, documents, etc.)
|
1017 |
- Secure code execution
|
1018 |
- Secure shell commands
|
1019 |
- Secure file operations
|
|
|
1026 |
- Ensure any numerical values, dates, names, or technical terms are correct
|
1027 |
- Confirm that the formatting precisely matches what was requested
|
1028 |
- Do not add units to the final answer if not explicitly requested
|
1029 |
+
- Do not use money symbols like in the final answer if not explicitly requested
|
1030 |
+
- Dont use comma separators for integers like 1,000,000, just use 1000000
|
1031 |
- Answers tend to be as short as possible, so do not add extra data unless explicitly requested
|
1032 |
|
1033 |
If the answer report is correct, format it exactly as asked in the question, and respond with:
|
|
|
1134 |
|
1135 |
try:
|
1136 |
# Run the workflow
|
1137 |
+
final_state = self.app.invoke(initial_state, config={"callbacks": [self.langfuse_handler], "recursion_limit": 35})
|
1138 |
|
1139 |
# Return the final answer
|
1140 |
answer = final_state.get("final_answer", "")
|
app.py
CHANGED
@@ -117,6 +117,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
117 |
print(f"Skipping item with missing task_id or question: {item}")
|
118 |
continue
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
# Skip if we already have an answer for this question
|
121 |
if task_id in existing_answers_dict:
|
122 |
submitted_answer = existing_answers_dict[task_id]
|
|
|
117 |
print(f"Skipping item with missing task_id or question: {item}")
|
118 |
continue
|
119 |
|
120 |
+
# Check if the question has an associated file and prepend information
|
121 |
+
file_name = item.get("file_name")
|
122 |
+
if file_name and file_name != "":
|
123 |
+
file_url = f"{api_url}/files/{task_id}"
|
124 |
+
question_with_file_info = f"For this task there is file available, with name {file_name}, it's possible to download it from {file_url}\n\n{question_text}"
|
125 |
+
question_text = question_with_file_info
|
126 |
+
|
127 |
# Skip if we already have an answer for this question
|
128 |
if task_id in existing_answers_dict:
|
129 |
submitted_answer = existing_answers_dict[task_id]
|
requirements.txt
CHANGED
@@ -7,4 +7,4 @@ langchain-anthropic
|
|
7 |
anthropic
|
8 |
python-Levenshtein
|
9 |
daytona_sdk
|
10 |
-
|
|
|
7 |
anthropic
|
8 |
python-Levenshtein
|
9 |
daytona_sdk
|
10 |
+
yt_dlp
|