Spaces:
Runtime error
Runtime error
Enhance agent.py with image data extraction and YouTube transcript retrieval tools; update .gitignore and requirements.txt to include test files and yt_dlp dependency.
Browse files- .gitignore +5 -1
- agent.py +150 -4
- app.py +7 -0
- requirements.txt +1 -1
.gitignore
CHANGED
|
@@ -12,4 +12,8 @@ TEST_SET/
|
|
| 12 |
test_results/
|
| 13 |
|
| 14 |
#cursor
|
| 15 |
-
.cursor/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
test_results/
|
| 13 |
|
| 14 |
#cursor
|
| 15 |
+
.cursor/
|
| 16 |
+
|
| 17 |
+
#test
|
| 18 |
+
test.py
|
| 19 |
+
test_youtube.py
|
agent.py
CHANGED
|
@@ -15,7 +15,11 @@ import requests
|
|
| 15 |
import json
|
| 16 |
import time
|
| 17 |
from daytona_sdk import Daytona, DaytonaConfig
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
# Load environment variablesTuple
|
|
@@ -485,6 +489,69 @@ def extract_document_data(input_method: str, files: list, prompt: str, json_mode
|
|
| 485 |
except Exception as e:
|
| 486 |
return f"Error extracting document data: {str(e)}"
|
| 487 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
@tool
|
| 489 |
def extract_url_content(url: str) -> str:
|
| 490 |
"""Extract content from a URL using Diffbot API (supports webpages, articles, PDFs, etc.).
|
|
@@ -521,7 +588,7 @@ def extract_url_content(url: str) -> str:
|
|
| 521 |
|
| 522 |
try:
|
| 523 |
# Make the API request with a timeout
|
| 524 |
-
response = requests.get(api_url, params=params, timeout=
|
| 525 |
response.raise_for_status() # Raise exception for HTTP errors
|
| 526 |
|
| 527 |
# Parse the response
|
|
@@ -559,6 +626,77 @@ def extract_url_content(url: str) -> str:
|
|
| 559 |
except Exception as e:
|
| 560 |
return f"Error extracting content from {url}: {str(e)}"
|
| 561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
class BasicAgent:
|
| 563 |
def __init__(self):
|
| 564 |
print("BasicAgent initialized.")
|
|
@@ -594,7 +732,7 @@ class BasicAgent:
|
|
| 594 |
)
|
| 595 |
|
| 596 |
# Initialize tools
|
| 597 |
-
self.tools = [search_web_tavily, search_web_serper, execute_code_securely, execute_shell_command, sandbox_file_operation, extract_document_data, extract_url_content]
|
| 598 |
|
| 599 |
# Bind tools only to the worker model
|
| 600 |
self.worker_model = self.worker_model_base.bind_tools(self.tools)
|
|
@@ -721,6 +859,8 @@ class BasicAgent:
|
|
| 721 |
Worker has access to the following tools:
|
| 722 |
- Web search (using Tavily and Serper)
|
| 723 |
- Web content extraction
|
|
|
|
|
|
|
| 724 |
- Secure code execution (for Python and other languages)
|
| 725 |
- Secure shell command execution
|
| 726 |
- Secure file operations
|
|
@@ -763,6 +903,8 @@ class BasicAgent:
|
|
| 763 |
Worker has access to the following powerful tools:
|
| 764 |
- Web search (using Tavily and Serper)
|
| 765 |
- Web content extraction
|
|
|
|
|
|
|
| 766 |
- Secure code execution (for Python and other languages)
|
| 767 |
- Secure shell command execution
|
| 768 |
- Secure file operations
|
|
@@ -870,6 +1012,8 @@ class BasicAgent:
|
|
| 870 |
Remember that the worker had access to:
|
| 871 |
- Web search tools
|
| 872 |
- Web content extraction
|
|
|
|
|
|
|
| 873 |
- Secure code execution
|
| 874 |
- Secure shell commands
|
| 875 |
- Secure file operations
|
|
@@ -882,6 +1026,8 @@ class BasicAgent:
|
|
| 882 |
- Ensure any numerical values, dates, names, or technical terms are correct
|
| 883 |
- Confirm that the formatting precisely matches what was requested
|
| 884 |
- Do not add units to the final answer if not explicitly requested
|
|
|
|
|
|
|
| 885 |
- Answers tend to be as short as possible, so do not add extra data unless explicitly requested
|
| 886 |
|
| 887 |
If the answer report is correct, format it exactly as asked in the question, and respond with:
|
|
@@ -988,7 +1134,7 @@ class BasicAgent:
|
|
| 988 |
|
| 989 |
try:
|
| 990 |
# Run the workflow
|
| 991 |
-
final_state = self.app.invoke(initial_state, config={"callbacks": [self.langfuse_handler], "recursion_limit":
|
| 992 |
|
| 993 |
# Return the final answer
|
| 994 |
answer = final_state.get("final_answer", "")
|
|
|
|
| 15 |
import json
|
| 16 |
import time
|
| 17 |
from daytona_sdk import Daytona, DaytonaConfig
|
| 18 |
+
import yt_dlp
|
| 19 |
+
import io
|
| 20 |
+
import os
|
| 21 |
+
import tempfile
|
| 22 |
+
from pathlib import Path
|
| 23 |
|
| 24 |
|
| 25 |
# Load environment variablesTuple
|
|
|
|
| 489 |
except Exception as e:
|
| 490 |
return f"Error extracting document data: {str(e)}"
|
| 491 |
|
| 492 |
+
@tool
|
| 493 |
+
def extract_image_data(input_method: str, images: list, prompt: str, json_mode: bool = False) -> str:
|
| 494 |
+
"""Extract visual information from images using Dumpling AI.
|
| 495 |
+
|
| 496 |
+
This tool allows you to extract detailed descriptions or specific information from images
|
| 497 |
+
using vision-capable Large Language Models (LLMs). It can identify objects, scenes, text,
|
| 498 |
+
and other visual elements based on your specific prompt.
|
| 499 |
+
|
| 500 |
+
Parameters:
|
| 501 |
+
- input_method: How to input images, either "url" or "base64"
|
| 502 |
+
- images: List of image URLs or base64-encoded strings depending on input_method
|
| 503 |
+
- prompt: Specific instructions for what information to extract from the image
|
| 504 |
+
- json_mode: Whether to return structured JSON (true) or free text (false)
|
| 505 |
+
|
| 506 |
+
Returns:
|
| 507 |
+
- Extracted visual data from the image based on your prompt
|
| 508 |
+
"""
|
| 509 |
+
api_key = os.getenv("DUMPLING_API_KEY")
|
| 510 |
+
if not api_key:
|
| 511 |
+
return "Error: DUMPLING_API_KEY environment variable not set"
|
| 512 |
+
|
| 513 |
+
try:
|
| 514 |
+
url = "https://app.dumplingai.com/api/v1/extract-image"
|
| 515 |
+
headers = {
|
| 516 |
+
"Content-Type": "application/json",
|
| 517 |
+
"Authorization": f"Bearer {api_key}"
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
data = {
|
| 521 |
+
"inputMethod": input_method,
|
| 522 |
+
"images": images,
|
| 523 |
+
"prompt": prompt,
|
| 524 |
+
"jsonMode": json_mode
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
response = requests.post(url, headers=headers, json=data, timeout=120)
|
| 528 |
+
response.raise_for_status()
|
| 529 |
+
|
| 530 |
+
result = response.json()
|
| 531 |
+
|
| 532 |
+
# Format the response in a readable way
|
| 533 |
+
formatted_response = f"Image Analysis Results:\n\n"
|
| 534 |
+
formatted_response += f"Extracted Data:\n{result.get('results', 'No results found')}\n\n"
|
| 535 |
+
formatted_response += f"Images Processed: {result.get('imageCount', 'Unknown')}\n"
|
| 536 |
+
formatted_response += f"Credit Usage: {result.get('creditUsage', 'Unknown')}\n"
|
| 537 |
+
|
| 538 |
+
return formatted_response
|
| 539 |
+
|
| 540 |
+
except requests.exceptions.Timeout:
|
| 541 |
+
return "Error: Request to Dumpling AI API timed out after 120 seconds"
|
| 542 |
+
except requests.exceptions.HTTPError as e:
|
| 543 |
+
error_detail = f"HTTP Error: {e.response.status_code}"
|
| 544 |
+
try:
|
| 545 |
+
error_json = e.response.json()
|
| 546 |
+
error_detail += f" - {error_json.get('detail', error_json)}"
|
| 547 |
+
except:
|
| 548 |
+
error_detail += f" - {e.response.text[:500]}"
|
| 549 |
+
return error_detail
|
| 550 |
+
except requests.exceptions.RequestException as e:
|
| 551 |
+
return f"Error making request to Dumpling AI API: {str(e)}"
|
| 552 |
+
except Exception as e:
|
| 553 |
+
return f"Error extracting image data: {str(e)}"
|
| 554 |
+
|
| 555 |
@tool
|
| 556 |
def extract_url_content(url: str) -> str:
|
| 557 |
"""Extract content from a URL using Diffbot API (supports webpages, articles, PDFs, etc.).
|
|
|
|
| 588 |
|
| 589 |
try:
|
| 590 |
# Make the API request with a timeout
|
| 591 |
+
response = requests.get(api_url, params=params, timeout=60) # 30 second timeout
|
| 592 |
response.raise_for_status() # Raise exception for HTTP errors
|
| 593 |
|
| 594 |
# Parse the response
|
|
|
|
| 626 |
except Exception as e:
|
| 627 |
return f"Error extracting content from {url}: {str(e)}"
|
| 628 |
|
| 629 |
+
@tool
|
| 630 |
+
def get_youtube_transcript(url: str) -> str:
|
| 631 |
+
"""Get the transcript (captions) from a YouTube video as text.
|
| 632 |
+
|
| 633 |
+
This tool extracts the transcript text from YouTube videos, returns the transcript as a string.
|
| 634 |
+
|
| 635 |
+
Parameters:
|
| 636 |
+
- url: The YouTube video URL
|
| 637 |
+
|
| 638 |
+
Returns:
|
| 639 |
+
- The transcript as a string, or an error message if the transcript couldn't be obtained
|
| 640 |
+
"""
|
| 641 |
+
|
| 642 |
+
|
| 643 |
+
# Create a temporary directory to store subtitle files
|
| 644 |
+
temp_dir = tempfile.mkdtemp()
|
| 645 |
+
current_dir = os.getcwd()
|
| 646 |
+
|
| 647 |
+
try:
|
| 648 |
+
# Change to temp directory for download
|
| 649 |
+
os.chdir(temp_dir)
|
| 650 |
+
|
| 651 |
+
ydl_opts = {
|
| 652 |
+
'writesubtitles': True, # Download subtitles
|
| 653 |
+
'writeautomaticsub': True, # Download automatic subtitles
|
| 654 |
+
'subtitleslangs': ['en'], # Specify English language
|
| 655 |
+
'skip_download': True, # Skip downloading the video, only get subtitles
|
| 656 |
+
'outtmpl': 'subtitle', # Simple output template
|
| 657 |
+
}
|
| 658 |
+
|
| 659 |
+
# Download the subtitles
|
| 660 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 661 |
+
info_dict = ydl.extract_info(url, download=True)
|
| 662 |
+
video_title = info_dict.get('title', 'Unknown Title')
|
| 663 |
+
|
| 664 |
+
# Look for subtitle files in the temp directory
|
| 665 |
+
subtitle_content = ""
|
| 666 |
+
subtitle_files = list(Path(temp_dir).glob("*.vtt")) + list(Path(temp_dir).glob("*.srt"))
|
| 667 |
+
|
| 668 |
+
if subtitle_files:
|
| 669 |
+
# Read the first subtitle file found
|
| 670 |
+
with open(subtitle_files[0], 'r', encoding='utf-8') as f:
|
| 671 |
+
subtitle_content = f.read()
|
| 672 |
+
|
| 673 |
+
# Clean up the subtitle content to remove timestamps and formatting
|
| 674 |
+
# This is a simple cleaning - more complex parsing may be needed for perfect results
|
| 675 |
+
lines = subtitle_content.split('\n')
|
| 676 |
+
cleaned_lines = []
|
| 677 |
+
for line in lines:
|
| 678 |
+
# Skip time codes, numbering and empty lines
|
| 679 |
+
if line.strip() and not line.strip().isdigit() and not '-->' in line and not line.startswith('WEBVTT'):
|
| 680 |
+
cleaned_lines.append(line)
|
| 681 |
+
|
| 682 |
+
subtitle_content = ' '.join(cleaned_lines)
|
| 683 |
+
return f"Transcript from YouTube video: '{video_title}'\n\n{subtitle_content}"
|
| 684 |
+
else:
|
| 685 |
+
return f"No transcript found for YouTube video: '{video_title}'"
|
| 686 |
+
|
| 687 |
+
except Exception as e:
|
| 688 |
+
return f"Error retrieving YouTube transcript: {str(e)}"
|
| 689 |
+
finally:
|
| 690 |
+
# Change back to original directory and clean up
|
| 691 |
+
os.chdir(current_dir)
|
| 692 |
+
# Cleanup files (optional)
|
| 693 |
+
try:
|
| 694 |
+
for file in os.listdir(temp_dir):
|
| 695 |
+
os.remove(os.path.join(temp_dir, file))
|
| 696 |
+
os.rmdir(temp_dir)
|
| 697 |
+
except:
|
| 698 |
+
pass
|
| 699 |
+
|
| 700 |
class BasicAgent:
|
| 701 |
def __init__(self):
|
| 702 |
print("BasicAgent initialized.")
|
|
|
|
| 732 |
)
|
| 733 |
|
| 734 |
# Initialize tools
|
| 735 |
+
self.tools = [search_web_tavily, search_web_serper, execute_code_securely, execute_shell_command, sandbox_file_operation, extract_document_data, extract_image_data, extract_url_content, get_youtube_transcript]
|
| 736 |
|
| 737 |
# Bind tools only to the worker model
|
| 738 |
self.worker_model = self.worker_model_base.bind_tools(self.tools)
|
|
|
|
| 859 |
Worker has access to the following tools:
|
| 860 |
- Web search (using Tavily and Serper)
|
| 861 |
- Web content extraction
|
| 862 |
+
- Image analysis (can extract visual information from images)
|
| 863 |
+
- Document data extraction (from PDFs, documents, etc.)
|
| 864 |
- Secure code execution (for Python and other languages)
|
| 865 |
- Secure shell command execution
|
| 866 |
- Secure file operations
|
|
|
|
| 903 |
Worker has access to the following powerful tools:
|
| 904 |
- Web search (using Tavily and Serper)
|
| 905 |
- Web content extraction
|
| 906 |
+
- Image analysis (can extract visual information from images)
|
| 907 |
+
- Document data extraction (can extract data from PDFs, documents, etc.)
|
| 908 |
- Secure code execution (for Python and other languages)
|
| 909 |
- Secure shell command execution
|
| 910 |
- Secure file operations
|
|
|
|
| 1012 |
Remember that the worker had access to:
|
| 1013 |
- Web search tools
|
| 1014 |
- Web content extraction
|
| 1015 |
+
- Image analysis (can extract visual information from images)
|
| 1016 |
+
- Document data extraction (from PDFs, documents, etc.)
|
| 1017 |
- Secure code execution
|
| 1018 |
- Secure shell commands
|
| 1019 |
- Secure file operations
|
|
|
|
| 1026 |
- Ensure any numerical values, dates, names, or technical terms are correct
|
| 1027 |
- Confirm that the formatting precisely matches what was requested
|
| 1028 |
- Do not add units to the final answer if not explicitly requested
|
| 1029 |
+
- Do not use money symbols like in the final answer if not explicitly requested
|
| 1030 |
+
- Dont use comma separators for integers like 1,000,000, just use 1000000
|
| 1031 |
- Answers tend to be as short as possible, so do not add extra data unless explicitly requested
|
| 1032 |
|
| 1033 |
If the answer report is correct, format it exactly as asked in the question, and respond with:
|
|
|
|
| 1134 |
|
| 1135 |
try:
|
| 1136 |
# Run the workflow
|
| 1137 |
+
final_state = self.app.invoke(initial_state, config={"callbacks": [self.langfuse_handler], "recursion_limit": 35})
|
| 1138 |
|
| 1139 |
# Return the final answer
|
| 1140 |
answer = final_state.get("final_answer", "")
|
app.py
CHANGED
|
@@ -117,6 +117,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 117 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 118 |
continue
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
# Skip if we already have an answer for this question
|
| 121 |
if task_id in existing_answers_dict:
|
| 122 |
submitted_answer = existing_answers_dict[task_id]
|
|
|
|
| 117 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 118 |
continue
|
| 119 |
|
| 120 |
+
# Check if the question has an associated file and prepend information
|
| 121 |
+
file_name = item.get("file_name")
|
| 122 |
+
if file_name and file_name != "":
|
| 123 |
+
file_url = f"{api_url}/files/{task_id}"
|
| 124 |
+
question_with_file_info = f"For this task there is file available, with name {file_name}, it's possible to download it from {file_url}\n\n{question_text}"
|
| 125 |
+
question_text = question_with_file_info
|
| 126 |
+
|
| 127 |
# Skip if we already have an answer for this question
|
| 128 |
if task_id in existing_answers_dict:
|
| 129 |
submitted_answer = existing_answers_dict[task_id]
|
requirements.txt
CHANGED
|
@@ -7,4 +7,4 @@ langchain-anthropic
|
|
| 7 |
anthropic
|
| 8 |
python-Levenshtein
|
| 9 |
daytona_sdk
|
| 10 |
-
|
|
|
|
| 7 |
anthropic
|
| 8 |
python-Levenshtein
|
| 9 |
daytona_sdk
|
| 10 |
+
yt_dlp
|