martinsu commited on
Commit
d70b82d
·
1 Parent(s): c48121b

Enhance agent.py with image data extraction and YouTube transcript retrieval tools; update .gitignore and requirements.txt to include test files and yt_dlp dependency.

Browse files
Files changed (4) hide show
  1. .gitignore +5 -1
  2. agent.py +150 -4
  3. app.py +7 -0
  4. requirements.txt +1 -1
.gitignore CHANGED
@@ -12,4 +12,8 @@ TEST_SET/
12
  test_results/
13
 
14
  #cursor
15
- .cursor/
 
 
 
 
 
12
  test_results/
13
 
14
  #cursor
15
+ .cursor/
16
+
17
+ #test
18
+ test.py
19
+ test_youtube.py
agent.py CHANGED
@@ -15,7 +15,11 @@ import requests
15
  import json
16
  import time
17
  from daytona_sdk import Daytona, DaytonaConfig
18
-
 
 
 
 
19
 
20
 
21
  # Load environment variablesTuple
@@ -485,6 +489,69 @@ def extract_document_data(input_method: str, files: list, prompt: str, json_mode
485
  except Exception as e:
486
  return f"Error extracting document data: {str(e)}"
487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
  @tool
489
  def extract_url_content(url: str) -> str:
490
  """Extract content from a URL using Diffbot API (supports webpages, articles, PDFs, etc.).
@@ -521,7 +588,7 @@ def extract_url_content(url: str) -> str:
521
 
522
  try:
523
  # Make the API request with a timeout
524
- response = requests.get(api_url, params=params, timeout=30) # 30 second timeout
525
  response.raise_for_status() # Raise exception for HTTP errors
526
 
527
  # Parse the response
@@ -559,6 +626,77 @@ def extract_url_content(url: str) -> str:
559
  except Exception as e:
560
  return f"Error extracting content from {url}: {str(e)}"
561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  class BasicAgent:
563
  def __init__(self):
564
  print("BasicAgent initialized.")
@@ -594,7 +732,7 @@ class BasicAgent:
594
  )
595
 
596
  # Initialize tools
597
- self.tools = [search_web_tavily, search_web_serper, execute_code_securely, execute_shell_command, sandbox_file_operation, extract_document_data, extract_url_content]
598
 
599
  # Bind tools only to the worker model
600
  self.worker_model = self.worker_model_base.bind_tools(self.tools)
@@ -721,6 +859,8 @@ class BasicAgent:
721
  Worker has access to the following tools:
722
  - Web search (using Tavily and Serper)
723
  - Web content extraction
 
 
724
  - Secure code execution (for Python and other languages)
725
  - Secure shell command execution
726
  - Secure file operations
@@ -763,6 +903,8 @@ class BasicAgent:
763
  Worker has access to the following powerful tools:
764
  - Web search (using Tavily and Serper)
765
  - Web content extraction
 
 
766
  - Secure code execution (for Python and other languages)
767
  - Secure shell command execution
768
  - Secure file operations
@@ -870,6 +1012,8 @@ class BasicAgent:
870
  Remember that the worker had access to:
871
  - Web search tools
872
  - Web content extraction
 
 
873
  - Secure code execution
874
  - Secure shell commands
875
  - Secure file operations
@@ -882,6 +1026,8 @@ class BasicAgent:
882
  - Ensure any numerical values, dates, names, or technical terms are correct
883
  - Confirm that the formatting precisely matches what was requested
884
  - Do not add units to the final answer if not explicitly requested
 
 
885
  - Answers tend to be as short as possible, so do not add extra data unless explicitly requested
886
 
887
  If the answer report is correct, format it exactly as asked in the question, and respond with:
@@ -988,7 +1134,7 @@ class BasicAgent:
988
 
989
  try:
990
  # Run the workflow
991
- final_state = self.app.invoke(initial_state, config={"callbacks": [self.langfuse_handler], "recursion_limit": 50})
992
 
993
  # Return the final answer
994
  answer = final_state.get("final_answer", "")
 
15
  import json
16
  import time
17
  from daytona_sdk import Daytona, DaytonaConfig
18
+ import yt_dlp
19
+ import io
20
+ import os
21
+ import tempfile
22
+ from pathlib import Path
23
 
24
 
25
  # Load environment variablesTuple
 
489
  except Exception as e:
490
  return f"Error extracting document data: {str(e)}"
491
 
492
+ @tool
493
+ def extract_image_data(input_method: str, images: list, prompt: str, json_mode: bool = False) -> str:
494
+ """Extract visual information from images using Dumpling AI.
495
+
496
+ This tool allows you to extract detailed descriptions or specific information from images
497
+ using vision-capable Large Language Models (LLMs). It can identify objects, scenes, text,
498
+ and other visual elements based on your specific prompt.
499
+
500
+ Parameters:
501
+ - input_method: How to input images, either "url" or "base64"
502
+ - images: List of image URLs or base64-encoded strings depending on input_method
503
+ - prompt: Specific instructions for what information to extract from the image
504
+ - json_mode: Whether to return structured JSON (true) or free text (false)
505
+
506
+ Returns:
507
+ - Extracted visual data from the image based on your prompt
508
+ """
509
+ api_key = os.getenv("DUMPLING_API_KEY")
510
+ if not api_key:
511
+ return "Error: DUMPLING_API_KEY environment variable not set"
512
+
513
+ try:
514
+ url = "https://app.dumplingai.com/api/v1/extract-image"
515
+ headers = {
516
+ "Content-Type": "application/json",
517
+ "Authorization": f"Bearer {api_key}"
518
+ }
519
+
520
+ data = {
521
+ "inputMethod": input_method,
522
+ "images": images,
523
+ "prompt": prompt,
524
+ "jsonMode": json_mode
525
+ }
526
+
527
+ response = requests.post(url, headers=headers, json=data, timeout=120)
528
+ response.raise_for_status()
529
+
530
+ result = response.json()
531
+
532
+ # Format the response in a readable way
533
+ formatted_response = f"Image Analysis Results:\n\n"
534
+ formatted_response += f"Extracted Data:\n{result.get('results', 'No results found')}\n\n"
535
+ formatted_response += f"Images Processed: {result.get('imageCount', 'Unknown')}\n"
536
+ formatted_response += f"Credit Usage: {result.get('creditUsage', 'Unknown')}\n"
537
+
538
+ return formatted_response
539
+
540
+ except requests.exceptions.Timeout:
541
+ return "Error: Request to Dumpling AI API timed out after 120 seconds"
542
+ except requests.exceptions.HTTPError as e:
543
+ error_detail = f"HTTP Error: {e.response.status_code}"
544
+ try:
545
+ error_json = e.response.json()
546
+ error_detail += f" - {error_json.get('detail', error_json)}"
547
+ except:
548
+ error_detail += f" - {e.response.text[:500]}"
549
+ return error_detail
550
+ except requests.exceptions.RequestException as e:
551
+ return f"Error making request to Dumpling AI API: {str(e)}"
552
+ except Exception as e:
553
+ return f"Error extracting image data: {str(e)}"
554
+
555
  @tool
556
  def extract_url_content(url: str) -> str:
557
  """Extract content from a URL using Diffbot API (supports webpages, articles, PDFs, etc.).
 
588
 
589
  try:
590
  # Make the API request with a timeout
591
+ response = requests.get(api_url, params=params, timeout=60) # 30 second timeout
592
  response.raise_for_status() # Raise exception for HTTP errors
593
 
594
  # Parse the response
 
626
  except Exception as e:
627
  return f"Error extracting content from {url}: {str(e)}"
628
 
629
+ @tool
630
+ def get_youtube_transcript(url: str) -> str:
631
+ """Get the transcript (captions) from a YouTube video as text.
632
+
633
+ This tool extracts the transcript text from YouTube videos, returns the transcript as a string.
634
+
635
+ Parameters:
636
+ - url: The YouTube video URL
637
+
638
+ Returns:
639
+ - The transcript as a string, or an error message if the transcript couldn't be obtained
640
+ """
641
+
642
+
643
+ # Create a temporary directory to store subtitle files
644
+ temp_dir = tempfile.mkdtemp()
645
+ current_dir = os.getcwd()
646
+
647
+ try:
648
+ # Change to temp directory for download
649
+ os.chdir(temp_dir)
650
+
651
+ ydl_opts = {
652
+ 'writesubtitles': True, # Download subtitles
653
+ 'writeautomaticsub': True, # Download automatic subtitles
654
+ 'subtitleslangs': ['en'], # Specify English language
655
+ 'skip_download': True, # Skip downloading the video, only get subtitles
656
+ 'outtmpl': 'subtitle', # Simple output template
657
+ }
658
+
659
+ # Download the subtitles
660
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
661
+ info_dict = ydl.extract_info(url, download=True)
662
+ video_title = info_dict.get('title', 'Unknown Title')
663
+
664
+ # Look for subtitle files in the temp directory
665
+ subtitle_content = ""
666
+ subtitle_files = list(Path(temp_dir).glob("*.vtt")) + list(Path(temp_dir).glob("*.srt"))
667
+
668
+ if subtitle_files:
669
+ # Read the first subtitle file found
670
+ with open(subtitle_files[0], 'r', encoding='utf-8') as f:
671
+ subtitle_content = f.read()
672
+
673
+ # Clean up the subtitle content to remove timestamps and formatting
674
+ # This is a simple cleaning - more complex parsing may be needed for perfect results
675
+ lines = subtitle_content.split('\n')
676
+ cleaned_lines = []
677
+ for line in lines:
678
+ # Skip time codes, numbering and empty lines
679
+ if line.strip() and not line.strip().isdigit() and not '-->' in line and not line.startswith('WEBVTT'):
680
+ cleaned_lines.append(line)
681
+
682
+ subtitle_content = ' '.join(cleaned_lines)
683
+ return f"Transcript from YouTube video: '{video_title}'\n\n{subtitle_content}"
684
+ else:
685
+ return f"No transcript found for YouTube video: '{video_title}'"
686
+
687
+ except Exception as e:
688
+ return f"Error retrieving YouTube transcript: {str(e)}"
689
+ finally:
690
+ # Change back to original directory and clean up
691
+ os.chdir(current_dir)
692
+ # Cleanup files (optional)
693
+ try:
694
+ for file in os.listdir(temp_dir):
695
+ os.remove(os.path.join(temp_dir, file))
696
+ os.rmdir(temp_dir)
697
+ except:
698
+ pass
699
+
700
  class BasicAgent:
701
  def __init__(self):
702
  print("BasicAgent initialized.")
 
732
  )
733
 
734
  # Initialize tools
735
+ self.tools = [search_web_tavily, search_web_serper, execute_code_securely, execute_shell_command, sandbox_file_operation, extract_document_data, extract_image_data, extract_url_content, get_youtube_transcript]
736
 
737
  # Bind tools only to the worker model
738
  self.worker_model = self.worker_model_base.bind_tools(self.tools)
 
859
  Worker has access to the following tools:
860
  - Web search (using Tavily and Serper)
861
  - Web content extraction
862
+ - Image analysis (can extract visual information from images)
863
+ - Document data extraction (from PDFs, documents, etc.)
864
  - Secure code execution (for Python and other languages)
865
  - Secure shell command execution
866
  - Secure file operations
 
903
  Worker has access to the following powerful tools:
904
  - Web search (using Tavily and Serper)
905
  - Web content extraction
906
+ - Image analysis (can extract visual information from images)
907
+ - Document data extraction (can extract data from PDFs, documents, etc.)
908
  - Secure code execution (for Python and other languages)
909
  - Secure shell command execution
910
  - Secure file operations
 
1012
  Remember that the worker had access to:
1013
  - Web search tools
1014
  - Web content extraction
1015
+ - Image analysis (can extract visual information from images)
1016
+ - Document data extraction (from PDFs, documents, etc.)
1017
  - Secure code execution
1018
  - Secure shell commands
1019
  - Secure file operations
 
1026
  - Ensure any numerical values, dates, names, or technical terms are correct
1027
  - Confirm that the formatting precisely matches what was requested
1028
  - Do not add units to the final answer if not explicitly requested
1029
+ - Do not use money symbols like in the final answer if not explicitly requested
1030
+ - Dont use comma separators for integers like 1,000,000, just use 1000000
1031
  - Answers tend to be as short as possible, so do not add extra data unless explicitly requested
1032
 
1033
  If the answer report is correct, format it exactly as asked in the question, and respond with:
 
1134
 
1135
  try:
1136
  # Run the workflow
1137
+ final_state = self.app.invoke(initial_state, config={"callbacks": [self.langfuse_handler], "recursion_limit": 35})
1138
 
1139
  # Return the final answer
1140
  answer = final_state.get("final_answer", "")
app.py CHANGED
@@ -117,6 +117,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
117
  print(f"Skipping item with missing task_id or question: {item}")
118
  continue
119
 
 
 
 
 
 
 
 
120
  # Skip if we already have an answer for this question
121
  if task_id in existing_answers_dict:
122
  submitted_answer = existing_answers_dict[task_id]
 
117
  print(f"Skipping item with missing task_id or question: {item}")
118
  continue
119
 
120
+ # Check if the question has an associated file and prepend information
121
+ file_name = item.get("file_name")
122
+ if file_name and file_name != "":
123
+ file_url = f"{api_url}/files/{task_id}"
124
+ question_with_file_info = f"For this task there is file available, with name {file_name}, it's possible to download it from {file_url}\n\n{question_text}"
125
+ question_text = question_with_file_info
126
+
127
  # Skip if we already have an answer for this question
128
  if task_id in existing_answers_dict:
129
  submitted_answer = existing_answers_dict[task_id]
requirements.txt CHANGED
@@ -7,4 +7,4 @@ langchain-anthropic
7
  anthropic
8
  python-Levenshtein
9
  daytona_sdk
10
-
 
7
  anthropic
8
  python-Levenshtein
9
  daytona_sdk
10
+ yt_dlp