HF_Agents_Final_Project

Runtime error

App Files Files Community

Yago Bolivar commited on May 22

Commit

87aa741

1 Parent(s): b70394c

Refactor speech_to_text.py to implement a singleton ASR pipeline, enhance error handling, and introduce SpeechToTextTool for better integration. Update spreadsheet_tool.py to support querying and improve parsing functionality, including CSV support. Enhance video_processing_tool.py with new tasks for metadata extraction and frame extraction, while improving object detection capabilities and initialization checks.

Browse files

Files changed (6) hide show

src/image_processing_tool.py +1 -1
src/markdown_table_parser.py +55 -27
src/python_tool.py +188 -130
src/speech_to_text.py +118 -19
src/spreadsheet_tool.py +159 -201
src/video_processing_tool.py +167 -166

src/image_processing_tool.py CHANGED Viewed

@@ -46,7 +46,7 @@ class ImageProcessor(Tool):
     # For simplicity, let's assume a general 'process' action and specify task type in params
     inputs = {
         'image_filepath': {'type': 'string', 'description': 'Path to the image file.'},
-        'task': {'type': 'string', 'description': 'Specific task to perform (e.g., \'caption\', \'chess_analysis\').'}
     }
     outputs = {'result': {'type': 'object', 'description': 'The result of the image processing task (e.g., text caption, chess move, error message).'}}
     output_type = "object"

     # For simplicity, let's assume a general 'process' action and specify task type in params
     inputs = {
         'image_filepath': {'type': 'string', 'description': 'Path to the image file.'},
+        'task': {'type': 'string', 'description': 'Specific task to perform (e.g., \'caption\', \'chess_analysis\').', 'nullable': True} # Added nullable: True
     }
     outputs = {'result': {'type': 'object', 'description': 'The result of the image processing task (e.g., text caption, chess move, error message).'}}
     output_type = "object"

src/markdown_table_parser.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import re
-def parse_markdown_table(markdown_text: str) -> dict[str, list[str]] | None:
     """
     Parses the first valid Markdown table found in a string.
     Returns a dictionary (headers as keys, lists of cell content as values)
@@ -48,15 +51,45 @@ def parse_markdown_table(markdown_text: str) -> dict[str, list[str]] | None:
                 # First cell is row label, rest are data
                 table[headers[0]].append(cells[0])
                 for k, h in enumerate(headers[1:], 1):
                     table[h].append(cells[k])
             else:
                 for k, h in enumerate(headers):
                     table[h].append(cells[k])
             j += 1
         return table
     return None
 if __name__ == '__main__':
     example_table = """
     |*|a|b|c|d|e|
     |---|---|---|---|---|---|
@@ -66,7 +99,7 @@ if __name__ == '__main__':
     |d|b|e|b|e|d|
     |e|d|b|a|d|c|
     """
-    parsed = parse_markdown_table(example_table)
     print("Parsed GAIA example:")
     if parsed:
         for header, column_data in parsed.items():
@@ -83,7 +116,7 @@ if __name__ == '__main__':
     | Carol | 45  | London    |
     Some text after
     """
-    parsed_2 = parse_markdown_table(example_table_2)
     print("\\nParsed Table 2 (with surrounding text):")
     if parsed_2:
         for header, column_data in parsed_2.items():
@@ -95,36 +128,31 @@ if __name__ == '__main__':
     | Header1 | Header2 |
     |---------|---------|
     """
-    parsed_empty = parse_markdown_table(empty_table_with_header)
     print("\\nParsed Empty Table with Header:")
     if parsed_empty:
         for header, column_data in parsed_empty.items():
             print(f"Header: {header}, Data: {column_data}")
     else:
-        print("Failed to parse empty table with header.")
-    malformed_separator = """
-    | Header1 | Header2 |
-    |---foo---|---------|
-    | data1   | data2   |
-    """
-    parsed_mal_sep = parse_markdown_table(malformed_separator)
-    print("\\nParsed table with malformed separator:")
-    if parsed_mal_sep:
-        print(parsed_mal_sep)
-    else:
-        print("Failed to parse (correctly).")
-    table_with_alignment = """
-    | Syntax    | Description |
-    | :-------- | :-----------: |
-    | Header    | Title       |
-    | Paragraph | Text        |
     """
-    parsed_align = parse_markdown_table(table_with_alignment)
-    print("\\nParsed table with alignment in separator:")
-    if parsed_align:
-        for header, column_data in parsed_align.items():
             print(f"Header: {header}, Data: {column_data}")
     else:
-        print("Failed to parse table with alignment.")

 import re
+from smolagents.tools import Tool
+from typing import Dict, List, Optional
+# Original parsing function
+def _parse_markdown_table_string(markdown_text: str) -> Optional[Dict[str, List[str]]]:
     """
     Parses the first valid Markdown table found in a string.
     Returns a dictionary (headers as keys, lists of cell content as values)
                 # First cell is row label, rest are data
                 table[headers[0]].append(cells[0])
                 for k, h in enumerate(headers[1:], 1):
+                    # Ensure the key exists and is a list
+                    if h not in table or not isinstance(table[h], list):
+                        table[h] = [] # Initialize if not present or not a list
                     table[h].append(cells[k])
             else:
                 for k, h in enumerate(headers):
+                    if h not in table or not isinstance(table[h], list):
+                        table[h] = []
                     table[h].append(cells[k])
             j += 1
         return table
     return None
+class MarkdownTableParserTool(Tool):
+    """
+    Parses a Markdown table from a given text string.
+    Useful for converting markdown tables into Python data structures for further analysis.
+    """
+    name = "markdown_table_parser"
+    description = "Parses the first valid Markdown table found in a string and returns it as a dictionary."
+    inputs = {'markdown_text': {'type': 'string', 'description': 'The string containing the Markdown table.'}}
+    outputs = {'parsed_table': {'type': 'object', 'description': 'A dictionary representing the table (headers as keys, lists of cell content as values), or null if no table is found.'}}
+    output_type = "object" # Or dict/None
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_initialized = True
+    def forward(self, markdown_text: str) -> Optional[Dict[str, List[str]]]:
+        """
+        Wrapper for the _parse_markdown_table_string function.
+        """
+        return _parse_markdown_table_string(markdown_text)
+# Expose the original function name if other parts of the system expect it (optional)
+parse_markdown_table = _parse_markdown_table_string
 if __name__ == '__main__':
+    tool_instance = MarkdownTableParserTool()
     example_table = """
     |*|a|b|c|d|e|
     |---|---|---|---|---|---|
     |d|b|e|b|e|d|
     |e|d|b|a|d|c|
     """
+    parsed = tool_instance.forward(example_table)
     print("Parsed GAIA example:")
     if parsed:
         for header, column_data in parsed.items():
     | Carol | 45  | London    |
     Some text after
     """
+    parsed_2 = tool_instance.forward(example_table_2)
     print("\\nParsed Table 2 (with surrounding text):")
     if parsed_2:
         for header, column_data in parsed_2.items():
     | Header1 | Header2 |
     |---------|---------|
     """
+    parsed_empty = tool_instance.forward(empty_table_with_header)
     print("\\nParsed Empty Table with Header:")
     if parsed_empty:
         for header, column_data in parsed_empty.items():
             print(f"Header: {header}, Data: {column_data}")
     else:
+        print("Failed to parse table (empty with header).") # Corrected message
+    malformed_table = """
+    | Header1 | Header2
+    |--- ---|
+    | cell1 | cell2 |
     """
+    parsed_malformed = tool_instance.forward(malformed_table)
+    print("\\nParsed Malformed Table:")
+    if parsed_malformed:
+        for header, column_data in parsed_malformed.items():
             print(f"Header: {header}, Data: {column_data}")
     else:
+        print("Failed to parse malformed table.")
+    no_table_text = "This is just some text without a table."
+    parsed_no_table = tool_instance.forward(no_table_text)
+    print("\\nParsed Text Without Table:")
+    if parsed_no_table:
+        print("Error: Should not have parsed a table.")
+    else:
+        print("Correctly found no table.")

src/python_tool.py CHANGED Viewed

@@ -6,21 +6,31 @@ import re
 import traceback
 from typing import Dict, Any, Optional, Union, List
 from smolagents.tools import Tool
 class CodeExecutionTool(Tool):
     """
     Executes Python code in a controlled environment for safe code interpretation.
     Useful for evaluating code snippets and returning their output or errors.
     """
-    def __init__(self, timeout: int = 5, max_output_size: int = 10000):
-        self.timeout = timeout  # Maximum execution time in seconds
         self.max_output_size = max_output_size
-        # Restricted imports - add more as needed
         self.banned_modules = [
-            'os', 'subprocess', 'sys', 'builtins', 'importlib', 'eval',
-            'pickle', 'requests', 'socket', 'shutil'
         ]
     def _analyze_code_safety(self, code: str) -> Dict[str, Any]:
         """Perform static analysis to check for potentially harmful code."""
@@ -33,9 +43,11 @@ class CodeExecutionTool(Tool):
                 if isinstance(node, ast.Import):
                     imports.extend(n.name for n in node.names)
                 elif isinstance(node, ast.ImportFrom):
-                    imports.append(node.module)
-            dangerous_imports = [imp for imp in imports if any(
                 banned in imp for banned in self.banned_modules)]
             if dangerous_imports:
@@ -83,141 +95,187 @@ class CodeExecutionTool(Tool):
         return None
-    def execute_file(self, filepath: str) -> Dict[str, Any]:
-        """Execute Python code from file and capture the output."""
-        try:
-            with open(filepath, 'r') as file:
-                code = file.read()
-            return self.execute_code(code)
-        except FileNotFoundError:
-            return {"success": False, "error": f"File not found: {filepath}"}
-        except Exception as e:
-            return {
-                "success": False,
-                "error": f"Error reading file: {str(e)}"
-            }
-    def execute_code(self, code: str) -> Dict[str, Any]:
-        """Execute Python code string and capture the output."""
-        # Check code safety first
         safety_check = self._analyze_code_safety(code)
         if not safety_check["safe"]:
-            return {
-                "success": False,
-                "error": f"Security check failed: {safety_check['reason']}"
-            }
-        # Prepare a clean globals dictionary with minimal safe functions
-        safe_globals = {
-            'abs': abs,
-            'all': all,
-            'any': any,
-            'bin': bin,
-            'bool': bool,
-            'chr': chr,
-            'complex': complex,
-            'dict': dict,
-            'divmod': divmod,
-            'enumerate': enumerate,
-            'filter': filter,
-            'float': float,
-            'format': format,
-            'frozenset': frozenset,
-            'hash': hash,
-            'hex': hex,
-            'int': int,
-            'isinstance': isinstance,
-            'issubclass': issubclass,
-            'len': len,
-            'list': list,
-            'map': map,
-            'max': max,
-            'min': min,
-            'oct': oct,
-            'ord': ord,
-            'pow': pow,
-            'print': print,
-            'range': range,
-            'reversed': reversed,
-            'round': round,
-            'set': set,
-            'sorted': sorted,
-            'str': str,
-            'sum': sum,
-            'tuple': tuple,
-            'zip': zip,
-            '__builtins__': {},  # Empty builtins for extra security
-        }
-        # Add math module functions, commonly needed
-        try:
-            import math
-            for name in dir(math):
-                if not name.startswith('_'):
-                    safe_globals[name] = getattr(math, name)
-        except ImportError:
-            pass
-        # Capture output using StringIO
-        output_buffer = io.StringIO()
-        # Set timeout handler
-        old_handler = signal.getsignal(signal.SIGALRM)
         signal.signal(signal.SIGALRM, self._timeout_handler)
         signal.alarm(self.timeout)
         try:
-            # Execute code with stdout/stderr capture
-            with contextlib.redirect_stdout(output_buffer):
-                with contextlib.redirect_stderr(output_buffer):
-                    exec(code, safe_globals)
-            output = output_buffer.getvalue()
             if len(output) > self.max_output_size:
-                truncation_message = f"\n... [output truncated to {self.max_output_size} characters]"
-                output = output[:self.max_output_size - len(truncation_message)] + truncation_message
-            else:
-                output = output.strip()
-            # Extract the numeric value
-            numeric_result = self._extract_numeric_value(output)
             return {
-                "success": True,
-                "raw_output": output,
-                "numeric_value": numeric_result,
-                "has_numeric_result": numeric_result is not None
             }
         except TimeoutError:
-            return {
-                "success": False,
-                "error": f"Code execution timed out after {self.timeout} seconds"
-            }
         except Exception as e:
-            error_info = traceback.format_exc()
-            return {
-                "success": False,
-                "error": str(e),
-                "traceback": error_info,
-                "raw_output": output_buffer.getvalue()
-            }
         finally:
-            # Reset alarm and signal handler
-            signal.alarm(0)
-            signal.signal(signal.SIGALRM, old_handler)
-# Example usage
-if __name__ == "__main__":
-    executor = CodeExecutionTool()
-    result = executor.execute_code("""
-# Example code that calculates a value
-total = 0
-for i in range(10):
-    total += i * 2
-print(f"The result is {total}")
-    """)
-    print(result)

 import traceback
 from typing import Dict, Any, Optional, Union, List
 from smolagents.tools import Tool
+import os
 class CodeExecutionTool(Tool):
     """
     Executes Python code in a controlled environment for safe code interpretation.
     Useful for evaluating code snippets and returning their output or errors.
     """
+    name = "python_code_executor"
+    description = "Executes a given Python code string or Python code from a file. Returns the output or error."
+    inputs = {
+        'code_string': {'type': 'string', 'description': 'The Python code to execute directly.', 'nullable': True},
+        'filepath': {'type': 'string', 'description': 'The path to a Python file to execute.', 'nullable': True}
+    }
+    outputs = {'result': {'type': 'object', 'description': 'A dictionary containing \'success\', \'output\', and/or \'error\'.'}}
+    output_type = "object"
+    def __init__(self, timeout: int = 10, max_output_size: int = 20000, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.timeout = timeout
         self.max_output_size = max_output_size
         self.banned_modules = [
+            'os', 'subprocess', 'sys', 'builtins', 'importlib',
+            'pickle', 'requests', 'socket', 'shutil', 'ctypes', 'multiprocessing'
         ]
+        self.is_initialized = True
     def _analyze_code_safety(self, code: str) -> Dict[str, Any]:
         """Perform static analysis to check for potentially harmful code."""
                 if isinstance(node, ast.Import):
                     imports.extend(n.name for n in node.names)
                 elif isinstance(node, ast.ImportFrom):
+                    # Ensure node.module is not None before attempting to check against banned_modules
+                    if node.module and any(banned in node.module for banned in self.banned_modules):
+                        imports.append(node.module)
+            dangerous_imports = [imp for imp in imports if imp and any(
                 banned in imp for banned in self.banned_modules)]
             if dangerous_imports:
         return None
+    # Main entry point for the agent
+    def forward(self, code_string: Optional[str] = None, filepath: Optional[str] = None) -> Dict[str, Any]:
+        if not code_string and not filepath:
+            return {"success": False, "error": "No code string or filepath provided."}
+        if code_string and filepath:
+            return {"success": False, "error": "Provide either a code string or a filepath, not both."}
+        code_to_execute = ""
+        if filepath:
+            if not os.path.exists(filepath):
+                 return {"success": False, "error": f"File not found: {filepath}"}
+            if not filepath.endswith(".py"):
+                return {"success": False, "error": f"File is not a Python file: {filepath}"}
+            try:
+                with open(filepath, 'r') as file:
+                    code_to_execute = file.read()
+            except Exception as e:
+                return {"success": False, "error": f"Error reading file {filepath}: {str(e)}"}
+        elif code_string:
+            code_to_execute = code_string
+        return self._execute_actual_code(code_to_execute)
+    # Renamed from execute_code to _execute_actual_code to be internal
+    def _execute_actual_code(self, code: str) -> Dict[str, Any]:
+        """Execute Python code and capture the output or error."""
         safety_check = self._analyze_code_safety(code)
         if not safety_check["safe"]:
+            return {"success": False, "error": f"Safety check failed: {safety_check['reason']}"}
+        # Setup timeout
         signal.signal(signal.SIGALRM, self._timeout_handler)
         signal.alarm(self.timeout)
+        captured_output = io.StringIO()
+        # It's generally safer to execute in a restricted scope
+        # and not provide access to all globals/locals by default.
+        # However, for a tool that might need to define functions/classes and use them,
+        # a shared scope might be necessary. This needs careful consideration.
+        exec_globals = {}
         try:
+            with contextlib.redirect_stdout(captured_output):
+                with contextlib.redirect_stderr(captured_output): # Capture stderr as well
+                    exec(code, exec_globals) # Execute in a controlled global scope
+            output = captured_output.getvalue()
             if len(output) > self.max_output_size:
+                output = output[:self.max_output_size] + "... [output truncated]"
+            # Attempt to extract a final numeric value if applicable
+            # This might be specific to certain tasks, consider making it optional
+            # numeric_result = self._extract_numeric_value(output)
             return {
+                "success": True,
+                "output": output,
+                # "numeric_value": numeric_result
             }
         except TimeoutError:
+            return {"success": False, "error": "Code execution timed out"}
         except Exception as e:
+            # Get detailed traceback
+            tb_lines = traceback.format_exception(type(e), e, e.__traceback__)
+            error_details = "".join(tb_lines)
+            if len(error_details) > self.max_output_size:
+                error_details = error_details[:self.max_output_size] + "... [error truncated]"
+            return {"success": False, "error": f"Execution failed: {str(e)}\nTraceback:\n{error_details}"}
         finally:
+            signal.alarm(0)  # Disable the alarm
+            captured_output.close()
+    # Kept execute_file and execute_code as helper methods if direct access is ever needed,
+    # but they now call the main _execute_actual_code method.
+    def execute_file(self, filepath: str) -> Dict[str, Any]:
+        """Helper to execute Python code from file."""
+        if not os.path.exists(filepath):
+            return {"success": False, "error": f"File not found: {filepath}"}
+        if not filepath.endswith(".py"):
+            return {"success": False, "error": f"File is not a Python file: {filepath}"}
+        try:
+            with open(filepath, 'r') as file:
+                code = file.read()
+            return self._execute_actual_code(code)
+        except Exception as e:
+            return {"success": False, "error": f"Error reading file {filepath}: {str(e)}"}
+    def execute_code(self, code: str) -> Dict[str, Any]:
+        """Helper to execute Python code from a string."""
+        return self._execute_actual_code(code)
+if __name__ == '__main__':
+    tool = CodeExecutionTool(timeout=5)
+    # Test 1: Safe code string
+    safe_code = "print('Hello from safe code!'); result = 10 * 2; print(result)"
+    print("\n--- Test 1: Safe Code String ---")
+    result1 = tool.forward(code_string=safe_code)
+    print(result1)
+    assert result1['success']
+    assert "Hello from safe code!" in result1['output']
+    assert "20" in result1['output']
+    # Test 2: Code with an error
+    error_code = "print(1/0)"
+    print("\n--- Test 2: Code with Error ---")
+    result2 = tool.forward(code_string=error_code)
+    print(result2)
+    assert not result2['success']
+    assert "ZeroDivisionError" in result2['error']
+    # Test 3: Code with a banned import
+    unsafe_import_code = "import os; print(os.getcwd())"
+    print("\n--- Test 3: Unsafe Import ---")
+    result3 = tool.forward(code_string=unsafe_import_code)
+    print(result3)
+    assert not result3['success']
+    assert "Safety check failed" in result3['error']
+    assert "os" in result3['error']
+    # Test 4: Timeout
+    timeout_code = "import time; time.sleep(10); print('Done sleeping')"
+    print("\n--- Test 4: Timeout ---")
+    # tool_timeout_short = CodeExecutionTool(timeout=2) # For testing timeout specifically
+    # result4 = tool_timeout_short.forward(code_string=timeout_code)
+    result4 = tool.forward(code_string=timeout_code) # Using the main tool instance with its timeout
+    print(result4)
+    assert not result4['success']
+    assert "timed out" in result4['error']
+    # Test 5: Execute from file
+    test_file_content = "print('Hello from file!'); x = 5; y = 7; print(f'Sum: {x+y}')"
+    test_filename = "temp_test_script.py"
+    with open(test_filename, "w") as f:
+        f.write(test_file_content)
+    print("\n--- Test 5: Execute from File ---")
+    result5 = tool.forward(filepath=test_filename)
+    print(result5)
+    assert result5['success']
+    assert "Hello from file!" in result5['output']
+    assert "Sum: 12" in result5['output']
+    os.remove(test_filename)
+    # Test 6: File not found
+    print("\n--- Test 6: File Not Found ---")
+    result6 = tool.forward(filepath="non_existent_script.py")
+    print(result6)
+    assert not result6['success']
+    assert "File not found" in result6['error']
+    # Test 7: Provide both code_string and filepath
+    print("\n--- Test 7: Both code_string and filepath ---")
+    result7 = tool.forward(code_string="print('hello')", filepath=test_filename)
+    print(result7)
+    assert not result7['success']
+    assert "Provide either a code string or a filepath, not both" in result7['error']
+    # Test 8: Provide neither
+    print("\n--- Test 8: Neither code_string nor filepath ---")
+    result8 = tool.forward()
+    print(result8)
+    assert not result8['success']
+    assert "No code string or filepath provided" in result8['error']
+    # Test 9: Code that defines a function and calls it
+    func_def_code = "def my_func(a, b): return a + b; print(my_func(3,4))"
+    print("\n--- Test 9: Function Definition and Call ---")
+    result9 = tool.forward(code_string=func_def_code)
+    print(result9)
+    assert result9['success']
+    assert "7" in result9['output']
+    # Test 10: Max output size
+    # tool_max_output = CodeExecutionTool(max_output_size=50)
+    # long_output_code = "for i in range(20): print(f'Line {i}')"
+    # print("\n--- Test 10: Max Output Size ---")
+    # result10 = tool_max_output.forward(code_string=long_output_code)
+    # print(result10)
+    # assert result10['success']
+    # assert "... [output truncated]" in result10['output']
+    # assert len(result10['output']) <= 50 + len("... [output truncated]") + 5 # a bit of leeway
+    print("\nAll tests seem to have passed (check output for details).")

src/speech_to_text.py CHANGED Viewed

@@ -1,35 +1,134 @@
 from transformers import pipeline
-import librosa # Or soundfile
 import os
-# Initialize the ASR pipeline with a specific model
-# Using a smaller Whisper model for quicker setup, but larger models offer better accuracy
-asr_pipeline = pipeline(
-    "automatic-speech-recognition",
-    model="openai/whisper-tiny.en",
-)
-def transcribe_audio(audio_filepath):
     """
-    Converts speech in an audio file (e.g., .mp3) to text using speech recognition.
     Args:
         audio_filepath (str): Path to the audio file.
     Returns:
-        str: Transcribed text from the audio.
     """
     try:
-        transcription = asr_pipeline(audio_filepath, return_timestamps=True)
-        return transcription["text"]
     except Exception as e:
-        return f"Error during transcription: {e}"
 # Example usage:
 if __name__ == "__main__":
-    audio_file = "./downloaded_files/1f975693-876d-457b-a649-393859e79bf3.mp3"
-    if os.path.exists(audio_file): # Check if the (placeholder or real) file exists
-        print(f"Attempting to transcribe: {audio_file}")
-        transcribed_text = transcribe_audio(audio_file)
-        print(f"Transcription:\n{transcribed_text}")
     else:
-        print(f"File not found: {audio_file}. Please provide a valid audio file.")

 from transformers import pipeline
+import librosa  # Or soundfile
 import os
+from smolagents.tools import Tool  # Added import
+from typing import Optional  # Added for type hinting
+# Initialize the ASR pipeline once
+_asr_pipeline_instance = None
+def get_asr_pipeline():
+    global _asr_pipeline_instance
+    if _asr_pipeline_instance is None:
+        try:
+            # Using a smaller Whisper model for quicker setup, but larger models offer better accuracy
+            _asr_pipeline_instance = pipeline(
+                "automatic-speech-recognition",
+                model="openai/whisper-tiny.en",  # Consider making model configurable
+            )
+            print("ASR pipeline initialized.")  # For feedback
+        except Exception as e:
+            print(f"Error initializing ASR pipeline: {e}")
+            # Handle error appropriately, e.g., raise or log
+    return _asr_pipeline_instance
+# Original transcription function, renamed to be internal
+def _transcribe_audio_file(audio_filepath: str, asr_pipeline_instance) -> str:
     """
+    Converts speech in an audio file to text using the provided ASR pipeline.
     Args:
         audio_filepath (str): Path to the audio file.
+        asr_pipeline_instance: The initialized ASR pipeline.
     Returns:
+        str: Transcribed text from the audio or an error message.
     """
+    if not asr_pipeline_instance:
+        return "Error: ASR pipeline is not available."
+    if not os.path.exists(audio_filepath):
+        return f"Error: Audio file not found at {audio_filepath}"
     try:
+        # Ensure the file can be loaded by librosa (or your chosen audio library)
+        # This step can help catch corrupted or unsupported audio formats early.
+        y, sr = librosa.load(audio_filepath, sr=None)  # Load with original sample rate
+        if sr != 16000:  # Whisper models expect 16kHz
+            y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+        # Pass the numpy array to the pipeline
+        transcription_result = asr_pipeline_instance(
+            {"raw": y, "sampling_rate": 16000}, return_timestamps=False
+        )  # Changed to False for simplicity
+        return transcription_result["text"]
     except Exception as e:
+        return f"Error during transcription of {audio_filepath}: {e}"
+class SpeechToTextTool(Tool):
+    """
+    Transcribes audio from a given audio file path to text.
+    """
+    name = "speech_to_text_transcriber"
+    description = "Converts speech in an audio file (e.g., .mp3, .wav) to text using speech recognition."
+    inputs = {
+        "audio_filepath": {"type": "string", "description": "Path to the audio file to transcribe."}
+    }
+    outputs = {
+        "transcribed_text": {
+            "type": "string",
+            "description": "The transcribed text from the audio, or an error message.",
+        }
+    }
+    output_type = "string"
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.asr_pipeline = get_asr_pipeline()  # Initialize or get the shared pipeline
+        self.is_initialized = True if self.asr_pipeline else False
+    def forward(self, audio_filepath: str) -> str:
+        """
+        Wrapper for the _transcribe_audio_file function.
+        """
+        if not self.is_initialized or not self.asr_pipeline:
+            return "Error: SpeechToTextTool was not initialized properly (ASR pipeline missing)."
+        return _transcribe_audio_file(audio_filepath, self.asr_pipeline)
+# Expose the original function name if needed by other parts of the system (optional)
+# transcribe_audio = _transcribe_audio_file # This would need adjustment if it expects the pipeline passed in
 # Example usage:
 if __name__ == "__main__":
+    tool_instance = SpeechToTextTool()
+    # Create a dummy MP3 file for testing (requires ffmpeg to be installed for pydub to work)
+    # This part is tricky to make universally runnable without external dependencies for audio creation.
+    # For a simple test, we'll assume a file exists or skip this part if it doesn't.
+    # Path to a test audio file (replace with an actual .mp3 or .wav file for testing)
+    # You might need to download a short sample audio file and place it in your project.
+    # e.g., create a `test_data` directory and put `sample.mp3` there.
+    test_audio_file = "./data/downloaded_files/1f975693-876d-457b-a649-393859e79bf3.mp3"  # GAIA example
+    # test_audio_file_2 = "./data/downloaded_files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3" # GAIA example
+    if tool_instance.is_initialized:
+        if os.path.exists(test_audio_file):
+            print(f"Attempting to transcribe: {test_audio_file}")
+            transcribed_text = tool_instance.forward(test_audio_file)
+            print(f"Transcription:\n{transcribed_text}")
+        else:
+            print(
+                f"Test audio file not found: {test_audio_file}. Skipping transcription test."
+            )
+            print("Please place a sample .mp3 or .wav file at that location for testing.")
+        # if os.path.exists(test_audio_file_2):
+        #     print(f"\nAttempting to transcribe: {test_audio_file_2}")
+        #     transcribed_text_2 = tool_instance.forward(test_audio_file_2)
+        #     print(f"Transcription 2:\n{transcribed_text_2}")
+        # else:
+        #     print(f"Test audio file 2 not found: {test_audio_file_2}. Skipping.")
     else:
+        print(
+            "SpeechToTextTool could not be initialized (ASR pipeline missing). Transcription test skipped."
+        )
+    # Test with a non-existent file
+    non_existent_file = "./non_existent_audio.mp3"
+    print(f"\nAttempting to transcribe non-existent file: {non_existent_file}")
+    error_text = tool_instance.forward(non_existent_file)
+    print(f"Result for non-existent file:\n{error_text}")
+    assert "Error:" in error_text  # Expect an error message

src/spreadsheet_tool.py CHANGED Viewed

@@ -1,59 +1,83 @@
 import os
 import pandas as pd
-from typing import Dict, List, Union, Tuple, Any
 import numpy as np
 from smolagents.tools import Tool
 class SpreadsheetTool(Tool):
     """
-    Parses spreadsheet files (e.g., .xlsx) and extracts tabular data for analysis.
     Useful for reading, processing, and converting spreadsheet content to Python data structures.
     """
-    def __init__(self):
         """Initialize the SpreadsheetTool."""
-        pass
-    def parse_spreadsheet(self, file_path: str) -> Dict[str, Any]:
-        """
-        Parse an Excel spreadsheet and extract useful information.
-        Args:
-            file_path: Path to the .xlsx file
-        Returns:
-            Dictionary containing:
-                - sheets: Dictionary of sheet names and their DataFrames
-                - sheet_names: List of sheet names
-                - summary: Basic spreadsheet summary
-                - error: Error message if any
-        """
         if not os.path.exists(file_path):
             return {"error": f"File not found: {file_path}"}
         try:
-            # Read all sheets in the Excel file
             excel_file = pd.ExcelFile(file_path)
             sheet_names = excel_file.sheet_names
             sheets = {}
             for sheet_name in sheet_names:
                 sheets[sheet_name] = pd.read_excel(excel_file, sheet_name=sheet_name)
-            # Create a summary of the spreadsheet
             summary = self._create_summary(sheets)
-            return {
-                "sheets": sheets,
-                "sheet_names": sheet_names,
-                "summary": summary,
-                "error": None
-            }
         except Exception as e:
-            return {"error": f"Error parsing spreadsheet: {str(e)}"}
     def _create_summary(self, sheets_dict: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
         """Create a summary of the spreadsheet contents."""
         summary = {}
@@ -70,179 +94,113 @@ class SpreadsheetTool(Tool):
         return summary
-    def query_data(self, data: Dict[str, Any], query_instructions: str) -> Dict[str, Any]:
         """
         Execute a query on the spreadsheet data based on instructions.
-        Args:
-            data: The parsed spreadsheet data (from parse_spreadsheet)
-            query_instructions: Instructions for querying the data (e.g., "Sum column A")
-        Returns:
-            Dictionary with query results and potential explanation
         """
-        if data.get("error"):
-            return {"error": data["error"]}
-        try:
-            # This is where you'd implement more sophisticated query logic
-            # For now, we'll implement some basic operations
-            sheets = data["sheets"]
-            result = {}
-            # Handle common operations based on query_instructions
-            if "sum" in query_instructions.lower():
-                # Extract column or range to sum
-                # This is a simple implementation - a more robust one would use regex or NLP
-                for sheet_name, df in sheets.items():
-                    numeric_cols = df.select_dtypes(include=[np.number]).columns
-                    if not numeric_cols.empty:
-                        result[f"{sheet_name}_sums"] = {
-                            col: df[col].sum() for col in numeric_cols
-                        }
-            elif "average" in query_instructions.lower() or "mean" in query_instructions.lower():
-                for sheet_name, df in sheets.items():
-                    numeric_cols = df.select_dtypes(include=[np.number]).columns
-                    if not numeric_cols.empty:
-                        result[f"{sheet_name}_averages"] = {
-                            col: df[col].mean() for col in numeric_cols
-                        }
-            elif "count" in query_instructions.lower():
-                for sheet_name, df in sheets.items():
-                    result[f"{sheet_name}_counts"] = {
-                        "rows": len(df),
-                        "non_null_counts": df.count().to_dict()
-                    }
-            # Add the raw data structure for more custom processing by the agent
-            result["data_structure"] = {
-                sheet_name: {
-                    "columns": df.columns.tolist(),
-                    "dtypes": df.dtypes.astype(str).to_dict()
-                } for sheet_name, df in sheets.items()
-            }
-            return result
-        except Exception as e:
-            return {"error": f"Error querying data: {str(e)}"}
-    def extract_specific_data(self, data: Dict[str, Any], sheet_name: str = None,
-                             column_names: List[str] = None,
-                             row_indices: List[int] = None) -> Dict[str, Any]:
-        """
-        Extract specific data from the spreadsheet.
-        Args:
-            data: The parsed spreadsheet data
-            sheet_name: Name of the sheet to extract from (default: first sheet)
-            column_names: List of column names to extract (default: all columns)
-            row_indices: List of row indices to extract (default: all rows)
-        Returns:
-            Dictionary with extracted data
-        """
-        if data.get("error"):
-            return {"error": data["error"]}
-        try:
-            sheets = data["sheets"]
-            # Default to the first sheet if not specified
-            if sheet_name is None:
-                sheet_name = data["sheet_names"][0]
-            if sheet_name not in sheets:
-                return {"error": f"Sheet '{sheet_name}' not found"}
-            df = sheets[sheet_name]
-            # Filter columns if specified
-            if column_names:
-                # Check if all requested columns exist
-                missing_columns = [col for col in column_names if col not in df.columns]
-                if missing_columns:
-                    return {"error": f"Columns not found: {missing_columns}"}
-                df = df[column_names]
-            # Filter rows if specified
-            if row_indices:
-                # Check if indices are in range
-                max_index = len(df) - 1
-                invalid_indices = [i for i in row_indices if i < 0 or i > max_index]
-                if invalid_indices:
-                    return {"error": f"Row indices out of range: {invalid_indices}. Valid range: 0-{max_index}"}
-                df = df.iloc[row_indices]
-            return {
-                "data": df.to_dict('records'),
-                "shape": df.shape
-            }
-        except Exception as e:
-            return {"error": f"Error extracting specific data: {str(e)}"}
-# Example usage (if this script is run directly)
-if __name__ == "__main__":
-    # Create a simple test spreadsheet for demonstration
-    test_dir = "spreadsheet_test"
-    os.makedirs(test_dir, exist_ok=True)
-    # Create a test DataFrame
-    test_data = {
-        'Product': ['Apple', 'Orange', 'Banana', 'Mango'],
-        'Price': [1.2, 0.8, 0.5, 1.5],
-        'Quantity': [100, 80, 200, 50],
-        'Revenue': [120, 64, 100, 75]
-    }
-    df = pd.DataFrame(test_data)
-    test_file_path = os.path.join(test_dir, "test_spreadsheet.xlsx")
-    # Save to Excel
-    with pd.ExcelWriter(test_file_path) as writer:
-        df.to_excel(writer, sheet_name='Sales', index=False)
-        # Create a second sheet with different data
-        pd.DataFrame({
-            'Month': ['Jan', 'Feb', 'Mar', 'Apr'],
-            'Expenses': [50, 60, 55, 70]
-        }).to_excel(writer, sheet_name='Expenses', index=False)
-    print(f"Created test spreadsheet at {test_file_path}")
-    # Test the tool
-    spreadsheet_tool = SpreadsheetTool()
-    # Parse the spreadsheet
-    print("\nParsing spreadsheet...")
-    parsed_data = spreadsheet_tool.parse_spreadsheet(test_file_path)
-    if parsed_data.get("error"):
-        print(f"Error: {parsed_data['error']}")
-    else:
-        print(f"Successfully parsed {len(parsed_data['sheet_names'])} sheets:")
-        print(f"Sheet names: {parsed_data['sheet_names']}")
-        # Show a sample of the first sheet
-        first_sheet_name = parsed_data['sheet_names'][0]
-        first_sheet = parsed_data['sheets'][first_sheet_name]
-        print(f"\nFirst few rows of '{first_sheet_name}':")
-        print(first_sheet.head())
-        # Test query
-        print("\nQuerying data (sum operation)...")
-        query_result = spreadsheet_tool.query_data(parsed_data, "sum")
-        print(f"Query result: {query_result}")
-        # Test specific data extraction
-        print("\nExtracting specific data...")
-        extract_result = spreadsheet_tool.extract_specific_data(
-            parsed_data,
-            sheet_name='Sales',
-            column_names=['Product', 'Revenue']
-        )
-        print(f"Extracted data: {extract_result}")

 import os
 import pandas as pd
+from typing import Dict, List, Union, Tuple, Any, Optional
 import numpy as np
 from smolagents.tools import Tool
 class SpreadsheetTool(Tool):
     """
+    Parses spreadsheet files (e.g., .xlsx) and extracts tabular data for analysis or allows querying.
     Useful for reading, processing, and converting spreadsheet content to Python data structures.
     """
+    name = "spreadsheet_processor"
+    description = "Parses a spreadsheet file (e.g., .xlsx, .xls, .csv) and can perform queries. Returns extracted data or query results."
+    inputs = {
+        'file_path': {'type': 'string', 'description': 'Path to the spreadsheet file.'},
+        'query_instructions': {'type': 'string', 'description': 'Optional. Instructions for querying the data (e.g., "Sum column A"). If None, parses the whole sheet.', 'nullable': True}
+    }
+    outputs = {'result': {'type': 'object', 'description': 'A dictionary containing parsed sheet data, query results, or an error message.'}}
+    output_type = "object"
+    def __init__(self, *args, **kwargs):
         """Initialize the SpreadsheetTool."""
+        super().__init__(*args, **kwargs)
+        self.is_initialized = True
+    # Main entry point for the agent
+    def forward(self, file_path: str, query_instructions: Optional[str] = None) -> Dict[str, Any]:
         if not os.path.exists(file_path):
             return {"error": f"File not found: {file_path}"}
+        # Determine file type for appropriate parsing
+        _, file_extension = os.path.splitext(file_path)
+        file_extension = file_extension.lower()
+        parsed_data = None
+        if file_extension in ['.xlsx', '.xls']:
+            parsed_data = self._parse_excel(file_path)
+        elif file_extension == '.csv':
+            parsed_data = self._parse_csv(file_path)
+        else:
+            return {"error": f"Unsupported file type: {file_extension}. Supported types: .xlsx, .xls, .csv"}
+        if parsed_data.get("error"):
+            return parsed_data # Return error from parsing step
+        if query_instructions:
+            return self._query_data(parsed_data, query_instructions)
+        else:
+            # If no query, return the parsed data and summary
+            return {
+                "parsed_sheets": parsed_data.get("sheets"),
+                "summary": parsed_data.get("summary"),
+                "message": "Spreadsheet parsed successfully."
+            }
+    def _parse_excel(self, file_path: str) -> Dict[str, Any]:
+        """Parse an Excel spreadsheet and extract useful information."""
         try:
             excel_file = pd.ExcelFile(file_path)
             sheet_names = excel_file.sheet_names
             sheets = {}
             for sheet_name in sheet_names:
                 sheets[sheet_name] = pd.read_excel(excel_file, sheet_name=sheet_name)
             summary = self._create_summary(sheets)
+            return {"sheets": sheets, "sheet_names": sheet_names, "summary": summary, "error": None}
         except Exception as e:
+            return {"error": f"Error parsing Excel spreadsheet: {str(e)}"}
+    def _parse_csv(self, file_path: str) -> Dict[str, Any]:
+        """Parse a CSV file."""
+        try:
+            df = pd.read_csv(file_path)
+            # CSVs don't have multiple sheets, so we adapt the structure
+            sheet_name = os.path.splitext(os.path.basename(file_path))[0]
+            sheets = {sheet_name: df}
+            summary = self._create_summary(sheets)
+            return {"sheets": sheets, "sheet_names": [sheet_name], "summary": summary, "error": None}
+        except Exception as e:
+            return {"error": f"Error parsing CSV file: {str(e)}"}
     def _create_summary(self, sheets_dict: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
         """Create a summary of the spreadsheet contents."""
         summary = {}
         return summary
+    # Renamed from query_data to _query_data and adjusted arguments
+    def _query_data(self, parsed_data_dict: Dict[str, Any], query_instructions: str) -> Dict[str, Any]:
         """
         Execute a query on the spreadsheet data based on instructions.
+        This is a simplified placeholder. Real implementation would need robust query parsing.
         """
+        if parsed_data_dict.get("error"):
+            return {"error": parsed_data_dict["error"]}
+        sheets = parsed_data_dict.get("sheets")
+        if not sheets:
+            return {"error": "No sheets data available for querying."}
+        # Placeholder for actual query logic.
+        # This would involve parsing `query_instructions` (e.g., using regex, NLP, or a DSL)
+        # and applying pandas operations.
+        # For now, let's return a message indicating the query was received and basic info.
+        results = {}
+        explanation = f"Query instruction received: '{query_instructions}'. Advanced query execution is not fully implemented. " \
+                      f"Returning summary of available sheets: {list(sheets.keys())}."
+        # Example: if query asks for sum, try to sum first numeric column of first sheet
+        if "sum" in query_instructions.lower():
+            first_sheet_name = next(iter(sheets))
+            df = sheets[first_sheet_name]
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            if not numeric_cols.empty:
+                col_to_sum = numeric_cols[0]
+                try:
+                    total_sum = df[col_to_sum].sum()
+                    results[f'{first_sheet_name}_{col_to_sum}_sum'] = total_sum
+                    explanation += f" Example sum of column '{col_to_sum}' in sheet '{first_sheet_name}': {total_sum}."
+                except Exception as e:
+                    explanation += f" Could not perform example sum: {e}."
+            else:
+                explanation += " No numeric columns found for example sum."
+        return {"query_results": results, "explanation": explanation, "original_query": query_instructions}
+# Example usage (for direct testing)
+if __name__ == '__main__':
+    tool = SpreadsheetTool()
+    # Create dummy files for testing
+    dummy_excel_file = "dummy_test.xlsx"
+    dummy_csv_file = "dummy_test.csv"
+    # Create a dummy Excel file
+    df_excel = pd.DataFrame({
+        'colA': [1, 2, 3, 4, 5],
+        'colB': ['apple', 'banana', 'cherry', 'date', 'elderberry'],
+        'colC': [10.1, 20.2, 30.3, 40.4, 50.5]
+    })
+    with pd.ExcelWriter(dummy_excel_file) as writer:
+        df_excel.to_excel(writer, sheet_name='Sheet1', index=False)
+        df_excel.head(2).to_excel(writer, sheet_name='Sheet2', index=False)
+    # Create a dummy CSV file
+    df_csv = pd.DataFrame({
+        'id': [101, 102, 103],
+        'product': ['widget', 'gadget', 'gizmo'],
+        'price': [19.99, 29.50, 15.00]
+    })
+    df_csv.to_csv(dummy_csv_file, index=False)
+    print("--- Test 1: Parse Excel file (no query) ---")
+    result1 = tool.forward(file_path=dummy_excel_file)
+    print(result1)
+    assert "error" not in result1 or result1["error"] is None
+    assert "Sheet1" in result1["parsed_sheets"]
+    print("\n--- Test 2: Parse CSV file (no query) ---")
+    result2 = tool.forward(file_path=dummy_csv_file)
+    print(result2)
+    assert "error" not in result2 or result2["error"] is None
+    assert dummy_csv_file.split('.')[0] in result2["parsed_sheets"]
+    print("\n--- Test 3: Query Excel file (simple sum example) ---")
+    result3 = tool.forward(file_path=dummy_excel_file, query_instructions="sum colA from Sheet1")
+    print(result3)
+    assert "error" not in result3 or result3["error"] is None
+    assert "query_results" in result3
+    if result3.get("query_results"):
+        assert "Sheet1_colA_sum" in result3["query_results"]
+        assert result3["query_results"]["Sheet1_colA_sum"] == 15
+    print("\n--- Test 4: File not found ---")
+    result4 = tool.forward(file_path="non_existent_file.xlsx")
+    print(result4)
+    assert result4["error"] is not None
+    assert "File not found" in result4["error"]
+    print("\n--- Test 5: Unsupported file type ---")
+    dummy_txt_file = "dummy_test.txt"
+    with open(dummy_txt_file, "w") as f:
+        f.write("hello")
+    result5 = tool.forward(file_path=dummy_txt_file)
+    print(result5)
+    assert result5["error"] is not None
+    assert "Unsupported file type" in result5["error"]
+    os.remove(dummy_txt_file)
+    # Clean up dummy files
+    if os.path.exists(dummy_excel_file):
+        os.remove(dummy_excel_file)
+    if os.path.exists(dummy_csv_file):
+        os.remove(dummy_csv_file)
+    print("\nSpreadsheetTool tests completed.")

src/video_processing_tool.py CHANGED Viewed

@@ -14,8 +14,18 @@ class VideoProcessingTool(Tool):
     Analyzes video content, extracting information such as frames, audio, or metadata.
     Useful for tasks like video summarization, frame extraction, transcript analysis, or content analysis.
     """
-    def __init__(self, model_cfg_path=None, model_weights_path=None, class_names_path=None, temp_dir_base=None):
         """
         Initializes the VideoProcessingTool.
@@ -25,6 +35,9 @@ class VideoProcessingTool(Tool):
             class_names_path (str, optional): Path to the file containing class names for the model.
             temp_dir_base (str, optional): Base directory for temporary files. Defaults to system temp.
         """
         if temp_dir_base:
             self.temp_dir = tempfile.mkdtemp(dir=temp_dir_base)
         else:
@@ -37,16 +50,67 @@ class VideoProcessingTool(Tool):
             if os.path.exists(model_cfg_path) and os.path.exists(model_weights_path) and os.path.exists(class_names_path):
                 try:
                     self.object_detection_model = cv2.dnn.readNetFromDarknet(model_cfg_path, model_weights_path)
-                    # Set preferable backend and target
                     self.object_detection_model.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
                     self.object_detection_model.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
                     with open(class_names_path, "r") as f:
                         self.class_names = [line.strip() for line in f.readlines()]
                 except Exception as e:
                     print(f"Error loading CV model: {e}. Object detection will not be available.")
                     self.object_detection_model = None
             else:
                 print("Warning: One or more CV model paths are invalid. Object detection will not be available.")
     def _extract_video_id(self, youtube_url):
         """Extract the YouTube video ID from a URL."""
@@ -105,7 +169,69 @@ class VideoProcessingTool(Tool):
         except Exception as e:
             return {"error": f"Failed to download video: {str(e)}"}
-    def get_video_transcript(self, youtube_url, languages=None):
         """Get the transcript/captions of a YouTube video."""
         if languages is None:
             languages = ['en', 'en-US'] # Default to English
@@ -161,20 +287,18 @@ class VideoProcessingTool(Tool):
             # Catches other exceptions from YouTubeTranscriptApi calls or re-raised from fetch
             return {"error": f"Failed to get transcript: {str(e)}"}
-    def count_objects_in_video(self, video_path, target_classes=None, confidence_threshold=0.5, frame_skip=5):
         """
-        Counts specified objects appearing in the video using the loaded DNN model.
-        Determines the maximum number of target objects appearing simultaneously in any single frame.
         Args:
             video_path (str): Path to the video file.
-            target_classes (list, optional): A list of object classes (strings) to count (e.g., ["bird", "cat"]).
-                                            If None, counts all detected objects.
             confidence_threshold (float): Minimum confidence for an object to be counted.
-            frame_skip (int): Process every Nth frame to speed up analysis.
         Returns:
-            dict: A dictionary with counts or an error message.
-                  e.g., {"success": True, "max_simultaneous_birds": 3, "max_simultaneous_cats": 1}
-                  or {"error": "Object detection model not loaded."}
         """
         if not self.object_detection_model or not self.class_names:
             return {"error": "Object detection model not loaded or class names missing."}
@@ -185,168 +309,45 @@ class VideoProcessingTool(Tool):
         if not cap.isOpened():
             return {"error": "Could not open video file."}
-        max_counts_per_class = {cls: 0 for cls in target_classes} if target_classes else {}
-        # If target_classes is None, we'd need to initialize for all detected classes,
-        # but for simplicity, let's require target_classes for now or adjust later.
-        if not target_classes:
-            # Defaulting to a common class if none specified, e.g. 'person'
-            # Or, one could count all unique classes detected. For GAIA, specific targets are better.
-            return {"error": "target_classes must be specified for counting."}
         frame_count = 0
         while cap.isOpened():
             ret, frame = cap.read()
             if not ret:
                 break
             frame_count += 1
-            if frame_count % frame_skip != 0:
-                continue
-            height, width = frame.shape[:2]
-            blob = cv2.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True, crop=False)
-            self.object_detection_model.setInput(blob)
-            layer_names = self.object_detection_model.getLayerNames()
-            # Handle potential differences in getUnconnectedOutLayers() return value
-            unconnected_out_layers_indices = self.object_detection_model.getUnconnectedOutLayers()
-            if isinstance(unconnected_out_layers_indices, np.ndarray) and unconnected_out_layers_indices.ndim > 1 : # For some OpenCV versions
-                 output_layer_names = [layer_names[i[0] - 1] for i in unconnected_out_layers_indices]
-            else: # For typical cases
-                 output_layer_names = [layer_names[i - 1] for i in unconnected_out_layers_indices]
-            detections = self.object_detection_model.forward(output_layer_names)
-            current_frame_counts = {cls: 0 for cls in target_classes}
-            for detection_set in detections: # Detections can come from multiple output layers
-                for detection in detection_set:
-                    scores = detection[5:]
-                    class_id = np.argmax(scores)
-                    confidence = scores[class_id]
-                    if confidence > confidence_threshold:
-                        detected_class_name = self.class_names[class_id]
-                        if detected_class_name in target_classes:
-                            current_frame_counts[detected_class_name] += 1
-            for cls in target_classes:
-                if current_frame_counts[cls] > max_counts_per_class[cls]:
-                    max_counts_per_class[cls] = current_frame_counts[cls]
         cap.release()
-        result = {"success": True}
-        for cls, count in max_counts_per_class.items():
-            result[f"max_simultaneous_{cls.replace(' ', '_')}"] = count # e.g. "max_simultaneous_bird"
-        return result
-    def find_dialogue_response(self, transcript_entries, query_phrase, max_entries_gap=2, max_time_gap_s=5.0):
-        """
-        Finds what is said in response to a given query phrase in transcript entries.
-        Looks for the query phrase and then captures the text from subsequent entries.
-        Args:
-            transcript_entries (list): List of transcript dictionaries (from get_video_transcript).
-            query_phrase (str): The phrase to find (e.g., a question).
-            max_entries_gap (int): How many transcript entries to look ahead for a response.
-            max_time_gap_s (float): Maximum time in seconds after the query phrase to consider for a response.
-        Returns:
-            dict: {"success": True, "response_text": "...", "found_at_entry": {...}} or {"error": "..."}
-        """
-        if not transcript_entries:
-            return {"error": "Transcript entries are empty."}
-        query_phrase_lower = query_phrase.lower().rstrip('?.!,;') # Strip common trailing punctuation
-        for i, entry in enumerate(transcript_entries):
-            # Correctly access attributes: .text, .start, .duration
-            if query_phrase_lower in entry.text.lower():
-                # Found the query phrase, now look for the response
-                response_parts = []
-                start_time_of_query = entry.start + entry.duration # End time of query entry
-                for j in range(i + 1, min(i + 1 + max_entries_gap + 1, len(transcript_entries))):
-                    next_entry = transcript_entries[j]
-                    # Check if the next entry is within the time gap
-                    if next_entry.start - start_time_of_query > max_time_gap_s:
-                        break # Too much time has passed
-                    # Add text if it's not just noise or very short (heuristic)
-                    if next_entry.text.strip() and len(next_entry.text.strip()) > 1:
-                         response_parts.append(next_entry.text)
-                    # If we have collected some response, and the next entry is significantly later, stop.
-                    if response_parts and (j + 1 < len(transcript_entries)):
-                        if transcript_entries[j+1].start - (next_entry.start + next_entry.duration) > 1.0: # If gap > 1s
-                            break
-                if response_parts:
-                    return {
-                        "success": True,
-                        "response_text": " ".join(response_parts),
-                        "query_entry": entry,
-                        "response_start_entry_index": i + 1
-                    }
-                # If no response found immediately after, but query was found
-                return {"error": f"Query phrase '{query_phrase}' found, but no subsequent dialogue captured as response within gap."}
-        return {"error": f"Query phrase '{query_phrase}' not found in transcript."}
-    def process_video(self, youtube_url, query_type, query_params=None):
-        """
-        Main method to process a video based on the type of query.
-        Args:
-            youtube_url (str): URL of the YouTube video.
-            query_type (str): Type of processing: "transcript", "object_count", "dialogue_response".
-            query_params (dict, optional): Additional parameters for the specific query type.
-                For "object_count": {"target_classes": ["bird"], "confidence_threshold": 0.5, "resolution": "360p"}
-                For "dialogue_response": {"query_phrase": "Isn't that hot?", "languages": ['en']}
-        """
-        if query_params is None:
-            query_params = {}
-        if query_type == "transcript":
-            return self.get_video_transcript(youtube_url, languages=query_params.get("languages"))
-        elif query_type == "object_count":
-            if not self.object_detection_model:
-                return {"error": "Object detection model not initialized. Cannot count objects."}
-            resolution = query_params.get("resolution", "360p")
-            download_result = self.download_video(youtube_url, resolution=resolution)
-            if "error" in download_result:
-                return download_result
-            video_path = download_result["file_path"]
-            target_classes = query_params.get("target_classes")
-            if not target_classes or not isinstance(target_classes, list):
-                return {"error": "query_params must include 'target_classes' as a list for object_count."}
-            confidence = query_params.get("confidence_threshold", 0.5)
-            frame_skip = query_params.get("frame_skip", 5)
-            return self.count_objects_in_video(video_path, target_classes, confidence, frame_skip)
-        elif query_type == "dialogue_response":
-            transcript_result = self.get_video_transcript(youtube_url, languages=query_params.get("languages"))
-            if "error" in transcript_result:
-                return transcript_result
-            query_phrase = query_params.get("query_phrase")
-            if not query_phrase:
-                return {"error": "query_params must include 'query_phrase' for dialogue_response."}
-            return self.find_dialogue_response(
-                transcript_result["transcript_entries"],
-                query_phrase,
-                max_entries_gap=query_params.get("max_entries_gap", 2),
-                max_time_gap_s=query_params.get("max_time_gap_s", 5.0)
-            )
-        return {"error": f"Unsupported query type: {query_type}"}
     def cleanup(self):
         """Remove temporary files and directory."""

     Analyzes video content, extracting information such as frames, audio, or metadata.
     Useful for tasks like video summarization, frame extraction, transcript analysis, or content analysis.
     """
+    name = "video_processor"
+    description = "Analyzes video content from a file path or YouTube URL. Can extract frames, detect objects, get transcripts, and provide video metadata."
+    inputs = {
+        "file_path": {"type": "string", "description": "Path to the video file or YouTube URL.", "nullable": True},
+        "task": {"type": "string", "description": "Specific task to perform (e.g., 'extract_frames', 'get_transcript', 'detect_objects', 'get_metadata').", "nullable": True},
+        "task_parameters": {"type": "object", "description": "Parameters for the specific task (e.g., frame extraction interval, object detection confidence).", "nullable": True}
+    }
+    outputs = {"result": {"type": "object", "description": "The result of the video processing task, e.g., list of frame paths, transcript text, object detection results, or metadata dictionary."}}
+    output_type = "object"
+    def __init__(self, model_cfg_path=None, model_weights_path=None, class_names_path=None, temp_dir_base=None, *args, **kwargs):
         """
         Initializes the VideoProcessingTool.
             class_names_path (str, optional): Path to the file containing class names for the model.
             temp_dir_base (str, optional): Base directory for temporary files. Defaults to system temp.
         """
+        super().__init__(*args, **kwargs)
+        self.is_initialized = False # Will be set to True after successful setup
         if temp_dir_base:
             self.temp_dir = tempfile.mkdtemp(dir=temp_dir_base)
         else:
             if os.path.exists(model_cfg_path) and os.path.exists(model_weights_path) and os.path.exists(class_names_path):
                 try:
                     self.object_detection_model = cv2.dnn.readNetFromDarknet(model_cfg_path, model_weights_path)
                     self.object_detection_model.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
                     self.object_detection_model.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
                     with open(class_names_path, "r") as f:
                         self.class_names = [line.strip() for line in f.readlines()]
+                    print("CV Model loaded successfully.")
                 except Exception as e:
                     print(f"Error loading CV model: {e}. Object detection will not be available.")
                     self.object_detection_model = None
             else:
                 print("Warning: One or more CV model paths are invalid. Object detection will not be available.")
+        else:
+            print("CV model paths not provided. Object detection will not be available.")
+        self.is_initialized = True
+    def forward(self, file_path: str = None, task: str = "get_metadata", task_parameters: dict = None):
+        """
+        Main entry point for video processing tasks.
+        """
+        if not self.is_initialized:
+            return {"error": "Tool not initialized properly."}
+        if task_parameters is None:
+            task_parameters = {}
+        is_youtube_url = file_path and ("youtube.com/" in file_path or "youtu.be/" in file_path)
+        video_source_path = file_path
+        if is_youtube_url:
+            download_resolution = task_parameters.get("resolution", "360p")
+            download_result = self.download_video(file_path, resolution=download_resolution)
+            if download_result.get("error"):
+                return download_result
+            video_source_path = download_result.get("file_path")
+            if not video_source_path or not os.path.exists(video_source_path):
+                 return {"error": f"Failed to download or locate video from URL: {file_path}"}
+        elif file_path and not os.path.exists(file_path):
+            return {"error": f"Video file not found: {file_path}"}
+        elif not file_path and task not in ['get_transcript']: # transcript can work with URL directly
+             return {"error": "File path is required for this task."}
+        if task == "get_metadata":
+            return self.get_video_metadata(video_source_path)
+        elif task == "extract_frames":
+            interval_seconds = task_parameters.get("interval_seconds", 5)
+            max_frames = task_parameters.get("max_frames")
+            return self.extract_frames_from_video(video_source_path, interval_seconds=interval_seconds, max_frames=max_frames)
+        elif task == "get_transcript":
+            # Use original file_path which might be the URL
+            return self.get_youtube_transcript(file_path)
+        elif task == "detect_objects":
+            if not self.object_detection_model:
+                return {"error": "Object detection model not loaded."}
+            confidence_threshold = task_parameters.get("confidence_threshold", 0.5)
+            frames_to_process = task_parameters.get("frames_to_process", 5) # Process N frames
+            return self.detect_objects_in_video(video_source_path, confidence_threshold=confidence_threshold, num_frames_to_sample=frames_to_process)
+        # Add more tasks as needed, e.g., extract_audio
+        else:
+            return {"error": f"Unsupported task: {task}"}
     def _extract_video_id(self, youtube_url):
         """Extract the YouTube video ID from a URL."""
         except Exception as e:
             return {"error": f"Failed to download video: {str(e)}"}
+    def get_video_metadata(self, video_path):
+        """Extract metadata from the video file."""
+        if not os.path.exists(video_path):
+            return {"error": f"Video file not found: {video_path}"}
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            return {"error": "Could not open video file."}
+        metadata = {
+            "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
+            "fps": cap.get(cv2.CAP_PROP_FPS),
+            "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
+            "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
+            "duration": cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)
+        }
+        cap.release()
+        return {"success": True, "metadata": metadata}
+    def extract_frames_from_video(self, video_path, interval_seconds=5, max_frames=None):
+        """
+        Extracts frames from the video at specified intervals.
+        Args:
+            video_path (str): Path to the video file.
+            interval_seconds (int): Interval in seconds between frames.
+            max_frames (int, optional): Maximum number of frames to extract.
+        Returns:
+            dict: {"success": True, "extracted_frame_paths": [...] } or {"error": "..."}
+        """
+        if not os.path.exists(video_path):
+            return {"error": f"Video file not found: {video_path}"}
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            return {"error": "Could not open video file."}
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_interval = int(fps * interval_seconds)
+        extracted_frame_paths = []
+        frame_count = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if frame_count % frame_interval == 0:
+                frame_id = int(frame_count / frame_interval)
+                frame_file_path = os.path.join(self.temp_dir, f"frame_{frame_id:04d}.jpg")
+                cv2.imwrite(frame_file_path, frame)
+                extracted_frame_paths.append(frame_file_path)
+                if max_frames and len(extracted_frame_paths) >= max_frames:
+                    break
+            frame_count += 1
+        cap.release()
+        return {"success": True, "extracted_frame_paths": extracted_frame_paths}
+    def get_youtube_transcript(self, youtube_url, languages=None):
         """Get the transcript/captions of a YouTube video."""
         if languages is None:
             languages = ['en', 'en-US'] # Default to English
             # Catches other exceptions from YouTubeTranscriptApi calls or re-raised from fetch
             return {"error": f"Failed to get transcript: {str(e)}"}
+    def detect_objects_in_video(self, video_path, confidence_threshold=0.5, num_frames_to_sample=5, target_fps=1):
         """
+        Detects objects in the video and returns the count of specified objects.
         Args:
             video_path (str): Path to the video file.
             confidence_threshold (float): Minimum confidence for an object to be counted.
+            num_frames_to_sample (int): Number of frames to sample for object detection.
+            target_fps (int): Target frames per second for processing.
         Returns:
+            dict: {"success": True, "object_counts": {...}} or {"error": "..."}
         """
         if not self.object_detection_model or not self.class_names:
             return {"error": "Object detection model not loaded or class names missing."}
         if not cap.isOpened():
             return {"error": "Could not open video file."}
+        object_counts = {cls: 0 for cls in self.class_names}
         frame_count = 0
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        sample_interval = max(1, total_frames // num_frames_to_sample)
         while cap.isOpened():
             ret, frame = cap.read()
             if not ret:
                 break
+            if frame_count % sample_interval == 0:
+                height, width = frame.shape[:2]
+                blob = cv2.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True, crop=False)
+                self.object_detection_model.setInput(blob)
+                layer_names = self.object_detection_model.getLayerNames()
+                # Handle potential differences in getUnconnectedOutLayers() return value
+                unconnected_out_layers_indices = self.object_detection_model.getUnconnectedOutLayers()
+                if isinstance(unconnected_out_layers_indices, np.ndarray) and unconnected_out_layers_indices.ndim > 1 : # For some OpenCV versions
+                     output_layer_names = [layer_names[i[0] - 1] for i in unconnected_out_layers_indices]
+                else: # For typical cases
+                     output_layer_names = [layer_names[i - 1] for i in unconnected_out_layers_indices]
+                detections = self.object_detection_model.forward(output_layer_names)
+                for detection_set in detections: # Detections can come from multiple output layers
+                    for detection in detection_set:
+                        scores = detection[5:]
+                        class_id = np.argmax(scores)
+                        confidence = scores[class_id]
+                        if confidence > confidence_threshold:
+                            detected_class_name = self.class_names[class_id]
+                            object_counts[detected_class_name] += 1
             frame_count += 1
         cap.release()
+        return {"success": True, "object_counts": object_counts}
     def cleanup(self):
         """Remove temporary files and directory."""