HF_Agents_Final_Project / src /file_processing_tool.py
Yago Bolivar
refactor: update tool classes to inherit from Tool base class for consistency and improved structure
bffd09a
from __future__ import annotations
import os
import mimetypes
from typing import Self, Dict, Any
from smolagents.tools import Tool
class FileIdentifier(Tool):
"""
Identifies file types and maps them to the appropriate processing tool based on file extension.
Useful for routing files to specialized tools such as speech-to-text, spreadsheet parser, image processor, etc.
"""
name = "file_identifier"
description = "Identifies the file type and suggests a processing action based on its path."
inputs = {'filepath': {'type': 'string', 'description': 'The path to the file to be identified.'}}
outputs = {'file_info': {'type': 'object', 'description': 'A dictionary with file type information or an error.'}}
output_type = "object"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
mimetypes.init()
# Mapping from simple type to action and common extensions
self.file_type_map = {
"audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]},
"spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]},
"image": {"action": "image_processor", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]},
"python_code": {"action": "safe_code_interpreter", "extensions": [".py"]},
"pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]},
"text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]},
"csv": {"action": "csv_parser", "extensions": [".csv"]},
}
# For quick lookup from extension to simple type
self.extension_to_type = {}
for simple_type, details in self.file_type_map.items():
for ext in details["extensions"]:
self.extension_to_type[ext] = simple_type
self.is_initialized = True
def forward(self: Self, filepath: str) -> Dict[str, Any]:
"""
Identifies the file type and suggests a processing action.
Args:
filepath (str): The path to the file to be identified.
Returns:
Dict[str, Any]: A dictionary with 'filepath', 'determined_type', 'mime_type',
'suggested_action', or an 'error'.
"""
if not os.path.exists(filepath):
return {
"filepath": filepath,
"error": "File not found"
}
mime_type, encoding = mimetypes.guess_type(filepath)
file_extension = os.path.splitext(filepath)[1].lower()
determined_type = "unknown"
suggested_action = "unknown_handler"
# Prioritize extension-based mapping for specific known types
if file_extension in self.extension_to_type:
determined_type = self.extension_to_type[file_extension]
suggested_action = self.file_type_map[determined_type]["action"]
elif mime_type:
# Fallback to MIME type if extension is not specifically mapped
if mime_type.startswith("audio/"):
determined_type = "audio"
suggested_action = self.file_type_map["audio"]["action"]
elif mime_type.startswith("image/"):
determined_type = "image"
suggested_action = self.file_type_map["image"]["action"]
elif mime_type == "application/pdf":
determined_type = "pdf"
suggested_action = self.file_type_map["pdf"]["action"]
elif mime_type == "text/csv":
determined_type = "csv"
suggested_action = self.file_type_map["csv"]["action"]
elif mime_type.startswith("text/"): # General text
if file_extension == ".py":
determined_type = "python_code"
suggested_action = self.file_type_map["python_code"]["action"]
else:
determined_type = "text"
suggested_action = self.file_type_map["text"]["action"]
elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
determined_type = "spreadsheet"
suggested_action = self.file_type_map["spreadsheet"]["action"]
# If still unknown, but has a common extension not yet caught
if determined_type == "unknown" and file_extension:
if file_extension in self.extension_to_type:
determined_type = self.extension_to_type[file_extension]
suggested_action = self.file_type_map[determined_type]["action"]
return {
"filepath": filepath,
"determined_type": determined_type,
"file_extension": file_extension,
"mime_type": mime_type,
"suggested_action": suggested_action
}
if __name__ == '__main__':
tool_instance = FileIdentifier()
# Example: Create a dummy file for testing
dummy_files = ["test.mp3", "document.xlsx", "image.png", "script.py", "unknown.xyz", "archive.zip"]
for fname in dummy_files:
with open(fname, "w") as f:
f.write("dummy content") # Create empty file for testing
result = tool_instance.forward(fname)
print(f"File: {fname}, Info: {result}")
os.remove(fname) # Clean up dummy file
# Test with a non-existent file
non_existent_file = "no_such_file.txt"
result_non_existent = tool_instance.forward(non_existent_file)
print(f"File: {non_existent_file}, Info: {result_non_existent}")