File size: 5,636 Bytes
149163c 073b7fb c5a6e89 bffd09a 073b7fb 8ff7d8f bffd09a b1939df bffd09a 073b7fb c5a6e89 8ff7d8f c5a6e89 073b7fb c5a6e89 bffd09a 073b7fb bffd09a 073b7fb af6bf18 bffd09a af6bf18 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 bffd09a c5a6e89 bffd09a c5a6e89 bffd09a c5a6e89 073b7fb c5a6e89 073b7fb bffd09a 8ff7d8f bffd09a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
from __future__ import annotations
import os
import mimetypes
from typing import Self, Dict, Any
from smolagents.tools import Tool
class FileIdentifier(Tool):
"""
Identifies file types and maps them to the appropriate processing tool based on file extension.
Useful for routing files to specialized tools such as speech-to-text, spreadsheet parser, image processor, etc.
"""
name = "file_identifier"
description = "Identifies the file type and suggests a processing action based on its path."
inputs = {'filepath': {'type': 'string', 'description': 'The path to the file to be identified.'}}
outputs = {'file_info': {'type': 'object', 'description': 'A dictionary with file type information or an error.'}}
output_type = "object"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
mimetypes.init()
# Mapping from simple type to action and common extensions
self.file_type_map = {
"audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]},
"spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]},
"image": {"action": "image_processor", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]},
"python_code": {"action": "safe_code_interpreter", "extensions": [".py"]},
"pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]},
"text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]},
"csv": {"action": "csv_parser", "extensions": [".csv"]},
}
# For quick lookup from extension to simple type
self.extension_to_type = {}
for simple_type, details in self.file_type_map.items():
for ext in details["extensions"]:
self.extension_to_type[ext] = simple_type
self.is_initialized = True
def forward(self: Self, filepath: str) -> Dict[str, Any]:
"""
Identifies the file type and suggests a processing action.
Args:
filepath (str): The path to the file to be identified.
Returns:
Dict[str, Any]: A dictionary with 'filepath', 'determined_type', 'mime_type',
'suggested_action', or an 'error'.
"""
if not os.path.exists(filepath):
return {
"filepath": filepath,
"error": "File not found"
}
mime_type, encoding = mimetypes.guess_type(filepath)
file_extension = os.path.splitext(filepath)[1].lower()
determined_type = "unknown"
suggested_action = "unknown_handler"
# Prioritize extension-based mapping for specific known types
if file_extension in self.extension_to_type:
determined_type = self.extension_to_type[file_extension]
suggested_action = self.file_type_map[determined_type]["action"]
elif mime_type:
# Fallback to MIME type if extension is not specifically mapped
if mime_type.startswith("audio/"):
determined_type = "audio"
suggested_action = self.file_type_map["audio"]["action"]
elif mime_type.startswith("image/"):
determined_type = "image"
suggested_action = self.file_type_map["image"]["action"]
elif mime_type == "application/pdf":
determined_type = "pdf"
suggested_action = self.file_type_map["pdf"]["action"]
elif mime_type == "text/csv":
determined_type = "csv"
suggested_action = self.file_type_map["csv"]["action"]
elif mime_type.startswith("text/"): # General text
if file_extension == ".py":
determined_type = "python_code"
suggested_action = self.file_type_map["python_code"]["action"]
else:
determined_type = "text"
suggested_action = self.file_type_map["text"]["action"]
elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
determined_type = "spreadsheet"
suggested_action = self.file_type_map["spreadsheet"]["action"]
# If still unknown, but has a common extension not yet caught
if determined_type == "unknown" and file_extension:
if file_extension in self.extension_to_type:
determined_type = self.extension_to_type[file_extension]
suggested_action = self.file_type_map[determined_type]["action"]
return {
"filepath": filepath,
"determined_type": determined_type,
"file_extension": file_extension,
"mime_type": mime_type,
"suggested_action": suggested_action
}
if __name__ == '__main__':
tool_instance = FileIdentifier()
# Example: Create a dummy file for testing
dummy_files = ["test.mp3", "document.xlsx", "image.png", "script.py", "unknown.xyz", "archive.zip"]
for fname in dummy_files:
with open(fname, "w") as f:
f.write("dummy content") # Create empty file for testing
result = tool_instance.forward(fname)
print(f"File: {fname}, Info: {result}")
os.remove(fname) # Clean up dummy file
# Test with a non-existent file
non_existent_file = "no_such_file.txt"
result_non_existent = tool_instance.forward(non_existent_file)
print(f"File: {non_existent_file}, Info: {result_non_existent}") |