File size: 5,636 Bytes
149163c
073b7fb
c5a6e89
bffd09a
 
073b7fb
8ff7d8f
bffd09a
b1939df
 
 
 
bffd09a
 
 
 
 
 
 
 
073b7fb
c5a6e89
 
 
 
8ff7d8f
c5a6e89
 
 
 
073b7fb
c5a6e89
 
 
 
 
bffd09a
073b7fb
bffd09a
073b7fb
 
af6bf18
 
 
 
 
bffd09a
af6bf18
073b7fb
 
 
c5a6e89
 
073b7fb
 
c5a6e89
 
073b7fb
c5a6e89
 
073b7fb
c5a6e89
 
 
 
 
 
073b7fb
c5a6e89
 
073b7fb
c5a6e89
 
073b7fb
c5a6e89
 
073b7fb
c5a6e89
 
bffd09a
c5a6e89
 
 
 
 
 
 
 
 
bffd09a
c5a6e89
 
bffd09a
c5a6e89
 
073b7fb
 
 
c5a6e89
 
073b7fb
 
 
 
bffd09a
 
 
 
 
 
 
8ff7d8f
bffd09a
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from __future__ import annotations
import os
import mimetypes
from typing import Self, Dict, Any
from smolagents.tools import Tool


class FileIdentifier(Tool):
    """
    Identifies file types and maps them to the appropriate processing tool based on file extension.
    Useful for routing files to specialized tools such as speech-to-text, spreadsheet parser, image processor, etc.
    """
    name = "file_identifier"
    description = "Identifies the file type and suggests a processing action based on its path."
    inputs = {'filepath': {'type': 'string', 'description': 'The path to the file to be identified.'}}
    outputs = {'file_info': {'type': 'object', 'description': 'A dictionary with file type information or an error.'}}
    output_type = "object"

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        mimetypes.init()
        # Mapping from simple type to action and common extensions
        self.file_type_map = {
            "audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]},
            "spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]},
            "image": {"action": "image_processor", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]},
            "python_code": {"action": "safe_code_interpreter", "extensions": [".py"]},
            "pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]},
            "text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]},
            "csv": {"action": "csv_parser", "extensions": [".csv"]},
        }
        # For quick lookup from extension to simple type
        self.extension_to_type = {}
        for simple_type, details in self.file_type_map.items():
            for ext in details["extensions"]:
                self.extension_to_type[ext] = simple_type
        self.is_initialized = True

    def forward(self: Self, filepath: str) -> Dict[str, Any]:
        """
        Identifies the file type and suggests a processing action.

        Args:
            filepath (str): The path to the file to be identified.

        Returns:
            Dict[str, Any]: A dictionary with 'filepath', 'determined_type', 'mime_type', 
                  'suggested_action', or an 'error'.
        """
        if not os.path.exists(filepath):
            return {
                "filepath": filepath,
                "error": "File not found"
            }

        mime_type, encoding = mimetypes.guess_type(filepath)
        file_extension = os.path.splitext(filepath)[1].lower()

        determined_type = "unknown"
        suggested_action = "unknown_handler"

        # Prioritize extension-based mapping for specific known types
        if file_extension in self.extension_to_type:
            determined_type = self.extension_to_type[file_extension]
            suggested_action = self.file_type_map[determined_type]["action"]
        elif mime_type:
            # Fallback to MIME type if extension is not specifically mapped
            if mime_type.startswith("audio/"):
                determined_type = "audio"
                suggested_action = self.file_type_map["audio"]["action"]
            elif mime_type.startswith("image/"):
                determined_type = "image"
                suggested_action = self.file_type_map["image"]["action"]
            elif mime_type == "application/pdf":
                determined_type = "pdf"
                suggested_action = self.file_type_map["pdf"]["action"]
            elif mime_type == "text/csv":
                determined_type = "csv"
                suggested_action = self.file_type_map["csv"]["action"]
            elif mime_type.startswith("text/"):  # General text
                if file_extension == ".py":
                    determined_type = "python_code"
                    suggested_action = self.file_type_map["python_code"]["action"]
                else:
                    determined_type = "text"
                    suggested_action = self.file_type_map["text"]["action"]
            elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
                determined_type = "spreadsheet"
                suggested_action = self.file_type_map["spreadsheet"]["action"]

        # If still unknown, but has a common extension not yet caught
        if determined_type == "unknown" and file_extension:
            if file_extension in self.extension_to_type:
                determined_type = self.extension_to_type[file_extension]
                suggested_action = self.file_type_map[determined_type]["action"]

        return {
            "filepath": filepath,
            "determined_type": determined_type,
            "file_extension": file_extension,
            "mime_type": mime_type,
            "suggested_action": suggested_action
        }

if __name__ == '__main__':
    tool_instance = FileIdentifier()
    # Example: Create a dummy file for testing
    dummy_files = ["test.mp3", "document.xlsx", "image.png", "script.py", "unknown.xyz", "archive.zip"]
    for fname in dummy_files:
        with open(fname, "w") as f:
            f.write("dummy content")  # Create empty file for testing
        
        result = tool_instance.forward(fname)
        print(f"File: {fname}, Info: {result}")
        os.remove(fname)  # Clean up dummy file

    # Test with a non-existent file
    non_existent_file = "no_such_file.txt"
    result_non_existent = tool_instance.forward(non_existent_file)
    print(f"File: {non_existent_file}, Info: {result_non_existent}")