File size: 5,783 Bytes
073b7fb
c5a6e89
073b7fb
 
 
 
c5a6e89
 
 
 
 
 
 
 
 
 
073b7fb
c5a6e89
 
 
 
 
073b7fb
 
 
 
c5a6e89
 
073b7fb
 
 
c5a6e89
 
073b7fb
 
c5a6e89
 
073b7fb
c5a6e89
 
073b7fb
c5a6e89
 
 
 
 
 
 
073b7fb
c5a6e89
 
073b7fb
c5a6e89
 
073b7fb
c5a6e89
 
073b7fb
c5a6e89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
073b7fb
 
 
 
c5a6e89
 
073b7fb
 
 
 
c5a6e89
073b7fb
 
 
 
c5a6e89
073b7fb
 
 
 
 
 
 
 
c5a6e89
 
073b7fb
c5a6e89
073b7fb
 
c5a6e89
073b7fb
 
c5a6e89
073b7fb
c5a6e89
 
073b7fb
 
c5a6e89
073b7fb
 
c5a6e89
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import mimetypes

class FileIdentifier:
    def __init__(self):
        mimetypes.init()
        # Mapping from simple type to action and common extensions
        self.file_type_map = {
            "audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]},
            "spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]},
            "image": {"action": "ocr_vision_reasoning", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]},
            "python_code": {"action": "safe_code_interpreter", "extensions": [".py"]},
            "pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]},
            "text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]},
            "csv": {"action": "csv_parser", "extensions": [".csv"]},
            # Add more specific types if needed
        }
        # For quick lookup from extension to simple type
        self.extension_to_type = {}
        for simple_type, details in self.file_type_map.items():
            for ext in details["extensions"]:
                self.extension_to_type[ext] = simple_type

    def identify_file(self, filepath):
        """
        Identifies the file type and suggests a processing action.
        Returns a dictionary with 'filepath', 'determined_type', 'mime_type', 
        'suggested_action', or an 'error'.
        """
        if not os.path.exists(filepath):
            return {
                "filepath": filepath,
                "error": "File not found"
            }

        mime_type, encoding = mimetypes.guess_type(filepath)
        file_extension = os.path.splitext(filepath)[1].lower()

        determined_type = "unknown"
        suggested_action = "unknown_handler"

        # Prioritize extension-based mapping for specific known types
        if file_extension in self.extension_to_type:
            determined_type = self.extension_to_type[file_extension]
            suggested_action = self.file_type_map[determined_type]["action"]
        elif mime_type:
            # Fallback to MIME type if extension is not specifically mapped
            # This part might need more sophisticated mapping from MIME to your simple types
            if mime_type.startswith("audio/"):
                determined_type = "audio"
                suggested_action = self.file_type_map["audio"]["action"]
            elif mime_type.startswith("image/"):
                determined_type = "image"
                suggested_action = self.file_type_map["image"]["action"]
            elif mime_type == "application/pdf":
                determined_type = "pdf"
                suggested_action = self.file_type_map["pdf"]["action"]
            elif mime_type == "text/csv":
                determined_type = "csv"
                suggested_action = self.file_type_map["csv"]["action"]
            elif mime_type.startswith("text/"): # General text
                # Check if it's python by extension, as text/x-python might not always be guessed
                if file_extension == ".py":
                    determined_type = "python_code"
                    suggested_action = self.file_type_map["python_code"]["action"]
                else:
                    determined_type = "text"
                    suggested_action = self.file_type_map["text"]["action"]
            elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
                determined_type = "spreadsheet"
                suggested_action = self.file_type_map["spreadsheet"]["action"]
            # Add more MIME-based rules if necessary
        
        # If still unknown, but has a common extension not yet caught
        if determined_type == "unknown" and file_extension:
             # A final check for common types if MIME was unhelpful or generic
            if file_extension in self.extension_to_type: # Redundant if first check comprehensive
                determined_type = self.extension_to_type[file_extension]
                suggested_action = self.file_type_map[determined_type]["action"]


        return {
            "filepath": filepath,
            "determined_type": determined_type,
            "file_extension": file_extension,
            "mime_type": mime_type,
            "suggested_action": suggested_action
        }

# Example Usage (optional, can be kept for testing this module directly):
if __name__ == "__main__":
    identifier = FileIdentifier()
    dummy_files_dir = "dummy_files_for_test"
    os.makedirs(dummy_files_dir, exist_ok=True)

    test_files_info = {
        "audio_sample.mp3": "audio content",
        "report_data.xlsx": "excel content",
        "diagram.png": "image content",
        "analysis_script.py": "print('hello')",
        "document.pdf": "pdf content",
        "notes.txt": "text content",
        "data.csv": "col1,col2\n1,2",
        "archive.zip": "zip content", # Example of an unmapped type by default
        "unknown_file.dat": "binary data"
    }

    for filename, content in test_files_info.items():
        with open(os.path.join(dummy_files_dir, filename), "w") as f:
            f.write(content) # Simple write for testing existence and extension

    test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
    test_filepaths.append("non_existent_file.doc")

    for filepath_to_test in test_filepaths:
        result = identifier.identify_file(filepath_to_test)
        print(result)

    # Consider cleaning up dummy files if you run this main block frequently
    # import shutil
    # shutil.rmtree(dummy_files_dir)
    print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory after testing.")