File size: 4,706 Bytes
073b7fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import mimetypes # For a more robust way to guess types

class FileIdentifier:
    def __init__(self):
        # Initialize mimetypes database
        mimetypes.init()
        self.file_type_actions = {
            "audio": "speech-to-text",
            "spreadsheet": "spreadsheet_parser",
            "image": "ocr_vision_reasoning",
            "python_code": "safe_code_interpreter",
            "pdf": "pdf_text_extractor",
            "text": "text_file_reader",
            "csv": "csv_parser",
            # Add more mappings as needed
        }

    def identify_file(self, filepath):
        """
        Identifies the file type and suggests a processing action.
        Returns a dictionary with 'file_type', 'mime_type', and 'suggested_action'.
        """
        if not os.path.exists(filepath):
            return {
                "error": "File not found",
                "filepath": filepath
            }
        if not os.path.isfile(filepath):
            return {
                "error": "Path is not a file",
                "filepath": filepath
            }

        _, extension = os.path.splitext(filepath)
        extension = extension.lower()

        # Primary detection by extension (as per downloaded_files.md)
        file_type = "unknown"
        if extension == ".mp3":
            file_type = "audio"
        elif extension == ".xlsx":
            file_type = "spreadsheet"
        elif extension == ".png": # Assuming .png for images as per downloaded_files.md
            file_type = "image"
        elif extension == ".py":
            file_type = "python_code"
        elif extension == ".pdf":
            file_type = "pdf"
        elif extension == ".txt":
            file_type = "text"
        elif extension == ".csv":
            file_type = "csv"

        # Fallback or complementary check using MIME types
        mime_type, _ = mimetypes.guess_type(filepath)

        # If extension-based detection was unknown, try to infer from MIME type
        if file_type == "unknown" and mime_type:
            if mime_type.startswith("audio/"):
                file_type = "audio"
            elif mime_type in ["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
                file_type = "spreadsheet"
            elif mime_type.startswith("image/"):
                file_type = "image"
            elif mime_type in ["text/x-python", "application/x-python-code"]:
                file_type = "python_code"
            elif mime_type == "application/pdf":
                file_type = "pdf"
            elif mime_type.startswith("text/plain"):
                 file_type = "text"
            elif mime_type == "text/csv":
                file_type = "csv"


        suggested_action = self.file_type_actions.get(file_type, "manual_inspection_required")

        return {
            "filepath": filepath,
            "extension": extension,
            "mime_type": mime_type,
            "determined_type": file_type,
            "suggested_action": suggested_action
        }

# Example Usage:
if __name__ == "__main__":
    identifier = FileIdentifier()

    # Create dummy files for testing
    dummy_files_dir = "dummy_files_for_test"
    os.makedirs(dummy_files_dir, exist_ok=True)
    test_files_info = {
        "audio_sample.mp3": "audio content",
        "report_data.xlsx": "excel content",
        "diagram.png": "image content",
        "analysis_script.py": "print('hello')",
        "document.pdf": "pdf content",
        "notes.txt": "text content",
        "data.csv": "col1,col2\n1,2",
        "unknown_file.zip": "zip content"
    }
    for filename, content in test_files_info.items():
        with open(os.path.join(dummy_files_dir, filename), "w") as f:
            # For binary files, this is not ideal, but for testing identification it's okay
            # Real .xlsx, .png, .mp3, .pdf would be binary
            if filename.endswith(('.xlsx', '.png', '.mp3', '.pdf', '.zip')):
                 with open(os.path.join(dummy_files_dir, filename), "wb") as fb: # Ensure binary files are writable
                    fb.write(b"dummy binary content") # placeholder
            else:
                f.write(content)


    test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
    test_filepaths.append("non_existent_file.doc") # Test non-existent file

    for filepath in test_filepaths:
        result = identifier.identify_file(filepath)
        print(result)

    # Clean up dummy files
    # import shutil
    # shutil.rmtree(dummy_files_dir)
    print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory.")