Yago Bolivar
commited on
Commit
·
c5a6e89
1
Parent(s):
c511b4a
refactor: enhance file identification logic with improved type mapping and error handling
Browse files- src/file_processing_tool.py +71 -73
src/file_processing_tool.py
CHANGED
@@ -1,95 +1,98 @@
|
|
1 |
import os
|
2 |
-
import mimetypes
|
3 |
|
4 |
class FileIdentifier:
|
5 |
def __init__(self):
|
6 |
-
# Initialize mimetypes database
|
7 |
mimetypes.init()
|
8 |
-
|
9 |
-
|
10 |
-
"
|
11 |
-
"
|
12 |
-
"
|
13 |
-
"
|
14 |
-
"
|
15 |
-
"
|
16 |
-
|
|
|
17 |
}
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def identify_file(self, filepath):
|
20 |
"""
|
21 |
Identifies the file type and suggests a processing action.
|
22 |
-
Returns a dictionary with '
|
|
|
23 |
"""
|
24 |
if not os.path.exists(filepath):
|
25 |
return {
|
26 |
-
"
|
27 |
-
"
|
28 |
-
}
|
29 |
-
if not os.path.isfile(filepath):
|
30 |
-
return {
|
31 |
-
"error": "Path is not a file",
|
32 |
-
"filepath": filepath
|
33 |
}
|
34 |
|
35 |
-
|
36 |
-
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
if extension == ".mp3":
|
41 |
-
file_type = "audio"
|
42 |
-
elif extension == ".xlsx":
|
43 |
-
file_type = "spreadsheet"
|
44 |
-
elif extension == ".png": # Assuming .png for images as per downloaded_files.md
|
45 |
-
file_type = "image"
|
46 |
-
elif extension == ".py":
|
47 |
-
file_type = "python_code"
|
48 |
-
elif extension == ".pdf":
|
49 |
-
file_type = "pdf"
|
50 |
-
elif extension == ".txt":
|
51 |
-
file_type = "text"
|
52 |
-
elif extension == ".csv":
|
53 |
-
file_type = "csv"
|
54 |
|
55 |
-
#
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
60 |
if mime_type.startswith("audio/"):
|
61 |
-
|
62 |
-
|
63 |
-
file_type = "spreadsheet"
|
64 |
elif mime_type.startswith("image/"):
|
65 |
-
|
66 |
-
|
67 |
-
file_type = "python_code"
|
68 |
elif mime_type == "application/pdf":
|
69 |
-
|
70 |
-
|
71 |
-
file_type = "text"
|
72 |
elif mime_type == "text/csv":
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
suggested_action = self.file_type_actions.get(file_type, "manual_inspection_required")
|
77 |
|
78 |
return {
|
79 |
"filepath": filepath,
|
80 |
-
"
|
|
|
81 |
"mime_type": mime_type,
|
82 |
-
"determined_type": file_type,
|
83 |
"suggested_action": suggested_action
|
84 |
}
|
85 |
|
86 |
-
# Example Usage:
|
87 |
if __name__ == "__main__":
|
88 |
identifier = FileIdentifier()
|
89 |
-
|
90 |
-
# Create dummy files for testing
|
91 |
dummy_files_dir = "dummy_files_for_test"
|
92 |
os.makedirs(dummy_files_dir, exist_ok=True)
|
|
|
93 |
test_files_info = {
|
94 |
"audio_sample.mp3": "audio content",
|
95 |
"report_data.xlsx": "excel content",
|
@@ -98,27 +101,22 @@ if __name__ == "__main__":
|
|
98 |
"document.pdf": "pdf content",
|
99 |
"notes.txt": "text content",
|
100 |
"data.csv": "col1,col2\n1,2",
|
101 |
-
"
|
|
|
102 |
}
|
|
|
103 |
for filename, content in test_files_info.items():
|
104 |
with open(os.path.join(dummy_files_dir, filename), "w") as f:
|
105 |
-
#
|
106 |
-
# Real .xlsx, .png, .mp3, .pdf would be binary
|
107 |
-
if filename.endswith(('.xlsx', '.png', '.mp3', '.pdf', '.zip')):
|
108 |
-
with open(os.path.join(dummy_files_dir, filename), "wb") as fb: # Ensure binary files are writable
|
109 |
-
fb.write(b"dummy binary content") # placeholder
|
110 |
-
else:
|
111 |
-
f.write(content)
|
112 |
-
|
113 |
|
114 |
test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
|
115 |
-
test_filepaths.append("non_existent_file.doc")
|
116 |
|
117 |
-
for
|
118 |
-
result = identifier.identify_file(
|
119 |
print(result)
|
120 |
|
121 |
-
#
|
122 |
# import shutil
|
123 |
# shutil.rmtree(dummy_files_dir)
|
124 |
-
print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory.")
|
|
|
1 |
import os
|
2 |
+
import mimetypes
|
3 |
|
4 |
class FileIdentifier:
|
5 |
def __init__(self):
|
|
|
6 |
mimetypes.init()
|
7 |
+
# Mapping from simple type to action and common extensions
|
8 |
+
self.file_type_map = {
|
9 |
+
"audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]},
|
10 |
+
"spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]},
|
11 |
+
"image": {"action": "ocr_vision_reasoning", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]},
|
12 |
+
"python_code": {"action": "safe_code_interpreter", "extensions": [".py"]},
|
13 |
+
"pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]},
|
14 |
+
"text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]},
|
15 |
+
"csv": {"action": "csv_parser", "extensions": [".csv"]},
|
16 |
+
# Add more specific types if needed
|
17 |
}
|
18 |
+
# For quick lookup from extension to simple type
|
19 |
+
self.extension_to_type = {}
|
20 |
+
for simple_type, details in self.file_type_map.items():
|
21 |
+
for ext in details["extensions"]:
|
22 |
+
self.extension_to_type[ext] = simple_type
|
23 |
|
24 |
def identify_file(self, filepath):
|
25 |
"""
|
26 |
Identifies the file type and suggests a processing action.
|
27 |
+
Returns a dictionary with 'filepath', 'determined_type', 'mime_type',
|
28 |
+
'suggested_action', or an 'error'.
|
29 |
"""
|
30 |
if not os.path.exists(filepath):
|
31 |
return {
|
32 |
+
"filepath": filepath,
|
33 |
+
"error": "File not found"
|
|
|
|
|
|
|
|
|
|
|
34 |
}
|
35 |
|
36 |
+
mime_type, encoding = mimetypes.guess_type(filepath)
|
37 |
+
file_extension = os.path.splitext(filepath)[1].lower()
|
38 |
|
39 |
+
determined_type = "unknown"
|
40 |
+
suggested_action = "unknown_handler"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
# Prioritize extension-based mapping for specific known types
|
43 |
+
if file_extension in self.extension_to_type:
|
44 |
+
determined_type = self.extension_to_type[file_extension]
|
45 |
+
suggested_action = self.file_type_map[determined_type]["action"]
|
46 |
+
elif mime_type:
|
47 |
+
# Fallback to MIME type if extension is not specifically mapped
|
48 |
+
# This part might need more sophisticated mapping from MIME to your simple types
|
49 |
if mime_type.startswith("audio/"):
|
50 |
+
determined_type = "audio"
|
51 |
+
suggested_action = self.file_type_map["audio"]["action"]
|
|
|
52 |
elif mime_type.startswith("image/"):
|
53 |
+
determined_type = "image"
|
54 |
+
suggested_action = self.file_type_map["image"]["action"]
|
|
|
55 |
elif mime_type == "application/pdf":
|
56 |
+
determined_type = "pdf"
|
57 |
+
suggested_action = self.file_type_map["pdf"]["action"]
|
|
|
58 |
elif mime_type == "text/csv":
|
59 |
+
determined_type = "csv"
|
60 |
+
suggested_action = self.file_type_map["csv"]["action"]
|
61 |
+
elif mime_type.startswith("text/"): # General text
|
62 |
+
# Check if it's python by extension, as text/x-python might not always be guessed
|
63 |
+
if file_extension == ".py":
|
64 |
+
determined_type = "python_code"
|
65 |
+
suggested_action = self.file_type_map["python_code"]["action"]
|
66 |
+
else:
|
67 |
+
determined_type = "text"
|
68 |
+
suggested_action = self.file_type_map["text"]["action"]
|
69 |
+
elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
|
70 |
+
determined_type = "spreadsheet"
|
71 |
+
suggested_action = self.file_type_map["spreadsheet"]["action"]
|
72 |
+
# Add more MIME-based rules if necessary
|
73 |
+
|
74 |
+
# If still unknown, but has a common extension not yet caught
|
75 |
+
if determined_type == "unknown" and file_extension:
|
76 |
+
# A final check for common types if MIME was unhelpful or generic
|
77 |
+
if file_extension in self.extension_to_type: # Redundant if first check comprehensive
|
78 |
+
determined_type = self.extension_to_type[file_extension]
|
79 |
+
suggested_action = self.file_type_map[determined_type]["action"]
|
80 |
|
|
|
81 |
|
82 |
return {
|
83 |
"filepath": filepath,
|
84 |
+
"determined_type": determined_type,
|
85 |
+
"file_extension": file_extension,
|
86 |
"mime_type": mime_type,
|
|
|
87 |
"suggested_action": suggested_action
|
88 |
}
|
89 |
|
90 |
+
# Example Usage (optional, can be kept for testing this module directly):
|
91 |
if __name__ == "__main__":
|
92 |
identifier = FileIdentifier()
|
|
|
|
|
93 |
dummy_files_dir = "dummy_files_for_test"
|
94 |
os.makedirs(dummy_files_dir, exist_ok=True)
|
95 |
+
|
96 |
test_files_info = {
|
97 |
"audio_sample.mp3": "audio content",
|
98 |
"report_data.xlsx": "excel content",
|
|
|
101 |
"document.pdf": "pdf content",
|
102 |
"notes.txt": "text content",
|
103 |
"data.csv": "col1,col2\n1,2",
|
104 |
+
"archive.zip": "zip content", # Example of an unmapped type by default
|
105 |
+
"unknown_file.dat": "binary data"
|
106 |
}
|
107 |
+
|
108 |
for filename, content in test_files_info.items():
|
109 |
with open(os.path.join(dummy_files_dir, filename), "w") as f:
|
110 |
+
f.write(content) # Simple write for testing existence and extension
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
|
113 |
+
test_filepaths.append("non_existent_file.doc")
|
114 |
|
115 |
+
for filepath_to_test in test_filepaths:
|
116 |
+
result = identifier.identify_file(filepath_to_test)
|
117 |
print(result)
|
118 |
|
119 |
+
# Consider cleaning up dummy files if you run this main block frequently
|
120 |
# import shutil
|
121 |
# shutil.rmtree(dummy_files_dir)
|
122 |
+
print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory after testing.")
|