File size: 5,783 Bytes
073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 073b7fb c5a6e89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import os
import mimetypes
class FileIdentifier:
def __init__(self):
mimetypes.init()
# Mapping from simple type to action and common extensions
self.file_type_map = {
"audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]},
"spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]},
"image": {"action": "ocr_vision_reasoning", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]},
"python_code": {"action": "safe_code_interpreter", "extensions": [".py"]},
"pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]},
"text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]},
"csv": {"action": "csv_parser", "extensions": [".csv"]},
# Add more specific types if needed
}
# For quick lookup from extension to simple type
self.extension_to_type = {}
for simple_type, details in self.file_type_map.items():
for ext in details["extensions"]:
self.extension_to_type[ext] = simple_type
def identify_file(self, filepath):
"""
Identifies the file type and suggests a processing action.
Returns a dictionary with 'filepath', 'determined_type', 'mime_type',
'suggested_action', or an 'error'.
"""
if not os.path.exists(filepath):
return {
"filepath": filepath,
"error": "File not found"
}
mime_type, encoding = mimetypes.guess_type(filepath)
file_extension = os.path.splitext(filepath)[1].lower()
determined_type = "unknown"
suggested_action = "unknown_handler"
# Prioritize extension-based mapping for specific known types
if file_extension in self.extension_to_type:
determined_type = self.extension_to_type[file_extension]
suggested_action = self.file_type_map[determined_type]["action"]
elif mime_type:
# Fallback to MIME type if extension is not specifically mapped
# This part might need more sophisticated mapping from MIME to your simple types
if mime_type.startswith("audio/"):
determined_type = "audio"
suggested_action = self.file_type_map["audio"]["action"]
elif mime_type.startswith("image/"):
determined_type = "image"
suggested_action = self.file_type_map["image"]["action"]
elif mime_type == "application/pdf":
determined_type = "pdf"
suggested_action = self.file_type_map["pdf"]["action"]
elif mime_type == "text/csv":
determined_type = "csv"
suggested_action = self.file_type_map["csv"]["action"]
elif mime_type.startswith("text/"): # General text
# Check if it's python by extension, as text/x-python might not always be guessed
if file_extension == ".py":
determined_type = "python_code"
suggested_action = self.file_type_map["python_code"]["action"]
else:
determined_type = "text"
suggested_action = self.file_type_map["text"]["action"]
elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
determined_type = "spreadsheet"
suggested_action = self.file_type_map["spreadsheet"]["action"]
# Add more MIME-based rules if necessary
# If still unknown, but has a common extension not yet caught
if determined_type == "unknown" and file_extension:
# A final check for common types if MIME was unhelpful or generic
if file_extension in self.extension_to_type: # Redundant if first check comprehensive
determined_type = self.extension_to_type[file_extension]
suggested_action = self.file_type_map[determined_type]["action"]
return {
"filepath": filepath,
"determined_type": determined_type,
"file_extension": file_extension,
"mime_type": mime_type,
"suggested_action": suggested_action
}
# Example Usage (optional, can be kept for testing this module directly):
if __name__ == "__main__":
identifier = FileIdentifier()
dummy_files_dir = "dummy_files_for_test"
os.makedirs(dummy_files_dir, exist_ok=True)
test_files_info = {
"audio_sample.mp3": "audio content",
"report_data.xlsx": "excel content",
"diagram.png": "image content",
"analysis_script.py": "print('hello')",
"document.pdf": "pdf content",
"notes.txt": "text content",
"data.csv": "col1,col2\n1,2",
"archive.zip": "zip content", # Example of an unmapped type by default
"unknown_file.dat": "binary data"
}
for filename, content in test_files_info.items():
with open(os.path.join(dummy_files_dir, filename), "w") as f:
f.write(content) # Simple write for testing existence and extension
test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
test_filepaths.append("non_existent_file.doc")
for filepath_to_test in test_filepaths:
result = identifier.identify_file(filepath_to_test)
print(result)
# Consider cleaning up dummy files if you run this main block frequently
# import shutil
# shutil.rmtree(dummy_files_dir)
print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory after testing.") |