Yago Bolivar
refactor: enhance file identification logic with improved type mapping and error handling
c5a6e89
import os | |
import mimetypes | |
class FileIdentifier: | |
def __init__(self): | |
mimetypes.init() | |
# Mapping from simple type to action and common extensions | |
self.file_type_map = { | |
"audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]}, | |
"spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]}, | |
"image": {"action": "ocr_vision_reasoning", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]}, | |
"python_code": {"action": "safe_code_interpreter", "extensions": [".py"]}, | |
"pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]}, | |
"text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]}, | |
"csv": {"action": "csv_parser", "extensions": [".csv"]}, | |
# Add more specific types if needed | |
} | |
# For quick lookup from extension to simple type | |
self.extension_to_type = {} | |
for simple_type, details in self.file_type_map.items(): | |
for ext in details["extensions"]: | |
self.extension_to_type[ext] = simple_type | |
def identify_file(self, filepath): | |
""" | |
Identifies the file type and suggests a processing action. | |
Returns a dictionary with 'filepath', 'determined_type', 'mime_type', | |
'suggested_action', or an 'error'. | |
""" | |
if not os.path.exists(filepath): | |
return { | |
"filepath": filepath, | |
"error": "File not found" | |
} | |
mime_type, encoding = mimetypes.guess_type(filepath) | |
file_extension = os.path.splitext(filepath)[1].lower() | |
determined_type = "unknown" | |
suggested_action = "unknown_handler" | |
# Prioritize extension-based mapping for specific known types | |
if file_extension in self.extension_to_type: | |
determined_type = self.extension_to_type[file_extension] | |
suggested_action = self.file_type_map[determined_type]["action"] | |
elif mime_type: | |
# Fallback to MIME type if extension is not specifically mapped | |
# This part might need more sophisticated mapping from MIME to your simple types | |
if mime_type.startswith("audio/"): | |
determined_type = "audio" | |
suggested_action = self.file_type_map["audio"]["action"] | |
elif mime_type.startswith("image/"): | |
determined_type = "image" | |
suggested_action = self.file_type_map["image"]["action"] | |
elif mime_type == "application/pdf": | |
determined_type = "pdf" | |
suggested_action = self.file_type_map["pdf"]["action"] | |
elif mime_type == "text/csv": | |
determined_type = "csv" | |
suggested_action = self.file_type_map["csv"]["action"] | |
elif mime_type.startswith("text/"): # General text | |
# Check if it's python by extension, as text/x-python might not always be guessed | |
if file_extension == ".py": | |
determined_type = "python_code" | |
suggested_action = self.file_type_map["python_code"]["action"] | |
else: | |
determined_type = "text" | |
suggested_action = self.file_type_map["text"]["action"] | |
elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": | |
determined_type = "spreadsheet" | |
suggested_action = self.file_type_map["spreadsheet"]["action"] | |
# Add more MIME-based rules if necessary | |
# If still unknown, but has a common extension not yet caught | |
if determined_type == "unknown" and file_extension: | |
# A final check for common types if MIME was unhelpful or generic | |
if file_extension in self.extension_to_type: # Redundant if first check comprehensive | |
determined_type = self.extension_to_type[file_extension] | |
suggested_action = self.file_type_map[determined_type]["action"] | |
return { | |
"filepath": filepath, | |
"determined_type": determined_type, | |
"file_extension": file_extension, | |
"mime_type": mime_type, | |
"suggested_action": suggested_action | |
} | |
# Example Usage (optional, can be kept for testing this module directly): | |
if __name__ == "__main__": | |
identifier = FileIdentifier() | |
dummy_files_dir = "dummy_files_for_test" | |
os.makedirs(dummy_files_dir, exist_ok=True) | |
test_files_info = { | |
"audio_sample.mp3": "audio content", | |
"report_data.xlsx": "excel content", | |
"diagram.png": "image content", | |
"analysis_script.py": "print('hello')", | |
"document.pdf": "pdf content", | |
"notes.txt": "text content", | |
"data.csv": "col1,col2\n1,2", | |
"archive.zip": "zip content", # Example of an unmapped type by default | |
"unknown_file.dat": "binary data" | |
} | |
for filename, content in test_files_info.items(): | |
with open(os.path.join(dummy_files_dir, filename), "w") as f: | |
f.write(content) # Simple write for testing existence and extension | |
test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()] | |
test_filepaths.append("non_existent_file.doc") | |
for filepath_to_test in test_filepaths: | |
result = identifier.identify_file(filepath_to_test) | |
print(result) | |
# Consider cleaning up dummy files if you run this main block frequently | |
# import shutil | |
# shutil.rmtree(dummy_files_dir) | |
print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory after testing.") |