import os import mimetypes class FileIdentifier: def __init__(self): mimetypes.init() # Mapping from simple type to action and common extensions self.file_type_map = { "audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]}, "spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]}, "image": {"action": "ocr_vision_reasoning", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]}, "python_code": {"action": "safe_code_interpreter", "extensions": [".py"]}, "pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]}, "text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]}, "csv": {"action": "csv_parser", "extensions": [".csv"]}, # Add more specific types if needed } # For quick lookup from extension to simple type self.extension_to_type = {} for simple_type, details in self.file_type_map.items(): for ext in details["extensions"]: self.extension_to_type[ext] = simple_type def identify_file(self, filepath): """ Identifies the file type and suggests a processing action. Returns a dictionary with 'filepath', 'determined_type', 'mime_type', 'suggested_action', or an 'error'. """ if not os.path.exists(filepath): return { "filepath": filepath, "error": "File not found" } mime_type, encoding = mimetypes.guess_type(filepath) file_extension = os.path.splitext(filepath)[1].lower() determined_type = "unknown" suggested_action = "unknown_handler" # Prioritize extension-based mapping for specific known types if file_extension in self.extension_to_type: determined_type = self.extension_to_type[file_extension] suggested_action = self.file_type_map[determined_type]["action"] elif mime_type: # Fallback to MIME type if extension is not specifically mapped # This part might need more sophisticated mapping from MIME to your simple types if mime_type.startswith("audio/"): determined_type = "audio" suggested_action = self.file_type_map["audio"]["action"] elif mime_type.startswith("image/"): determined_type = "image" suggested_action = self.file_type_map["image"]["action"] elif mime_type == "application/pdf": determined_type = "pdf" suggested_action = self.file_type_map["pdf"]["action"] elif mime_type == "text/csv": determined_type = "csv" suggested_action = self.file_type_map["csv"]["action"] elif mime_type.startswith("text/"): # General text # Check if it's python by extension, as text/x-python might not always be guessed if file_extension == ".py": determined_type = "python_code" suggested_action = self.file_type_map["python_code"]["action"] else: determined_type = "text" suggested_action = self.file_type_map["text"]["action"] elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": determined_type = "spreadsheet" suggested_action = self.file_type_map["spreadsheet"]["action"] # Add more MIME-based rules if necessary # If still unknown, but has a common extension not yet caught if determined_type == "unknown" and file_extension: # A final check for common types if MIME was unhelpful or generic if file_extension in self.extension_to_type: # Redundant if first check comprehensive determined_type = self.extension_to_type[file_extension] suggested_action = self.file_type_map[determined_type]["action"] return { "filepath": filepath, "determined_type": determined_type, "file_extension": file_extension, "mime_type": mime_type, "suggested_action": suggested_action } # Example Usage (optional, can be kept for testing this module directly): if __name__ == "__main__": identifier = FileIdentifier() dummy_files_dir = "dummy_files_for_test" os.makedirs(dummy_files_dir, exist_ok=True) test_files_info = { "audio_sample.mp3": "audio content", "report_data.xlsx": "excel content", "diagram.png": "image content", "analysis_script.py": "print('hello')", "document.pdf": "pdf content", "notes.txt": "text content", "data.csv": "col1,col2\n1,2", "archive.zip": "zip content", # Example of an unmapped type by default "unknown_file.dat": "binary data" } for filename, content in test_files_info.items(): with open(os.path.join(dummy_files_dir, filename), "w") as f: f.write(content) # Simple write for testing existence and extension test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()] test_filepaths.append("non_existent_file.doc") for filepath_to_test in test_filepaths: result = identifier.identify_file(filepath_to_test) print(result) # Consider cleaning up dummy files if you run this main block frequently # import shutil # shutil.rmtree(dummy_files_dir) print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory after testing.")