import os import mimetypes # For a more robust way to guess types class FileIdentifier: def __init__(self): # Initialize mimetypes database mimetypes.init() self.file_type_actions = { "audio": "speech-to-text", "spreadsheet": "spreadsheet_parser", "image": "ocr_vision_reasoning", "python_code": "safe_code_interpreter", "pdf": "pdf_text_extractor", "text": "text_file_reader", "csv": "csv_parser", # Add more mappings as needed } def identify_file(self, filepath): """ Identifies the file type and suggests a processing action. Returns a dictionary with 'file_type', 'mime_type', and 'suggested_action'. """ if not os.path.exists(filepath): return { "error": "File not found", "filepath": filepath } if not os.path.isfile(filepath): return { "error": "Path is not a file", "filepath": filepath } _, extension = os.path.splitext(filepath) extension = extension.lower() # Primary detection by extension (as per downloaded_files.md) file_type = "unknown" if extension == ".mp3": file_type = "audio" elif extension == ".xlsx": file_type = "spreadsheet" elif extension == ".png": # Assuming .png for images as per downloaded_files.md file_type = "image" elif extension == ".py": file_type = "python_code" elif extension == ".pdf": file_type = "pdf" elif extension == ".txt": file_type = "text" elif extension == ".csv": file_type = "csv" # Fallback or complementary check using MIME types mime_type, _ = mimetypes.guess_type(filepath) # If extension-based detection was unknown, try to infer from MIME type if file_type == "unknown" and mime_type: if mime_type.startswith("audio/"): file_type = "audio" elif mime_type in ["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]: file_type = "spreadsheet" elif mime_type.startswith("image/"): file_type = "image" elif mime_type in ["text/x-python", "application/x-python-code"]: file_type = "python_code" elif mime_type == "application/pdf": file_type = "pdf" elif mime_type.startswith("text/plain"): file_type = "text" elif mime_type == "text/csv": file_type = "csv" suggested_action = self.file_type_actions.get(file_type, "manual_inspection_required") return { "filepath": filepath, "extension": extension, "mime_type": mime_type, "determined_type": file_type, "suggested_action": suggested_action } # Example Usage: if __name__ == "__main__": identifier = FileIdentifier() # Create dummy files for testing dummy_files_dir = "dummy_files_for_test" os.makedirs(dummy_files_dir, exist_ok=True) test_files_info = { "audio_sample.mp3": "audio content", "report_data.xlsx": "excel content", "diagram.png": "image content", "analysis_script.py": "print('hello')", "document.pdf": "pdf content", "notes.txt": "text content", "data.csv": "col1,col2\n1,2", "unknown_file.zip": "zip content" } for filename, content in test_files_info.items(): with open(os.path.join(dummy_files_dir, filename), "w") as f: # For binary files, this is not ideal, but for testing identification it's okay # Real .xlsx, .png, .mp3, .pdf would be binary if filename.endswith(('.xlsx', '.png', '.mp3', '.pdf', '.zip')): with open(os.path.join(dummy_files_dir, filename), "wb") as fb: # Ensure binary files are writable fb.write(b"dummy binary content") # placeholder else: f.write(content) test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()] test_filepaths.append("non_existent_file.doc") # Test non-existent file for filepath in test_filepaths: result = identifier.identify_file(filepath) print(result) # Clean up dummy files # import shutil # shutil.rmtree(dummy_files_dir) print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory.")