File size: 4,706 Bytes
073b7fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import os
import mimetypes # For a more robust way to guess types
class FileIdentifier:
def __init__(self):
# Initialize mimetypes database
mimetypes.init()
self.file_type_actions = {
"audio": "speech-to-text",
"spreadsheet": "spreadsheet_parser",
"image": "ocr_vision_reasoning",
"python_code": "safe_code_interpreter",
"pdf": "pdf_text_extractor",
"text": "text_file_reader",
"csv": "csv_parser",
# Add more mappings as needed
}
def identify_file(self, filepath):
"""
Identifies the file type and suggests a processing action.
Returns a dictionary with 'file_type', 'mime_type', and 'suggested_action'.
"""
if not os.path.exists(filepath):
return {
"error": "File not found",
"filepath": filepath
}
if not os.path.isfile(filepath):
return {
"error": "Path is not a file",
"filepath": filepath
}
_, extension = os.path.splitext(filepath)
extension = extension.lower()
# Primary detection by extension (as per downloaded_files.md)
file_type = "unknown"
if extension == ".mp3":
file_type = "audio"
elif extension == ".xlsx":
file_type = "spreadsheet"
elif extension == ".png": # Assuming .png for images as per downloaded_files.md
file_type = "image"
elif extension == ".py":
file_type = "python_code"
elif extension == ".pdf":
file_type = "pdf"
elif extension == ".txt":
file_type = "text"
elif extension == ".csv":
file_type = "csv"
# Fallback or complementary check using MIME types
mime_type, _ = mimetypes.guess_type(filepath)
# If extension-based detection was unknown, try to infer from MIME type
if file_type == "unknown" and mime_type:
if mime_type.startswith("audio/"):
file_type = "audio"
elif mime_type in ["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
file_type = "spreadsheet"
elif mime_type.startswith("image/"):
file_type = "image"
elif mime_type in ["text/x-python", "application/x-python-code"]:
file_type = "python_code"
elif mime_type == "application/pdf":
file_type = "pdf"
elif mime_type.startswith("text/plain"):
file_type = "text"
elif mime_type == "text/csv":
file_type = "csv"
suggested_action = self.file_type_actions.get(file_type, "manual_inspection_required")
return {
"filepath": filepath,
"extension": extension,
"mime_type": mime_type,
"determined_type": file_type,
"suggested_action": suggested_action
}
# Example Usage:
if __name__ == "__main__":
identifier = FileIdentifier()
# Create dummy files for testing
dummy_files_dir = "dummy_files_for_test"
os.makedirs(dummy_files_dir, exist_ok=True)
test_files_info = {
"audio_sample.mp3": "audio content",
"report_data.xlsx": "excel content",
"diagram.png": "image content",
"analysis_script.py": "print('hello')",
"document.pdf": "pdf content",
"notes.txt": "text content",
"data.csv": "col1,col2\n1,2",
"unknown_file.zip": "zip content"
}
for filename, content in test_files_info.items():
with open(os.path.join(dummy_files_dir, filename), "w") as f:
# For binary files, this is not ideal, but for testing identification it's okay
# Real .xlsx, .png, .mp3, .pdf would be binary
if filename.endswith(('.xlsx', '.png', '.mp3', '.pdf', '.zip')):
with open(os.path.join(dummy_files_dir, filename), "wb") as fb: # Ensure binary files are writable
fb.write(b"dummy binary content") # placeholder
else:
f.write(content)
test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
test_filepaths.append("non_existent_file.doc") # Test non-existent file
for filepath in test_filepaths:
result = identifier.identify_file(filepath)
print(result)
# Clean up dummy files
# import shutil
# shutil.rmtree(dummy_files_dir)
print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory.") |