|
import os |
|
import mimetypes |
|
|
|
class FileIdentifier: |
|
def __init__(self): |
|
|
|
mimetypes.init() |
|
self.file_type_actions = { |
|
"audio": "speech-to-text", |
|
"spreadsheet": "spreadsheet_parser", |
|
"image": "ocr_vision_reasoning", |
|
"python_code": "safe_code_interpreter", |
|
"pdf": "pdf_text_extractor", |
|
"text": "text_file_reader", |
|
"csv": "csv_parser", |
|
|
|
} |
|
|
|
def identify_file(self, filepath): |
|
""" |
|
Identifies the file type and suggests a processing action. |
|
Returns a dictionary with 'file_type', 'mime_type', and 'suggested_action'. |
|
""" |
|
if not os.path.exists(filepath): |
|
return { |
|
"error": "File not found", |
|
"filepath": filepath |
|
} |
|
if not os.path.isfile(filepath): |
|
return { |
|
"error": "Path is not a file", |
|
"filepath": filepath |
|
} |
|
|
|
_, extension = os.path.splitext(filepath) |
|
extension = extension.lower() |
|
|
|
|
|
file_type = "unknown" |
|
if extension == ".mp3": |
|
file_type = "audio" |
|
elif extension == ".xlsx": |
|
file_type = "spreadsheet" |
|
elif extension == ".png": |
|
file_type = "image" |
|
elif extension == ".py": |
|
file_type = "python_code" |
|
elif extension == ".pdf": |
|
file_type = "pdf" |
|
elif extension == ".txt": |
|
file_type = "text" |
|
elif extension == ".csv": |
|
file_type = "csv" |
|
|
|
|
|
mime_type, _ = mimetypes.guess_type(filepath) |
|
|
|
|
|
if file_type == "unknown" and mime_type: |
|
if mime_type.startswith("audio/"): |
|
file_type = "audio" |
|
elif mime_type in ["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]: |
|
file_type = "spreadsheet" |
|
elif mime_type.startswith("image/"): |
|
file_type = "image" |
|
elif mime_type in ["text/x-python", "application/x-python-code"]: |
|
file_type = "python_code" |
|
elif mime_type == "application/pdf": |
|
file_type = "pdf" |
|
elif mime_type.startswith("text/plain"): |
|
file_type = "text" |
|
elif mime_type == "text/csv": |
|
file_type = "csv" |
|
|
|
|
|
suggested_action = self.file_type_actions.get(file_type, "manual_inspection_required") |
|
|
|
return { |
|
"filepath": filepath, |
|
"extension": extension, |
|
"mime_type": mime_type, |
|
"determined_type": file_type, |
|
"suggested_action": suggested_action |
|
} |
|
|
|
|
|
if __name__ == "__main__": |
|
identifier = FileIdentifier() |
|
|
|
|
|
dummy_files_dir = "dummy_files_for_test" |
|
os.makedirs(dummy_files_dir, exist_ok=True) |
|
test_files_info = { |
|
"audio_sample.mp3": "audio content", |
|
"report_data.xlsx": "excel content", |
|
"diagram.png": "image content", |
|
"analysis_script.py": "print('hello')", |
|
"document.pdf": "pdf content", |
|
"notes.txt": "text content", |
|
"data.csv": "col1,col2\n1,2", |
|
"unknown_file.zip": "zip content" |
|
} |
|
for filename, content in test_files_info.items(): |
|
with open(os.path.join(dummy_files_dir, filename), "w") as f: |
|
|
|
|
|
if filename.endswith(('.xlsx', '.png', '.mp3', '.pdf', '.zip')): |
|
with open(os.path.join(dummy_files_dir, filename), "wb") as fb: |
|
fb.write(b"dummy binary content") |
|
else: |
|
f.write(content) |
|
|
|
|
|
test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()] |
|
test_filepaths.append("non_existent_file.doc") |
|
|
|
for filepath in test_filepaths: |
|
result = identifier.identify_file(filepath) |
|
print(result) |
|
|
|
|
|
|
|
|
|
print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory.") |