HF_Agents_Final_Project / src /file_processing_tool.py
Yago Bolivar
reorder
073b7fb
raw
history blame
4.71 kB
import os
import mimetypes # For a more robust way to guess types
class FileIdentifier:
def __init__(self):
# Initialize mimetypes database
mimetypes.init()
self.file_type_actions = {
"audio": "speech-to-text",
"spreadsheet": "spreadsheet_parser",
"image": "ocr_vision_reasoning",
"python_code": "safe_code_interpreter",
"pdf": "pdf_text_extractor",
"text": "text_file_reader",
"csv": "csv_parser",
# Add more mappings as needed
}
def identify_file(self, filepath):
"""
Identifies the file type and suggests a processing action.
Returns a dictionary with 'file_type', 'mime_type', and 'suggested_action'.
"""
if not os.path.exists(filepath):
return {
"error": "File not found",
"filepath": filepath
}
if not os.path.isfile(filepath):
return {
"error": "Path is not a file",
"filepath": filepath
}
_, extension = os.path.splitext(filepath)
extension = extension.lower()
# Primary detection by extension (as per downloaded_files.md)
file_type = "unknown"
if extension == ".mp3":
file_type = "audio"
elif extension == ".xlsx":
file_type = "spreadsheet"
elif extension == ".png": # Assuming .png for images as per downloaded_files.md
file_type = "image"
elif extension == ".py":
file_type = "python_code"
elif extension == ".pdf":
file_type = "pdf"
elif extension == ".txt":
file_type = "text"
elif extension == ".csv":
file_type = "csv"
# Fallback or complementary check using MIME types
mime_type, _ = mimetypes.guess_type(filepath)
# If extension-based detection was unknown, try to infer from MIME type
if file_type == "unknown" and mime_type:
if mime_type.startswith("audio/"):
file_type = "audio"
elif mime_type in ["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
file_type = "spreadsheet"
elif mime_type.startswith("image/"):
file_type = "image"
elif mime_type in ["text/x-python", "application/x-python-code"]:
file_type = "python_code"
elif mime_type == "application/pdf":
file_type = "pdf"
elif mime_type.startswith("text/plain"):
file_type = "text"
elif mime_type == "text/csv":
file_type = "csv"
suggested_action = self.file_type_actions.get(file_type, "manual_inspection_required")
return {
"filepath": filepath,
"extension": extension,
"mime_type": mime_type,
"determined_type": file_type,
"suggested_action": suggested_action
}
# Example Usage:
if __name__ == "__main__":
identifier = FileIdentifier()
# Create dummy files for testing
dummy_files_dir = "dummy_files_for_test"
os.makedirs(dummy_files_dir, exist_ok=True)
test_files_info = {
"audio_sample.mp3": "audio content",
"report_data.xlsx": "excel content",
"diagram.png": "image content",
"analysis_script.py": "print('hello')",
"document.pdf": "pdf content",
"notes.txt": "text content",
"data.csv": "col1,col2\n1,2",
"unknown_file.zip": "zip content"
}
for filename, content in test_files_info.items():
with open(os.path.join(dummy_files_dir, filename), "w") as f:
# For binary files, this is not ideal, but for testing identification it's okay
# Real .xlsx, .png, .mp3, .pdf would be binary
if filename.endswith(('.xlsx', '.png', '.mp3', '.pdf', '.zip')):
with open(os.path.join(dummy_files_dir, filename), "wb") as fb: # Ensure binary files are writable
fb.write(b"dummy binary content") # placeholder
else:
f.write(content)
test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
test_filepaths.append("non_existent_file.doc") # Test non-existent file
for filepath in test_filepaths:
result = identifier.identify_file(filepath)
print(result)
# Clean up dummy files
# import shutil
# shutil.rmtree(dummy_files_dir)
print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory.")