HF_Agents_Final_Project

Sleeping

HF_Agents_Final_Project / src /file_processing_tool.py

Yago Bolivar

reorder

073b7fb 4 months ago

4.71 kB

	import os
	import mimetypes # For a more robust way to guess types

	class FileIdentifier:
	def __init__(self):
	# Initialize mimetypes database
	mimetypes.init()
	self.file_type_actions = {
	"audio": "speech-to-text",
	"spreadsheet": "spreadsheet_parser",
	"image": "ocr_vision_reasoning",
	"python_code": "safe_code_interpreter",
	"pdf": "pdf_text_extractor",
	"text": "text_file_reader",
	"csv": "csv_parser",
	# Add more mappings as needed
	}

	def identify_file(self, filepath):
	"""
	Identifies the file type and suggests a processing action.
	Returns a dictionary with 'file_type', 'mime_type', and 'suggested_action'.
	"""
	if not os.path.exists(filepath):
	return {
	"error": "File not found",
	"filepath": filepath
	}
	if not os.path.isfile(filepath):
	return {
	"error": "Path is not a file",
	"filepath": filepath
	}

	_, extension = os.path.splitext(filepath)
	extension = extension.lower()

	# Primary detection by extension (as per downloaded_files.md)
	file_type = "unknown"
	if extension == ".mp3":
	file_type = "audio"
	elif extension == ".xlsx":
	file_type = "spreadsheet"
	elif extension == ".png": # Assuming .png for images as per downloaded_files.md
	file_type = "image"
	elif extension == ".py":
	file_type = "python_code"
	elif extension == ".pdf":
	file_type = "pdf"
	elif extension == ".txt":
	file_type = "text"
	elif extension == ".csv":
	file_type = "csv"

	# Fallback or complementary check using MIME types
	mime_type, _ = mimetypes.guess_type(filepath)

	# If extension-based detection was unknown, try to infer from MIME type
	if file_type == "unknown" and mime_type:
	if mime_type.startswith("audio/"):
	file_type = "audio"
	elif mime_type in ["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
	file_type = "spreadsheet"
	elif mime_type.startswith("image/"):
	file_type = "image"
	elif mime_type in ["text/x-python", "application/x-python-code"]:
	file_type = "python_code"
	elif mime_type == "application/pdf":
	file_type = "pdf"
	elif mime_type.startswith("text/plain"):
	file_type = "text"
	elif mime_type == "text/csv":
	file_type = "csv"


	suggested_action = self.file_type_actions.get(file_type, "manual_inspection_required")

	return {
	"filepath": filepath,
	"extension": extension,
	"mime_type": mime_type,
	"determined_type": file_type,
	"suggested_action": suggested_action
	}

	# Example Usage:
	if __name__ == "__main__":
	identifier = FileIdentifier()

	# Create dummy files for testing
	dummy_files_dir = "dummy_files_for_test"
	os.makedirs(dummy_files_dir, exist_ok=True)
	test_files_info = {
	"audio_sample.mp3": "audio content",
	"report_data.xlsx": "excel content",
	"diagram.png": "image content",
	"analysis_script.py": "print('hello')",
	"document.pdf": "pdf content",
	"notes.txt": "text content",
	"data.csv": "col1,col2\n1,2",
	"unknown_file.zip": "zip content"
	}
	for filename, content in test_files_info.items():
	with open(os.path.join(dummy_files_dir, filename), "w") as f:
	# For binary files, this is not ideal, but for testing identification it's okay
	# Real .xlsx, .png, .mp3, .pdf would be binary
	if filename.endswith(('.xlsx', '.png', '.mp3', '.pdf', '.zip')):
	with open(os.path.join(dummy_files_dir, filename), "wb") as fb: # Ensure binary files are writable
	fb.write(b"dummy binary content") # placeholder
	else:
	f.write(content)


	test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
	test_filepaths.append("non_existent_file.doc") # Test non-existent file

	for filepath in test_filepaths:
	result = identifier.identify_file(filepath)
	print(result)

	# Clean up dummy files
	# import shutil
	# shutil.rmtree(dummy_files_dir)
	print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory.")