HF_Agents_Final_Project

Sleeping

HF_Agents_Final_Project / src /file_processing_tool.py

Yago Bolivar

feat: implement image processing and chess analysis tools with unit tests

8ff7d8f 4 months ago

7.35 kB

	import os
	import mimetypes

	import os
	import mimetypes

	class FileIdentifier:
	def __init__(self):
	mimetypes.init()
	# Mapping from simple type to action and common extensions
	self.file_type_map = {
	"audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]},
	"spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]},
	"image": {"action": "image_processor", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]},
	"python_code": {"action": "safe_code_interpreter", "extensions": [".py"]},
	"pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]},
	"text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]},
	"csv": {"action": "csv_parser", "extensions": [".csv"]},
	# Add more specific types if needed
	}
	# For quick lookup from extension to simple type
	self.extension_to_type = {}
	for simple_type, details in self.file_type_map.items():
	for ext in details["extensions"]:
	self.extension_to_type[ext] = simple_type

	def identify_file(self, filepath):
	"""
	Identifies the file type and suggests a processing action.
	Returns a dictionary with 'filepath', 'determined_type', 'mime_type',
	'suggested_action', or an 'error'.
	"""
	if not os.path.exists(filepath):
	return {
	"filepath": filepath,
	"error": "File not found"
	}

	mime_type, encoding = mimetypes.guess_type(filepath)
	file_extension = os.path.splitext(filepath)[1].lower()

	determined_type = "unknown"
	suggested_action = "unknown_handler"

	# Prioritize extension-based mapping for specific known types
	if file_extension in self.extension_to_type:
	determined_type = self.extension_to_type[file_extension]
	suggested_action = self.file_type_map[determined_type]["action"]
	elif mime_type:
	# Fallback to MIME type if extension is not specifically mapped
	# This part might need more sophisticated mapping from MIME to your simple types
	if mime_type.startswith("audio/"):
	determined_type = "audio"
	suggested_action = self.file_type_map["audio"]["action"]
	elif mime_type.startswith("image/"):
	determined_type = "image"
	suggested_action = self.file_type_map["image"]["action"]
	elif mime_type == "application/pdf":
	determined_type = "pdf"
	suggested_action = self.file_type_map["pdf"]["action"]
	elif mime_type == "text/csv":
	determined_type = "csv"
	suggested_action = self.file_type_map["csv"]["action"]
	elif mime_type.startswith("text/"): # General text
	# Check if it's python by extension, as text/x-python might not always be guessed
	if file_extension == ".py":
	determined_type = "python_code"
	suggested_action = self.file_type_map["python_code"]["action"]
	else:
	determined_type = "text"
	suggested_action = self.file_type_map["text"]["action"]
	elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
	determined_type = "spreadsheet"
	suggested_action = self.file_type_map["spreadsheet"]["action"]
	# Add more MIME-based rules if necessary

	# If still unknown, but has a common extension not yet caught
	if determined_type == "unknown" and file_extension:
	# A final check for common types if MIME was unhelpful or generic
	if file_extension in self.extension_to_type: # Redundant if first check comprehensive
	determined_type = self.extension_to_type[file_extension]
	suggested_action = self.file_type_map[determined_type]["action"]


	return {
	"filepath": filepath,
	"determined_type": determined_type,
	"file_extension": file_extension,
	"mime_type": mime_type,
	"suggested_action": suggested_action
	}

	# Example Usage (optional, can be kept for testing this module directly):
	if __name__ == "__main__":
	identifier = FileIdentifier()
	dummy_files_dir = "dummy_files_for_test"
	os.makedirs(dummy_files_dir, exist_ok=True)

	test_files_info = {
	"audio_sample.mp3": "audio content",
	"report_data.xlsx": "excel content",
	"diagram.png": "image content",
	"analysis_script.py": "print('hello')",
	"document.pdf": "pdf content",
	"notes.txt": "text content",
	"data.csv": "col1,col2\n1,2",
	"archive.zip": "zip content", # Example of an unmapped type by default
	"unknown_file.dat": "binary data"
	}

	for filename, content in test_files_info.items():
	with open(os.path.join(dummy_files_dir, filename), "w") as f:
	f.write(content) # Simple write for testing existence and extension

	test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
	test_filepaths.append("non_existent_file.doc")

	for filepath_to_test in test_filepaths:
	result = identifier.identify_file(filepath_to_test)
	print(result)

	# Consider cleaning up dummy files if you run this main block frequently
	# import shutil
	# shutil.rmtree(dummy_files_dir)
	print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory after testing.")

	# Example of how to process an image file specifically
	def process_image_file(filepath):
	"""
	Process an image file using the ImageProcessor class.
	Args:
	filepath: Path to the image file
	Returns:
	Dictionary with processing results
	"""
	try:
	from image_processing_tool import ImageProcessor

	processor = ImageProcessor()

	# Get basic image details
	image_details = processor.get_image_details(filepath)

	# Perform OCR text extraction
	text_content = processor.extract_text_from_image(filepath)

	# If it's potentially a chess image, add chess analysis
	chess_analysis = None
	if "chess" in text_content.lower() or "board" in text_content.lower():
	chess_analysis = processor.analyze_chess_position(filepath)
	# For our specific chess image with known task_id, always do chess analysis
	elif "cca530fc-4052-43b2-b130-b30968d8aa44" in filepath:
	chess_analysis = processor.analyze_chess_position(filepath)

	return {
	"filepath": filepath,
	"details": image_details,
	"extracted_text": text_content,
	"chess_analysis": chess_analysis
	}
	except ImportError:
	return {
	"error": "ImageProcessor not available. Make sure image_processing_tool.py is in your path."
	}
	except Exception as e:
	return {
	"error": f"Error processing image: {str(e)}"
	}