Spaces:

raksama19
/

Test_Voice

Paused

Test_Voice / demo_element.py

raksa-the-wildcats

Add all project files with proper LFS tracking

ee78b3d 21 days ago

4 kB

	"""
	Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
	SPDX-License-Identifier: MIT
	"""

	import argparse
	import glob
	import os

	from omegaconf import OmegaConf
	from PIL import Image

	from chat import DOLPHIN
	from utils.utils import *


	def process_element(image_path, model, element_type, save_dir=None):
	"""Process a single element image (text, table, formula)

	Args:
	image_path: Path to the element image
	model: DOLPHIN model instance
	element_type: Type of element ('text', 'table', 'formula')
	save_dir: Directory to save results (default: same as input directory)

	Returns:
	Parsed content of the element and recognition results
	"""
	# Load and prepare image
	pil_image = Image.open(image_path).convert("RGB")
	pil_image = crop_margin(pil_image)

	# Select appropriate prompt based on element type
	if element_type == "table":
	prompt = "Parse the table in the image."
	label = "tab"
	elif element_type == "formula":
	prompt = "Read text in the image."
	label = "formula"
	else: # Default to text
	prompt = "Read text in the image."
	label = "text"

	# Process the element
	result = model.chat(prompt, pil_image)

	# Create recognition result in the same format as the document parser
	recognition_result = [
	{
	"label": label,
	"text": result.strip(),
	}
	]

	# Save results if save_dir is provided
	if save_dir:
	save_outputs(recognition_result, image_path, save_dir)
	print(f"Results saved to {save_dir}")

	return result, recognition_result


	def main():
	parser = argparse.ArgumentParser(description="Element-level processing using DOLPHIN model")
	parser.add_argument("--config", default="./config/Dolphin.yaml", help="Path to configuration file")
	parser.add_argument("--input_path", type=str, required=True, help="Path to input image or directory of images")
	parser.add_argument(
	"--element_type",
	type=str,
	choices=["text", "table", "formula"],
	default="text",
	help="Type of element to process (text, table, formula)",
	)
	parser.add_argument(
	"--save_dir",
	type=str,
	default=None,
	help="Directory to save parsing results (default: same as input directory)",
	)
	parser.add_argument("--print_results", action="store_true", help="Print recognition results to console")
	args = parser.parse_args()

	# Load Model
	config = OmegaConf.load(args.config)
	model = DOLPHIN(config)

	# Set save directory
	save_dir = args.save_dir or (
	args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)
	)
	setup_output_dirs(save_dir)

	# Collect Images
	if os.path.isdir(args.input_path):
	image_files = []
	for ext in [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]:
	image_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
	image_files = sorted(image_files)
	else:
	if not os.path.exists(args.input_path):
	raise FileNotFoundError(f"Input path {args.input_path} does not exist")
	image_files = [args.input_path]

	total_samples = len(image_files)
	print(f"\nTotal samples to process: {total_samples}")

	# Process images one by one
	for image_path in image_files:
	print(f"\nProcessing {image_path}")
	try:
	result, recognition_result = process_element(
	image_path=image_path,
	model=model,
	element_type=args.element_type,
	save_dir=save_dir,
	)

	if args.print_results:
	print("\nRecognition result:")
	print(result)
	print("-" * 40)

	except Exception as e:
	print(f"Error processing {image_path}: {str(e)}")
	continue


	if __name__ == "__main__":
	main()