Spaces:

John-Jiang
/

starfish_data_ai

Running

init commit

5301c48 2 months ago

1.81 kB

	from typing import Optional, List
	from .base_parser import BaseParser


	class UnstructuredParser(BaseParser):
	def __init__(self, strategy: str = "auto", ocr_languages: Optional[List[str]] = None):
	"""
	Initialize the unstructured parser

	Args:
	strategy: Partitioning strategy ("auto", "fast", "hi_res", "ocr_only")
	ocr_languages: List of languages for OCR (e.g., ["eng", "spa"])
	"""
	super().__init__()
	self.strategy = strategy
	self.ocr_languages = ocr_languages or ["eng"]
	self._unstructured_loaded = False

	def _load_unstructured(self):
	"""Lazy load unstructured module"""
	if not self._unstructured_loaded:
	global partition_pdf
	from unstructured.partition.pdf import partition_pdf

	self._unstructured_loaded = True

	def parse(self, file_path: str) -> str:
	"""
	Parse a document using unstructured.io

	Args:
	file_path: Path to the document file

	Returns:
	str: Extracted text content
	"""
	try:
	if not self._unstructured_loaded:
	self._load_unstructured()

	# Convert list of languages to comma-separated string
	ocr_lang_str = ",".join(self.ocr_languages)

	# Partition the document
	elements = partition_pdf(
	filename=file_path,
	strategy=self.strategy,
	ocr_languages=ocr_lang_str, # Pass string instead of list
	)

	# Join elements with double newlines for better readability
	return "\n\n".join([str(el) for el in elements])

	except Exception as e:
	raise Exception(f"Failed to parse document {file_path}: {str(e)}")