Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

docling_free / app.py

hellorahulk

Improve error handling and file processing

fdbfd73 7 months ago

raw

history blame

5.84 kB

	import os
	import gradio as gr
	import pandas as pd
	from dockling_parser import DocumentParser
	from dockling_parser.exceptions import ParserError, UnsupportedFormatError
	import tempfile
	import mimetypes
	import traceback

	TITLE = "📄 Smart Document Parser"
	DESCRIPTION = """
	A powerful document parsing application that automatically extracts structured information from various document formats.
	Upload any document (PDF, DOCX, TXT, HTML, Markdown) and get structured information extracted automatically.
	"""

	ARTICLE = """
	## 🚀 Features

	- Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
	- Rich Information Extraction
	- Smart Processing with Confidence Scoring
	- Automatic Format Detection

	Made with ❤️ using Docling and Gradio
	"""

	ERROR_MESSAGES = {
	"no_file": (
	"⚠️ No file uploaded",
	"Please upload a document to process.",
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	),
	"unsupported_format": (
	"⚠️ Unsupported file format",
	"Please upload a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.",
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	),
	"processing_error": (
	"⚠️ Error processing document",
	"An error occurred while processing the document. Please try again with a different file.",
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)
	}

	# Initialize the document parser
	parser = DocumentParser()

	def process_document(file_path):
	"""Process uploaded document and return structured information"""
	if file_path is None:
	return ERROR_MESSAGES["no_file"]

	try:
	# Parse the document directly using the file path
	result = parser.parse(file_path)

	# Prepare the outputs
	metadata_df = pd.DataFrame([{
	"Property": k,
	"Value": str(v)
	} for k, v in result.metadata.dict().items()])

	# Extract structured content
	sections = result.structured_content.get('sections', [])
	sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)])

	# Format entities if available
	entities = result.structured_content.get('entities', {})
	entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}"
	for entity_type, entities_list in entities.items()]) if entities else "No entities detected"

	return (
	result.content, # Main content
	metadata_df, # Metadata as table
	sections_text, # Structured sections
	entities_text, # Named entities
	f"Confidence Score: {result.confidence_score:.2f}" # Confidence score
	)

	except UnsupportedFormatError as e:
	error_msg = f"⚠️ {str(e)}"
	return (
	error_msg,
	pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)
	except ParserError as e:
	error_msg = f"⚠️ {str(e)}"
	return (
	error_msg,
	pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)
	except Exception as e:
	error_msg = f"⚠️ Unexpected error: {str(e)}\n{traceback.format_exc()}"
	return (
	error_msg,
	pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)

	# Create Gradio interface
	with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
	gr.Markdown(f"# {TITLE}")
	gr.Markdown(DESCRIPTION)

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload Document",
	file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
	type="filepath"
	)
	submit_btn = gr.Button("Process Document", variant="primary")

	with gr.Column():
	confidence = gr.Textbox(label="Processing Confidence")

	with gr.Tabs():
	with gr.TabItem("📝 Content"):
	content_output = gr.Textbox(
	label="Extracted Content",
	lines=10,
	max_lines=30
	)

	with gr.TabItem("📊 Metadata"):
	metadata_output = gr.Dataframe(
	label="Document Metadata",
	headers=["Property", "Value"]
	)

	with gr.TabItem("📑 Sections"):
	sections_output = gr.Textbox(
	label="Document Sections",
	lines=10,
	max_lines=30
	)

	with gr.TabItem("🏷️ Entities"):
	entities_output = gr.Textbox(
	label="Named Entities",
	lines=5,
	max_lines=15
	)

	# Handle file submission
	submit_btn.click(
	fn=process_document,
	inputs=[file_input],
	outputs=[
	content_output,
	metadata_output,
	sections_output,
	entities_output,
	confidence
	]
	)

	gr.Markdown("""
	### 📌 Supported Formats
	- PDF Documents (*.pdf)
	- Word Documents (*.docx)
	- Text Files (*.txt)
	- HTML Files (*.html)
	- Markdown Files (*.md)
	""")

	gr.Markdown(ARTICLE)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()