Spaces:
Running
Running
import os | |
import gradio as gr | |
import pandas as pd | |
from dockling_parser import DocumentParser | |
from dockling_parser.exceptions import ParserError, UnsupportedFormatError | |
import tempfile | |
import mimetypes | |
import traceback | |
TITLE = "π Smart Document Parser" | |
DESCRIPTION = """ | |
A powerful document parsing application that automatically extracts structured information from various document formats. | |
Upload any document (PDF, DOCX, TXT, HTML, Markdown) and get structured information extracted automatically. | |
""" | |
ARTICLE = """ | |
## π Features | |
- Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown | |
- Rich Information Extraction | |
- Smart Processing with Confidence Scoring | |
- Automatic Format Detection | |
Made with β€οΈ using Docling and Gradio | |
""" | |
ERROR_MESSAGES = { | |
"no_file": ( | |
"β οΈ No file uploaded", | |
"Please upload a document to process.", | |
"No sections available", | |
"No entities available", | |
"Confidence Score: 0.0" | |
), | |
"unsupported_format": ( | |
"β οΈ Unsupported file format", | |
"Please upload a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.", | |
"No sections available", | |
"No entities available", | |
"Confidence Score: 0.0" | |
), | |
"processing_error": ( | |
"β οΈ Error processing document", | |
"An error occurred while processing the document. Please try again with a different file.", | |
"No sections available", | |
"No entities available", | |
"Confidence Score: 0.0" | |
) | |
} | |
# Initialize the document parser | |
parser = DocumentParser() | |
def process_document(file_path): | |
"""Process uploaded document and return structured information""" | |
if file_path is None: | |
return ERROR_MESSAGES["no_file"] | |
try: | |
# Parse the document directly using the file path | |
result = parser.parse(file_path) | |
# Prepare the outputs | |
metadata_df = pd.DataFrame([{ | |
"Property": k, | |
"Value": str(v) | |
} for k, v in result.metadata.dict().items()]) | |
# Extract structured content | |
sections = result.structured_content.get('sections', []) | |
sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)]) | |
# Format entities if available | |
entities = result.structured_content.get('entities', {}) | |
entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}" | |
for entity_type, entities_list in entities.items()]) if entities else "No entities detected" | |
return ( | |
result.content, # Main content | |
metadata_df, # Metadata as table | |
sections_text, # Structured sections | |
entities_text, # Named entities | |
f"Confidence Score: {result.confidence_score:.2f}" # Confidence score | |
) | |
except UnsupportedFormatError as e: | |
error_msg = f"β οΈ {str(e)}" | |
return ( | |
error_msg, | |
pd.DataFrame([{"Property": "Error", "Value": error_msg}]), | |
"No sections available", | |
"No entities available", | |
"Confidence Score: 0.0" | |
) | |
except ParserError as e: | |
error_msg = f"β οΈ {str(e)}" | |
return ( | |
error_msg, | |
pd.DataFrame([{"Property": "Error", "Value": error_msg}]), | |
"No sections available", | |
"No entities available", | |
"Confidence Score: 0.0" | |
) | |
except Exception as e: | |
error_msg = f"β οΈ Unexpected error: {str(e)}\n{traceback.format_exc()}" | |
return ( | |
error_msg, | |
pd.DataFrame([{"Property": "Error", "Value": error_msg}]), | |
"No sections available", | |
"No entities available", | |
"Confidence Score: 0.0" | |
) | |
# Create Gradio interface | |
with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface: | |
gr.Markdown(f"# {TITLE}") | |
gr.Markdown(DESCRIPTION) | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.File( | |
label="Upload Document", | |
file_types=[".pdf", ".docx", ".txt", ".html", ".md"], | |
type="filepath" | |
) | |
submit_btn = gr.Button("Process Document", variant="primary") | |
with gr.Column(): | |
confidence = gr.Textbox(label="Processing Confidence") | |
with gr.Tabs(): | |
with gr.TabItem("π Content"): | |
content_output = gr.Textbox( | |
label="Extracted Content", | |
lines=10, | |
max_lines=30 | |
) | |
with gr.TabItem("π Metadata"): | |
metadata_output = gr.Dataframe( | |
label="Document Metadata", | |
headers=["Property", "Value"] | |
) | |
with gr.TabItem("π Sections"): | |
sections_output = gr.Textbox( | |
label="Document Sections", | |
lines=10, | |
max_lines=30 | |
) | |
with gr.TabItem("π·οΈ Entities"): | |
entities_output = gr.Textbox( | |
label="Named Entities", | |
lines=5, | |
max_lines=15 | |
) | |
# Handle file submission | |
submit_btn.click( | |
fn=process_document, | |
inputs=[file_input], | |
outputs=[ | |
content_output, | |
metadata_output, | |
sections_output, | |
entities_output, | |
confidence | |
] | |
) | |
gr.Markdown(""" | |
### π Supported Formats | |
- PDF Documents (*.pdf) | |
- Word Documents (*.docx) | |
- Text Files (*.txt) | |
- HTML Files (*.html) | |
- Markdown Files (*.md) | |
""") | |
gr.Markdown(ARTICLE) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() |