Spaces:

hellorahulk
/

docling_free

Running

File size: 5,950 Bytes

import os
import gradio as gr
import pandas as pd
from dockling_parser import DocumentParser
from dockling_parser.exceptions import ParserError
import tempfile
import mimetypes

TITLE = "📄 Smart Document Parser"
DESCRIPTION = """
A powerful document parsing application that automatically extracts structured information from various document formats.
Upload any document (PDF, DOCX, TXT, HTML, Markdown) and get structured information extracted automatically.
"""

ARTICLE = """
## 🚀 Features

- Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
- Rich Information Extraction
- Smart Processing with Confidence Scoring
- Automatic Format Detection

Made with ❤️ using Docling and Gradio
"""

# Initialize the document parser
parser = DocumentParser()

def get_file_extension(file_type):
    """Get file extension based on MIME type"""
    extensions = {
        'application/pdf': '.pdf',
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
        'text/plain': '.txt',
        'text/html': '.html',
        'text/markdown': '.md'
    }
    return extensions.get(file_type, '.tmp')

def process_document(file_obj):
    """Process uploaded document and return structured information"""
    temp_path = None
    try:
        # Handle file upload based on type
        if isinstance(file_obj, dict):
            # Get file data and original name
            file_data = file_obj['data']
            original_name = file_obj.get('name', 'uploaded_file')
            file_type = file_obj.get('mime_type', mimetypes.guess_type(original_name)[0])
            extension = os.path.splitext(original_name)[1] or get_file_extension(file_type)
        else:
            # Handle binary data directly
            file_data = file_obj
            extension = '.pdf'  # Default to PDF for binary uploads
            
        # Create temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
            if isinstance(file_data, bytes):
                tmp_file.write(file_data)
            else:
                tmp_file.write(file_data.read())
            temp_path = tmp_file.name
        
        # Parse the document
        result = parser.parse(temp_path)
        
        # Prepare the outputs
        metadata_df = pd.DataFrame([{
            "Property": k,
            "Value": str(v)
        } for k, v in result.metadata.dict().items()])
        
        # Extract structured content
        sections = result.structured_content.get('sections', [])
        sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)])
        
        # Format entities if available
        entities = result.structured_content.get('entities', {})
        entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}" 
                                 for entity_type, entities_list in entities.items()]) if entities else "No entities detected"
        
        return (
            result.content,  # Main content
            metadata_df,     # Metadata as table
            sections_text,   # Structured sections
            entities_text,   # Named entities
            f"Confidence Score: {result.confidence_score:.2f}"  # Confidence score
        )
        
    except ParserError as e:
        return (
            f"Error parsing document: {str(e)}",
            pd.DataFrame(),
            "No sections available",
            "No entities available",
            "Confidence Score: 0.0"
        )
    except Exception as e:
        return (
            f"Unexpected error: {str(e)}",
            pd.DataFrame(),
            "No sections available",
            "No entities available",
            "Confidence Score: 0.0"
        )
    finally:
        # Clean up temporary file
        if temp_path and os.path.exists(temp_path):
            try:
                os.unlink(temp_path)
            except:
                pass

# Create Gradio interface
with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
    gr.Markdown(f"# {TITLE}")
    gr.Markdown(DESCRIPTION)
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(
                label="Upload Document",
                file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
                type="binary"
            )
            submit_btn = gr.Button("Process Document", variant="primary")
        
        with gr.Column():
            confidence = gr.Textbox(label="Processing Confidence")
    
    with gr.Tabs():
        with gr.TabItem("📝 Content"):
            content_output = gr.Textbox(
                label="Extracted Content",
                lines=10,
                max_lines=30
            )
            
        with gr.TabItem("📊 Metadata"):
            metadata_output = gr.Dataframe(
                label="Document Metadata",
                headers=["Property", "Value"]
            )
            
        with gr.TabItem("📑 Sections"):
            sections_output = gr.Textbox(
                label="Document Sections",
                lines=10,
                max_lines=30
            )
            
        with gr.TabItem("🏷️ Entities"):
            entities_output = gr.Textbox(
                label="Named Entities",
                lines=5,
                max_lines=15
            )
    
    # Handle file submission
    submit_btn.click(
        fn=process_document,
        inputs=[file_input],
        outputs=[
            content_output,
            metadata_output,
            sections_output,
            entities_output,
            confidence
        ]
    )
    
    gr.Markdown("""
    ### 📌 Supported Formats
    - PDF Documents (*.pdf)
    - Word Documents (*.docx)
    - Text Files (*.txt)
    - HTML Files (*.html)
    - Markdown Files (*.md)
    """)
    
    gr.Markdown(ARTICLE)

# Launch the app
if __name__ == "__main__":
    iface.launch()