PDF_Recogni

Build error

File size: 6,462 Bytes

33f4e34

import os
import streamlit as st
from datetime import datetime
import re
from werkzeug.utils import secure_filename

from src.gpp import GPP, GPPConfig
from src.qa import AnswerGenerator

# --- Custom CSS for styling ---
st.markdown(
    """
    <style>
    body { background-color: #F5F7FA; }
    .header { text-align: center; padding: 10px; }
    .card { background: white; border-radius: 10px; padding: 15px; margin-bottom: 20px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
    .stButton>button { background-color: #4A90E2; color: white; }
    pre { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
    </style>
    """, unsafe_allow_html=True
)

# --- Page Configuration ---
st.set_page_config(
    page_title="Document Intelligence Q&A",
    layout="wide",
    initial_sidebar_state="expanded"
)

# --- Header ---
st.markdown("<div class='header'>", unsafe_allow_html=True)
st.image("https://img.icons8.com/ios-filled/50/4A90E2/document.png", width=50)
st.title("Document Intelligence Q&A")
st.markdown(
    "<p style='font-size:18px; color:#555;'>Upload any PDF and get instant insights via advanced RAG-powered Q&A.</p>",
    unsafe_allow_html=True
)
st.markdown(
    f"<p style='font-size:12px; color:#888;'>Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>",
    unsafe_allow_html=True
)
st.markdown("</div>", unsafe_allow_html=True)

# --- Sidebar: Instructions ---
with st.sidebar:
    st.header("How It Works")
    st.markdown(
        "1. Upload and parse your PDF; 2. LLM narrates tables/images and enriches context; 3. Hybrid retrieval surfaces relevant chunks; 4. Reranker refines and generates answer."
    )
    st.markdown("---")
    st.markdown("&copy; 2025 Document Intelligence Team")

# --- Session State ---
if "parsed" not in st.session_state:
    st.session_state.parsed = None

# --- Three-Column Layout ---
col1, col2, col3 = st.columns([2, 3, 3])

# --- Left Column: Upload & Layout ---
with col1:
    st.header("1. Upload & Layout")
    uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
    if uploaded_file:
        try:
            filename = secure_filename(uploaded_file.name)
            if not re.match(r'^[\w\-. ]+$', filename):
                st.error("Invalid file name.")
            elif st.button("Parse Document"):
                output_dir = os.path.join("./parsed", filename)
                os.makedirs(output_dir, exist_ok=True)
                pdf_path = os.path.join(output_dir, filename)
                with open(pdf_path, "wb") as f:
                    f.write(uploaded_file.getbuffer())
                with st.spinner("Parsing document with MinerU and LLM...⏳"):
                    try:
                        gpp = GPP(GPPConfig())
                        parsed = gpp.run(pdf_path, output_dir)
                        st.success("✅ Parsing complete!")
                        st.session_state.parsed = parsed
                    except Exception as e:
                        st.error(f"Parsing failed: {e}")
                        st.session_state.parsed = None
        except Exception as e:
            st.error(f"File upload failed: {e}")
    parsed = st.session_state.parsed
    if parsed:
        try:
            st.subheader("Layout Preview")
            layout_pdf = parsed.get("layout_pdf")
            if layout_pdf and os.path.exists(layout_pdf):
                st.markdown(f"[Open Layout PDF]({layout_pdf})")
            st.subheader("Extracted Content (Preview)")
            md_path = parsed.get("md_path")
            if md_path and os.path.exists(md_path):
                try:
                    with open(md_path, 'r', encoding='utf-8') as md_file:
                        md_text = md_file.read()
                    st.markdown(f"<div class='card'><pre>{md_text[:2000]}{'...' if len(md_text)>2000 else ''}</pre></div>", unsafe_allow_html=True)
                except Exception as e:
                    st.error(f"Error reading markdown: {e}")
        except Exception as e:
            st.error(f"Error displaying preview: {e}")

# --- Center Column: Q&A ---
with col2:
    st.header("2. Ask a Question")
    if parsed:
        try:
            question = st.text_input("Type your question here:", placeholder="E.g., 'What was the Q2 revenue?'" )
            if st.button("Get Answer") and question:
                with st.spinner("Retrieving answer...🤖"):
                    try:
                        generator = AnswerGenerator()
                        answer, supporting_chunks = generator.answer(parsed['chunks'], question)
                        st.markdown(f"<div class='card'><h3>Answer</h3><p>{answer}</p></div>", unsafe_allow_html=True)
                        st.markdown("<div class='card'><h4>Supporting Context</h4></div>", unsafe_allow_html=True)
                        for sc in supporting_chunks:
                            st.write(f"- {sc['narration']}")
                    except Exception as e:
                        st.error(f"Failed to generate answer: {e}")
        except Exception as e:
            st.error(f"Error in Q&A section: {e}")
    else:
        st.info("Upload and parse a document to ask questions.")

# --- Right Column: Chunks ---
with col3:
    st.header("3. Relevant Chunks")
    if parsed:
        try:
            chunks = parsed.get('chunks', [])
            for idx, chunk in enumerate(chunks):
                with st.expander(f"Chunk {idx} - {chunk['type'].capitalize()}"):
                    try:
                        st.write(chunk.get('narration', ''))
                        if 'table_structure' in chunk:
                            st.write("**Parsed Table:**")
                            st.table(chunk['table_structure'])
                        for blk in chunk.get('blocks', []):
                            if blk.get('type') == 'img_path':
                                img_path = os.path.join(parsed['images_dir'], blk.get('img_path',''))
                                if os.path.exists(img_path):
                                    st.image(img_path, caption=os.path.basename(img_path))
                    except Exception as e:
                        st.error(f"Error displaying chunk: {e}")
            st.info(f"Total chunks: {len(chunks)}")
        except Exception as e:
            st.error(f"Error displaying chunks: {e}")
    else:
        st.info("No chunks to display. Parse a document first.")