File size: 6,462 Bytes
33f4e34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import streamlit as st
from datetime import datetime
import re
from werkzeug.utils import secure_filename

from src.gpp import GPP, GPPConfig
from src.qa import AnswerGenerator

# --- Custom CSS for styling ---
st.markdown(
    """
    <style>
    body { background-color: #F5F7FA; }
    .header { text-align: center; padding: 10px; }
    .card { background: white; border-radius: 10px; padding: 15px; margin-bottom: 20px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
    .stButton>button { background-color: #4A90E2; color: white; }
    pre { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
    </style>
    """, unsafe_allow_html=True
)

# --- Page Configuration ---
st.set_page_config(
    page_title="Document Intelligence Q&A",
    layout="wide",
    initial_sidebar_state="expanded"
)

# --- Header ---
st.markdown("<div class='header'>", unsafe_allow_html=True)
st.image("https://img.icons8.com/ios-filled/50/4A90E2/document.png", width=50)
st.title("Document Intelligence Q&A")
st.markdown(
    "<p style='font-size:18px; color:#555;'>Upload any PDF and get instant insights via advanced RAG-powered Q&A.</p>",
    unsafe_allow_html=True
)
st.markdown(
    f"<p style='font-size:12px; color:#888;'>Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>",
    unsafe_allow_html=True
)
st.markdown("</div>", unsafe_allow_html=True)

# --- Sidebar: Instructions ---
with st.sidebar:
    st.header("How It Works")
    st.markdown(
        "1. Upload and parse your PDF; 2. LLM narrates tables/images and enriches context; 3. Hybrid retrieval surfaces relevant chunks; 4. Reranker refines and generates answer."
    )
    st.markdown("---")
    st.markdown("&copy; 2025 Document Intelligence Team")

# --- Session State ---
if "parsed" not in st.session_state:
    st.session_state.parsed = None

# --- Three-Column Layout ---
col1, col2, col3 = st.columns([2, 3, 3])

# --- Left Column: Upload & Layout ---
with col1:
    st.header("1. Upload & Layout")
    uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
    if uploaded_file:
        try:
            filename = secure_filename(uploaded_file.name)
            if not re.match(r'^[\w\-. ]+$', filename):
                st.error("Invalid file name.")
            elif st.button("Parse Document"):
                output_dir = os.path.join("./parsed", filename)
                os.makedirs(output_dir, exist_ok=True)
                pdf_path = os.path.join(output_dir, filename)
                with open(pdf_path, "wb") as f:
                    f.write(uploaded_file.getbuffer())
                with st.spinner("Parsing document with MinerU and LLM...⏳"):
                    try:
                        gpp = GPP(GPPConfig())
                        parsed = gpp.run(pdf_path, output_dir)
                        st.success("✅ Parsing complete!")
                        st.session_state.parsed = parsed
                    except Exception as e:
                        st.error(f"Parsing failed: {e}")
                        st.session_state.parsed = None
        except Exception as e:
            st.error(f"File upload failed: {e}")
    parsed = st.session_state.parsed
    if parsed:
        try:
            st.subheader("Layout Preview")
            layout_pdf = parsed.get("layout_pdf")
            if layout_pdf and os.path.exists(layout_pdf):
                st.markdown(f"[Open Layout PDF]({layout_pdf})")
            st.subheader("Extracted Content (Preview)")
            md_path = parsed.get("md_path")
            if md_path and os.path.exists(md_path):
                try:
                    with open(md_path, 'r', encoding='utf-8') as md_file:
                        md_text = md_file.read()
                    st.markdown(f"<div class='card'><pre>{md_text[:2000]}{'...' if len(md_text)>2000 else ''}</pre></div>", unsafe_allow_html=True)
                except Exception as e:
                    st.error(f"Error reading markdown: {e}")
        except Exception as e:
            st.error(f"Error displaying preview: {e}")

# --- Center Column: Q&A ---
with col2:
    st.header("2. Ask a Question")
    if parsed:
        try:
            question = st.text_input("Type your question here:", placeholder="E.g., 'What was the Q2 revenue?'" )
            if st.button("Get Answer") and question:
                with st.spinner("Retrieving answer...🤖"):
                    try:
                        generator = AnswerGenerator()
                        answer, supporting_chunks = generator.answer(parsed['chunks'], question)
                        st.markdown(f"<div class='card'><h3>Answer</h3><p>{answer}</p></div>", unsafe_allow_html=True)
                        st.markdown("<div class='card'><h4>Supporting Context</h4></div>", unsafe_allow_html=True)
                        for sc in supporting_chunks:
                            st.write(f"- {sc['narration']}")
                    except Exception as e:
                        st.error(f"Failed to generate answer: {e}")
        except Exception as e:
            st.error(f"Error in Q&A section: {e}")
    else:
        st.info("Upload and parse a document to ask questions.")

# --- Right Column: Chunks ---
with col3:
    st.header("3. Relevant Chunks")
    if parsed:
        try:
            chunks = parsed.get('chunks', [])
            for idx, chunk in enumerate(chunks):
                with st.expander(f"Chunk {idx} - {chunk['type'].capitalize()}"):
                    try:
                        st.write(chunk.get('narration', ''))
                        if 'table_structure' in chunk:
                            st.write("**Parsed Table:**")
                            st.table(chunk['table_structure'])
                        for blk in chunk.get('blocks', []):
                            if blk.get('type') == 'img_path':
                                img_path = os.path.join(parsed['images_dir'], blk.get('img_path',''))
                                if os.path.exists(img_path):
                                    st.image(img_path, caption=os.path.basename(img_path))
                    except Exception as e:
                        st.error(f"Error displaying chunk: {e}")
            st.info(f"Total chunks: {len(chunks)}")
        except Exception as e:
            st.error(f"Error displaying chunks: {e}")
    else:
        st.info("No chunks to display. Parse a document first.")