Spaces:
Build error
Build error
File size: 6,462 Bytes
33f4e34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import os
import streamlit as st
from datetime import datetime
import re
from werkzeug.utils import secure_filename
from src.gpp import GPP, GPPConfig
from src.qa import AnswerGenerator
# --- Custom CSS for styling ---
st.markdown(
"""
<style>
body { background-color: #F5F7FA; }
.header { text-align: center; padding: 10px; }
.card { background: white; border-radius: 10px; padding: 15px; margin-bottom: 20px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
.stButton>button { background-color: #4A90E2; color: white; }
pre { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
</style>
""", unsafe_allow_html=True
)
# --- Page Configuration ---
st.set_page_config(
page_title="Document Intelligence Q&A",
layout="wide",
initial_sidebar_state="expanded"
)
# --- Header ---
st.markdown("<div class='header'>", unsafe_allow_html=True)
st.image("https://img.icons8.com/ios-filled/50/4A90E2/document.png", width=50)
st.title("Document Intelligence Q&A")
st.markdown(
"<p style='font-size:18px; color:#555;'>Upload any PDF and get instant insights via advanced RAG-powered Q&A.</p>",
unsafe_allow_html=True
)
st.markdown(
f"<p style='font-size:12px; color:#888;'>Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>",
unsafe_allow_html=True
)
st.markdown("</div>", unsafe_allow_html=True)
# --- Sidebar: Instructions ---
with st.sidebar:
st.header("How It Works")
st.markdown(
"1. Upload and parse your PDF; 2. LLM narrates tables/images and enriches context; 3. Hybrid retrieval surfaces relevant chunks; 4. Reranker refines and generates answer."
)
st.markdown("---")
st.markdown("© 2025 Document Intelligence Team")
# --- Session State ---
if "parsed" not in st.session_state:
st.session_state.parsed = None
# --- Three-Column Layout ---
col1, col2, col3 = st.columns([2, 3, 3])
# --- Left Column: Upload & Layout ---
with col1:
st.header("1. Upload & Layout")
uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
if uploaded_file:
try:
filename = secure_filename(uploaded_file.name)
if not re.match(r'^[\w\-. ]+$', filename):
st.error("Invalid file name.")
elif st.button("Parse Document"):
output_dir = os.path.join("./parsed", filename)
os.makedirs(output_dir, exist_ok=True)
pdf_path = os.path.join(output_dir, filename)
with open(pdf_path, "wb") as f:
f.write(uploaded_file.getbuffer())
with st.spinner("Parsing document with MinerU and LLM...⏳"):
try:
gpp = GPP(GPPConfig())
parsed = gpp.run(pdf_path, output_dir)
st.success("✅ Parsing complete!")
st.session_state.parsed = parsed
except Exception as e:
st.error(f"Parsing failed: {e}")
st.session_state.parsed = None
except Exception as e:
st.error(f"File upload failed: {e}")
parsed = st.session_state.parsed
if parsed:
try:
st.subheader("Layout Preview")
layout_pdf = parsed.get("layout_pdf")
if layout_pdf and os.path.exists(layout_pdf):
st.markdown(f"[Open Layout PDF]({layout_pdf})")
st.subheader("Extracted Content (Preview)")
md_path = parsed.get("md_path")
if md_path and os.path.exists(md_path):
try:
with open(md_path, 'r', encoding='utf-8') as md_file:
md_text = md_file.read()
st.markdown(f"<div class='card'><pre>{md_text[:2000]}{'...' if len(md_text)>2000 else ''}</pre></div>", unsafe_allow_html=True)
except Exception as e:
st.error(f"Error reading markdown: {e}")
except Exception as e:
st.error(f"Error displaying preview: {e}")
# --- Center Column: Q&A ---
with col2:
st.header("2. Ask a Question")
if parsed:
try:
question = st.text_input("Type your question here:", placeholder="E.g., 'What was the Q2 revenue?'" )
if st.button("Get Answer") and question:
with st.spinner("Retrieving answer...🤖"):
try:
generator = AnswerGenerator()
answer, supporting_chunks = generator.answer(parsed['chunks'], question)
st.markdown(f"<div class='card'><h3>Answer</h3><p>{answer}</p></div>", unsafe_allow_html=True)
st.markdown("<div class='card'><h4>Supporting Context</h4></div>", unsafe_allow_html=True)
for sc in supporting_chunks:
st.write(f"- {sc['narration']}")
except Exception as e:
st.error(f"Failed to generate answer: {e}")
except Exception as e:
st.error(f"Error in Q&A section: {e}")
else:
st.info("Upload and parse a document to ask questions.")
# --- Right Column: Chunks ---
with col3:
st.header("3. Relevant Chunks")
if parsed:
try:
chunks = parsed.get('chunks', [])
for idx, chunk in enumerate(chunks):
with st.expander(f"Chunk {idx} - {chunk['type'].capitalize()}"):
try:
st.write(chunk.get('narration', ''))
if 'table_structure' in chunk:
st.write("**Parsed Table:**")
st.table(chunk['table_structure'])
for blk in chunk.get('blocks', []):
if blk.get('type') == 'img_path':
img_path = os.path.join(parsed['images_dir'], blk.get('img_path',''))
if os.path.exists(img_path):
st.image(img_path, caption=os.path.basename(img_path))
except Exception as e:
st.error(f"Error displaying chunk: {e}")
st.info(f"Total chunks: {len(chunks)}")
except Exception as e:
st.error(f"Error displaying chunks: {e}")
else:
st.info("No chunks to display. Parse a document first.")
|