Tesneem's picture
Update app.py
f8d2230 verified
raw
history blame
4.31 kB
import os
import streamlit as st
import tempfile
from datetime import datetime
from pathlib import Path
from pymongo import MongoClient
from urllib.parse import quote_plus
from document_chunker import DocumentChunker
# === MongoDB connection via Hugging Face secrets ===
user = quote_plus(os.getenv("MONGO_USER"))
password = quote_plus(os.getenv("MONGO_PASS"))
cluster = os.getenv("MONGO_CLUSTER")
db_name = os.environ.get("MONGO_DB", "grant_docs")
mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true"
client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000)
db = client[db_name]
# === Streamlit UI ===
st.set_page_config(page_title="Doc Chunker", layout="wide")
st.title("📄 Document Chunker & Uploader")
with st.sidebar:
st.header("Settings")
try:
existing_collections = db.list_collection_names()
existing_collections.append("Create New Collection")
default_index = existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0
selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=default_index)
except Exception as e:
st.error(f"Failed to list collections: {e}")
selected_collection = "doc_chunks_cat"
if selected_collection == "Create New Collection":
selected_collection = st.sidebar.text_input("Enter Collection Name:")
if not selected_collection:
st.warning("⚠️ Enter a collection name to proceed.")
st.stop()
is_grant_app = st.toggle("Is this a Grant Application?", value=False)
uploaded_file = st.file_uploader("Upload a DOCX, TXT, or PDF file", type=["docx", "txt", "pdf"])
# === Store session state after upload ===
if uploaded_file and "ready_to_process" not in st.session_state:
temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.session_state["uploaded_file_name"] = uploaded_file.name
st.session_state["collection_name"] = selected_collection
st.session_state["is_grant_app"] = is_grant_app
st.session_state["temp_path"] = str(temp_path)
st.session_state["ready_to_process"] = True
st.rerun()
# === Process document ===
if st.session_state.get("ready_to_process"):
file_name = st.session_state["uploaded_file_name"]
collection_name = st.session_state["collection_name"]
is_grant_app = st.session_state["is_grant_app"]
temp_path = st.session_state["temp_path"]
st.success(f"Uploaded `{file_name}`")
collection = db[collection_name]
if collection.find_one({"metadata.title": file_name}):
st.warning("⚠️ This file already exists in the collection. Skipping...")
else:
st.write("⏳ Processing with DocumentChunker...")
chunker = DocumentChunker()
chunks = chunker.process_document(temp_path)
if chunks:
for chunk in chunks:
chunk['metadata'].update({
"title": file_name,
"uploaded_at": datetime.now().isoformat(),
"is_grant_app": is_grant_app,
})
collection.insert_one(chunk)
st.success(f"✅ {len(chunks)} chunks inserted into `{collection_name}`")
for i, c in enumerate(chunks[:3]):
st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
st.markdown(c['text'][:400] + "...")
st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
st.progress(c['metadata']['confidence_score'])
if len(chunks) > 3:
st.info(f"... and {len(chunks)-3} more chunks processed.")
else:
st.warning("⚠️ No chunks were generated.")
# Clean up
try:
os.remove(temp_path)
except Exception as e:
st.warning(f"⚠️ Could not delete temp file: {e}")
# Reset session
for key in ["uploaded_file_name", "collection_name", "is_grant_app", "temp_path", "ready_to_process"]:
st.session_state.pop(key, None)
st.rerun()