Spaces:
Running
Running
import os | |
import streamlit as st | |
import tempfile | |
from datetime import datetime | |
from pathlib import Path | |
from pymongo import MongoClient | |
from urllib.parse import quote_plus | |
from document_chunker import DocumentChunker | |
# === MongoDB connection via Hugging Face secrets === | |
user = quote_plus(os.getenv("MONGO_USER")) | |
password = quote_plus(os.getenv("MONGO_PASS")) | |
cluster = os.getenv("MONGO_CLUSTER") | |
db_name = os.environ.get("MONGO_DB", "grant_docs") | |
mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true" | |
client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000) | |
db = client[db_name] | |
# === Streamlit UI === | |
st.set_page_config(page_title="Doc Chunker", layout="wide") | |
st.title("📄 Document Chunker & Uploader") | |
with st.sidebar: | |
st.header("Settings") | |
try: | |
existing_collections = db.list_collection_names() | |
existing_collections.append("Create New Collection") | |
default_index = existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0 | |
selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=default_index) | |
except Exception as e: | |
st.error(f"Failed to list collections: {e}") | |
selected_collection = "doc_chunks_cat" | |
if selected_collection == "Create New Collection": | |
selected_collection = st.sidebar.text_input("Enter Collection Name:") | |
if not selected_collection: | |
st.warning("⚠️ Enter a collection name to proceed.") | |
st.stop() | |
is_grant_app = st.toggle("Is this a Grant Application?", value=False) | |
uploaded_file = st.file_uploader("Upload a DOCX, TXT, or PDF file", type=["docx", "txt", "pdf"]) | |
# === Store session state after upload === | |
if uploaded_file and "ready_to_process" not in st.session_state: | |
temp_path = Path(tempfile.gettempdir()) / uploaded_file.name | |
with open(temp_path, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.session_state["uploaded_file_name"] = uploaded_file.name | |
st.session_state["collection_name"] = selected_collection | |
st.session_state["is_grant_app"] = is_grant_app | |
st.session_state["temp_path"] = str(temp_path) | |
st.session_state["ready_to_process"] = True | |
st.rerun() | |
# === Process document === | |
if st.session_state.get("ready_to_process"): | |
file_name = st.session_state["uploaded_file_name"] | |
collection_name = st.session_state["collection_name"] | |
is_grant_app = st.session_state["is_grant_app"] | |
temp_path = st.session_state["temp_path"] | |
st.success(f"Uploaded `{file_name}`") | |
collection = db[collection_name] | |
if collection.find_one({"metadata.title": file_name}): | |
st.warning("⚠️ This file already exists in the collection. Skipping...") | |
else: | |
st.write("⏳ Processing with DocumentChunker...") | |
chunker = DocumentChunker() | |
chunks = chunker.process_document(temp_path) | |
if chunks: | |
for chunk in chunks: | |
chunk['metadata'].update({ | |
"title": file_name, | |
"uploaded_at": datetime.now().isoformat(), | |
"is_grant_app": is_grant_app, | |
}) | |
collection.insert_one(chunk) | |
st.success(f"✅ {len(chunks)} chunks inserted into `{collection_name}`") | |
for i, c in enumerate(chunks[:3]): | |
st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}") | |
st.markdown(c['text'][:400] + "...") | |
st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}") | |
st.progress(c['metadata']['confidence_score']) | |
if len(chunks) > 3: | |
st.info(f"... and {len(chunks)-3} more chunks processed.") | |
else: | |
st.warning("⚠️ No chunks were generated.") | |
# Clean up | |
try: | |
os.remove(temp_path) | |
except Exception as e: | |
st.warning(f"⚠️ Could not delete temp file: {e}") | |
# Reset session | |
for key in ["uploaded_file_name", "collection_name", "is_grant_app", "temp_path", "ready_to_process"]: | |
st.session_state.pop(key, None) | |
st.rerun() | |