Spaces:
Running
Running
File size: 4,306 Bytes
b9ae50a 23a7785 b9ae50a f8d2230 3246e10 f8d2230 b9ae50a 23a7785 3246e10 23a7785 f8d2230 3246e10 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 2cdf48b f8d2230 23a7785 c7c0d2c 2cdf48b c7c0d2c 2cdf48b c7c0d2c d75a194 b9ae50a 9564b65 f8d2230 b9ae50a f8d2230 b9ae50a f8d2230 23a7785 f8d2230 b9ae50a 23a7785 b9ae50a f8d2230 b9ae50a f8d2230 23a7785 b9ae50a c7c0d2c f8d2230 b9ae50a c7c0d2c f8d2230 c7c0d2c f8d2230 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
import streamlit as st
import tempfile
from datetime import datetime
from pathlib import Path
from pymongo import MongoClient
from urllib.parse import quote_plus
from document_chunker import DocumentChunker
# === MongoDB connection via Hugging Face secrets ===
user = quote_plus(os.getenv("MONGO_USER"))
password = quote_plus(os.getenv("MONGO_PASS"))
cluster = os.getenv("MONGO_CLUSTER")
db_name = os.environ.get("MONGO_DB", "grant_docs")
mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true"
client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000)
db = client[db_name]
# === Streamlit UI ===
st.set_page_config(page_title="Doc Chunker", layout="wide")
st.title("📄 Document Chunker & Uploader")
with st.sidebar:
st.header("Settings")
try:
existing_collections = db.list_collection_names()
existing_collections.append("Create New Collection")
default_index = existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0
selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=default_index)
except Exception as e:
st.error(f"Failed to list collections: {e}")
selected_collection = "doc_chunks_cat"
if selected_collection == "Create New Collection":
selected_collection = st.sidebar.text_input("Enter Collection Name:")
if not selected_collection:
st.warning("⚠️ Enter a collection name to proceed.")
st.stop()
is_grant_app = st.toggle("Is this a Grant Application?", value=False)
uploaded_file = st.file_uploader("Upload a DOCX, TXT, or PDF file", type=["docx", "txt", "pdf"])
# === Store session state after upload ===
if uploaded_file and "ready_to_process" not in st.session_state:
temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.session_state["uploaded_file_name"] = uploaded_file.name
st.session_state["collection_name"] = selected_collection
st.session_state["is_grant_app"] = is_grant_app
st.session_state["temp_path"] = str(temp_path)
st.session_state["ready_to_process"] = True
st.rerun()
# === Process document ===
if st.session_state.get("ready_to_process"):
file_name = st.session_state["uploaded_file_name"]
collection_name = st.session_state["collection_name"]
is_grant_app = st.session_state["is_grant_app"]
temp_path = st.session_state["temp_path"]
st.success(f"Uploaded `{file_name}`")
collection = db[collection_name]
if collection.find_one({"metadata.title": file_name}):
st.warning("⚠️ This file already exists in the collection. Skipping...")
else:
st.write("⏳ Processing with DocumentChunker...")
chunker = DocumentChunker()
chunks = chunker.process_document(temp_path)
if chunks:
for chunk in chunks:
chunk['metadata'].update({
"title": file_name,
"uploaded_at": datetime.now().isoformat(),
"is_grant_app": is_grant_app,
})
collection.insert_one(chunk)
st.success(f"✅ {len(chunks)} chunks inserted into `{collection_name}`")
for i, c in enumerate(chunks[:3]):
st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
st.markdown(c['text'][:400] + "...")
st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
st.progress(c['metadata']['confidence_score'])
if len(chunks) > 3:
st.info(f"... and {len(chunks)-3} more chunks processed.")
else:
st.warning("⚠️ No chunks were generated.")
# Clean up
try:
os.remove(temp_path)
except Exception as e:
st.warning(f"⚠️ Could not delete temp file: {e}")
# Reset session
for key in ["uploaded_file_name", "collection_name", "is_grant_app", "temp_path", "ready_to_process"]:
st.session_state.pop(key, None)
st.rerun()
|