Spaces:
Running
Running
File size: 3,936 Bytes
b9ae50a 23a7785 b9ae50a 3246e10 b9ae50a 23a7785 3246e10 23a7785 3246e10 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 2cdf48b 23a7785 2cdf48b d75a194 b9ae50a 9564b65 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a e3a5d18 b9ae50a 23a7785 b9ae50a 8284226 e3a5d18 8284226 b9ae50a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import os
import streamlit as st
import tempfile
from pymongo import MongoClient
from datetime import datetime
from pathlib import Path
from document_chunker import DocumentChunker
from urllib.parse import quote_plus
# === MongoDB connection via Hugging Face secrets ===
user = quote_plus(os.getenv("MONGO_USER"))
password = quote_plus(os.getenv("MONGO_PASS"))
cluster = os.getenv("MONGO_CLUSTER")
# db_name = "grant_docs"
db_name = os.environ.get("MONGO_DB", "grant_docs")
mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true"
# mongo_uri = os.environ["MONGO_URI"]
client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000)
db = client[db_name]
# === Streamlit UI ===
st.set_page_config(page_title="Doc Chunker", layout="wide")
st.title("📄 Document Chunker & Uploader")
with st.sidebar:
st.header("Settings")
# Fetch collection names for dropdown
try:
existing_collections = db.list_collection_names()
existing_collections.append("Create New Collection")
selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0)
except Exception as e:
st.error(f"Failed to list collections: {e}")
selected_collection = "doc_chunks_cat"
if selected_collection == "Create New Collection":
selected_collection = st.sidebar.text_input("Enter Collection Name:)")
if not selected_collection:
st.warning("⚠️ Enter a collection name to proceed.")
st.stop()
is_grant_app = st.toggle("Is this a Grant Application?", value=False)
# uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])
uploaded_file = st.file_uploader("Upload a DOCX, TXT, or PDF file", type=["docx", "txt", "pdf"])
if uploaded_file:
temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success(f"Uploaded `{uploaded_file.name}`")
modified_time = datetime.now().isoformat()
collection = db[selected_collection]
if collection.find_one({"metadata.title": uploaded_file.name}):
st.warning("⚠️ This file already exists in the collection. Skipping...")
else:
st.write("⏳ Processing with DocumentChunker...")
chunker = DocumentChunker()
chunks = chunker.process_document(str(temp_path))
if chunks:
for chunk in chunks:
chunk['metadata'].update({
"title": uploaded_file.name,
"uploaded_at": modified_time,
"is_grant_app": is_grant_app,
})
collection.insert_one(chunk)
st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")
# Show a few previews
for i, c in enumerate(chunks[:3]):
st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
st.markdown(c['text'][:400] + "...")
st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
st.progress(c['metadata']['confidence_score'])
if len(chunks) > 3:
st.info(f"... and {len(chunks)-3} more chunks processed.")
# st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")
try:
os.remove(temp_path)
except Exception as e:
st.warning(f"⚠️ Could not delete temp file: {e}")
# Rerun the app to refresh the UI
st.rerun()
else:
st.warning("⚠️ No chunks were generated.")
|