File size: 4,306 Bytes
b9ae50a
23a7785
b9ae50a
 
 
f8d2230
3246e10
f8d2230
b9ae50a
23a7785
3246e10
 
 
23a7785
f8d2230
3246e10
 
23a7785
b9ae50a
23a7785
b9ae50a
 
 
 
 
23a7785
 
2cdf48b
f8d2230
 
23a7785
 
 
c7c0d2c
2cdf48b
c7c0d2c
2cdf48b
 
 
c7c0d2c
d75a194
b9ae50a
9564b65
 
f8d2230
 
b9ae50a
 
 
 
f8d2230
 
 
 
 
 
 
 
 
 
 
 
 
b9ae50a
f8d2230
 
23a7785
f8d2230
b9ae50a
 
23a7785
b9ae50a
f8d2230
b9ae50a
 
 
 
f8d2230
 
23a7785
b9ae50a
 
c7c0d2c
f8d2230
b9ae50a
 
 
 
 
 
 
 
 
 
 
c7c0d2c
f8d2230
 
 
 
 
c7c0d2c
f8d2230
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import streamlit as st
import tempfile
from datetime import datetime
from pathlib import Path
from pymongo import MongoClient
from urllib.parse import quote_plus
from document_chunker import DocumentChunker

# === MongoDB connection via Hugging Face secrets ===
user = quote_plus(os.getenv("MONGO_USER"))
password = quote_plus(os.getenv("MONGO_PASS"))
cluster = os.getenv("MONGO_CLUSTER")
db_name = os.environ.get("MONGO_DB", "grant_docs")

mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true"
client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000)
db = client[db_name]

# === Streamlit UI ===
st.set_page_config(page_title="Doc Chunker", layout="wide")
st.title("📄 Document Chunker & Uploader")

with st.sidebar:
    st.header("Settings")
    try:
        existing_collections = db.list_collection_names()
        existing_collections.append("Create New Collection")
        default_index = existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0
        selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=default_index)
    except Exception as e:
        st.error(f"Failed to list collections: {e}")
        selected_collection = "doc_chunks_cat"

    if selected_collection == "Create New Collection":
        selected_collection = st.sidebar.text_input("Enter Collection Name:")
        if not selected_collection:
            st.warning("⚠️ Enter a collection name to proceed.")
            st.stop()

    is_grant_app = st.toggle("Is this a Grant Application?", value=False)

uploaded_file = st.file_uploader("Upload a DOCX, TXT, or PDF file", type=["docx", "txt", "pdf"])

# === Store session state after upload ===
if uploaded_file and "ready_to_process" not in st.session_state:
    temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
    with open(temp_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.session_state["uploaded_file_name"] = uploaded_file.name
    st.session_state["collection_name"] = selected_collection
    st.session_state["is_grant_app"] = is_grant_app
    st.session_state["temp_path"] = str(temp_path)
    st.session_state["ready_to_process"] = True
    st.rerun()

# === Process document ===
if st.session_state.get("ready_to_process"):
    file_name = st.session_state["uploaded_file_name"]
    collection_name = st.session_state["collection_name"]
    is_grant_app = st.session_state["is_grant_app"]
    temp_path = st.session_state["temp_path"]

    st.success(f"Uploaded `{file_name}`")
    collection = db[collection_name]

    if collection.find_one({"metadata.title": file_name}):
        st.warning("⚠️ This file already exists in the collection. Skipping...")
    else:
        st.write("⏳ Processing with DocumentChunker...")
        chunker = DocumentChunker()
        chunks = chunker.process_document(temp_path)

        if chunks:
            for chunk in chunks:
                chunk['metadata'].update({
                    "title": file_name,
                    "uploaded_at": datetime.now().isoformat(),
                    "is_grant_app": is_grant_app,
                })
                collection.insert_one(chunk)

            st.success(f"✅ {len(chunks)} chunks inserted into `{collection_name}`")

            for i, c in enumerate(chunks[:3]):
                st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
                st.markdown(c['text'][:400] + "...")
                st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
                st.progress(c['metadata']['confidence_score'])

            if len(chunks) > 3:
                st.info(f"... and {len(chunks)-3} more chunks processed.")
        else:
            st.warning("⚠️ No chunks were generated.")

    # Clean up
    try:
        os.remove(temp_path)
    except Exception as e:
        st.warning(f"⚠️ Could not delete temp file: {e}")

    # Reset session
    for key in ["uploaded_file_name", "collection_name", "is_grant_app", "temp_path", "ready_to_process"]:
        st.session_state.pop(key, None)
    st.rerun()