Spaces:

masadonline
/

Quasa

Sleeping

App Files Files Community

masadonline commited on May 18

Commit

b089ae9

verified ·

1 Parent(s): 30db2dc

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -21

app.py CHANGED Viewed

@@ -18,10 +18,11 @@ import faiss
 from twilio.rest import Client
 from twilio.base.exceptions import TwilioRestException
-# Start time for filtering incoming messages
 APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
 os.environ["PYTORCH_JIT"] = "0"
 # ---------------- PDF / DOCX / JSON LOADERS ----------------
 def _extract_tables_from_page(page):
     tables = page.extract_tables()
@@ -68,15 +69,6 @@ def load_json_data(json_path):
         print(f"[JSON error] {e}")
         return ""
-def _format_tables_internal(tables):
-    formatted = []
-    for table in tables:
-        with StringIO() as csvfile:
-            writer = csv.writer(csvfile)
-            writer.writerows(table)
-            formatted.append(csvfile.getvalue())
-    return "\n\n".join(formatted)
 def clean_extracted_text(text):
     return '\n'.join(' '.join(line.strip().split()) for line in text.splitlines() if line.strip())
@@ -154,25 +146,32 @@ def handle_incoming_messages(index, embed_model, tokenizer, text_chunks):
 # ---------------- STREAMLIT UI ----------------
 st.title("🎁 ToyShop Assistant – RAG WhatsApp Bot")
-uploaded_files = st.file_uploader("📎 Upload your documents (PDF, DOCX, JSON)", accept_multiple_files=True)
-if uploaded_files:
     full_text = ""
     all_tables = []
-    for file in uploaded_files:
-        ext = file.name.lower().split(".")[-1]
         if ext == "pdf":
-            text, tables = extract_text_from_pdf(file)
             all_tables.extend(tables)
         elif ext == "docx":
-            text = extract_text_from_docx(file)
         elif ext == "json":
-            text = load_json_data(file)
         else:
-            text = file.read().decode("utf-8")
         full_text += clean_extracted_text(text) + "\n\n"
-    # Load models
     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
     embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
     chunks = chunk_text(full_text, tokenizer)
@@ -181,10 +180,9 @@ if uploaded_files:
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(np.array(embeddings))
-    # Start listener thread
     if "listener_started" not in st.session_state:
         threading.Thread(target=handle_incoming_messages, args=(index, embed_model, tokenizer, chunks), daemon=True).start()
         st.session_state.listener_started = True
         st.success("✅ WhatsApp listener started.")
-    st.success(f"📚 Knowledge base built with {len(chunks)} chunks")

 from twilio.rest import Client
 from twilio.base.exceptions import TwilioRestException
 APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
 os.environ["PYTORCH_JIT"] = "0"
+DOCS_FOLDER = "./docs"
 # ---------------- PDF / DOCX / JSON LOADERS ----------------
 def _extract_tables_from_page(page):
     tables = page.extract_tables()
         print(f"[JSON error] {e}")
         return ""
 def clean_extracted_text(text):
     return '\n'.join(' '.join(line.strip().split()) for line in text.splitlines() if line.strip())
 # ---------------- STREAMLIT UI ----------------
 st.title("🎁 ToyShop Assistant – RAG WhatsApp Bot")
+def load_all_documents(folder_path):
     full_text = ""
     all_tables = []
+    for filename in os.listdir(folder_path):
+        filepath = os.path.join(folder_path, filename)
+        ext = filename.lower().split(".")[-1]
         if ext == "pdf":
+            text, tables = extract_text_from_pdf(filepath)
             all_tables.extend(tables)
         elif ext == "docx":
+            text = extract_text_from_docx(filepath)
         elif ext == "json":
+            text = load_json_data(filepath)
         else:
+            try:
+                with open(filepath, "r", encoding="utf-8") as f:
+                    text = f.read()
+            except Exception:
+                continue
         full_text += clean_extracted_text(text) + "\n\n"
+    return full_text, all_tables
+with st.spinner("Loading documents..."):
+    full_text, tables = load_all_documents(DOCS_FOLDER)
     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
     embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
     chunks = chunk_text(full_text, tokenizer)
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(np.array(embeddings))
     if "listener_started" not in st.session_state:
         threading.Thread(target=handle_incoming_messages, args=(index, embed_model, tokenizer, chunks), daemon=True).start()
         st.session_state.listener_started = True
         st.success("✅ WhatsApp listener started.")
+    st.success(f"📚 Loaded {len(chunks)} text chunks from docs/ folder")