Spaces:

masadonline
/

Quasa

Sleeping

masadonline commited on May 17

Commit

20cc7a2

verified ·

1 Parent(s): de2271c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -179,14 +179,26 @@ def send_twilio_message(client, conversation_sid, body):
 def setup_knowledge_base():
     folder_path = "docs"
     all_text = ""
-    for file in os.listdir(folder_path):
-        path = os.path.join(folder_path, file)
-        if file.endswith(".pdf"):
-            raw_text = extract_text_from_pdf(path)
-            all_text += clean_extracted_text(raw_text) + "\n"
-        elif file.endswith((".docx", ".doc")):
-            all_text += extract_text_from_docx(path) + "\n"
     tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
     chunks = chunk_text(all_text, tokenizer)
     model = SentenceTransformer('all-mpnet-base-v2')
@@ -196,6 +208,7 @@ def setup_knowledge_base():
     index.add(np.array(embeddings).astype('float32'))
     return index, model, chunks
 # --- Monitor Conversations ---
 def start_conversation_monitor(client, index, embed_model, text_chunks):
     processed_convos = set()

 def setup_knowledge_base():
     folder_path = "docs"
     all_text = ""
+    # Process PDFs
+    for filename in ["FAQ.pdf", "ProductReturnPolicy.pdf"]:
+        pdf_path = os.path.join(folder_path, filename)
+        text, tables = extract_text_from_pdf(pdf_path)
+        all_text += clean_extracted_text(text) + "\n"
+        all_text += _format_tables_internal(tables) + "\n"
+    # Process CSVs
+    for filename in ["CustomerOrders.csv", "Products.csv"]:
+        csv_path = os.path.join(folder_path, filename)
+        try:
+            with open(csv_path, newline='', encoding='utf-8') as csvfile:
+                reader = csv.reader(csvfile)
+                lines = [" | ".join(row) for row in reader]
+                all_text += "\n".join(lines) + "\n"
+        except Exception as e:
+            print(f"❌ Error reading {filename}: {e}")
+    # Tokenization & chunking
     tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
     chunks = chunk_text(all_text, tokenizer)
     model = SentenceTransformer('all-mpnet-base-v2')
     index.add(np.array(embeddings).astype('float32'))
     return index, model, chunks
 # --- Monitor Conversations ---
 def start_conversation_monitor(client, index, embed_model, text_chunks):
     processed_convos = set()