Spaces:

masadonline
/

Quasa

Sleeping

App Files Files Community

masadonline commited on May 18

Commit

c99b5df

verified ·

1 Parent(s): 95b3fe6

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -19

app.py CHANGED Viewed

@@ -17,11 +17,13 @@ from twilio.base.exceptions import TwilioRestException
 import pdfplumber
 import datetime
 import csv
 APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
 os.environ["PYTORCH_JIT"] = "0"
-# ---------------- PDF & DOCX Extraction ----------------
 def _extract_tables_from_page(page):
     tables = page.extract_tables()
     formatted_tables = []
@@ -68,6 +70,26 @@ def extract_text_from_docx(docx_path):
     except:
         return ""
 # ---------------- Chunking ----------------
 def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32):
     tokens = tokenizer.tokenize(text)
@@ -96,7 +118,7 @@ def generate_answer_with_groq(question, context):
     }
     prompt = (
         f"Customer asked: '{question}'\n\n"
-        f"Here is the relevant product or policy info to help:\n{context}\n\n"
         f"Respond in a friendly and helpful tone as a toy shop support agent."
     )
     payload = {
@@ -144,22 +166,27 @@ def setup_knowledge_base():
     folder_path = "docs"
     all_text = ""
-    for filename in ["FAQ.pdf", "ProductReturnPolicy.pdf"]:
-        pdf_path = os.path.join(folder_path, filename)
-        text, tables = extract_text_from_pdf(pdf_path)
-        all_text += clean_extracted_text(text) + "\n"
-        all_text += _format_tables_internal(tables) + "\n"
-    for filename in ["CustomerOrders.csv", "Products.csv"]:
-        path = os.path.join(folder_path, filename)
-        try:
-            with open(path, newline='', encoding='utf-8') as csvfile:
-                reader = csv.DictReader(csvfile)
-                for row in reader:
-                    line = ' | '.join(f"{k}: {v}" for k, v in row.items())
-                    all_text += line + "\n"
-        except Exception as e:
-            print(f"CSV read error: {e}")
     tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
     chunks = chunk_text(all_text, tokenizer)
@@ -196,7 +223,7 @@ def start_conversation_monitor(client, index, embed_model, text_chunks):
             threading.Thread(target=poll_convo, args=(convo.sid,), daemon=True).start()
 # ---------------- Main Entry ----------------
-if _name_ == "_main_":
     st.title("🤖 ToyBot WhatsApp Assistant")
     st.write("Initializing knowledge base...")

 import pdfplumber
 import datetime
 import csv
+import json
+import re
 APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
 os.environ["PYTORCH_JIT"] = "0"
+# ---------------- PDF & DOCX & JSON Extraction ----------------
 def _extract_tables_from_page(page):
     tables = page.extract_tables()
     formatted_tables = []
     except:
         return ""
+def load_json_data(json_path):
+    try:
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        if isinstance(data, dict):
+            # Flatten dictionary values (avoiding nested structures as strings)
+            return "\n".join(f"{key}: {value}" for key, value in data.items() if not isinstance(value, (dict, list)))
+        elif isinstance(data, list):
+            # Flatten list of dictionaries
+            all_items = []
+            for item in data:
+                if isinstance(item, dict):
+                    all_items.append("\n".join(f"{key}: {value}" for key, value in item.items() if not isinstance(value, (dict, list))))
+            return "\n\n".join(all_items)
+        else:
+            return json.dumps(data, ensure_ascii=False, indent=2)
+    except Exception as e:
+        print(f"JSON read error: {e}")
+        return ""
 # ---------------- Chunking ----------------
 def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32):
     tokens = tokenizer.tokenize(text)
     }
     prompt = (
         f"Customer asked: '{question}'\n\n"
+        f"Here is the relevant information to help:\n{context}\n\n"
         f"Respond in a friendly and helpful tone as a toy shop support agent."
     )
     payload = {
     folder_path = "docs"
     all_text = ""
+    for filename in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, filename)
+        if filename.endswith(".pdf"):
+            text, tables = extract_text_from_pdf(file_path)
+            all_text += clean_extracted_text(text) + "\n"
+            all_text += _format_tables_internal(tables) + "\n"
+        elif filename.endswith(".docx"):
+            text = extract_text_from_docx(file_path)
+            all_text += clean_extracted_text(text) + "\n"
+        elif filename.endswith(".json"):
+            text = load_json_data(file_path)
+            all_text += text + "\n"
+        elif filename.endswith(".csv"):
+            try:
+                with open(file_path, newline='', encoding='utf-8') as csvfile:
+                    reader = csv.DictReader(csvfile)
+                    for row in reader:
+                        line = ' | '.join(f"{k}: {v}" for k, v in row.items())
+                        all_text += line + "\n"
+            except Exception as e:
+                print(f"CSV read error: {e}")
     tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
     chunks = chunk_text(all_text, tokenizer)
             threading.Thread(target=poll_convo, args=(convo.sid,), daemon=True).start()
 # ---------------- Main Entry ----------------
+if __name__ == "__main__":
     st.title("🤖 ToyBot WhatsApp Assistant")
     st.write("Initializing knowledge base...")