masadonline commited on
Commit
b089ae9
Β·
verified Β·
1 Parent(s): 30db2dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -21
app.py CHANGED
@@ -18,10 +18,11 @@ import faiss
18
  from twilio.rest import Client
19
  from twilio.base.exceptions import TwilioRestException
20
 
21
- # Start time for filtering incoming messages
22
  APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
23
  os.environ["PYTORCH_JIT"] = "0"
24
 
 
 
25
  # ---------------- PDF / DOCX / JSON LOADERS ----------------
26
  def _extract_tables_from_page(page):
27
  tables = page.extract_tables()
@@ -68,15 +69,6 @@ def load_json_data(json_path):
68
  print(f"[JSON error] {e}")
69
  return ""
70
 
71
- def _format_tables_internal(tables):
72
- formatted = []
73
- for table in tables:
74
- with StringIO() as csvfile:
75
- writer = csv.writer(csvfile)
76
- writer.writerows(table)
77
- formatted.append(csvfile.getvalue())
78
- return "\n\n".join(formatted)
79
-
80
  def clean_extracted_text(text):
81
  return '\n'.join(' '.join(line.strip().split()) for line in text.splitlines() if line.strip())
82
 
@@ -154,25 +146,32 @@ def handle_incoming_messages(index, embed_model, tokenizer, text_chunks):
154
  # ---------------- STREAMLIT UI ----------------
155
  st.title("🎁 ToyShop Assistant – RAG WhatsApp Bot")
156
 
157
- uploaded_files = st.file_uploader("πŸ“Ž Upload your documents (PDF, DOCX, JSON)", accept_multiple_files=True)
158
- if uploaded_files:
159
  full_text = ""
160
  all_tables = []
161
 
162
- for file in uploaded_files:
163
- ext = file.name.lower().split(".")[-1]
 
164
  if ext == "pdf":
165
- text, tables = extract_text_from_pdf(file)
166
  all_tables.extend(tables)
167
  elif ext == "docx":
168
- text = extract_text_from_docx(file)
169
  elif ext == "json":
170
- text = load_json_data(file)
171
  else:
172
- text = file.read().decode("utf-8")
 
 
 
 
173
  full_text += clean_extracted_text(text) + "\n\n"
 
 
 
 
174
 
175
- # Load models
176
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
177
  embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
178
  chunks = chunk_text(full_text, tokenizer)
@@ -181,10 +180,9 @@ if uploaded_files:
181
  index = faiss.IndexFlatL2(embeddings.shape[1])
182
  index.add(np.array(embeddings))
183
 
184
- # Start listener thread
185
  if "listener_started" not in st.session_state:
186
  threading.Thread(target=handle_incoming_messages, args=(index, embed_model, tokenizer, chunks), daemon=True).start()
187
  st.session_state.listener_started = True
188
  st.success("βœ… WhatsApp listener started.")
189
 
190
- st.success(f"πŸ“š Knowledge base built with {len(chunks)} chunks")
 
18
  from twilio.rest import Client
19
  from twilio.base.exceptions import TwilioRestException
20
 
 
21
  APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
22
  os.environ["PYTORCH_JIT"] = "0"
23
 
24
+ DOCS_FOLDER = "./docs"
25
+
26
  # ---------------- PDF / DOCX / JSON LOADERS ----------------
27
  def _extract_tables_from_page(page):
28
  tables = page.extract_tables()
 
69
  print(f"[JSON error] {e}")
70
  return ""
71
 
 
 
 
 
 
 
 
 
 
72
  def clean_extracted_text(text):
73
  return '\n'.join(' '.join(line.strip().split()) for line in text.splitlines() if line.strip())
74
 
 
146
  # ---------------- STREAMLIT UI ----------------
147
  st.title("🎁 ToyShop Assistant – RAG WhatsApp Bot")
148
 
149
+ def load_all_documents(folder_path):
 
150
  full_text = ""
151
  all_tables = []
152
 
153
+ for filename in os.listdir(folder_path):
154
+ filepath = os.path.join(folder_path, filename)
155
+ ext = filename.lower().split(".")[-1]
156
  if ext == "pdf":
157
+ text, tables = extract_text_from_pdf(filepath)
158
  all_tables.extend(tables)
159
  elif ext == "docx":
160
+ text = extract_text_from_docx(filepath)
161
  elif ext == "json":
162
+ text = load_json_data(filepath)
163
  else:
164
+ try:
165
+ with open(filepath, "r", encoding="utf-8") as f:
166
+ text = f.read()
167
+ except Exception:
168
+ continue
169
  full_text += clean_extracted_text(text) + "\n\n"
170
+ return full_text, all_tables
171
+
172
+ with st.spinner("Loading documents..."):
173
+ full_text, tables = load_all_documents(DOCS_FOLDER)
174
 
 
175
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
176
  embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
177
  chunks = chunk_text(full_text, tokenizer)
 
180
  index = faiss.IndexFlatL2(embeddings.shape[1])
181
  index.add(np.array(embeddings))
182
 
 
183
  if "listener_started" not in st.session_state:
184
  threading.Thread(target=handle_incoming_messages, args=(index, embed_model, tokenizer, chunks), daemon=True).start()
185
  st.session_state.listener_started = True
186
  st.success("βœ… WhatsApp listener started.")
187
 
188
+ st.success(f"πŸ“š Loaded {len(chunks)} text chunks from docs/ folder")