masadonline commited on
Commit
ce4e9d7
Β·
verified Β·
1 Parent(s): 021a9d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -52
app.py CHANGED
@@ -3,30 +3,35 @@ import time
3
  import threading
4
  import streamlit as st
5
  from twilio.rest import Client
6
- from pdfminer.high_level import extract_text
7
  from sentence_transformers import SentenceTransformer
8
  from transformers import AutoTokenizer
9
  import faiss
10
  import numpy as np
11
  import docx
12
  from groq import Groq
13
- import PyPDF2
14
  import requests
 
 
 
15
 
16
- # --- Text Extraction Utilities ---
17
  def extract_text_from_pdf(pdf_path):
18
- try:
19
- text = ""
20
- with open(pdf_path, 'rb') as file:
21
- reader = PyPDF2.PdfReader(file)
22
- for page in reader.pages:
23
- page_text = page.extract_text()
24
- if page_text:
25
- text += page_text
26
- return text
27
- except:
28
- return extract_text(pdf_path)
29
-
 
 
 
 
30
  def extract_text_from_docx(docx_path):
31
  try:
32
  doc = docx.Document(docx_path)
@@ -42,20 +47,16 @@ def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32, max_tokens=512
42
  while start < len(tokens):
43
  end = min(start + chunk_size, len(tokens))
44
  chunk_tokens = tokens[start:end]
45
- # Drop chunk if it's too long after detokenization
46
  chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
47
- # Double-check token count with tokenizer to be safe
48
  if len(tokenizer.encode(chunk_text)) <= max_tokens:
49
  chunks.append(chunk_text.strip())
50
  start += chunk_size - chunk_overlap
51
  return chunks
52
 
53
-
54
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
55
  question_embedding = embed_model.encode(question)
56
  D, I = index.search(np.array([question_embedding]), k)
57
- relevant_chunks = [text_chunks[i] for i in I[0]]
58
- return relevant_chunks
59
 
60
  # --- Groq Answer Generator ---
61
  def generate_answer_with_groq(question, context):
@@ -119,7 +120,6 @@ def fetch_latest_incoming_message(client, conversation_sid):
119
  }
120
  return None
121
 
122
-
123
  def send_twilio_message(client, conversation_sid, body):
124
  return client.conversations.v1.conversations(conversation_sid).messages.create(
125
  author="system", body=body
@@ -132,45 +132,44 @@ def setup_knowledge_base():
132
  for file in os.listdir(folder_path):
133
  path = os.path.join(folder_path, file)
134
  if file.endswith(".pdf"):
135
- all_text += extract_text_from_pdf(path) + "\n"
 
136
  elif file.endswith((".docx", ".doc")):
137
  all_text += extract_text_from_docx(path) + "\n"
 
138
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
139
  chunks = chunk_text(all_text, tokenizer)
140
  model = SentenceTransformer('all-mpnet-base-v2')
141
- embeddings = model.encode(chunks)
142
  dim = embeddings[0].shape[0]
143
  index = faiss.IndexFlatL2(dim)
144
  index.add(np.array(embeddings).astype('float32'))
145
  return index, model, chunks
146
 
147
- # --- Monitor All Conversations ---
148
  def start_conversation_monitor(client, index, embed_model, text_chunks):
149
- last_msg_index = {}
150
  monitored_sids = set()
151
 
152
- def poll_conversation(convo_sid):
153
- last_processed_timestamp = None
154
- while True:
155
- try:
156
- latest_msg = fetch_latest_incoming_message(client, convo_sid)
157
- if latest_msg:
158
- msg_time = latest_msg["timestamp"]
159
- if last_processed_timestamp is None or msg_time > last_processed_timestamp:
160
- last_processed_timestamp = msg_time
161
- question = latest_msg["body"]
162
- sender = latest_msg["author"]
163
- print(f"\nπŸ“₯ New message from {sender} in {convo_sid}: {question}")
164
- context = "\n\n".join(retrieve_chunks(question, index, embed_model, text_chunks))
165
- answer = generate_answer_with_groq(question, context)
166
- send_twilio_message(client, convo_sid, answer)
167
- print(f"πŸ“€ Replied to {sender}: {answer}")
168
- time.sleep(3)
169
- except Exception as e:
170
- print(f"❌ Error in convo {convo_sid} polling:", e)
171
- time.sleep(5)
172
-
173
-
174
 
175
  def monitor_all_conversations():
176
  while True:
@@ -178,18 +177,16 @@ def poll_conversation(convo_sid):
178
  current_sids = set(get_whatsapp_conversation_sids(client))
179
  new_sids = current_sids - monitored_sids
180
  for sid in new_sids:
181
- print(f"➑️ Starting to monitor new conversation: {sid}")
182
  monitored_sids.add(sid)
183
  threading.Thread(target=poll_conversation, args=(sid,), daemon=True).start()
184
- time.sleep(15) # refresh every 15 seconds or adjust as needed
185
  except Exception as e:
186
  print("❌ Error in conversation monitoring loop:", e)
187
  time.sleep(15)
188
 
189
- # Start the monitoring loop in a separate thread so it runs in background
190
  threading.Thread(target=monitor_all_conversations, daemon=True).start()
191
 
192
-
193
  # --- Streamlit UI ---
194
  st.set_page_config(page_title="Quasa – A Smart WhatsApp Chatbot", layout="wide")
195
  st.title("πŸ“± Quasa – A Smart WhatsApp Chatbot")
@@ -215,4 +212,4 @@ if all([account_sid, auth_token, GROQ_API_KEY]):
215
  start_conversation_monitor(client, index, model, chunks)
216
  st.success("🟒 Chatbot is running in background and will reply to new messages.")
217
  else:
218
- st.error("❌ No WhatsApp conversations found.")
 
3
  import threading
4
  import streamlit as st
5
  from twilio.rest import Client
 
6
  from sentence_transformers import SentenceTransformer
7
  from transformers import AutoTokenizer
8
  import faiss
9
  import numpy as np
10
  import docx
11
  from groq import Groq
 
12
  import requests
13
+ from io import StringIO
14
+ from pdfminer.high_level import extract_text_to_fp
15
+ from pdfminer.layout import LAParams
16
 
17
+ # --- PDF Extraction (Improved for Tables & Paragraphs) ---
18
  def extract_text_from_pdf(pdf_path):
19
+ output_string = StringIO()
20
+ with open(pdf_path, 'rb') as file:
21
+ extract_text_to_fp(file, output_string, laparams=LAParams(), output_type='text', codec=None)
22
+ return output_string.getvalue()
23
+
24
+ def clean_extracted_text(text):
25
+ lines = text.splitlines()
26
+ cleaned = []
27
+ for line in lines:
28
+ line = line.strip()
29
+ if line:
30
+ line = ' '.join(line.split()) # remove extra spaces
31
+ cleaned.append(line)
32
+ return '\n'.join(cleaned)
33
+
34
+ # --- DOCX Extraction ---
35
  def extract_text_from_docx(docx_path):
36
  try:
37
  doc = docx.Document(docx_path)
 
47
  while start < len(tokens):
48
  end = min(start + chunk_size, len(tokens))
49
  chunk_tokens = tokens[start:end]
 
50
  chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
 
51
  if len(tokenizer.encode(chunk_text)) <= max_tokens:
52
  chunks.append(chunk_text.strip())
53
  start += chunk_size - chunk_overlap
54
  return chunks
55
 
 
56
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
57
  question_embedding = embed_model.encode(question)
58
  D, I = index.search(np.array([question_embedding]), k)
59
+ return [text_chunks[i] for i in I[0]]
 
60
 
61
  # --- Groq Answer Generator ---
62
  def generate_answer_with_groq(question, context):
 
120
  }
121
  return None
122
 
 
123
  def send_twilio_message(client, conversation_sid, body):
124
  return client.conversations.v1.conversations(conversation_sid).messages.create(
125
  author="system", body=body
 
132
  for file in os.listdir(folder_path):
133
  path = os.path.join(folder_path, file)
134
  if file.endswith(".pdf"):
135
+ raw_text = extract_text_from_pdf(path)
136
+ all_text += clean_extracted_text(raw_text) + "\n"
137
  elif file.endswith((".docx", ".doc")):
138
  all_text += extract_text_from_docx(path) + "\n"
139
+
140
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
141
  chunks = chunk_text(all_text, tokenizer)
142
  model = SentenceTransformer('all-mpnet-base-v2')
143
+ embeddings = model.encode(chunks, truncate=True, show_progress_bar=False)
144
  dim = embeddings[0].shape[0]
145
  index = faiss.IndexFlatL2(dim)
146
  index.add(np.array(embeddings).astype('float32'))
147
  return index, model, chunks
148
 
149
+ # --- Monitor Conversations ---
150
  def start_conversation_monitor(client, index, embed_model, text_chunks):
 
151
  monitored_sids = set()
152
 
153
+ def poll_conversation(convo_sid):
154
+ last_processed_timestamp = None
155
+ while True:
156
+ try:
157
+ latest_msg = fetch_latest_incoming_message(client, convo_sid)
158
+ if latest_msg:
159
+ msg_time = latest_msg["timestamp"]
160
+ if last_processed_timestamp is None or msg_time > last_processed_timestamp:
161
+ last_processed_timestamp = msg_time
162
+ question = latest_msg["body"]
163
+ sender = latest_msg["author"]
164
+ print(f"\nπŸ“₯ New message from {sender} in {convo_sid}: {question}")
165
+ context = "\n\n".join(retrieve_chunks(question, index, embed_model, text_chunks))
166
+ answer = generate_answer_with_groq(question, context)
167
+ send_twilio_message(client, convo_sid, answer)
168
+ print(f"πŸ“€ Replied to {sender}: {answer}")
169
+ time.sleep(3)
170
+ except Exception as e:
171
+ print(f"❌ Error in convo {convo_sid} polling:", e)
172
+ time.sleep(5)
 
 
173
 
174
  def monitor_all_conversations():
175
  while True:
 
177
  current_sids = set(get_whatsapp_conversation_sids(client))
178
  new_sids = current_sids - monitored_sids
179
  for sid in new_sids:
180
+ print(f"➑️ Monitoring new conversation: {sid}")
181
  monitored_sids.add(sid)
182
  threading.Thread(target=poll_conversation, args=(sid,), daemon=True).start()
183
+ time.sleep(15)
184
  except Exception as e:
185
  print("❌ Error in conversation monitoring loop:", e)
186
  time.sleep(15)
187
 
 
188
  threading.Thread(target=monitor_all_conversations, daemon=True).start()
189
 
 
190
  # --- Streamlit UI ---
191
  st.set_page_config(page_title="Quasa – A Smart WhatsApp Chatbot", layout="wide")
192
  st.title("πŸ“± Quasa – A Smart WhatsApp Chatbot")
 
212
  start_conversation_monitor(client, index, model, chunks)
213
  st.success("🟒 Chatbot is running in background and will reply to new messages.")
214
  else:
215
+ st.error("❌ No WhatsApp conversations found.")