Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,30 +3,35 @@ import time
|
|
3 |
import threading
|
4 |
import streamlit as st
|
5 |
from twilio.rest import Client
|
6 |
-
from pdfminer.high_level import extract_text
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
from transformers import AutoTokenizer
|
9 |
import faiss
|
10 |
import numpy as np
|
11 |
import docx
|
12 |
from groq import Groq
|
13 |
-
import PyPDF2
|
14 |
import requests
|
|
|
|
|
|
|
15 |
|
16 |
-
# ---
|
17 |
def extract_text_from_pdf(pdf_path):
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
30 |
def extract_text_from_docx(docx_path):
|
31 |
try:
|
32 |
doc = docx.Document(docx_path)
|
@@ -42,20 +47,16 @@ def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32, max_tokens=512
|
|
42 |
while start < len(tokens):
|
43 |
end = min(start + chunk_size, len(tokens))
|
44 |
chunk_tokens = tokens[start:end]
|
45 |
-
# Drop chunk if it's too long after detokenization
|
46 |
chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
|
47 |
-
# Double-check token count with tokenizer to be safe
|
48 |
if len(tokenizer.encode(chunk_text)) <= max_tokens:
|
49 |
chunks.append(chunk_text.strip())
|
50 |
start += chunk_size - chunk_overlap
|
51 |
return chunks
|
52 |
|
53 |
-
|
54 |
def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
|
55 |
question_embedding = embed_model.encode(question)
|
56 |
D, I = index.search(np.array([question_embedding]), k)
|
57 |
-
|
58 |
-
return relevant_chunks
|
59 |
|
60 |
# --- Groq Answer Generator ---
|
61 |
def generate_answer_with_groq(question, context):
|
@@ -119,7 +120,6 @@ def fetch_latest_incoming_message(client, conversation_sid):
|
|
119 |
}
|
120 |
return None
|
121 |
|
122 |
-
|
123 |
def send_twilio_message(client, conversation_sid, body):
|
124 |
return client.conversations.v1.conversations(conversation_sid).messages.create(
|
125 |
author="system", body=body
|
@@ -132,45 +132,44 @@ def setup_knowledge_base():
|
|
132 |
for file in os.listdir(folder_path):
|
133 |
path = os.path.join(folder_path, file)
|
134 |
if file.endswith(".pdf"):
|
135 |
-
|
|
|
136 |
elif file.endswith((".docx", ".doc")):
|
137 |
all_text += extract_text_from_docx(path) + "\n"
|
|
|
138 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
139 |
chunks = chunk_text(all_text, tokenizer)
|
140 |
model = SentenceTransformer('all-mpnet-base-v2')
|
141 |
-
embeddings = model.encode(chunks)
|
142 |
dim = embeddings[0].shape[0]
|
143 |
index = faiss.IndexFlatL2(dim)
|
144 |
index.add(np.array(embeddings).astype('float32'))
|
145 |
return index, model, chunks
|
146 |
|
147 |
-
# --- Monitor
|
148 |
def start_conversation_monitor(client, index, embed_model, text_chunks):
|
149 |
-
last_msg_index = {}
|
150 |
monitored_sids = set()
|
151 |
|
152 |
-
def poll_conversation(convo_sid):
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
|
175 |
def monitor_all_conversations():
|
176 |
while True:
|
@@ -178,18 +177,16 @@ def poll_conversation(convo_sid):
|
|
178 |
current_sids = set(get_whatsapp_conversation_sids(client))
|
179 |
new_sids = current_sids - monitored_sids
|
180 |
for sid in new_sids:
|
181 |
-
print(f"β‘οΈ
|
182 |
monitored_sids.add(sid)
|
183 |
threading.Thread(target=poll_conversation, args=(sid,), daemon=True).start()
|
184 |
-
time.sleep(15)
|
185 |
except Exception as e:
|
186 |
print("β Error in conversation monitoring loop:", e)
|
187 |
time.sleep(15)
|
188 |
|
189 |
-
# Start the monitoring loop in a separate thread so it runs in background
|
190 |
threading.Thread(target=monitor_all_conversations, daemon=True).start()
|
191 |
|
192 |
-
|
193 |
# --- Streamlit UI ---
|
194 |
st.set_page_config(page_title="Quasa β A Smart WhatsApp Chatbot", layout="wide")
|
195 |
st.title("π± Quasa β A Smart WhatsApp Chatbot")
|
@@ -215,4 +212,4 @@ if all([account_sid, auth_token, GROQ_API_KEY]):
|
|
215 |
start_conversation_monitor(client, index, model, chunks)
|
216 |
st.success("π’ Chatbot is running in background and will reply to new messages.")
|
217 |
else:
|
218 |
-
st.error("β No WhatsApp conversations found.")
|
|
|
3 |
import threading
|
4 |
import streamlit as st
|
5 |
from twilio.rest import Client
|
|
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from transformers import AutoTokenizer
|
8 |
import faiss
|
9 |
import numpy as np
|
10 |
import docx
|
11 |
from groq import Groq
|
|
|
12 |
import requests
|
13 |
+
from io import StringIO
|
14 |
+
from pdfminer.high_level import extract_text_to_fp
|
15 |
+
from pdfminer.layout import LAParams
|
16 |
|
17 |
+
# --- PDF Extraction (Improved for Tables & Paragraphs) ---
|
18 |
def extract_text_from_pdf(pdf_path):
|
19 |
+
output_string = StringIO()
|
20 |
+
with open(pdf_path, 'rb') as file:
|
21 |
+
extract_text_to_fp(file, output_string, laparams=LAParams(), output_type='text', codec=None)
|
22 |
+
return output_string.getvalue()
|
23 |
+
|
24 |
+
def clean_extracted_text(text):
|
25 |
+
lines = text.splitlines()
|
26 |
+
cleaned = []
|
27 |
+
for line in lines:
|
28 |
+
line = line.strip()
|
29 |
+
if line:
|
30 |
+
line = ' '.join(line.split()) # remove extra spaces
|
31 |
+
cleaned.append(line)
|
32 |
+
return '\n'.join(cleaned)
|
33 |
+
|
34 |
+
# --- DOCX Extraction ---
|
35 |
def extract_text_from_docx(docx_path):
|
36 |
try:
|
37 |
doc = docx.Document(docx_path)
|
|
|
47 |
while start < len(tokens):
|
48 |
end = min(start + chunk_size, len(tokens))
|
49 |
chunk_tokens = tokens[start:end]
|
|
|
50 |
chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
|
|
|
51 |
if len(tokenizer.encode(chunk_text)) <= max_tokens:
|
52 |
chunks.append(chunk_text.strip())
|
53 |
start += chunk_size - chunk_overlap
|
54 |
return chunks
|
55 |
|
|
|
56 |
def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
|
57 |
question_embedding = embed_model.encode(question)
|
58 |
D, I = index.search(np.array([question_embedding]), k)
|
59 |
+
return [text_chunks[i] for i in I[0]]
|
|
|
60 |
|
61 |
# --- Groq Answer Generator ---
|
62 |
def generate_answer_with_groq(question, context):
|
|
|
120 |
}
|
121 |
return None
|
122 |
|
|
|
123 |
def send_twilio_message(client, conversation_sid, body):
|
124 |
return client.conversations.v1.conversations(conversation_sid).messages.create(
|
125 |
author="system", body=body
|
|
|
132 |
for file in os.listdir(folder_path):
|
133 |
path = os.path.join(folder_path, file)
|
134 |
if file.endswith(".pdf"):
|
135 |
+
raw_text = extract_text_from_pdf(path)
|
136 |
+
all_text += clean_extracted_text(raw_text) + "\n"
|
137 |
elif file.endswith((".docx", ".doc")):
|
138 |
all_text += extract_text_from_docx(path) + "\n"
|
139 |
+
|
140 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
141 |
chunks = chunk_text(all_text, tokenizer)
|
142 |
model = SentenceTransformer('all-mpnet-base-v2')
|
143 |
+
embeddings = model.encode(chunks, truncate=True, show_progress_bar=False)
|
144 |
dim = embeddings[0].shape[0]
|
145 |
index = faiss.IndexFlatL2(dim)
|
146 |
index.add(np.array(embeddings).astype('float32'))
|
147 |
return index, model, chunks
|
148 |
|
149 |
+
# --- Monitor Conversations ---
|
150 |
def start_conversation_monitor(client, index, embed_model, text_chunks):
|
|
|
151 |
monitored_sids = set()
|
152 |
|
153 |
+
def poll_conversation(convo_sid):
|
154 |
+
last_processed_timestamp = None
|
155 |
+
while True:
|
156 |
+
try:
|
157 |
+
latest_msg = fetch_latest_incoming_message(client, convo_sid)
|
158 |
+
if latest_msg:
|
159 |
+
msg_time = latest_msg["timestamp"]
|
160 |
+
if last_processed_timestamp is None or msg_time > last_processed_timestamp:
|
161 |
+
last_processed_timestamp = msg_time
|
162 |
+
question = latest_msg["body"]
|
163 |
+
sender = latest_msg["author"]
|
164 |
+
print(f"\nπ₯ New message from {sender} in {convo_sid}: {question}")
|
165 |
+
context = "\n\n".join(retrieve_chunks(question, index, embed_model, text_chunks))
|
166 |
+
answer = generate_answer_with_groq(question, context)
|
167 |
+
send_twilio_message(client, convo_sid, answer)
|
168 |
+
print(f"π€ Replied to {sender}: {answer}")
|
169 |
+
time.sleep(3)
|
170 |
+
except Exception as e:
|
171 |
+
print(f"β Error in convo {convo_sid} polling:", e)
|
172 |
+
time.sleep(5)
|
|
|
|
|
173 |
|
174 |
def monitor_all_conversations():
|
175 |
while True:
|
|
|
177 |
current_sids = set(get_whatsapp_conversation_sids(client))
|
178 |
new_sids = current_sids - monitored_sids
|
179 |
for sid in new_sids:
|
180 |
+
print(f"β‘οΈ Monitoring new conversation: {sid}")
|
181 |
monitored_sids.add(sid)
|
182 |
threading.Thread(target=poll_conversation, args=(sid,), daemon=True).start()
|
183 |
+
time.sleep(15)
|
184 |
except Exception as e:
|
185 |
print("β Error in conversation monitoring loop:", e)
|
186 |
time.sleep(15)
|
187 |
|
|
|
188 |
threading.Thread(target=monitor_all_conversations, daemon=True).start()
|
189 |
|
|
|
190 |
# --- Streamlit UI ---
|
191 |
st.set_page_config(page_title="Quasa β A Smart WhatsApp Chatbot", layout="wide")
|
192 |
st.title("π± Quasa β A Smart WhatsApp Chatbot")
|
|
|
212 |
start_conversation_monitor(client, index, model, chunks)
|
213 |
st.success("π’ Chatbot is running in background and will reply to new messages.")
|
214 |
else:
|
215 |
+
st.error("β No WhatsApp conversations found.")
|