masadonline commited on
Commit
0a4a544
Β·
verified Β·
1 Parent(s): 72f0f38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -159
app.py CHANGED
@@ -13,240 +13,283 @@ import requests
13
  from io import StringIO
14
  from pdfminer.high_level import extract_text_to_fp
15
  from pdfminer.layout import LAParams
16
- from twilio.base.exceptions import TwilioRestException
17
  import pdfplumber
18
  import datetime
19
  import csv
20
- import json
21
- import re
22
 
23
  APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
24
- os.environ["PYTORCH_JIT"] = "0"
25
 
26
- # Twilio Setup
27
- TWILIO_ACCOUNT_SID = os.getenv("TWILIO_ACCOUNT_SID")
28
- TWILIO_AUTH_TOKEN = os.getenv("TWILIO_AUTH_TOKEN")
29
- twilio_client = Client(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)
30
 
31
- # ---------------- PDF & DOCX & JSON Extraction ----------------
32
  def _extract_tables_from_page(page):
 
 
33
  tables = page.extract_tables()
 
 
 
34
  formatted_tables = []
35
  for table in tables:
36
- formatted_row = [[cell if cell is not None else "" for cell in row] for row in table]
37
- formatted_tables.append(formatted_row)
 
 
 
 
 
 
38
  return formatted_tables
39
-
40
  def extract_text_from_pdf(pdf_path):
41
  text_output = StringIO()
42
  all_tables = []
43
  try:
44
  with pdfplumber.open(pdf_path) as pdf:
45
  for page in pdf.pages:
46
- all_tables.extend(_extract_tables_from_page(page))
 
 
 
 
47
  text = page.extract_text()
48
  if text:
49
  text_output.write(text + "\n\n")
50
  except Exception as e:
 
 
51
  with open(pdf_path, 'rb') as file:
52
- extract_text_to_fp(file, text_output, laparams=LAParams(), output_type='text')
53
- return text_output.getvalue(), all_tables
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def _format_tables_internal(tables):
56
- formatted = []
 
 
57
  for table in tables:
 
58
  with StringIO() as csvfile:
59
- writer = csv.writer(csvfile)
60
- writer.writerows(table)
61
- formatted.append(csvfile.getvalue())
62
- return "\n\n".join(formatted)
63
 
64
- def clean_extracted_text(text):
65
- return '\n'.join(' '.join(line.strip().split()) for line in text.splitlines() if line.strip())
66
-
67
- def extract_text_from_docx(path):
68
  try:
69
- doc = docx.Document(path)
70
- return '\n'.join(p.text for p in doc.paragraphs)
71
- except:
72
- return ""
73
-
74
- def load_json_data(path):
75
- try:
76
- with open(path, 'r', encoding='utf-8') as f:
77
- data = json.load(f)
78
- if isinstance(data, dict):
79
- return "\n".join(f"{k}: {v}" for k, v in data.items() if not isinstance(v, (dict, list)))
80
- elif isinstance(data, list):
81
- return "\n\n".join("\n".join(f"{k}: {v}" for k, v in item.items() if isinstance(item, dict)) for item in data)
82
- else:
83
- return json.dumps(data, ensure_ascii=False, indent=2)
84
- except Exception as e:
85
  return ""
86
 
87
- # ---------------- Chunking ----------------
88
- def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32):
89
  tokens = tokenizer.tokenize(text)
90
  chunks = []
91
  start = 0
92
  while start < len(tokens):
93
  end = min(start + chunk_size, len(tokens))
94
- chunk = tokens[start:end]
95
- chunks.append(tokenizer.convert_tokens_to_string(chunk))
 
 
 
96
  start += chunk_size - chunk_overlap
97
  return chunks
98
 
99
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
100
- q_embedding = embed_model.encode(question)
101
- D, I = index.search(np.array([q_embedding]), k)
102
  return [text_chunks[i] for i in I[0]]
103
 
104
- # ---------------- Groq Answer Generator ----------------
105
  def generate_answer_with_groq(question, context):
106
  url = "https://api.groq.com/openai/v1/chat/completions"
107
- api_key = os.getenv("GROQ_API_KEY")
108
- headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
109
- prompt = f"Customer asked: '{question}'\n\nHere is the relevant information to help:\n{context}"
 
 
 
 
 
 
 
110
  payload = {
111
  "model": "llama3-8b-8192",
112
  "messages": [
113
- {"role": "system", "content": "You are ToyBot, a friendly WhatsApp assistant..."},
 
 
 
 
 
 
 
114
  {"role": "user", "content": prompt},
115
  ],
116
  "temperature": 0.5,
117
  "max_tokens": 300,
118
  }
119
  response = requests.post(url, headers=headers, json=payload)
 
120
  return response.json()['choices'][0]['message']['content'].strip()
121
 
122
- # ---------------- Twilio Integration ----------------
123
  def fetch_latest_incoming_message(client, conversation_sid):
124
  try:
125
  messages = client.conversations.v1.conversations(conversation_sid).messages.list()
126
  for msg in reversed(messages):
127
- if (
128
- msg.author.startswith("whatsapp:") and
129
- msg.date_created and
130
- msg.date_created > APP_START_TIME
131
- ):
132
  return {
133
  "sid": msg.sid,
134
  "body": msg.body,
135
  "author": msg.author,
136
  "timestamp": msg.date_created,
137
  }
138
- except TwilioRestException:
139
- return None
140
-
141
- def send_twilio_message(client, conversation_sid, body):
142
- return client.conversations.v1.conversations(conversation_sid).messages.create(author="system", body=body)
143
-
144
- def get_latest_whatsapp_conversation_sid(client):
145
- try:
146
- conversations = client.conversations.v1.conversations.list(limit=20)
147
- filtered = [
148
- c for c in conversations
149
- if c.date_created and c.date_created > APP_START_TIME
150
- ]
151
- for convo in sorted(filtered, key=lambda c: c.date_created, reverse=True):
152
- messages = convo.messages.list(limit=1)
153
- if messages and any(m.author.startswith("whatsapp:") and m.date_created > APP_START_TIME for m in messages):
154
- return convo.sid
155
  except Exception as e:
156
- print("Error fetching valid conversation SID:", e)
157
- return None
158
 
159
- # ---------------- Load Orders ----------------
160
- def load_orders():
161
- orders_path = "docs/CustomersOrder.json"
162
- try:
163
- with open(orders_path, "r", encoding="utf-8") as f:
164
- return json.load(f)
165
- except Exception as e:
166
- print(f"Error loading orders: {e}")
167
- return {}
168
-
169
- def extract_order_id(text):
170
- pattern = r"(order_id\s+\d+)"
171
- match = re.search(pattern, text, re.IGNORECASE)
172
- if match:
173
- return match.group(1).lower()
174
  return None
175
 
176
- def format_order_response(order_id, order_data):
177
- if not order_data:
178
- return f"Sorry, I could not find details for order ID {order_id}."
179
- details = [
180
- f"Order ID: {order_id}",
181
- f"Customer Name: {order_data.get('customer_name', 'N/A')}",
182
- f"Address: {order_data.get('address', 'N/A')}",
183
- f"Items: {', '.join(order_data.get('items', []))}",
184
- f"Status: {order_data.get('status', 'N/A')}",
185
- ]
186
- return "\n".join(details)
187
-
188
- # ---------------- Knowledge Base Setup ----------------
189
  def setup_knowledge_base():
190
- folder = "docs"
191
- text = ""
192
- for f in os.listdir(folder):
193
- path = os.path.join(folder, f)
194
- if f.endswith(".pdf"):
195
- t, tables = extract_text_from_pdf(path)
196
- text += clean_extracted_text(t) + "\n" + _format_tables_internal(tables) + "\n"
197
- elif f.endswith(".docx"):
198
- text += clean_extracted_text(extract_text_from_docx(path)) + "\n"
199
- elif f.endswith(".json"):
200
- # Skip orders.json here to avoid mixing with KB text
201
- if f == "CustomersOrder.json":
202
- continue
203
- text += load_json_data(path) + "\n"
204
- elif f.endswith(".csv"):
205
- with open(path, newline='', encoding='utf-8') as csvfile:
206
- reader = csv.reader(csvfile)
207
- text += "\n".join(", ".join(row) for row in reader) + "\n"
208
- return text
209
-
210
- # ---------------- App Logic ----------------
211
- def process_messages_loop():
212
- embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
213
- tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
214
- knowledge_text = setup_knowledge_base()
215
- text_chunks = chunk_text(knowledge_text, tokenizer)
216
- embeddings = embed_model.encode(text_chunks)
217
- index = faiss.IndexFlatL2(embeddings.shape[1])
218
- index.add(embeddings)
219
-
220
- orders = load_orders() # Load orders once at start
221
-
222
- seen_sids = set()
223
 
224
- while True:
225
- conversation_sid = get_latest_whatsapp_conversation_sid(twilio_client)
226
- if not conversation_sid:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  time.sleep(5)
228
- continue
229
 
230
- message = fetch_latest_incoming_message(twilio_client, conversation_sid)
231
- if message and message["sid"] not in seen_sids:
232
- seen_sids.add(message["sid"])
233
- question = message["body"]
234
 
235
- order_id = extract_order_id(question)
236
- if order_id and order_id in orders:
237
- answer = format_order_response(order_id, orders[order_id])
238
- else:
239
- chunks = retrieve_chunks(question, index, embed_model, text_chunks)
240
- answer = generate_answer_with_groq(question, "\n\n".join(chunks))
241
 
242
- send_twilio_message(twilio_client, conversation_sid, answer)
243
 
244
- time.sleep(5)
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- # ---------------- Streamlit UI ----------------
247
- st.title("ToyShop WhatsApp Assistant (Groq + Twilio)")
 
248
 
249
- if st.button("Start WhatsApp Bot"):
250
- thread = threading.Thread(target=process_messages_loop, daemon=True)
251
- thread.start()
252
- st.success("WhatsApp assistant started and monitoring for new messages.")
 
13
  from io import StringIO
14
  from pdfminer.high_level import extract_text_to_fp
15
  from pdfminer.layout import LAParams
16
+ from twilio.base.exceptions import TwilioRestException # Add this at the top
17
  import pdfplumber
18
  import datetime
19
  import csv
 
 
20
 
21
  APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
 
22
 
23
+ os.environ["PYTORCH_JIT"] = "0"
 
 
 
24
 
25
+ # --- PDF Extraction ---
26
  def _extract_tables_from_page(page):
27
+ """Extracts tables from a single page of a PDF."""
28
+
29
  tables = page.extract_tables()
30
+ if not tables:
31
+ return []
32
+
33
  formatted_tables = []
34
  for table in tables:
35
+ formatted_table = []
36
+ for row in table:
37
+ if row: # Filter out empty rows
38
+ formatted_row = [cell if cell is not None else "" for cell in row] # Replace None with ""
39
+ formatted_table.append(formatted_row)
40
+ else:
41
+ formatted_table.append([""]) # Append an empty row if the row is None
42
+ formatted_tables.append(formatted_table)
43
  return formatted_tables
44
+
45
  def extract_text_from_pdf(pdf_path):
46
  text_output = StringIO()
47
  all_tables = []
48
  try:
49
  with pdfplumber.open(pdf_path) as pdf:
50
  for page in pdf.pages:
51
+ # Extract tables
52
+ page_tables = _extract_tables_from_page(page)
53
+ if page_tables:
54
+ all_tables.extend(page_tables)
55
+ # Extract text
56
  text = page.extract_text()
57
  if text:
58
  text_output.write(text + "\n\n")
59
  except Exception as e:
60
+ print(f"Error extracting with pdfplumber: {e}")
61
+ # Fallback to pdfminer if pdfplumber fails
62
  with open(pdf_path, 'rb') as file:
63
+ extract_text_to_fp(file, text_output, laparams=LAParams(), output_type='text', codec=None)
64
+ extracted_text = text_output.getvalue()
65
+ return extracted_text, all_tables # Return text and list of tables
66
+
67
+ def clean_extracted_text(text):
68
+ lines = text.splitlines()
69
+ cleaned = []
70
+ for line in lines:
71
+ line = line.strip()
72
+ if line:
73
+ line = ' '.join(line.split())
74
+ cleaned.append(line)
75
+ return '\n'.join(cleaned)
76
 
77
  def _format_tables_internal(tables):
78
+ """Formats extracted tables into a string representation."""
79
+
80
+ formatted_tables_str = []
81
  for table in tables:
82
+ # Use csv writer to handle commas and quotes correctly
83
  with StringIO() as csvfile:
84
+ csvwriter = csv.writer(csvfile)
85
+ csvwriter.writerows(table)
86
+ formatted_tables_str.append(csvfile.getvalue())
87
+ return "\n\n".join(formatted_tables_str)
88
 
89
+ # --- DOCX Extraction ---
90
+ def extract_text_from_docx(docx_path):
 
 
91
  try:
92
+ doc = docx.Document(docx_path)
93
+ return '\n'.join(para.text for para in doc.paragraphs)
94
+ except Exception:
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  return ""
96
 
97
+ # --- Chunking ---
98
+ def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32, max_tokens=512):
99
  tokens = tokenizer.tokenize(text)
100
  chunks = []
101
  start = 0
102
  while start < len(tokens):
103
  end = min(start + chunk_size, len(tokens))
104
+ chunk_tokens = tokens[start:end]
105
+ chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
106
+ chunks.append(chunk_text)
107
+ if end == len(tokens):
108
+ break
109
  start += chunk_size - chunk_overlap
110
  return chunks
111
 
112
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
113
+ question_embedding = embed_model.encode(question)
114
+ D, I = index.search(np.array([question_embedding]), k)
115
  return [text_chunks[i] for i in I[0]]
116
 
117
+ # --- Groq Answer Generator ---
118
  def generate_answer_with_groq(question, context):
119
  url = "https://api.groq.com/openai/v1/chat/completions"
120
+ api_key = os.environ.get("GROQ_API_KEY")
121
+ headers = {
122
+ "Authorization": f"Bearer {api_key}",
123
+ "Content-Type": "application/json",
124
+ }
125
+ prompt = (
126
+ f"Customer asked: '{question}'\n\n"
127
+ f"Here is the relevant product or policy info to help:\n{context}\n\n"
128
+ f"Respond in a friendly and helpful tone as a toy shop support agent."
129
+ )
130
  payload = {
131
  "model": "llama3-8b-8192",
132
  "messages": [
133
+ {
134
+ "role": "system",
135
+ "content": (
136
+ "You are ToyBot, a friendly and helpful WhatsApp assistant for an online toy shop. "
137
+ "Your goal is to politely answer customer questions, help them choose the right toys, "
138
+ "provide order or delivery information, explain return policies, and guide them through purchases."
139
+ )
140
+ },
141
  {"role": "user", "content": prompt},
142
  ],
143
  "temperature": 0.5,
144
  "max_tokens": 300,
145
  }
146
  response = requests.post(url, headers=headers, json=payload)
147
+ response.raise_for_status()
148
  return response.json()['choices'][0]['message']['content'].strip()
149
 
150
+ # --- Twilio Functions ---
151
  def fetch_latest_incoming_message(client, conversation_sid):
152
  try:
153
  messages = client.conversations.v1.conversations(conversation_sid).messages.list()
154
  for msg in reversed(messages):
155
+ if msg.author.startswith("whatsapp:"):
 
 
 
 
156
  return {
157
  "sid": msg.sid,
158
  "body": msg.body,
159
  "author": msg.author,
160
  "timestamp": msg.date_created,
161
  }
162
+ except TwilioRestException as e:
163
+ if e.status == 404:
164
+ print(f"Conversation {conversation_sid} not found, skipping...")
165
+ else:
166
+ print(f"Twilio error fetching messages for {conversation_sid}:", e)
 
 
 
 
 
 
 
 
 
 
 
 
167
  except Exception as e:
168
+ #print(f"Unexpected error in fetch_latest_incoming_message for {conversation_sid}:", e)
169
+ pass
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  return None
172
 
173
+ def send_twilio_message(client, conversation_sid, body):
174
+ return client.conversations.v1.conversations(conversation_sid).messages.create(
175
+ author="system", body=body
176
+ )
177
+
178
+ # --- Load Knowledge Base ---
 
 
 
 
 
 
 
179
  def setup_knowledge_base():
180
+ folder_path = "docs"
181
+ all_text = ""
182
+
183
+ # Process PDFs
184
+ for filename in ["FAQ.pdf", "ProductReturnPolicy.pdf"]:
185
+ pdf_path = os.path.join(folder_path, filename)
186
+ text, tables = extract_text_from_pdf(pdf_path)
187
+ all_text += clean_extracted_text(text) + "\n"
188
+ all_text += _format_tables_internal(tables) + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
+ # Process CSVs
191
+ for filename in ["CustomerOrders.csv"]:
192
+ csv_path = os.path.join(folder_path, filename)
193
+ try:
194
+ with open(csv_path, newline='', encoding='utf-8') as csvfile:
195
+ reader = csv.DictReader(csvfile)
196
+ for row in reader:
197
+ line = f"Order ID: {row.get('OrderID')} | Customer Name: {row.get('CustomerName')} | Order Date: {row.get('OrderDate')} | ProductID: {row.get('ProductID')} | Date: {row.get('OrderDate')} | Quantity: {row.get('Quantity')} | UnitPrice(USD): {row.get('UnitPrice(USD)')} | TotalPrice(USD): {row.get('TotalPrice(USD)')} | ShippingAddress: {row.get('ShippingAddress')} | OrderStatus: {row.get('OrderStatus')}"
198
+ all_text += line + "\n"
199
+ except Exception as e:
200
+ print(f"❌ Error reading {filename}: {e}")
201
+
202
+ for filename in ["Products.csv"]:
203
+ csv_path = os.path.join(folder_path, filename)
204
+ try:
205
+ with open(csv_path, newline='', encoding='utf-8') as csvfile:
206
+ reader = csv.DictReader(csvfile)
207
+ for row in reader:
208
+ line = f"Product ID: {row.get('ProductID')} | Toy Name: {row.get('ToyName')} | Category: {row.get('Category')} | Price(USD): {row.get('Price(USD)')} | Stock Quantity: {row.get('StockQuantity')} | Description: {row.get('Description')}"
209
+ all_text += line + "\n"
210
+ except Exception as e:
211
+ print(f"❌ Error reading {filename}: {e}")
212
+
213
+ # Tokenization & chunking
214
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
215
+ chunks = chunk_text(all_text, tokenizer)
216
+ model = SentenceTransformer('all-mpnet-base-v2')
217
+ embeddings = model.encode(chunks, show_progress_bar=False, truncation=True, max_length=512)
218
+ dim = embeddings[0].shape[0]
219
+ index = faiss.IndexFlatL2(dim)
220
+ index.add(np.array(embeddings).astype('float32'))
221
+ return index, model, chunks
222
+
223
+
224
+
225
+ # --- Monitor Conversations ---
226
+ def start_conversation_monitor(client, index, embed_model, text_chunks):
227
+ processed_convos = set()
228
+ last_processed_timestamp = {}
229
+
230
+ def poll_conversation(convo_sid):
231
+ while True:
232
+ try:
233
+ latest_msg = fetch_latest_incoming_message(client, convo_sid)
234
+ if latest_msg:
235
+ msg_time = latest_msg["timestamp"]
236
+ if convo_sid not in last_processed_timestamp or msg_time > last_processed_timestamp[convo_sid]:
237
+ last_processed_timestamp[convo_sid] = msg_time
238
+ question = latest_msg["body"]
239
+ sender = latest_msg["author"]
240
+ print(f"\nπŸ“₯ New message from {sender} in {convo_sid}: {question}")
241
+ context = "\n\n".join(retrieve_chunks(question, index, embed_model, text_chunks))
242
+ answer = generate_answer_with_groq(question, context)
243
+ send_twilio_message(client, convo_sid, answer)
244
+ print(f"πŸ“€ Replied to {sender}: {answer}")
245
+ time.sleep(3)
246
+ except Exception as e:
247
+ print(f"❌ Error in convo {convo_sid} polling:", e)
248
+ time.sleep(5)
249
+
250
+ def poll_new_conversations():
251
+ print("➑️ Monitoring for new WhatsApp conversations...")
252
+ while True:
253
+ try:
254
+ conversations = client.conversations.v1.conversations.list(limit=20)
255
+ for convo in conversations:
256
+ convo_full = client.conversations.v1.conversations(convo.sid).fetch()
257
+ if convo.sid not in processed_convos and convo_full.date_created > APP_START_TIME:
258
+ participants = client.conversations.v1.conversations(convo.sid).participants.list()
259
+ for p in participants:
260
+ address = p.messaging_binding.get("address", "") if p.messaging_binding else ""
261
+ if address.startswith("whatsapp:"):
262
+ print(f"πŸ†• New WhatsApp convo found: {convo.sid}")
263
+ processed_convos.add(convo.sid)
264
+ threading.Thread(target=poll_conversation, args=(convo.sid,), daemon=True).start()
265
+ except Exception as e:
266
+ print("❌ Error polling conversations:", e)
267
  time.sleep(5)
 
268
 
269
+ # βœ… Launch conversation polling monitor
270
+ threading.Thread(target=poll_new_conversations, daemon=True).start()
 
 
271
 
 
 
 
 
 
 
272
 
 
273
 
274
+ # --- Streamlit UI ---
275
+ st.set_page_config(page_title="Quasa – A Smart WhatsApp Chatbot", layout="wide")
276
+ st.title("πŸ“± Quasa – A Smart WhatsApp Chatbot")
277
+
278
+ account_sid = st.secrets.get("TWILIO_SID")
279
+ auth_token = st.secrets.get("TWILIO_TOKEN")
280
+ GROQ_API_KEY = st.secrets.get("GROQ_API_KEY")
281
+
282
+ if not all([account_sid, auth_token, GROQ_API_KEY]):
283
+ st.warning("⚠️ Provide all credentials below:")
284
+ account_sid = st.text_input("Twilio SID", value=account_sid or "")
285
+ auth_token = st.text_input("Twilio Token", type="password", value=auth_token or "")
286
+ GROQ_API_KEY = st.text_input("GROQ API Key", type="password", value=GROQ_API_KEY or "")
287
 
288
+ if all([account_sid, auth_token, GROQ_API_KEY]):
289
+ os.environ["GROQ_API_KEY"] = GROQ_API_KEY
290
+ client = Client(account_sid, auth_token)
291
 
292
+ st.success("🟒 Monitoring new WhatsApp conversations...")
293
+ index, model, chunks = setup_knowledge_base()
294
+ threading.Thread(target=start_conversation_monitor, args=(client, index, model, chunks), daemon=True).start()
295
+ st.info("⏳ Waiting for new messages...")