masadonline commited on
Commit
e1e01d8
Β·
verified Β·
1 Parent(s): 2d6e369

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -34
app.py CHANGED
@@ -2,7 +2,6 @@ import os
2
  import time
3
  import streamlit as st
4
  from twilio.rest import Client
5
- from twilio.base.exceptions import TwilioRestException
6
  from pdfminer.high_level import extract_text
7
  from sentence_transformers import SentenceTransformer
8
  from transformers import AutoTokenizer
@@ -12,31 +11,33 @@ import docx
12
  from groq import Groq
13
  import PyPDF2
14
  import requests
15
- from streamlit_extras.st_autorefresh import st_autorefresh # <- fixed import
16
 
17
-
18
- # --- Document Loaders ---
19
  def extract_text_from_pdf(pdf_path):
20
  try:
21
  text = ""
22
  with open(pdf_path, 'rb') as file:
23
  pdf_reader = PyPDF2.PdfReader(file)
24
- for page_num in range(len(pdf_reader.pages)):
25
- page = pdf_reader.pages[page_num]
26
  page_text = page.extract_text()
27
  if page_text:
28
  text += page_text
29
  return text
30
- except:
 
31
  return extract_text(pdf_path)
32
 
 
33
  def extract_text_from_docx(docx_path):
34
  try:
35
  doc = docx.Document(docx_path)
36
  return '\n'.join(para.text for para in doc.paragraphs)
37
- except:
 
38
  return ""
39
 
 
40
  def chunk_text(text, tokenizer, chunk_size=150, chunk_overlap=30):
41
  tokens = tokenizer.tokenize(text)
42
  chunks, start = [], 0
@@ -47,15 +48,19 @@ def chunk_text(text, tokenizer, chunk_size=150, chunk_overlap=30):
47
  start += chunk_size - chunk_overlap
48
  return chunks
49
 
 
50
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
51
  question_embedding = embed_model.encode([question])[0]
52
- D, I = index.search(np.array([question_embedding]), k)
53
  return [text_chunks[i] for i in I[0]]
54
 
55
- # --- GROQ Answer Generation ---
56
  def generate_answer_with_groq(question, context, retries=3, delay=2):
57
  url = "https://api.groq.com/openai/v1/chat/completions"
58
- api_key = os.environ["GROQ_API_KEY"]
 
 
 
59
  headers = {
60
  "Authorization": f"Bearer {api_key}",
61
  "Content-Type": "application/json",
@@ -85,17 +90,20 @@ def generate_answer_with_groq(question, context, retries=3, delay=2):
85
 
86
  for attempt in range(retries):
87
  try:
88
- response = requests.post(url, headers=headers, json=payload)
 
89
  result = response.json()
90
  return result['choices'][0]['message']['content'].strip()
91
- except Exception as e:
92
- if "503" in str(e) and attempt < retries - 1:
93
  time.sleep(delay)
94
  continue
95
  else:
96
- return f"⚠️ Groq API Error: {str(e)}"
 
 
97
 
98
- # --- Twilio Chat Handlers ---
99
  def fetch_latest_incoming_message(account_sid, auth_token, conversation_sid):
100
  client = Client(account_sid, auth_token)
101
  messages = client.conversations.v1.conversations(conversation_sid).messages.list(limit=10)
@@ -112,15 +120,13 @@ def send_twilio_message(account_sid, auth_token, conversation_sid, body):
112
  except Exception as e:
113
  return str(e)
114
 
115
- # --- Streamlit UI ---
116
  st.set_page_config(page_title="Quasa – A Smart WhatsApp Chatbot", layout="wide")
117
  st.title("πŸ“± Quasa – A Smart WhatsApp Chatbot")
118
 
119
- # Initialize session state for last index
120
  if "last_index" not in st.session_state:
121
  st.session_state.last_index = -1
122
 
123
- # Load secrets or allow manual input
124
  account_sid = st.secrets.get("TWILIO_SID")
125
  auth_token = st.secrets.get("TWILIO_TOKEN")
126
  GROQ_API_KEY = st.secrets.get("GROQ_API_KEY")
@@ -133,7 +139,6 @@ if not all([account_sid, auth_token, GROQ_API_KEY]):
133
 
134
  conversation_sid = st.text_input("Enter Conversation SID", value="")
135
 
136
- # Auto-refresh toggle and interval selector
137
  enable_autorefresh = st.checkbox("πŸ”„ Enable Auto-Refresh", value=True)
138
  interval_seconds = st.selectbox("Refresh Interval (seconds)", options=[5, 10, 15, 30, 60], index=1)
139
 
@@ -143,25 +148,31 @@ if enable_autorefresh:
143
  if all([account_sid, auth_token, GROQ_API_KEY, conversation_sid]):
144
  os.environ["GROQ_API_KEY"] = GROQ_API_KEY
145
 
146
- @st.cache_resource
147
  def setup_knowledge_base():
148
  folder_path = "docs"
149
  all_text = ""
150
- for file in os.listdir(folder_path):
151
- if file.endswith(".pdf"):
152
- all_text += extract_text_from_pdf(os.path.join(folder_path, file)) + "\n"
153
- elif file.endswith((".docx", ".doc")):
154
- all_text += extract_text_from_docx(os.path.join(folder_path, file)) + "\n"
155
- tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
156
- chunks = chunk_text(all_text, tokenizer)
157
- model = SentenceTransformer('all-mpnet-base-v2')
158
- embeddings = model.encode(chunks)
159
- dim = embeddings[0].shape[0]
160
- index = faiss.IndexFlatL2(dim)
161
- index.add(np.array(embeddings))
162
- return index, model, chunks
 
 
 
 
163
 
164
  index, embedding_model, text_chunks = setup_knowledge_base()
 
 
165
 
166
  st.success("βœ… Knowledge base ready. Monitoring WhatsApp...")
167
 
 
2
  import time
3
  import streamlit as st
4
  from twilio.rest import Client
 
5
  from pdfminer.high_level import extract_text
6
  from sentence_transformers import SentenceTransformer
7
  from transformers import AutoTokenizer
 
11
  from groq import Groq
12
  import PyPDF2
13
  import requests
14
+ from streamlit_extras.st_autorefresh import st_autorefresh
15
 
16
+ # Extract text from PDF with fallback
 
17
  def extract_text_from_pdf(pdf_path):
18
  try:
19
  text = ""
20
  with open(pdf_path, 'rb') as file:
21
  pdf_reader = PyPDF2.PdfReader(file)
22
+ for page in pdf_reader.pages:
 
23
  page_text = page.extract_text()
24
  if page_text:
25
  text += page_text
26
  return text
27
+ except Exception as e:
28
+ st.write(f"Fallback pdfminer extraction: {e}")
29
  return extract_text(pdf_path)
30
 
31
+ # Extract text from DOCX
32
  def extract_text_from_docx(docx_path):
33
  try:
34
  doc = docx.Document(docx_path)
35
  return '\n'.join(para.text for para in doc.paragraphs)
36
+ except Exception as e:
37
+ st.write(f"Docx extraction error: {e}")
38
  return ""
39
 
40
+ # Chunk text based on tokens
41
  def chunk_text(text, tokenizer, chunk_size=150, chunk_overlap=30):
42
  tokens = tokenizer.tokenize(text)
43
  chunks, start = [], 0
 
48
  start += chunk_size - chunk_overlap
49
  return chunks
50
 
51
+ # Retrieve relevant chunks from index
52
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
53
  question_embedding = embed_model.encode([question])[0]
54
+ D, I = index.search(np.array([question_embedding]).astype('float32'), k)
55
  return [text_chunks[i] for i in I[0]]
56
 
57
+ # Generate answer using Groq API with retries and timeout
58
  def generate_answer_with_groq(question, context, retries=3, delay=2):
59
  url = "https://api.groq.com/openai/v1/chat/completions"
60
+ api_key = os.environ.get("GROQ_API_KEY")
61
+ if not api_key:
62
+ return "⚠️ GROQ_API_KEY not set."
63
+
64
  headers = {
65
  "Authorization": f"Bearer {api_key}",
66
  "Content-Type": "application/json",
 
90
 
91
  for attempt in range(retries):
92
  try:
93
+ response = requests.post(url, headers=headers, json=payload, timeout=10)
94
+ response.raise_for_status()
95
  result = response.json()
96
  return result['choices'][0]['message']['content'].strip()
97
+ except requests.exceptions.HTTPError as e:
98
+ if response.status_code == 503 and attempt < retries - 1:
99
  time.sleep(delay)
100
  continue
101
  else:
102
+ return f"⚠️ Groq API HTTPError: {e}"
103
+ except Exception as e:
104
+ return f"⚠️ Groq API Error: {e}"
105
 
106
+ # Twilio message fetch and send
107
  def fetch_latest_incoming_message(account_sid, auth_token, conversation_sid):
108
  client = Client(account_sid, auth_token)
109
  messages = client.conversations.v1.conversations(conversation_sid).messages.list(limit=10)
 
120
  except Exception as e:
121
  return str(e)
122
 
123
+ # Streamlit UI
124
  st.set_page_config(page_title="Quasa – A Smart WhatsApp Chatbot", layout="wide")
125
  st.title("πŸ“± Quasa – A Smart WhatsApp Chatbot")
126
 
 
127
  if "last_index" not in st.session_state:
128
  st.session_state.last_index = -1
129
 
 
130
  account_sid = st.secrets.get("TWILIO_SID")
131
  auth_token = st.secrets.get("TWILIO_TOKEN")
132
  GROQ_API_KEY = st.secrets.get("GROQ_API_KEY")
 
139
 
140
  conversation_sid = st.text_input("Enter Conversation SID", value="")
141
 
 
142
  enable_autorefresh = st.checkbox("πŸ”„ Enable Auto-Refresh", value=True)
143
  interval_seconds = st.selectbox("Refresh Interval (seconds)", options=[5, 10, 15, 30, 60], index=1)
144
 
 
148
  if all([account_sid, auth_token, GROQ_API_KEY, conversation_sid]):
149
  os.environ["GROQ_API_KEY"] = GROQ_API_KEY
150
 
151
+ @st.cache_data(show_spinner=False)
152
  def setup_knowledge_base():
153
  folder_path = "docs"
154
  all_text = ""
155
+ try:
156
+ for file in os.listdir(folder_path):
157
+ if file.endswith(".pdf"):
158
+ all_text += extract_text_from_pdf(os.path.join(folder_path, file)) + "\n"
159
+ elif file.endswith((".docx", ".doc")):
160
+ all_text += extract_text_from_docx(os.path.join(folder_path, file)) + "\n"
161
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
162
+ chunks = chunk_text(all_text, tokenizer)
163
+ model = SentenceTransformer('all-mpnet-base-v2')
164
+ embeddings = model.encode(chunks)
165
+ dim = embeddings[0].shape[0]
166
+ index = faiss.IndexFlatL2(dim)
167
+ index.add(np.array(embeddings).astype('float32'))
168
+ return index, model, chunks
169
+ except Exception as e:
170
+ st.error(f"Error setting up knowledge base: {e}")
171
+ return None, None, None
172
 
173
  index, embedding_model, text_chunks = setup_knowledge_base()
174
+ if index is None:
175
+ st.stop()
176
 
177
  st.success("βœ… Knowledge base ready. Monitoring WhatsApp...")
178