masadonline commited on
Commit
6bda95c
Β·
verified Β·
1 Parent(s): df1b68d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -116
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
- from flask import Flask, request
3
- from twilio.twiml.messaging_response import MessagingResponse
4
  import PyPDF2
5
  from pdfminer.high_level import extract_text
6
  from transformers import AutoTokenizer
@@ -9,152 +8,138 @@ import faiss
9
  import numpy as np
10
  from groq import Groq
11
  import docx
 
 
 
 
 
 
12
 
13
- # --- Helper functions from your code ---
 
 
 
 
 
 
 
 
14
 
15
  def extract_text_from_pdf(pdf_path):
16
  try:
17
  text = ""
18
  with open(pdf_path, 'rb') as file:
19
  pdf_reader = PyPDF2.PdfReader(file)
20
- for page_num in range(len(pdf_reader.pages)):
21
- page = pdf_reader.pages[page_num]
22
  page_text = page.extract_text()
23
  if page_text:
24
  text += page_text
25
  return text
26
  except Exception as e:
27
- print(f"PyPDF2 failed with error: {e}. Trying pdfminer.six...")
28
  return extract_text(pdf_path)
29
 
30
  def extract_text_from_docx(docx_path):
31
  try:
32
  doc = docx.Document(docx_path)
33
- full_text = []
34
- for para in doc.paragraphs:
35
- full_text.append(para.text)
36
- return '\n'.join(full_text)
37
- except Exception as e:
38
- print(f"Failed to read DOCX {docx_path}: {e}")
39
  return ""
40
 
41
- def chunk_text_with_tokenizer(text, tokenizer, chunk_size=150, chunk_overlap=30):
42
  tokens = tokenizer.tokenize(text)
43
- chunks = []
44
- start = 0
45
  while start < len(tokens):
46
  end = min(start + chunk_size, len(tokens))
47
- chunk_tokens = tokens[start:end]
48
- chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
49
- chunks.append(chunk_text)
50
- start += chunk_size - chunk_overlap
51
  return chunks
52
 
53
- def retrieve_relevant_chunks(question, index, embeddings_model, text_chunks, k=3):
54
- question_embedding = embeddings_model.encode([question])[0]
55
- D, I = index.search(np.array([question_embedding]), k)
56
- relevant_chunks = [text_chunks[i] for i in I[0]]
57
- return relevant_chunks
 
58
 
59
- def generate_answer_with_groq(question, context):
 
 
 
 
 
60
  prompt = f"Based on the following context, answer the question: '{question}'\n\nContext:\n{context}"
61
- model_name = "llama-3.3-70b-versatile" # Adjust if needed
62
  try:
63
- groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
64
- response = groq_client.chat.completions.create(
65
- model=model_name,
66
  messages=[
67
- {"role": "system", "content": "You are an AI Assistant for Small Businesses. You are an SME expert."},
68
  {"role": "user", "content": prompt},
69
- ]
70
  )
71
  return response.choices[0].message.content
72
  except Exception as e:
73
- print(f"Error generating answer with Groq API: {e}")
74
- return "I'm sorry, I couldn't generate an answer at this time."
 
 
 
75
 
76
- # --- Load and prepare docs on startup ---
77
 
78
- def load_and_prepare_docs(folder_path="docs"):
79
- print("Loading documents from", folder_path)
80
  all_text = ""
81
- if not os.path.exists(folder_path):
82
- print(f"Folder '{folder_path}' does not exist!")
83
- return None, None, None
84
-
85
- files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.pdf', '.docx', '.doc'))]
86
- if not files:
87
- print(f"No PDF or DOCX files found in folder '{folder_path}'.")
88
- return None, None, None
89
-
90
- for file in files:
91
- path = os.path.join(folder_path, file)
92
- if file.lower().endswith('.pdf'):
93
- text = extract_text_from_pdf(path)
94
- else:
95
- text = extract_text_from_docx(path)
96
- if text:
97
- all_text += text + "\n\n"
98
-
99
- if not all_text.strip():
100
- print("No text extracted from documents.")
101
- return None, None, None
102
-
103
- tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
104
- text_chunks = chunk_text_with_tokenizer(all_text, tokenizer)
105
-
106
- embedding_model = SentenceTransformer('all-mpnet-base-v2')
107
- all_embeddings = embedding_model.encode(text_chunks) if text_chunks else None
108
-
109
- if all_embeddings is None or len(all_embeddings) == 0:
110
- print("No text chunks found to create embeddings.")
111
- return None, None, None
112
-
113
- embedding_dim = all_embeddings[0].shape[0]
114
- index = faiss.IndexFlatL2(embedding_dim)
115
- index.add(np.array(all_embeddings))
116
-
117
- print("Documents loaded and FAISS index created.")
118
- return index, embedding_model, text_chunks
119
-
120
- # --- Flask app and WhatsApp webhook ---
121
-
122
- from flask_cors import CORS
123
- app = Flask(__name__)
124
- CORS(app) # Optional, if you call API from other domains
125
-
126
- # Load documents once at start
127
- index, embedding_model, text_chunks = load_and_prepare_docs()
128
-
129
- @app.route("/whatsapp", methods=["POST"])
130
- def whatsapp_reply():
131
- incoming_msg = request.values.get('Body', '').strip()
132
- from_number = request.values.get('From', '')
133
- print(f"Incoming message from {from_number}: {incoming_msg}")
134
-
135
- resp = MessagingResponse()
136
- msg = resp.message()
137
-
138
- if not incoming_msg:
139
- msg.body("Please send a question.")
140
- return str(resp)
141
-
142
- if index is None or embedding_model is None or text_chunks is None:
143
- msg.body("Sorry, the knowledge base is not ready. Please try again later.")
144
- return str(resp)
145
-
146
- # Retrieve context and generate answer
147
- relevant_chunks = retrieve_relevant_chunks(incoming_msg, index, embedding_model, text_chunks)
148
- context = "\n\n".join(relevant_chunks)
149
- answer = generate_answer_with_groq(incoming_msg, context)
150
-
151
- msg.body(answer)
152
- return str(resp)
153
-
154
- if __name__ == "__main__":
155
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
156
- if not GROQ_API_KEY:
157
- print("Please set the GROQ_API_KEY environment variable before running.")
158
- exit(1)
159
- print("Starting WhatsApp SMEHelpBot server...")
160
- app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 5000)))
 
1
  import os
2
+ import streamlit as st
 
3
  import PyPDF2
4
  from pdfminer.high_level import extract_text
5
  from transformers import AutoTokenizer
 
8
  import numpy as np
9
  from groq import Groq
10
  import docx
11
+ from fastapi import FastAPI, Request
12
+ import uvicorn
13
+ import threading
14
+ from pydantic import BaseModel
15
+ from twilio.rest import Client
16
+ from fastapi.responses import JSONResponse
17
 
18
+ # --- Global Config ---
19
+
20
+ WHATSAPP_FROM = "whatsapp:+14155238886" # Twilio sandbox number
21
+ WHATSAPP_TO = os.getenv("WHATSAPP_TO") or "whatsapp:+YOUR_NUMBER"
22
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
23
+ TWILIO_SID = os.getenv("TWILIO_SID")
24
+ TWILIO_TOKEN = os.getenv("TWILIO_TOKEN")
25
+
26
+ # --- Helper Functions ---
27
 
28
  def extract_text_from_pdf(pdf_path):
29
  try:
30
  text = ""
31
  with open(pdf_path, 'rb') as file:
32
  pdf_reader = PyPDF2.PdfReader(file)
33
+ for page in pdf_reader.pages:
 
34
  page_text = page.extract_text()
35
  if page_text:
36
  text += page_text
37
  return text
38
  except Exception as e:
 
39
  return extract_text(pdf_path)
40
 
41
  def extract_text_from_docx(docx_path):
42
  try:
43
  doc = docx.Document(docx_path)
44
+ return '\n'.join(para.text for para in doc.paragraphs)
45
+ except Exception:
 
 
 
 
46
  return ""
47
 
48
+ def chunk_text(text, tokenizer, chunk_size=150, overlap=30):
49
  tokens = tokenizer.tokenize(text)
50
+ chunks, start = [], 0
 
51
  while start < len(tokens):
52
  end = min(start + chunk_size, len(tokens))
53
+ chunk = tokenizer.convert_tokens_to_string(tokens[start:end])
54
+ chunks.append(chunk)
55
+ start += chunk_size - overlap
 
56
  return chunks
57
 
58
+ def get_embeddings_and_index(chunks, model):
59
+ embeddings = model.encode(chunks)
60
+ dim = embeddings[0].shape[0]
61
+ index = faiss.IndexFlatL2(dim)
62
+ index.add(np.array(embeddings))
63
+ return index, embeddings
64
 
65
+ def get_relevant_chunks(question, index, model, chunks, k=3):
66
+ query_vector = model.encode([question])[0]
67
+ _, I = index.search(np.array([query_vector]), k)
68
+ return [chunks[i] for i in I[0]]
69
+
70
+ def generate_answer(question, context):
71
  prompt = f"Based on the following context, answer the question: '{question}'\n\nContext:\n{context}"
 
72
  try:
73
+ client = Groq(api_key=GROQ_API_KEY)
74
+ response = client.chat.completions.create(
75
+ model="llama-3.3-70b-versatile",
76
  messages=[
77
+ {"role": "system", "content": "You are an AI assistant for small businesses."},
78
  {"role": "user", "content": prompt},
79
+ ],
80
  )
81
  return response.choices[0].message.content
82
  except Exception as e:
83
+ return f"Error: {e}"
84
+
85
+ def send_whatsapp_reply(to, message):
86
+ client = Client(TWILIO_SID, TWILIO_TOKEN)
87
+ client.messages.create(body=message, from_=WHATSAPP_FROM, to=to)
88
 
89
+ # --- Load Documents ---
90
 
91
+ @st.cache_data
92
+ def load_documents(folder="docs"):
93
  all_text = ""
94
+ for file in os.listdir(folder):
95
+ path = os.path.join(folder, file)
96
+ if file.endswith(".pdf"):
97
+ all_text += extract_text_from_pdf(path) + "\n"
98
+ elif file.endswith((".docx", ".doc")):
99
+ all_text += extract_text_from_docx(path) + "\n"
100
+ return all_text
101
+
102
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
103
+ embedder = SentenceTransformer("all-mpnet-base-v2")
104
+ docs = load_documents()
105
+ chunks = chunk_text(docs, tokenizer)
106
+ index, embeddings = get_embeddings_and_index(chunks, embedder)
107
+
108
+ # --- Streamlit UI ---
109
+
110
+ st.set_page_config(page_title="SMEHelpBot πŸ€–", layout="wide")
111
+ st.title("πŸ€– SMEHelpBot – Ask your business questions!")
112
+
113
+ question = st.text_input("πŸ’¬ Ask something:")
114
+
115
+ if st.button("Get Answer") and question:
116
+ with st.spinner("Searching..."):
117
+ top_chunks = get_relevant_chunks(question, index, embedder, chunks)
118
+ context = "\n".join(top_chunks)
119
+ answer = generate_answer(question, context)
120
+ st.success(answer)
121
+
122
+ # --- FastAPI WhatsApp Webhook Server ---
123
+
124
+ app = FastAPI()
125
+
126
+ class WhatsAppMessage(BaseModel):
127
+ Body: str
128
+ From: str
129
+
130
+ @app.post("/whatsapp-webhook")
131
+ async def whatsapp_webhook(msg: WhatsAppMessage):
132
+ question = msg.Body.strip()
133
+ from_number = msg.From
134
+ relevant_chunks = get_relevant_chunks(question, index, embedder, chunks)
135
+ context = "\n".join(relevant_chunks)
136
+ answer = generate_answer(question, context)
137
+ send_whatsapp_reply(from_number, answer)
138
+ return JSONResponse(content={"status": "sent"})
139
+
140
+ # --- Run FastAPI in background ---
141
+
142
+ def run_fastapi():
143
+ uvicorn.run(app, host="0.0.0.0", port=7860)
144
+
145
+ threading.Thread(target=run_fastapi, daemon=True).start()