masadonline commited on
Commit
20cc7a2
·
verified ·
1 Parent(s): de2271c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -7
app.py CHANGED
@@ -179,14 +179,26 @@ def send_twilio_message(client, conversation_sid, body):
179
  def setup_knowledge_base():
180
  folder_path = "docs"
181
  all_text = ""
182
- for file in os.listdir(folder_path):
183
- path = os.path.join(folder_path, file)
184
- if file.endswith(".pdf"):
185
- raw_text = extract_text_from_pdf(path)
186
- all_text += clean_extracted_text(raw_text) + "\n"
187
- elif file.endswith((".docx", ".doc")):
188
- all_text += extract_text_from_docx(path) + "\n"
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
191
  chunks = chunk_text(all_text, tokenizer)
192
  model = SentenceTransformer('all-mpnet-base-v2')
@@ -196,6 +208,7 @@ def setup_knowledge_base():
196
  index.add(np.array(embeddings).astype('float32'))
197
  return index, model, chunks
198
 
 
199
  # --- Monitor Conversations ---
200
  def start_conversation_monitor(client, index, embed_model, text_chunks):
201
  processed_convos = set()
 
179
  def setup_knowledge_base():
180
  folder_path = "docs"
181
  all_text = ""
 
 
 
 
 
 
 
182
 
183
+ # Process PDFs
184
+ for filename in ["FAQ.pdf", "ProductReturnPolicy.pdf"]:
185
+ pdf_path = os.path.join(folder_path, filename)
186
+ text, tables = extract_text_from_pdf(pdf_path)
187
+ all_text += clean_extracted_text(text) + "\n"
188
+ all_text += _format_tables_internal(tables) + "\n"
189
+
190
+ # Process CSVs
191
+ for filename in ["CustomerOrders.csv", "Products.csv"]:
192
+ csv_path = os.path.join(folder_path, filename)
193
+ try:
194
+ with open(csv_path, newline='', encoding='utf-8') as csvfile:
195
+ reader = csv.reader(csvfile)
196
+ lines = [" | ".join(row) for row in reader]
197
+ all_text += "\n".join(lines) + "\n"
198
+ except Exception as e:
199
+ print(f"❌ Error reading {filename}: {e}")
200
+
201
+ # Tokenization & chunking
202
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
203
  chunks = chunk_text(all_text, tokenizer)
204
  model = SentenceTransformer('all-mpnet-base-v2')
 
208
  index.add(np.array(embeddings).astype('float32'))
209
  return index, model, chunks
210
 
211
+
212
  # --- Monitor Conversations ---
213
  def start_conversation_monitor(client, index, embed_model, text_chunks):
214
  processed_convos = set()