Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -179,14 +179,26 @@ def send_twilio_message(client, conversation_sid, body):
|
|
179 |
def setup_knowledge_base():
|
180 |
folder_path = "docs"
|
181 |
all_text = ""
|
182 |
-
for file in os.listdir(folder_path):
|
183 |
-
path = os.path.join(folder_path, file)
|
184 |
-
if file.endswith(".pdf"):
|
185 |
-
raw_text = extract_text_from_pdf(path)
|
186 |
-
all_text += clean_extracted_text(raw_text) + "\n"
|
187 |
-
elif file.endswith((".docx", ".doc")):
|
188 |
-
all_text += extract_text_from_docx(path) + "\n"
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
191 |
chunks = chunk_text(all_text, tokenizer)
|
192 |
model = SentenceTransformer('all-mpnet-base-v2')
|
@@ -196,6 +208,7 @@ def setup_knowledge_base():
|
|
196 |
index.add(np.array(embeddings).astype('float32'))
|
197 |
return index, model, chunks
|
198 |
|
|
|
199 |
# --- Monitor Conversations ---
|
200 |
def start_conversation_monitor(client, index, embed_model, text_chunks):
|
201 |
processed_convos = set()
|
|
|
179 |
def setup_knowledge_base():
|
180 |
folder_path = "docs"
|
181 |
all_text = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
+
# Process PDFs
|
184 |
+
for filename in ["FAQ.pdf", "ProductReturnPolicy.pdf"]:
|
185 |
+
pdf_path = os.path.join(folder_path, filename)
|
186 |
+
text, tables = extract_text_from_pdf(pdf_path)
|
187 |
+
all_text += clean_extracted_text(text) + "\n"
|
188 |
+
all_text += _format_tables_internal(tables) + "\n"
|
189 |
+
|
190 |
+
# Process CSVs
|
191 |
+
for filename in ["CustomerOrders.csv", "Products.csv"]:
|
192 |
+
csv_path = os.path.join(folder_path, filename)
|
193 |
+
try:
|
194 |
+
with open(csv_path, newline='', encoding='utf-8') as csvfile:
|
195 |
+
reader = csv.reader(csvfile)
|
196 |
+
lines = [" | ".join(row) for row in reader]
|
197 |
+
all_text += "\n".join(lines) + "\n"
|
198 |
+
except Exception as e:
|
199 |
+
print(f"❌ Error reading {filename}: {e}")
|
200 |
+
|
201 |
+
# Tokenization & chunking
|
202 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
203 |
chunks = chunk_text(all_text, tokenizer)
|
204 |
model = SentenceTransformer('all-mpnet-base-v2')
|
|
|
208 |
index.add(np.array(embeddings).astype('float32'))
|
209 |
return index, model, chunks
|
210 |
|
211 |
+
|
212 |
# --- Monitor Conversations ---
|
213 |
def start_conversation_monitor(client, index, embed_model, text_chunks):
|
214 |
processed_convos = set()
|