masadonline commited on
Commit
c99b5df
·
verified ·
1 Parent(s): 95b3fe6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -19
app.py CHANGED
@@ -17,11 +17,13 @@ from twilio.base.exceptions import TwilioRestException
17
  import pdfplumber
18
  import datetime
19
  import csv
 
 
20
 
21
  APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
22
  os.environ["PYTORCH_JIT"] = "0"
23
 
24
- # ---------------- PDF & DOCX Extraction ----------------
25
  def _extract_tables_from_page(page):
26
  tables = page.extract_tables()
27
  formatted_tables = []
@@ -68,6 +70,26 @@ def extract_text_from_docx(docx_path):
68
  except:
69
  return ""
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  # ---------------- Chunking ----------------
72
  def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32):
73
  tokens = tokenizer.tokenize(text)
@@ -96,7 +118,7 @@ def generate_answer_with_groq(question, context):
96
  }
97
  prompt = (
98
  f"Customer asked: '{question}'\n\n"
99
- f"Here is the relevant product or policy info to help:\n{context}\n\n"
100
  f"Respond in a friendly and helpful tone as a toy shop support agent."
101
  )
102
  payload = {
@@ -144,22 +166,27 @@ def setup_knowledge_base():
144
  folder_path = "docs"
145
  all_text = ""
146
 
147
- for filename in ["FAQ.pdf", "ProductReturnPolicy.pdf"]:
148
- pdf_path = os.path.join(folder_path, filename)
149
- text, tables = extract_text_from_pdf(pdf_path)
150
- all_text += clean_extracted_text(text) + "\n"
151
- all_text += _format_tables_internal(tables) + "\n"
152
-
153
- for filename in ["CustomerOrders.csv", "Products.csv"]:
154
- path = os.path.join(folder_path, filename)
155
- try:
156
- with open(path, newline='', encoding='utf-8') as csvfile:
157
- reader = csv.DictReader(csvfile)
158
- for row in reader:
159
- line = ' | '.join(f"{k}: {v}" for k, v in row.items())
160
- all_text += line + "\n"
161
- except Exception as e:
162
- print(f"CSV read error: {e}")
 
 
 
 
 
163
 
164
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
165
  chunks = chunk_text(all_text, tokenizer)
@@ -196,7 +223,7 @@ def start_conversation_monitor(client, index, embed_model, text_chunks):
196
  threading.Thread(target=poll_convo, args=(convo.sid,), daemon=True).start()
197
 
198
  # ---------------- Main Entry ----------------
199
- if _name_ == "_main_":
200
  st.title("🤖 ToyBot WhatsApp Assistant")
201
  st.write("Initializing knowledge base...")
202
 
 
17
  import pdfplumber
18
  import datetime
19
  import csv
20
+ import json
21
+ import re
22
 
23
  APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
24
  os.environ["PYTORCH_JIT"] = "0"
25
 
26
+ # ---------------- PDF & DOCX & JSON Extraction ----------------
27
  def _extract_tables_from_page(page):
28
  tables = page.extract_tables()
29
  formatted_tables = []
 
70
  except:
71
  return ""
72
 
73
+ def load_json_data(json_path):
74
+ try:
75
+ with open(json_path, 'r', encoding='utf-8') as f:
76
+ data = json.load(f)
77
+ if isinstance(data, dict):
78
+ # Flatten dictionary values (avoiding nested structures as strings)
79
+ return "\n".join(f"{key}: {value}" for key, value in data.items() if not isinstance(value, (dict, list)))
80
+ elif isinstance(data, list):
81
+ # Flatten list of dictionaries
82
+ all_items = []
83
+ for item in data:
84
+ if isinstance(item, dict):
85
+ all_items.append("\n".join(f"{key}: {value}" for key, value in item.items() if not isinstance(value, (dict, list))))
86
+ return "\n\n".join(all_items)
87
+ else:
88
+ return json.dumps(data, ensure_ascii=False, indent=2)
89
+ except Exception as e:
90
+ print(f"JSON read error: {e}")
91
+ return ""
92
+
93
  # ---------------- Chunking ----------------
94
  def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32):
95
  tokens = tokenizer.tokenize(text)
 
118
  }
119
  prompt = (
120
  f"Customer asked: '{question}'\n\n"
121
+ f"Here is the relevant information to help:\n{context}\n\n"
122
  f"Respond in a friendly and helpful tone as a toy shop support agent."
123
  )
124
  payload = {
 
166
  folder_path = "docs"
167
  all_text = ""
168
 
169
+ for filename in os.listdir(folder_path):
170
+ file_path = os.path.join(folder_path, filename)
171
+ if filename.endswith(".pdf"):
172
+ text, tables = extract_text_from_pdf(file_path)
173
+ all_text += clean_extracted_text(text) + "\n"
174
+ all_text += _format_tables_internal(tables) + "\n"
175
+ elif filename.endswith(".docx"):
176
+ text = extract_text_from_docx(file_path)
177
+ all_text += clean_extracted_text(text) + "\n"
178
+ elif filename.endswith(".json"):
179
+ text = load_json_data(file_path)
180
+ all_text += text + "\n"
181
+ elif filename.endswith(".csv"):
182
+ try:
183
+ with open(file_path, newline='', encoding='utf-8') as csvfile:
184
+ reader = csv.DictReader(csvfile)
185
+ for row in reader:
186
+ line = ' | '.join(f"{k}: {v}" for k, v in row.items())
187
+ all_text += line + "\n"
188
+ except Exception as e:
189
+ print(f"CSV read error: {e}")
190
 
191
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
192
  chunks = chunk_text(all_text, tokenizer)
 
223
  threading.Thread(target=poll_convo, args=(convo.sid,), daemon=True).start()
224
 
225
  # ---------------- Main Entry ----------------
226
+ if __name__ == "__main__":
227
  st.title("🤖 ToyBot WhatsApp Assistant")
228
  st.write("Initializing knowledge base...")
229