Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -18,10 +18,11 @@ import faiss
|
|
18 |
from twilio.rest import Client
|
19 |
from twilio.base.exceptions import TwilioRestException
|
20 |
|
21 |
-
# Start time for filtering incoming messages
|
22 |
APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
|
23 |
os.environ["PYTORCH_JIT"] = "0"
|
24 |
|
|
|
|
|
25 |
# ---------------- PDF / DOCX / JSON LOADERS ----------------
|
26 |
def _extract_tables_from_page(page):
|
27 |
tables = page.extract_tables()
|
@@ -68,15 +69,6 @@ def load_json_data(json_path):
|
|
68 |
print(f"[JSON error] {e}")
|
69 |
return ""
|
70 |
|
71 |
-
def _format_tables_internal(tables):
|
72 |
-
formatted = []
|
73 |
-
for table in tables:
|
74 |
-
with StringIO() as csvfile:
|
75 |
-
writer = csv.writer(csvfile)
|
76 |
-
writer.writerows(table)
|
77 |
-
formatted.append(csvfile.getvalue())
|
78 |
-
return "\n\n".join(formatted)
|
79 |
-
|
80 |
def clean_extracted_text(text):
|
81 |
return '\n'.join(' '.join(line.strip().split()) for line in text.splitlines() if line.strip())
|
82 |
|
@@ -154,25 +146,32 @@ def handle_incoming_messages(index, embed_model, tokenizer, text_chunks):
|
|
154 |
# ---------------- STREAMLIT UI ----------------
|
155 |
st.title("π ToyShop Assistant β RAG WhatsApp Bot")
|
156 |
|
157 |
-
|
158 |
-
if uploaded_files:
|
159 |
full_text = ""
|
160 |
all_tables = []
|
161 |
|
162 |
-
for
|
163 |
-
|
|
|
164 |
if ext == "pdf":
|
165 |
-
text, tables = extract_text_from_pdf(
|
166 |
all_tables.extend(tables)
|
167 |
elif ext == "docx":
|
168 |
-
text = extract_text_from_docx(
|
169 |
elif ext == "json":
|
170 |
-
text = load_json_data(
|
171 |
else:
|
172 |
-
|
|
|
|
|
|
|
|
|
173 |
full_text += clean_extracted_text(text) + "\n\n"
|
|
|
|
|
|
|
|
|
174 |
|
175 |
-
# Load models
|
176 |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
177 |
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
178 |
chunks = chunk_text(full_text, tokenizer)
|
@@ -181,10 +180,9 @@ if uploaded_files:
|
|
181 |
index = faiss.IndexFlatL2(embeddings.shape[1])
|
182 |
index.add(np.array(embeddings))
|
183 |
|
184 |
-
# Start listener thread
|
185 |
if "listener_started" not in st.session_state:
|
186 |
threading.Thread(target=handle_incoming_messages, args=(index, embed_model, tokenizer, chunks), daemon=True).start()
|
187 |
st.session_state.listener_started = True
|
188 |
st.success("β
WhatsApp listener started.")
|
189 |
|
190 |
-
st.success(f"π
|
|
|
18 |
from twilio.rest import Client
|
19 |
from twilio.base.exceptions import TwilioRestException
|
20 |
|
|
|
21 |
APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
|
22 |
os.environ["PYTORCH_JIT"] = "0"
|
23 |
|
24 |
+
DOCS_FOLDER = "./docs"
|
25 |
+
|
26 |
# ---------------- PDF / DOCX / JSON LOADERS ----------------
|
27 |
def _extract_tables_from_page(page):
|
28 |
tables = page.extract_tables()
|
|
|
69 |
print(f"[JSON error] {e}")
|
70 |
return ""
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
def clean_extracted_text(text):
|
73 |
return '\n'.join(' '.join(line.strip().split()) for line in text.splitlines() if line.strip())
|
74 |
|
|
|
146 |
# ---------------- STREAMLIT UI ----------------
|
147 |
st.title("π ToyShop Assistant β RAG WhatsApp Bot")
|
148 |
|
149 |
+
def load_all_documents(folder_path):
|
|
|
150 |
full_text = ""
|
151 |
all_tables = []
|
152 |
|
153 |
+
for filename in os.listdir(folder_path):
|
154 |
+
filepath = os.path.join(folder_path, filename)
|
155 |
+
ext = filename.lower().split(".")[-1]
|
156 |
if ext == "pdf":
|
157 |
+
text, tables = extract_text_from_pdf(filepath)
|
158 |
all_tables.extend(tables)
|
159 |
elif ext == "docx":
|
160 |
+
text = extract_text_from_docx(filepath)
|
161 |
elif ext == "json":
|
162 |
+
text = load_json_data(filepath)
|
163 |
else:
|
164 |
+
try:
|
165 |
+
with open(filepath, "r", encoding="utf-8") as f:
|
166 |
+
text = f.read()
|
167 |
+
except Exception:
|
168 |
+
continue
|
169 |
full_text += clean_extracted_text(text) + "\n\n"
|
170 |
+
return full_text, all_tables
|
171 |
+
|
172 |
+
with st.spinner("Loading documents..."):
|
173 |
+
full_text, tables = load_all_documents(DOCS_FOLDER)
|
174 |
|
|
|
175 |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
176 |
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
177 |
chunks = chunk_text(full_text, tokenizer)
|
|
|
180 |
index = faiss.IndexFlatL2(embeddings.shape[1])
|
181 |
index.add(np.array(embeddings))
|
182 |
|
|
|
183 |
if "listener_started" not in st.session_state:
|
184 |
threading.Thread(target=handle_incoming_messages, args=(index, embed_model, tokenizer, chunks), daemon=True).start()
|
185 |
st.session_state.listener_started = True
|
186 |
st.success("β
WhatsApp listener started.")
|
187 |
|
188 |
+
st.success(f"π Loaded {len(chunks)} text chunks from docs/ folder")
|