Spaces:

masadonline
/

RAG-PDF

Sleeping

App Files Files Community

masadonline commited on May 18

Commit

013dd9f

verified ·

1 Parent(s): 3cdab77

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -61

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import streamlit as st
 import os
 import json
-import tempfile
 import pdfplumber
 import faiss
 import numpy as np
@@ -22,68 +21,65 @@ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL = "llama3-8b-8192"
 embedder = SentenceTransformer(EMBEDDING_MODEL)
-# Streamlit app setup
 st.set_page_config(page_title="🧸 ToyShop Assistant", layout="wide")
 st.title("🧸 ToyShop RAG-Based Assistant")
-# --- Helper functions ---
 def extract_pdf_text(file):
     text = ""
     with pdfplumber.open(file) as pdf:
         for page in pdf.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text + "\n"
     return text.strip()
 def load_json_orders(json_file):
-    valid_orders = []
     try:
         data = json.load(json_file)
-        if isinstance(data, list):
-            for i, order in enumerate(data):
-                try:
-                    json.dumps(order)  # test serialization
-                    valid_orders.append(order)
-                except Exception as e:
-                    st.warning(f"⚠️ Skipping invalid order at index {i}: {e}")
-        elif isinstance(data, dict):
-            for k, order in data.items():
-                try:
-                    json.dumps(order)
-                    valid_orders.append(order)
-                except Exception as e:
-                    st.warning(f"⚠️ Skipping invalid order with key '{k}': {e}")
     except Exception as e:
-        st.error(f"❌ Error parsing JSON file: {e}")
-    return valid_orders
-def build_index(text_chunks):
-    vectors = embedder.encode(text_chunks)
     index = faiss.IndexFlatL2(vectors.shape[1])
     index.add(np.array(vectors))
-    return index, text_chunks
 def ask_llm(context, query):
-    prompt = (
-        f"You are a helpful assistant for an online toy shop.\n\n"
-        f"Knowledge base:\n{context}\n\n"
-        f"Question: {query}"
-    )
-    # For debugging: show the prompt being sent.
-    st.expander("Prompt to LLM").code(prompt)
     response = client.chat.completions.create(
         model=LLM_MODEL,
         messages=[{"role": "user", "content": prompt}]
     )
-    # Log full response for inspection (can be commented out in production)
-    st.expander("Raw LLM API Response").json(response)
     return response.choices[0].message.content.strip()
-# --- File upload section ---
 st.subheader("📁 Upload Customer Orders (JSON)")
 orders_file = st.file_uploader("Upload JSON file", type="json")
@@ -92,57 +88,50 @@ pdf_files = st.file_uploader("Upload one or more PDFs", type="pdf", accept_multi
 order_chunks, pdf_chunks = [], []
-# --- Process JSON ---
 if orders_file:
     orders = load_json_orders(orders_file)
     if orders:
-        order_chunks = [json.dumps(order, ensure_ascii=False) for order in orders]
-        st.success(f"✅ Loaded {len(order_chunks)} customer order records.")
-        # Attempt to flatten for viewing
         try:
             df = pd.json_normalize(orders)
             st.dataframe(df, use_container_width=True)
         except Exception:
-            st.warning("⚠️ Nested JSON detected. Showing raw JSON preview instead.")
             st.json(orders)
-    else:
-        st.error("No valid orders found in the JSON file.")
-# --- Process PDFs ---
 if pdf_files:
-    for pdf_file in pdf_files:
         try:
-            text = extract_pdf_text(pdf_file)
-            # Split into paragraphs (non-empty lines)
-            paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
-            pdf_chunks.extend(paragraphs)
-            st.success(f"📄 Processed {pdf_file.name}")
         except Exception as e:
-            st.error(f"❌ Failed to read {pdf_file.name}: {e}")
 combined_chunks = order_chunks + pdf_chunks
-# --- Question Answering Section ---
 if combined_chunks:
     index, sources = build_index(combined_chunks)
     st.subheader("❓ Ask a Question")
-    user_query = st.text_input("What would you like to know?", placeholder="e.g. What is the status of order 123?")
     if user_query:
         query_vector = embedder.encode([user_query])
         D, I = index.search(query_vector, k=5)
-        # Prepare context from the top-K results:
         context = "\n---\n".join([sources[i] for i in I[0]])
-        st.expander("Combined Context").code(context)
         with st.spinner("🤔 Thinking..."):
             try:
                 answer = ask_llm(context, user_query)
                 st.markdown("### 🧠 Answer")
-                # Use st.write() to render the answer as text.
                 st.write(answer)
             except Exception as e:
-                st.error(f"❌ GROQ API Error: {e}")
 else:
-    st.info("📂 Please upload both JSON orders and relevant PDFs to begin.")

 import streamlit as st
 import os
 import json
 import pdfplumber
 import faiss
 import numpy as np
 LLM_MODEL = "llama3-8b-8192"
 embedder = SentenceTransformer(EMBEDDING_MODEL)
+# Streamlit UI
 st.set_page_config(page_title="🧸 ToyShop Assistant", layout="wide")
 st.title("🧸 ToyShop RAG-Based Assistant")
 def extract_pdf_text(file):
     text = ""
     with pdfplumber.open(file) as pdf:
         for page in pdf.pages:
+            content = page.extract_text()
+            if content:
+                text += content + "\n"
     return text.strip()
+def flatten_order(order):
+    flat = []
+    if isinstance(order, dict):
+        for k, v in order.items():
+            if isinstance(v, (dict, list)):
+                flat.append(f"{k}: {json.dumps(v, ensure_ascii=False)}")
+            else:
+                flat.append(f"{k}: {v}")
+    return "\n".join(flat)
 def load_json_orders(json_file):
     try:
         data = json.load(json_file)
+        if isinstance(data, dict):
+            orders = list(data.values())
+        elif isinstance(data, list):
+            orders = data
+        else:
+            return []
+        valid_orders = [o for o in orders if isinstance(o, dict)]
+        return valid_orders
     except Exception as e:
+        st.error(f"❌ Error parsing JSON: {e}")
+        return []
+def build_index(chunks):
+    vectors = embedder.encode(chunks)
     index = faiss.IndexFlatL2(vectors.shape[1])
     index.add(np.array(vectors))
+    return index, chunks
 def ask_llm(context, query):
+    prompt = f"""You are a helpful assistant for an online toy shop.
+Knowledge base:
+{context}
+Question: {query}
+"""
     response = client.chat.completions.create(
         model=LLM_MODEL,
         messages=[{"role": "user", "content": prompt}]
     )
     return response.choices[0].message.content.strip()
+# Uploads
 st.subheader("📁 Upload Customer Orders (JSON)")
 orders_file = st.file_uploader("Upload JSON file", type="json")
 order_chunks, pdf_chunks = [], []
+# Handle JSON orders
 if orders_file:
     orders = load_json_orders(orders_file)
     if orders:
+        order_chunks = [flatten_order(o) for o in orders]
+        st.success(f"✅ Loaded {len(order_chunks)} valid orders.")
         try:
             df = pd.json_normalize(orders)
             st.dataframe(df, use_container_width=True)
         except Exception:
+            st.warning("⚠️ Unable to normalize JSON. Showing raw preview.")
             st.json(orders)
+# Handle PDFs
 if pdf_files:
+    for file in pdf_files:
         try:
+            text = extract_pdf_text(file)
+            pdf_chunks.extend(text.split("\n\n"))
+            st.success(f"📄 Processed: {file.name}")
         except Exception as e:
+            st.error(f"❌ Error in {file.name}: {e}")
+# Combine & build index
 combined_chunks = order_chunks + pdf_chunks
 if combined_chunks:
     index, sources = build_index(combined_chunks)
     st.subheader("❓ Ask a Question")
+    user_query = st.text_input("What would you like to know?", placeholder="e.g., What is the status of order 105?")
     if user_query:
         query_vector = embedder.encode([user_query])
         D, I = index.search(query_vector, k=5)
         context = "\n---\n".join([sources[i] for i in I[0]])
         with st.spinner("🤔 Thinking..."):
             try:
                 answer = ask_llm(context, user_query)
                 st.markdown("### 🧠 Answer")
                 st.write(answer)
             except Exception as e:
+                st.error(f"❌ GROQ Error: {e}")
 else:
+    st.info("📂 Please upload orders (JSON) and info files (PDF) to get started.")