Spaces:

masadonline
/

RAG-PDF

Sleeping

App Files Files Community

masadonline commited on May 18

Commit

8a8a6d6

verified ·

1 Parent(s): 12fd03c

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -25

app.py CHANGED Viewed

@@ -9,27 +9,25 @@ import pandas as pd
 from sentence_transformers import SentenceTransformer
 from openai import OpenAI
 from dotenv import load_dotenv
-import torch
 # Load environment variables
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-# Setup GROQ LLM client
 client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
-# Load embedding model with device specification
-device = "cuda" if torch.cuda.is_available() else "cpu"
-embedder = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)
-embedder.to(device)
-# LLM model name
 LLM_MODEL = "llama3-8b-8192"
-# Streamlit setup
 st.set_page_config(page_title="🧸 ToyShop Assistant", layout="wide")
 st.title("🧸 ToyShop RAG-Based Assistant")
 def extract_pdf_text(file):
     text = ""
     with pdfplumber.open(file) as pdf:
@@ -40,8 +38,17 @@ def extract_pdf_text(file):
     return text.strip()
 def load_json_orders(json_file):
-    data = json.load(json_file)
-    return data if isinstance(data, list) else list(data.values())
 def build_index(text_chunks):
     vectors = embedder.encode(text_chunks)
@@ -57,49 +64,56 @@ def ask_llm(context, query):
     )
     return response.choices[0].message.content.strip()
-# File upload
 st.subheader("📁 Upload Customer Orders (JSON)")
 orders_file = st.file_uploader("Upload JSON file", type="json")
-st.subheader("📚 Upload FAQ / Product Info / Return Policy (PDFs)")
 pdf_files = st.file_uploader("Upload one or more PDFs", type="pdf", accept_multiple_files=True)
 order_chunks, pdf_chunks = [], []
-# Handle JSON
 if orders_file:
-    try:
-        orders = load_json_orders(orders_file)
         order_chunks = [json.dumps(order, ensure_ascii=False) for order in orders]
         st.success(f"✅ Loaded {len(order_chunks)} customer order records.")
-        st.dataframe(pd.DataFrame(orders), use_container_width=True)
-    except Exception as e:
-        st.error(f"❌ Error loading JSON: {e}")
-# Handle PDFs
 if pdf_files:
     for pdf_file in pdf_files:
         try:
             text = extract_pdf_text(pdf_file)
-            pdf_chunks.extend(text.split("\n\n"))  # simple paragraph chunking
         except Exception as e:
             st.error(f"❌ Failed to read {pdf_file.name}: {e}")
-# Build index if we have content
 combined_chunks = order_chunks + pdf_chunks
 if combined_chunks:
     index, sources = build_index(combined_chunks)
     st.subheader("❓ Ask a Question")
-    user_query = st.text_input("What would you like to know?")
     if user_query:
         query_vector = embedder.encode([user_query])
         D, I = index.search(query_vector, k=5)
         context = "\n---\n".join([sources[i] for i in I[0]])
-        with st.spinner("Thinking..."):
             try:
                 answer = ask_llm(context, user_query)
                 st.markdown("### 🧠 Answer")
@@ -107,4 +121,4 @@ if combined_chunks:
             except Exception as e:
                 st.error(f"❌ GROQ API Error: {e}")
 else:
-    st.info("📂 Please upload both JSON orders and PDFs to begin.")

 from sentence_transformers import SentenceTransformer
 from openai import OpenAI
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+# Setup GROQ client
 client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
+# Constants
+EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL = "llama3-8b-8192"
+embedder = SentenceTransformer(EMBEDDING_MODEL)
+# Streamlit app setup
 st.set_page_config(page_title="🧸 ToyShop Assistant", layout="wide")
 st.title("🧸 ToyShop RAG-Based Assistant")
+# --- Helper functions ---
 def extract_pdf_text(file):
     text = ""
     with pdfplumber.open(file) as pdf:
     return text.strip()
 def load_json_orders(json_file):
+    try:
+        data = json.load(json_file)
+        if isinstance(data, list):
+            return data
+        elif isinstance(data, dict):
+            return list(data.values())
+        else:
+            return []
+    except Exception as e:
+        st.error(f"Error parsing JSON: {e}")
+        return []
 def build_index(text_chunks):
     vectors = embedder.encode(text_chunks)
     )
     return response.choices[0].message.content.strip()
+# --- File upload section ---
 st.subheader("📁 Upload Customer Orders (JSON)")
 orders_file = st.file_uploader("Upload JSON file", type="json")
+st.subheader("📚 Upload FAQs / Product Info / Return Policy (PDFs)")
 pdf_files = st.file_uploader("Upload one or more PDFs", type="pdf", accept_multiple_files=True)
 order_chunks, pdf_chunks = [], []
+# --- Process JSON ---
 if orders_file:
+    orders = load_json_orders(orders_file)
+    if orders:
         order_chunks = [json.dumps(order, ensure_ascii=False) for order in orders]
         st.success(f"✅ Loaded {len(order_chunks)} customer order records.")
+        # Try to flatten for DataFrame view
+        try:
+            df = pd.json_normalize(orders)
+            st.dataframe(df, use_container_width=True)
+        except Exception:
+            st.warning("⚠️ Nested JSON detected. Showing raw JSON preview instead.")
+            st.json(orders)
+# --- Process PDFs ---
 if pdf_files:
     for pdf_file in pdf_files:
         try:
             text = extract_pdf_text(pdf_file)
+            pdf_chunks.extend(text.split("\n\n"))  # paragraph-wise
+            st.success(f"📄 Processed {pdf_file.name}")
         except Exception as e:
             st.error(f"❌ Failed to read {pdf_file.name}: {e}")
 combined_chunks = order_chunks + pdf_chunks
+# --- Question Answering Section ---
 if combined_chunks:
     index, sources = build_index(combined_chunks)
     st.subheader("❓ Ask a Question")
+    user_query = st.text_input("What would you like to know?", placeholder="e.g. What is the status of order 123?")
     if user_query:
         query_vector = embedder.encode([user_query])
         D, I = index.search(query_vector, k=5)
         context = "\n---\n".join([sources[i] for i in I[0]])
+        with st.spinner("🤔 Thinking..."):
             try:
                 answer = ask_llm(context, user_query)
                 st.markdown("### 🧠 Answer")
             except Exception as e:
                 st.error(f"❌ GROQ API Error: {e}")
 else:
+    st.info("📂 Please upload both JSON orders and relevant PDFs to begin.")