Spaces:

masadonline
/

RAG-PDF

Sleeping

App Files Files Community

masadonline commited on May 18

Commit

1b72738

verified ·

1 Parent(s): 8fe6699

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -40

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import streamlit as st
-import json
 import os
 import faiss
 import numpy as np
 import pandas as pd
@@ -8,11 +10,11 @@ from sentence_transformers import SentenceTransformer
 from openai import OpenAI
 from dotenv import load_dotenv
-# Load environment variables
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-# Setup GROQ client
 client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
 # Constants
@@ -20,65 +22,92 @@ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL = "llama3-8b-8192"
 embedder = SentenceTransformer(EMBEDDING_MODEL)
-def load_orders_from_json(json_file):
     data = json.load(json_file)
     if isinstance(data, list):
-        rows = data
     elif isinstance(data, dict):
-        rows = list(data.values())
     else:
-        rows = []
-    return rows
-def build_index(chunks):
-    text_chunks = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
     vectors = embedder.encode(text_chunks)
     index = faiss.IndexFlatL2(vectors.shape[1])
     index.add(np.array(vectors))
     return index, text_chunks
 def ask_llm(context, query):
-    prompt = f"You are a helpful assistant for an online toy shop.\n\nHere is the customer order data:\n{context}\n\nQuestion: {query}"
     response = client.chat.completions.create(
         model=LLM_MODEL,
         messages=[{"role": "user", "content": prompt}]
     )
-    return response.choices[0].message.content
-# Streamlit UI
-st.set_page_config(page_title="🧸 ToyShop Order Status Assistant", layout="wide")
-st.title("📦 ToyShop Order Status Assistant")
-uploaded_file = st.file_uploader("Upload a Customer Orders JSON File", type="json")
-if uploaded_file:
     try:
-        rows = load_orders_from_json(uploaded_file)
-        if not rows:
-            st.error("❌ No valid order data found in the JSON file.")
-        else:
-            st.success(f"✅ Loaded {len(rows)} order records.")
-            df = pd.DataFrame(rows)
-            st.subheader("📋 Customer Orders")
-            st.dataframe(df, use_container_width=True)
-            index, text_chunks = build_index(rows)
-            query = st.text_input("Ask a question (e.g., 'What is the status of order #1002?')")
-            if query:
-                query_vec = embedder.encode([query])
-                D, I = index.search(query_vec, k=3)
-                context = "\n".join([text_chunks[i] for i in I[0]])
-                with st.spinner("Generating answer..."):
-                    try:
-                        answer = ask_llm(context, query)
-                        st.markdown("### 🧠 Answer")
-                        st.write(answer)
-                    except Exception as e:
-                        st.error(f"LLM Error: {str(e)}")
-    except Exception as e:
-        st.error(f"❌ Failed to load or process JSON file: {e}")

 import streamlit as st
 import os
+import json
+import tempfile
+import pdfplumber
 import faiss
 import numpy as np
 import pandas as pd
 from openai import OpenAI
 from dotenv import load_dotenv
+# Load GROQ API key from .env
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+# Setup GROQ LLM client
 client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
 # Constants
 LLM_MODEL = "llama3-8b-8192"
 embedder = SentenceTransformer(EMBEDDING_MODEL)
+st.set_page_config(page_title="🧸 ToyShop Assistant", layout="wide")
+st.title("🧸 ToyShop RAG-Based Assistant")
+# --- Load and process uploaded files ---
+def extract_pdf_text(file):
+    text = ""
+    with pdfplumber.open(file) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text() + "\n"
+    return text.strip()
+def load_json_orders(json_file):
     data = json.load(json_file)
     if isinstance(data, list):
+        return data
     elif isinstance(data, dict):
+        return list(data.values())
     else:
+        return []
+def build_index(text_chunks):
     vectors = embedder.encode(text_chunks)
     index = faiss.IndexFlatL2(vectors.shape[1])
     index.add(np.array(vectors))
     return index, text_chunks
 def ask_llm(context, query):
+    prompt = f"You are a helpful assistant for an online toy shop.\n\nKnowledge base:\n{context}\n\nQuestion: {query}"
     response = client.chat.completions.create(
         model=LLM_MODEL,
         messages=[{"role": "user", "content": prompt}]
     )
+    return response.choices[0].message.content.strip()
+# --- File upload UI ---
+st.subheader("📁 Upload Customer Orders (JSON)")
+orders_file = st.file_uploader("Upload JSON file", type="json")
+st.subheader("📚 Upload FAQ / Product Info / Return Policy (PDFs)")
+pdf_files = st.file_uploader("Upload one or more PDFs", type="pdf", accept_multiple_files=True)
+order_chunks, pdf_chunks = [], []
+# --- Process files ---
+if orders_file:
     try:
+        orders = load_json_orders(orders_file)
+        order_chunks = [json.dumps(order, ensure_ascii=False) for order in orders]
+        df = pd.DataFrame(orders)
+        st.success(f"✅ Loaded {len(order_chunks)} customer order records.")
+        st.dataframe(df, use_container_width=True)
+    except Exception as e:
+        st.error(f"❌ Error loading JSON: {e}")
+if pdf_files:
+    for pdf_file in pdf_files:
+        try:
+            text = extract_pdf_text(pdf_file)
+            pdf_chunks.extend(text.split("\n\n"))  # chunk by paragraph
+        except Exception as e:
+            st.error(f"❌ Failed to read {pdf_file.name}: {e}")
+combined_chunks = order_chunks + pdf_chunks
+# --- Question Answering ---
+if combined_chunks:
+    index, sources = build_index(combined_chunks)
+    st.subheader("❓ Ask a Question")
+    user_query = st.text_input("What would you like to know?")
+    if user_query:
+        query_vector = embedder.encode([user_query])
+        D, I = index.search(query_vector, k=5)
+        context = "\n---\n".join([sources[i] for i in I[0]])
+        with st.spinner("Thinking..."):
+            try:
+                answer = ask_llm(context, user_query)
+                st.markdown("### 🧠 Answer")
+                st.write(answer)
+            except Exception as e:
+                st.error(f"❌ GROQ API Error: {e}")
+else:
+    st.info("📂 Please upload both JSON orders and PDFs to begin.")