RAG-PDF / app.py
masadonline's picture
Update app.py
0831006 verified
import streamlit as st
import os
import json
import pdfplumber
import faiss
import numpy as np
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# Setup GROQ client
client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
# Constants
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL = "llama3-8b-8192"
embedder = SentenceTransformer(EMBEDDING_MODEL)
# Streamlit UI
st.set_page_config(page_title="🧸 ToyShop Assistant", layout="wide")
st.title("🧸 ToyShop RAG-Based Assistant")
# --- Helper Functions ---
def extract_pdf_text(file):
text = ""
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
content = page.extract_text()
if content:
text += content + "\n"
return text.strip()
# Replace only the following functions from your current app.py
def flatten_order(order):
flat = []
if isinstance(order, dict):
for k, v in order.items():
if isinstance(v, (dict, list)):
flat.append(f"{k}: {json.dumps(v, ensure_ascii=False)}")
else:
flat.append(f"{k}: {v}")
# Add a friendly natural language summary for the assistant
if "order_id" in order and "status" in order:
summary = f"""
Dear {order.get("customer_name", "Customer")},
Here are the complete details of your order **#{order['order_id']}**:
- **Status**: {order['status']}
- **Items**: {', '.join([item['name'] + ' (x' + str(item['quantity']) + ')' for item in order.get('items', [])]) if order.get('items') else 'Not available'}
- **Total**: {order.get('total', 'N/A')}
- **Date**: {order.get('date', 'N/A')}
- **Shipping Address**: {order.get('shipping_address', 'N/A')}
We hope this helps! Let us know if you need anything else.
Thanks for shopping with us! 😊
""".strip()
flat.append(summary)
return "\n".join(flat)
def get_order_by_id(orders, query):
match = re.search(r"order(?:_id)?\s*[:#]?\s*(\d+)", query)
if match:
oid = match.group(1)
for order in orders:
if str(order.get("order_id")) == oid:
return flatten_order(order)
return None
def load_json_orders(json_file):
try:
data = json.load(json_file)
if isinstance(data, dict):
orders = list(data.values())
elif isinstance(data, list):
orders = data
else:
return []
valid_orders = [o for o in orders if isinstance(o, dict)]
return valid_orders
except Exception as e:
st.error(f"❌ Error parsing JSON: {e}")
return []
def build_index(chunks):
vectors = embedder.encode(chunks)
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(np.array(vectors))
return index, chunks
def ask_llm(context, query):
prompt = f"""You are a helpful assistant for an online toy shop.
Knowledge base:
{context}
Question: {query}
"""
response = client.chat.completions.create(
model=LLM_MODEL,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content.strip()
def preprocess_query(q):
return q.replace("order_id", "order").replace("_", " ")
def get_order_by_id(orders, query):
match = re.search(r"order(?:_id)?\s*[:#]?\s*(\d+)", query)
if match:
oid = match.group(1)
for order in orders:
if str(order.get("order_id")) == oid:
return flatten_order(order)
return None
# --- Uploads ---
st.subheader("πŸ“ Upload Customer Orders (JSON)")
orders_file = st.file_uploader("Upload JSON file", type="json")
st.subheader("πŸ“š Upload FAQs / Product Info / Return Policy (PDFs)")
pdf_files = st.file_uploader("Upload one or more PDFs", type="pdf", accept_multiple_files=True)
order_chunks, pdf_chunks = [], []
orders = []
# --- Handle JSON Orders ---
if orders_file:
orders = load_json_orders(orders_file)
if orders:
order_chunks = [flatten_order(o) for o in orders]
st.success(f"βœ… Loaded {len(order_chunks)} valid orders.")
try:
df = pd.json_normalize(orders)
st.dataframe(df, use_container_width=True)
except Exception:
st.warning("⚠️ Unable to normalize JSON. Showing raw preview.")
st.json(orders)
# --- Handle PDFs ---
if pdf_files:
for file in pdf_files:
try:
text = extract_pdf_text(file)
pdf_chunks.extend(text.split("\n\n"))
st.success(f"πŸ“„ Processed: {file.name}")
except Exception as e:
st.error(f"❌ Error in {file.name}: {e}")
# --- Build Index & Q&A ---
combined_chunks = order_chunks + pdf_chunks
if combined_chunks:
index, sources = build_index(combined_chunks)
st.subheader("❓ Ask a Question")
user_query = st.text_input("What would you like to know?", placeholder="e.g., What is the status of order 105?")
if user_query:
pre_q = preprocess_query(user_query)
direct_match = get_order_by_id(orders, user_query)
if direct_match:
context = direct_match
else:
query_vector = embedder.encode([pre_q])
D, I = index.search(query_vector, k=5)
context = "\n---\n".join([sources[i] for i in I[0]])
with st.spinner("πŸ€” Thinking..."):
try:
answer = ask_llm(context, user_query)
st.markdown("### 🧠 Answer")
st.write(answer)
except Exception as e:
st.error(f"❌ GROQ Error: {e}")
else:
st.info("πŸ“‚ Please upload orders (JSON) and info files (PDF) to get started.")