RAG-PDF / app.py
masadonline's picture
Update app.py
013dd9f verified
raw
history blame
4.15 kB
import streamlit as st
import os
import json
import pdfplumber
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# Setup GROQ client
client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
# Constants
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL = "llama3-8b-8192"
embedder = SentenceTransformer(EMBEDDING_MODEL)
# Streamlit UI
st.set_page_config(page_title="🧸 ToyShop Assistant", layout="wide")
st.title("🧸 ToyShop RAG-Based Assistant")
def extract_pdf_text(file):
text = ""
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
content = page.extract_text()
if content:
text += content + "\n"
return text.strip()
def flatten_order(order):
flat = []
if isinstance(order, dict):
for k, v in order.items():
if isinstance(v, (dict, list)):
flat.append(f"{k}: {json.dumps(v, ensure_ascii=False)}")
else:
flat.append(f"{k}: {v}")
return "\n".join(flat)
def load_json_orders(json_file):
try:
data = json.load(json_file)
if isinstance(data, dict):
orders = list(data.values())
elif isinstance(data, list):
orders = data
else:
return []
valid_orders = [o for o in orders if isinstance(o, dict)]
return valid_orders
except Exception as e:
st.error(f"❌ Error parsing JSON: {e}")
return []
def build_index(chunks):
vectors = embedder.encode(chunks)
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(np.array(vectors))
return index, chunks
def ask_llm(context, query):
prompt = f"""You are a helpful assistant for an online toy shop.
Knowledge base:
{context}
Question: {query}
"""
response = client.chat.completions.create(
model=LLM_MODEL,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content.strip()
# Uploads
st.subheader("πŸ“ Upload Customer Orders (JSON)")
orders_file = st.file_uploader("Upload JSON file", type="json")
st.subheader("πŸ“š Upload FAQs / Product Info / Return Policy (PDFs)")
pdf_files = st.file_uploader("Upload one or more PDFs", type="pdf", accept_multiple_files=True)
order_chunks, pdf_chunks = [], []
# Handle JSON orders
if orders_file:
orders = load_json_orders(orders_file)
if orders:
order_chunks = [flatten_order(o) for o in orders]
st.success(f"βœ… Loaded {len(order_chunks)} valid orders.")
try:
df = pd.json_normalize(orders)
st.dataframe(df, use_container_width=True)
except Exception:
st.warning("⚠️ Unable to normalize JSON. Showing raw preview.")
st.json(orders)
# Handle PDFs
if pdf_files:
for file in pdf_files:
try:
text = extract_pdf_text(file)
pdf_chunks.extend(text.split("\n\n"))
st.success(f"πŸ“„ Processed: {file.name}")
except Exception as e:
st.error(f"❌ Error in {file.name}: {e}")
# Combine & build index
combined_chunks = order_chunks + pdf_chunks
if combined_chunks:
index, sources = build_index(combined_chunks)
st.subheader("❓ Ask a Question")
user_query = st.text_input("What would you like to know?", placeholder="e.g., What is the status of order 105?")
if user_query:
query_vector = embedder.encode([user_query])
D, I = index.search(query_vector, k=5)
context = "\n---\n".join([sources[i] for i in I[0]])
with st.spinner("πŸ€” Thinking..."):
try:
answer = ask_llm(context, user_query)
st.markdown("### 🧠 Answer")
st.write(answer)
except Exception as e:
st.error(f"❌ GROQ Error: {e}")
else:
st.info("πŸ“‚ Please upload orders (JSON) and info files (PDF) to get started.")