Spaces:

masadonline
/

RAG-PDF

Sleeping

File size: 5,884 Bytes

36c0c0f
b02d98a
1b72738
 
b02d98a
 
4dbf41f
92d0c75
b02d98a
85e6257
36c0c0f
0b8ee3b
12fd03c
0b8ee3b
85e6257
ab4f2f9
8a8a6d6
85e6257
b02d98a
8a8a6d6
 
85e6257
8a8a6d6
b02d98a
013dd9f
1b72738
 
 
92d0c75
 
1b72738
 
 
 
013dd9f
 
 
1b72738
 
0831006
 
013dd9f
 
 
 
 
 
 
 
0831006
 
92d0c75
0831006
 
 
 
 
 
 
 
 
 
 
 
 
 
 
013dd9f
 
0831006
 
 
 
 
 
 
 
 
 
1b72738
8a8a6d6
 
013dd9f
 
 
 
 
 
 
 
8a8a6d6
013dd9f
 
d899598
013dd9f
 
b02d98a
 
013dd9f
b02d98a
 
013dd9f
 
 
 
 
 
 
85e6257
b02d98a
 
 
1b72738
 
92d0c75
 
 
 
 
 
 
 
 
 
 
 
 
1b72738
 
b02d98a
8a8a6d6
1b72738
36c0c0f
1b72738
92d0c75
1b72738
92d0c75
1b72738
8a8a6d6
 
013dd9f
 
8a8a6d6
 
 
 
013dd9f
8a8a6d6
 
92d0c75
1b72738
013dd9f
1b72738
013dd9f
 
 
1b72738
013dd9f
4dbf41f
92d0c75
1b72738
12a98fd
1b72738
 
36c0c0f
1b72738
013dd9f
36c0c0f
1b72738
92d0c75
 
 
 
 
 
 
 
 
1b72738
8a8a6d6
1b72738
 
 
 
 
013dd9f
1b72738
013dd9f

import streamlit as st
import os
import json
import pdfplumber
import faiss
import numpy as np
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Setup GROQ client
client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")

# Constants
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL = "llama3-8b-8192"
embedder = SentenceTransformer(EMBEDDING_MODEL)

# Streamlit UI
st.set_page_config(page_title="🧸 ToyShop Assistant", layout="wide")
st.title("🧸 ToyShop RAG-Based Assistant")

# --- Helper Functions ---

def extract_pdf_text(file):
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            content = page.extract_text()
            if content:
                text += content + "\n"
    return text.strip()

# Replace only the following functions from your current app.py

def flatten_order(order):
    flat = []
    if isinstance(order, dict):
        for k, v in order.items():
            if isinstance(v, (dict, list)):
                flat.append(f"{k}: {json.dumps(v, ensure_ascii=False)}")
            else:
                flat.append(f"{k}: {v}")
        
        # Add a friendly natural language summary for the assistant
        if "order_id" in order and "status" in order:
            summary = f"""
Dear {order.get("customer_name", "Customer")},

Here are the complete details of your order **#{order['order_id']}**:
- **Status**: {order['status']}
- **Items**: {', '.join([item['name'] + ' (x' + str(item['quantity']) + ')' for item in order.get('items', [])]) if order.get('items') else 'Not available'}
- **Total**: {order.get('total', 'N/A')}
- **Date**: {order.get('date', 'N/A')}
- **Shipping Address**: {order.get('shipping_address', 'N/A')}

We hope this helps! Let us know if you need anything else.

Thanks for shopping with us! 😊
            """.strip()
            flat.append(summary)
    return "\n".join(flat)

def get_order_by_id(orders, query):
    match = re.search(r"order(?:_id)?\s*[:#]?\s*(\d+)", query)
    if match:
        oid = match.group(1)
        for order in orders:
            if str(order.get("order_id")) == oid:
                return flatten_order(order)
    return None


def load_json_orders(json_file):
    try:
        data = json.load(json_file)
        if isinstance(data, dict):
            orders = list(data.values())
        elif isinstance(data, list):
            orders = data
        else:
            return []
        valid_orders = [o for o in orders if isinstance(o, dict)]
        return valid_orders
    except Exception as e:
        st.error(f"❌ Error parsing JSON: {e}")
        return []

def build_index(chunks):
    vectors = embedder.encode(chunks)
    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(np.array(vectors))
    return index, chunks

def ask_llm(context, query):
    prompt = f"""You are a helpful assistant for an online toy shop.

Knowledge base:
{context}

Question: {query}
"""
    response = client.chat.completions.create(
        model=LLM_MODEL,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

def preprocess_query(q):
    return q.replace("order_id", "order").replace("_", " ")

def get_order_by_id(orders, query):
    match = re.search(r"order(?:_id)?\s*[:#]?\s*(\d+)", query)
    if match:
        oid = match.group(1)
        for order in orders:
            if str(order.get("order_id")) == oid:
                return flatten_order(order)
    return None

# --- Uploads ---
st.subheader("📁 Upload Customer Orders (JSON)")
orders_file = st.file_uploader("Upload JSON file", type="json")

st.subheader("📚 Upload FAQs / Product Info / Return Policy (PDFs)")
pdf_files = st.file_uploader("Upload one or more PDFs", type="pdf", accept_multiple_files=True)

order_chunks, pdf_chunks = [], []
orders = []

# --- Handle JSON Orders ---
if orders_file:
    orders = load_json_orders(orders_file)
    if orders:
        order_chunks = [flatten_order(o) for o in orders]
        st.success(f"✅ Loaded {len(order_chunks)} valid orders.")
        try:
            df = pd.json_normalize(orders)
            st.dataframe(df, use_container_width=True)
        except Exception:
            st.warning("⚠️ Unable to normalize JSON. Showing raw preview.")
            st.json(orders)

# --- Handle PDFs ---
if pdf_files:
    for file in pdf_files:
        try:
            text = extract_pdf_text(file)
            pdf_chunks.extend(text.split("\n\n"))
            st.success(f"📄 Processed: {file.name}")
        except Exception as e:
            st.error(f"❌ Error in {file.name}: {e}")

# --- Build Index & Q&A ---
combined_chunks = order_chunks + pdf_chunks

if combined_chunks:
    index, sources = build_index(combined_chunks)

    st.subheader("❓ Ask a Question")
    user_query = st.text_input("What would you like to know?", placeholder="e.g., What is the status of order 105?")

    if user_query:
        pre_q = preprocess_query(user_query)
        direct_match = get_order_by_id(orders, user_query)

        if direct_match:
            context = direct_match
        else:
            query_vector = embedder.encode([pre_q])
            D, I = index.search(query_vector, k=5)
            context = "\n---\n".join([sources[i] for i in I[0]])

        with st.spinner("🤔 Thinking..."):
            try:
                answer = ask_llm(context, user_query)
                st.markdown("### 🧠 Answer")
                st.write(answer)
            except Exception as e:
                st.error(f"❌ GROQ Error: {e}")
else:
    st.info("📂 Please upload orders (JSON) and info files (PDF) to get started.")