RAG-PDF / app.py
masadonline's picture
Update app.py
4dbf41f verified
raw
history blame
3.2 kB
import streamlit as st
import pdfplumber
import os
import tempfile
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# Setup GROQ client
client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
# Constants
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL = "llama3-8b-8192"
embedder = SentenceTransformer(EMBEDDING_MODEL)
def extract_rows_from_pdf(pdf_file_path):
rows = []
with pdfplumber.open(pdf_file_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
for row in table[1:]: # skip header
cleaned = [str(cell).strip() if cell else "" for cell in row]
if any(cleaned): # skip empty rows
rows.append(cleaned)
return rows
def build_index(chunks):
text_chunks = [" | ".join(chunk) for chunk in chunks]
vectors = embedder.encode(text_chunks)
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(np.array(vectors))
return index, text_chunks
def ask_llm(context, query):
prompt = f"You are a helpful assistant for an online toy shop.\n\nHere is the order data:\n{context}\n\nQuestion: {query}"
response = client.chat.completions.create(
model=LLM_MODEL,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# Streamlit UI
st.set_page_config(page_title="🧸 ToyShop Order Status Assistant", layout="wide")
st.title("πŸ“¦ ToyShop Order Status Assistant")
uploaded_file = st.file_uploader("Upload a Customer Order PDF", type="pdf")
if uploaded_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(uploaded_file.read())
pdf_path = tmp.name
st.success("βœ… File uploaded successfully")
rows = extract_rows_from_pdf(pdf_path)
if not rows:
st.error("❌ No tabular data found in the PDF.")
else:
st.info(f"πŸ“„ Extracted {len(rows)} order records.")
# Display records as table (if columns look uniform)
try:
df = pd.DataFrame(rows)
st.subheader("πŸ“‹ Extracted Order Records")
st.dataframe(df, use_container_width=True)
except:
st.text_area("Extracted Rows", "\n".join([" | ".join(r) for r in rows]), height=300)
index, text_chunks = build_index(rows)
query = st.text_input("Ask a question (e.g., 'What is the status of order 27?')")
if query:
query_vec = embedder.encode([query])
D, I = index.search(query_vec, k=3)
context = "\n".join([text_chunks[i] for i in I[0]])
with st.spinner("Generating answer..."):
try:
answer = ask_llm(context, query)
st.markdown("### 🧠 Answer")
st.write(answer)
except Exception as e:
st.error(f"LLM Error: {str(e)}")