Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pdfplumber | |
import os | |
import tempfile | |
import faiss | |
import numpy as np | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
from openai import OpenAI | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
# Setup GROQ client | |
client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1") | |
# Constants | |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
LLM_MODEL = "llama3-8b-8192" | |
embedder = SentenceTransformer(EMBEDDING_MODEL) | |
def extract_rows_from_pdf(pdf_file_path): | |
rows = [] | |
with pdfplumber.open(pdf_file_path) as pdf: | |
for page in pdf.pages: | |
tables = page.extract_tables() | |
for table in tables: | |
for row in table[1:]: # skip header | |
cleaned = [str(cell).strip() if cell else "" for cell in row] | |
if any(cleaned): # skip empty rows | |
rows.append(cleaned) | |
return rows | |
def build_index(chunks): | |
text_chunks = [" | ".join(chunk) for chunk in chunks] | |
vectors = embedder.encode(text_chunks) | |
index = faiss.IndexFlatL2(vectors.shape[1]) | |
index.add(np.array(vectors)) | |
return index, text_chunks | |
def ask_llm(context, query): | |
prompt = f"You are a helpful assistant for an online toy shop.\n\nHere is the order data:\n{context}\n\nQuestion: {query}" | |
response = client.chat.completions.create( | |
model=LLM_MODEL, | |
messages=[{"role": "user", "content": prompt}] | |
) | |
return response.choices[0].message.content | |
# Streamlit UI | |
st.set_page_config(page_title="π§Έ ToyShop Order Status Assistant", layout="wide") | |
st.title("π¦ ToyShop Order Status Assistant") | |
uploaded_file = st.file_uploader("Upload a Customer Order PDF", type="pdf") | |
if uploaded_file: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: | |
tmp.write(uploaded_file.read()) | |
pdf_path = tmp.name | |
st.success("β File uploaded successfully") | |
rows = extract_rows_from_pdf(pdf_path) | |
if not rows: | |
st.error("β No tabular data found in the PDF.") | |
else: | |
st.info(f"π Extracted {len(rows)} order records.") | |
# Display records as table (if columns look uniform) | |
try: | |
df = pd.DataFrame(rows) | |
st.subheader("π Extracted Order Records") | |
st.dataframe(df, use_container_width=True) | |
except: | |
st.text_area("Extracted Rows", "\n".join([" | ".join(r) for r in rows]), height=300) | |
index, text_chunks = build_index(rows) | |
query = st.text_input("Ask a question (e.g., 'What is the status of order 27?')") | |
if query: | |
query_vec = embedder.encode([query]) | |
D, I = index.search(query_vec, k=3) | |
context = "\n".join([text_chunks[i] for i in I[0]]) | |
with st.spinner("Generating answer..."): | |
try: | |
answer = ask_llm(context, query) | |
st.markdown("### π§ Answer") | |
st.write(answer) | |
except Exception as e: | |
st.error(f"LLM Error: {str(e)}") | |