Spaces:

masadonline
/

RAG-PDF

Sleeping

App Files Files Community

masadonline commited on May 18

Commit

12fd03c

verified ·

1 Parent(s): 750403e

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -22

app.py CHANGED Viewed

@@ -9,39 +9,39 @@ import pandas as pd
 from sentence_transformers import SentenceTransformer
 from openai import OpenAI
 from dotenv import load_dotenv
-# Load GROQ API key from .env
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 # Setup GROQ LLM client
 client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
-# Constants
-EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL = "llama3-8b-8192"
-embedder = SentenceTransformer(EMBEDDING_MODEL)
 st.set_page_config(page_title="🧸 ToyShop Assistant", layout="wide")
 st.title("🧸 ToyShop RAG-Based Assistant")
-# --- Load and process uploaded files ---
 def extract_pdf_text(file):
     text = ""
     with pdfplumber.open(file) as pdf:
         for page in pdf.pages:
-            text += page.extract_text() + "\n"
     return text.strip()
 def load_json_orders(json_file):
     data = json.load(json_file)
-    if isinstance(data, list):
-        return data
-    elif isinstance(data, dict):
-        return list(data.values())
-    else:
-        return []
 def build_index(text_chunks):
     vectors = embedder.encode(text_chunks)
@@ -57,8 +57,7 @@ def ask_llm(context, query):
     )
     return response.choices[0].message.content.strip()
-# --- File upload UI ---
 st.subheader("📁 Upload Customer Orders (JSON)")
 orders_file = st.file_uploader("Upload JSON file", type="json")
@@ -67,30 +66,28 @@ pdf_files = st.file_uploader("Upload one or more PDFs", type="pdf", accept_multi
 order_chunks, pdf_chunks = [], []
-# --- Process files ---
 if orders_file:
     try:
         orders = load_json_orders(orders_file)
         order_chunks = [json.dumps(order, ensure_ascii=False) for order in orders]
-        df = pd.DataFrame(orders)
         st.success(f"✅ Loaded {len(order_chunks)} customer order records.")
-        st.dataframe(df, use_container_width=True)
     except Exception as e:
         st.error(f"❌ Error loading JSON: {e}")
 if pdf_files:
     for pdf_file in pdf_files:
         try:
             text = extract_pdf_text(pdf_file)
-            pdf_chunks.extend(text.split("\n\n"))  # chunk by paragraph
         except Exception as e:
             st.error(f"❌ Failed to read {pdf_file.name}: {e}")
 combined_chunks = order_chunks + pdf_chunks
-# --- Question Answering ---
 if combined_chunks:
     index, sources = build_index(combined_chunks)

 from sentence_transformers import SentenceTransformer
 from openai import OpenAI
 from dotenv import load_dotenv
+import torch
+# Load environment variables
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 # Setup GROQ LLM client
 client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
+# Load embedding model with device specification
+device = "cuda" if torch.cuda.is_available() else "cpu"
+embedder = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)
+embedder.to(device)
+# LLM model name
 LLM_MODEL = "llama3-8b-8192"
+# Streamlit setup
 st.set_page_config(page_title="🧸 ToyShop Assistant", layout="wide")
 st.title("🧸 ToyShop RAG-Based Assistant")
 def extract_pdf_text(file):
     text = ""
     with pdfplumber.open(file) as pdf:
         for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
     return text.strip()
 def load_json_orders(json_file):
     data = json.load(json_file)
+    return data if isinstance(data, list) else list(data.values())
 def build_index(text_chunks):
     vectors = embedder.encode(text_chunks)
     )
     return response.choices[0].message.content.strip()
+# File upload
 st.subheader("📁 Upload Customer Orders (JSON)")
 orders_file = st.file_uploader("Upload JSON file", type="json")
 order_chunks, pdf_chunks = [], []
+# Handle JSON
 if orders_file:
     try:
         orders = load_json_orders(orders_file)
         order_chunks = [json.dumps(order, ensure_ascii=False) for order in orders]
         st.success(f"✅ Loaded {len(order_chunks)} customer order records.")
+        st.dataframe(pd.DataFrame(orders), use_container_width=True)
     except Exception as e:
         st.error(f"❌ Error loading JSON: {e}")
+# Handle PDFs
 if pdf_files:
     for pdf_file in pdf_files:
         try:
             text = extract_pdf_text(pdf_file)
+            pdf_chunks.extend(text.split("\n\n"))  # simple paragraph chunking
         except Exception as e:
             st.error(f"❌ Failed to read {pdf_file.name}: {e}")
+# Build index if we have content
 combined_chunks = order_chunks + pdf_chunks
 if combined_chunks:
     index, sources = build_index(combined_chunks)