Spaces:

masadonline
/

RAG-PDF

Sleeping

App Files Files Community

masadonline commited on May 18

Commit

4dbf41f

verified ·

1 Parent(s): 4adc539

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -12

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import tempfile
 import faiss
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from openai import OpenAI
 from dotenv import load_dotenv
@@ -20,26 +21,25 @@ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL = "llama3-8b-8192"
 embedder = SentenceTransformer(EMBEDDING_MODEL)
-# Extract table rows from PDF
 def extract_rows_from_pdf(pdf_file_path):
     rows = []
     with pdfplumber.open(pdf_file_path) as pdf:
         for page in pdf.pages:
             tables = page.extract_tables()
             for table in tables:
-                for row in table[1:]:
-                    cleaned = " | ".join([str(cell).strip() for cell in row if cell is not None])
-                    rows.append(cleaned)
     return rows
-# Build FAISS index
 def build_index(chunks):
-    vectors = embedder.encode(chunks)
     index = faiss.IndexFlatL2(vectors.shape[1])
     index.add(np.array(vectors))
-    return index, vectors
-# Ask LLM
 def ask_llm(context, query):
     prompt = f"You are a helpful assistant for an online toy shop.\n\nHere is the order data:\n{context}\n\nQuestion: {query}"
     response = client.chat.completions.create(
@@ -61,21 +61,28 @@ if uploaded_file:
     st.success("✅ File uploaded successfully")
-    # Process file
     rows = extract_rows_from_pdf(pdf_path)
     if not rows:
         st.error("❌ No tabular data found in the PDF.")
     else:
-        st.info(f"📄 Extracted {len(rows)} rows of order data.")
-        index, _ = build_index(rows)
         query = st.text_input("Ask a question (e.g., 'What is the status of order 27?')")
         if query:
             query_vec = embedder.encode([query])
             D, I = index.search(query_vec, k=3)
-            context = "\n".join([rows[i] for i in I[0]])
             with st.spinner("Generating answer..."):
                 try:

 import tempfile
 import faiss
 import numpy as np
+import pandas as pd
 from sentence_transformers import SentenceTransformer
 from openai import OpenAI
 from dotenv import load_dotenv
 LLM_MODEL = "llama3-8b-8192"
 embedder = SentenceTransformer(EMBEDDING_MODEL)
 def extract_rows_from_pdf(pdf_file_path):
     rows = []
     with pdfplumber.open(pdf_file_path) as pdf:
         for page in pdf.pages:
             tables = page.extract_tables()
             for table in tables:
+                for row in table[1:]:  # skip header
+                    cleaned = [str(cell).strip() if cell else "" for cell in row]
+                    if any(cleaned):  # skip empty rows
+                        rows.append(cleaned)
     return rows
 def build_index(chunks):
+    text_chunks = [" | ".join(chunk) for chunk in chunks]
+    vectors = embedder.encode(text_chunks)
     index = faiss.IndexFlatL2(vectors.shape[1])
     index.add(np.array(vectors))
+    return index, text_chunks
 def ask_llm(context, query):
     prompt = f"You are a helpful assistant for an online toy shop.\n\nHere is the order data:\n{context}\n\nQuestion: {query}"
     response = client.chat.completions.create(
     st.success("✅ File uploaded successfully")
     rows = extract_rows_from_pdf(pdf_path)
     if not rows:
         st.error("❌ No tabular data found in the PDF.")
     else:
+        st.info(f"📄 Extracted {len(rows)} order records.")
+        # Display records as table (if columns look uniform)
+        try:
+            df = pd.DataFrame(rows)
+            st.subheader("📋 Extracted Order Records")
+            st.dataframe(df, use_container_width=True)
+        except:
+            st.text_area("Extracted Rows", "\n".join([" | ".join(r) for r in rows]), height=300)
+        index, text_chunks = build_index(rows)
         query = st.text_input("Ask a question (e.g., 'What is the status of order 27?')")
         if query:
             query_vec = embedder.encode([query])
             D, I = index.search(query_vec, k=3)
+            context = "\n".join([text_chunks[i] for i in I[0]])
             with st.spinner("Generating answer..."):
                 try: