masadonline commited on
Commit
4dbf41f
Β·
verified Β·
1 Parent(s): 4adc539

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -12
app.py CHANGED
@@ -4,6 +4,7 @@ import os
4
  import tempfile
5
  import faiss
6
  import numpy as np
 
7
  from sentence_transformers import SentenceTransformer
8
  from openai import OpenAI
9
  from dotenv import load_dotenv
@@ -20,26 +21,25 @@ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
20
  LLM_MODEL = "llama3-8b-8192"
21
  embedder = SentenceTransformer(EMBEDDING_MODEL)
22
 
23
- # Extract table rows from PDF
24
  def extract_rows_from_pdf(pdf_file_path):
25
  rows = []
26
  with pdfplumber.open(pdf_file_path) as pdf:
27
  for page in pdf.pages:
28
  tables = page.extract_tables()
29
  for table in tables:
30
- for row in table[1:]:
31
- cleaned = " | ".join([str(cell).strip() for cell in row if cell is not None])
32
- rows.append(cleaned)
 
33
  return rows
34
 
35
- # Build FAISS index
36
  def build_index(chunks):
37
- vectors = embedder.encode(chunks)
 
38
  index = faiss.IndexFlatL2(vectors.shape[1])
39
  index.add(np.array(vectors))
40
- return index, vectors
41
 
42
- # Ask LLM
43
  def ask_llm(context, query):
44
  prompt = f"You are a helpful assistant for an online toy shop.\n\nHere is the order data:\n{context}\n\nQuestion: {query}"
45
  response = client.chat.completions.create(
@@ -61,21 +61,28 @@ if uploaded_file:
61
 
62
  st.success("βœ… File uploaded successfully")
63
 
64
- # Process file
65
  rows = extract_rows_from_pdf(pdf_path)
66
  if not rows:
67
  st.error("❌ No tabular data found in the PDF.")
68
  else:
69
- st.info(f"πŸ“„ Extracted {len(rows)} rows of order data.")
70
 
71
- index, _ = build_index(rows)
 
 
 
 
 
 
 
 
72
 
73
  query = st.text_input("Ask a question (e.g., 'What is the status of order 27?')")
74
 
75
  if query:
76
  query_vec = embedder.encode([query])
77
  D, I = index.search(query_vec, k=3)
78
- context = "\n".join([rows[i] for i in I[0]])
79
 
80
  with st.spinner("Generating answer..."):
81
  try:
 
4
  import tempfile
5
  import faiss
6
  import numpy as np
7
+ import pandas as pd
8
  from sentence_transformers import SentenceTransformer
9
  from openai import OpenAI
10
  from dotenv import load_dotenv
 
21
  LLM_MODEL = "llama3-8b-8192"
22
  embedder = SentenceTransformer(EMBEDDING_MODEL)
23
 
 
24
  def extract_rows_from_pdf(pdf_file_path):
25
  rows = []
26
  with pdfplumber.open(pdf_file_path) as pdf:
27
  for page in pdf.pages:
28
  tables = page.extract_tables()
29
  for table in tables:
30
+ for row in table[1:]: # skip header
31
+ cleaned = [str(cell).strip() if cell else "" for cell in row]
32
+ if any(cleaned): # skip empty rows
33
+ rows.append(cleaned)
34
  return rows
35
 
 
36
  def build_index(chunks):
37
+ text_chunks = [" | ".join(chunk) for chunk in chunks]
38
+ vectors = embedder.encode(text_chunks)
39
  index = faiss.IndexFlatL2(vectors.shape[1])
40
  index.add(np.array(vectors))
41
+ return index, text_chunks
42
 
 
43
  def ask_llm(context, query):
44
  prompt = f"You are a helpful assistant for an online toy shop.\n\nHere is the order data:\n{context}\n\nQuestion: {query}"
45
  response = client.chat.completions.create(
 
61
 
62
  st.success("βœ… File uploaded successfully")
63
 
 
64
  rows = extract_rows_from_pdf(pdf_path)
65
  if not rows:
66
  st.error("❌ No tabular data found in the PDF.")
67
  else:
68
+ st.info(f"πŸ“„ Extracted {len(rows)} order records.")
69
 
70
+ # Display records as table (if columns look uniform)
71
+ try:
72
+ df = pd.DataFrame(rows)
73
+ st.subheader("πŸ“‹ Extracted Order Records")
74
+ st.dataframe(df, use_container_width=True)
75
+ except:
76
+ st.text_area("Extracted Rows", "\n".join([" | ".join(r) for r in rows]), height=300)
77
+
78
+ index, text_chunks = build_index(rows)
79
 
80
  query = st.text_input("Ask a question (e.g., 'What is the status of order 27?')")
81
 
82
  if query:
83
  query_vec = embedder.encode([query])
84
  D, I = index.search(query_vec, k=3)
85
+ context = "\n".join([text_chunks[i] for i in I[0]])
86
 
87
  with st.spinner("Generating answer..."):
88
  try: