sunbal7 commited on
Commit
021fae5
Β·
verified Β·
1 Parent(s): 6c784e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -14
app.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import os
2
  import fitz # PyMuPDF for PDF processing
3
  import faiss
@@ -8,18 +12,16 @@ from sentence_transformers import SentenceTransformer
8
  from groq import Groq
9
  from dotenv import load_dotenv
10
 
11
-
12
-
13
-
14
  # Load API key
15
  load_dotenv()
16
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
17
 
18
  # Initialize Groq client
19
- client = Groq(api_key= GROQ_API_KEY)
20
 
21
  # Load sentence transformer model for embedding
22
  embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 
23
  def extract_text_from_pdf(pdf_path):
24
  """Extract text from a PDF file using PyMuPDF."""
25
  doc = fitz.open(pdf_path)
@@ -27,13 +29,7 @@ def extract_text_from_pdf(pdf_path):
27
  for page in doc:
28
  text += page.get_text("text") + "\n"
29
  return text.strip()
30
- def extract_text_from_pdf(pdf_path):
31
- """Extract text from a PDF file using PyMuPDF."""
32
- doc = fitz.open(pdf_path)
33
- text = ""
34
- for page in doc:
35
- text += page.get_text("text") + "\n"
36
- return text.strip()
37
  def create_text_chunks(text, chunk_size=500, chunk_overlap=100):
38
  """Split text into chunks of specified size with overlap."""
39
  text_splitter = RecursiveCharacterTextSplitter(
@@ -42,6 +38,7 @@ def create_text_chunks(text, chunk_size=500, chunk_overlap=100):
42
  )
43
  chunks = text_splitter.split_text(text)
44
  return chunks
 
45
  def create_faiss_index(chunks):
46
  """Generate embeddings for text chunks and store them in FAISS."""
47
  embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
@@ -51,6 +48,7 @@ def create_faiss_index(chunks):
51
  index.add(embeddings) # Add embeddings to FAISS index
52
 
53
  return index, embeddings, chunks
 
54
  def retrieve_similar_chunks(query, index, embeddings, chunks, top_k=3):
55
  """Retrieve the most relevant text chunks using FAISS."""
56
  query_embedding = embedding_model.encode([query], convert_to_numpy=True)
@@ -58,6 +56,7 @@ def retrieve_similar_chunks(query, index, embeddings, chunks, top_k=3):
58
 
59
  results = [chunks[idx] for idx in indices[0]]
60
  return results
 
61
  def query_groq_api(query, context):
62
  """Send the query along with retrieved context to Groq API."""
63
  prompt = f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"
@@ -68,8 +67,8 @@ def query_groq_api(query, context):
68
  )
69
 
70
  return chat_completion.choices[0].message.content
71
- import streamlit as st
72
 
 
73
  st.title("πŸ“š RAG-based PDF Query Application")
74
  st.write("Upload a PDF and ask questions!")
75
 
@@ -106,5 +105,4 @@ if uploaded_file is not None:
106
  st.subheader("Answer:")
107
  st.write(response)
108
  else:
109
- st.warning("Please enter a question.")
110
-
 
1
+
2
+ ### `app.py`
3
+
4
+ ```python
5
  import os
6
  import fitz # PyMuPDF for PDF processing
7
  import faiss
 
12
  from groq import Groq
13
  from dotenv import load_dotenv
14
 
 
 
 
15
  # Load API key
16
  load_dotenv()
17
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
18
 
19
  # Initialize Groq client
20
+ client = Groq(api_key=GROQ_API_KEY)
21
 
22
  # Load sentence transformer model for embedding
23
  embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
24
+
25
  def extract_text_from_pdf(pdf_path):
26
  """Extract text from a PDF file using PyMuPDF."""
27
  doc = fitz.open(pdf_path)
 
29
  for page in doc:
30
  text += page.get_text("text") + "\n"
31
  return text.strip()
32
+
 
 
 
 
 
 
33
  def create_text_chunks(text, chunk_size=500, chunk_overlap=100):
34
  """Split text into chunks of specified size with overlap."""
35
  text_splitter = RecursiveCharacterTextSplitter(
 
38
  )
39
  chunks = text_splitter.split_text(text)
40
  return chunks
41
+
42
  def create_faiss_index(chunks):
43
  """Generate embeddings for text chunks and store them in FAISS."""
44
  embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
 
48
  index.add(embeddings) # Add embeddings to FAISS index
49
 
50
  return index, embeddings, chunks
51
+
52
  def retrieve_similar_chunks(query, index, embeddings, chunks, top_k=3):
53
  """Retrieve the most relevant text chunks using FAISS."""
54
  query_embedding = embedding_model.encode([query], convert_to_numpy=True)
 
56
 
57
  results = [chunks[idx] for idx in indices[0]]
58
  return results
59
+
60
  def query_groq_api(query, context):
61
  """Send the query along with retrieved context to Groq API."""
62
  prompt = f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"
 
67
  )
68
 
69
  return chat_completion.choices[0].message.content
 
70
 
71
+ # Streamlit UI
72
  st.title("πŸ“š RAG-based PDF Query Application")
73
  st.write("Upload a PDF and ask questions!")
74
 
 
105
  st.subheader("Answer:")
106
  st.write(response)
107
  else:
108
+ st.warning("Please enter a question.")