sunbal7 commited on
Commit
1c7a288
Β·
verified Β·
1 Parent(s): 8b3ba7e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py CHANGED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF for PDF processing
3
+ import faiss
4
+ import numpy as np
5
+ import streamlit as st
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from sentence_transformers import SentenceTransformer
8
+ from groq import Groq
9
+ from dotenv import load_dotenv
10
+
11
+
12
+
13
+
14
+ # Load API key
15
+ load_dotenv()
16
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
17
+
18
+ # Initialize Groq client
19
+ client = Groq(api_key= GROQ_API_KEY)
20
+
21
+ # Load sentence transformer model for embedding
22
+ embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
23
+ def extract_text_from_pdf(pdf_path):
24
+ """Extract text from a PDF file using PyMuPDF."""
25
+ doc = fitz.open(pdf_path)
26
+ text = ""
27
+ for page in doc:
28
+ text += page.get_text("text") + "\n"
29
+ return text.strip()
30
+ def extract_text_from_pdf(pdf_path):
31
+ """Extract text from a PDF file using PyMuPDF."""
32
+ doc = fitz.open(pdf_path)
33
+ text = ""
34
+ for page in doc:
35
+ text += page.get_text("text") + "\n"
36
+ return text.strip()
37
+ def create_text_chunks(text, chunk_size=500, chunk_overlap=100):
38
+ """Split text into chunks of specified size with overlap."""
39
+ text_splitter = RecursiveCharacterTextSplitter(
40
+ chunk_size=chunk_size,
41
+ chunk_overlap=chunk_overlap
42
+ )
43
+ chunks = text_splitter.split_text(text)
44
+ return chunks
45
+ def create_faiss_index(chunks):
46
+ """Generate embeddings for text chunks and store them in FAISS."""
47
+ embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
48
+ dimension = embeddings.shape[1]
49
+
50
+ index = faiss.IndexFlatL2(dimension) # L2 (Euclidean) distance
51
+ index.add(embeddings) # Add embeddings to FAISS index
52
+
53
+ return index, embeddings, chunks
54
+ def retrieve_similar_chunks(query, index, embeddings, chunks, top_k=3):
55
+ """Retrieve the most relevant text chunks using FAISS."""
56
+ query_embedding = embedding_model.encode([query], convert_to_numpy=True)
57
+ distances, indices = index.search(query_embedding, top_k)
58
+
59
+ results = [chunks[idx] for idx in indices[0]]
60
+ return results
61
+ def query_groq_api(query, context):
62
+ """Send the query along with retrieved context to Groq API."""
63
+ prompt = f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"
64
+
65
+ chat_completion = client.chat.completions.create(
66
+ messages=[{"role": "user", "content": prompt}],
67
+ model="llama-3.3-70b-versatile",
68
+ )
69
+
70
+ return chat_completion.choices[0].message.content
71
+ import streamlit as st
72
+
73
+ st.title("πŸ“š RAG-based PDF Query Application")
74
+ st.write("Upload a PDF and ask questions!")
75
+
76
+ # File Upload
77
+ uploaded_file = st.file_uploader("Upload PDF", type="pdf")
78
+
79
+ if uploaded_file is not None:
80
+ pdf_path = "uploaded_document.pdf"
81
+
82
+ # Save file temporarily
83
+ with open(pdf_path, "wb") as f:
84
+ f.write(uploaded_file.getbuffer())
85
+
86
+ # Process the PDF
87
+ st.write("Processing PDF...")
88
+ text = extract_text_from_pdf(pdf_path)
89
+ chunks = create_text_chunks(text)
90
+ index, embeddings, chunk_texts = create_faiss_index(chunks)
91
+
92
+ st.success("PDF processed! Now you can ask questions.")
93
+
94
+ # User Query
95
+ query = st.text_input("Ask a question about the PDF:")
96
+
97
+ if st.button("Get Answer"):
98
+ if query:
99
+ # Retrieve top chunks
100
+ relevant_chunks = retrieve_similar_chunks(query, index, embeddings, chunk_texts)
101
+ context = "\n\n".join(relevant_chunks)
102
+
103
+ # Query Groq API
104
+ response = query_groq_api(query, context)
105
+
106
+ st.subheader("Answer:")
107
+ st.write(response)
108
+ else:
109
+ st.warning("Please enter a question.")
110
+