sunbal7 commited on
Commit
3acced2
Β·
verified Β·
1 Parent(s): 99e6cea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -36
app.py CHANGED
@@ -4,11 +4,11 @@ import fitz # PyMuPDF
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
7
- from langchain_community.llms import HuggingFaceHub
8
  from langchain.chains import RetrievalQA
9
- import tempfile
 
10
  import os
11
- import base64
12
 
13
  # Page configuration
14
  st.set_page_config(
@@ -72,29 +72,52 @@ st.markdown("""
72
  from { opacity: 0; }
73
  to { opacity: 1; }
74
  }
 
 
 
 
 
 
 
75
  </style>
76
  """, unsafe_allow_html=True)
77
 
78
  # Initialize session state
79
  if 'pdf_processed' not in st.session_state:
80
  st.session_state.pdf_processed = False
81
- if 'qa_chain' not in st.session_state:
82
- st.session_state.qa_chain = None
83
  if 'pages' not in st.session_state:
84
  st.session_state.pages = []
 
 
85
 
86
- # Load models with caching
87
  @st.cache_resource
88
  def load_embedding_model():
89
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
90
 
91
- @st.cache_resource
92
- def load_qa_model():
93
- return HuggingFaceHub(
94
- repo_id="google/flan-t5-xxl",
95
- model_kwargs={"temperature": 0.5, "max_length": 512},
96
- huggingfacehub_api_token=os.getenv("HF_API_KEY")
97
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def process_pdf(pdf_file):
100
  """Extract text from PDF and create vector store"""
@@ -103,8 +126,9 @@ def process_pdf(pdf_file):
103
  text = ""
104
  st.session_state.pages = []
105
  for page in doc:
106
- text += page.get_text()
107
- st.session_state.pages.append(page.get_text())
 
108
 
109
  with st.spinner("πŸ” Processing text..."):
110
  text_splitter = RecursiveCharacterTextSplitter(
@@ -115,19 +139,44 @@ def process_pdf(pdf_file):
115
  chunks = text_splitter.split_text(text)
116
 
117
  embeddings = load_embedding_model()
118
- vector_store = FAISS.from_texts(chunks, embeddings)
119
-
120
- qa_model = load_qa_model()
121
- st.session_state.qa_chain = RetrievalQA.from_chain_type(
122
- llm=qa_model,
123
- chain_type="stuff",
124
- retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
125
- return_source_documents=True
126
- )
127
 
128
  st.session_state.pdf_processed = True
129
  st.success("βœ… PDF processed successfully!")
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def generate_qa_for_chapter(start_page, end_page):
132
  """Generate Q&A for specific chapter pages"""
133
  if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page:
@@ -144,17 +193,19 @@ def generate_qa_for_chapter(start_page, end_page):
144
  chunks = text_splitter.split_text(chapter_text)
145
 
146
  qa_pairs = []
147
- qa_model = load_qa_model()
148
 
149
  with st.spinner(f"🧠 Generating Q&A for pages {start_page}-{end_page}..."):
150
  for i, chunk in enumerate(chunks):
151
  if i % 2 == 0: # Generate question
152
- prompt = f"Generate a study question based on: {chunk[:500]}"
153
- question = qa_model(prompt)[:120] + "?"
 
 
154
  else: # Generate answer
155
- prompt = f"Answer the question: {qa_pairs[-1][0]} using context: {chunk[:500]}"
156
- answer = qa_model(prompt)
157
- qa_pairs[-1] = (qa_pairs[-1][0], answer)
 
158
 
159
  return qa_pairs
160
 
@@ -175,8 +226,8 @@ if pdf_file:
175
  # Navigation tabs
176
  selected_tab = option_menu(
177
  None,
178
- ["Ask Questions", "Generate Chapter Q&A"],
179
- icons=["chat", "book"],
180
  menu_icon="cast",
181
  default_index=0,
182
  orientation="horizontal",
@@ -194,11 +245,11 @@ if pdf_file:
194
 
195
  if user_question:
196
  with st.spinner("πŸ€” Thinking..."):
197
- result = st.session_state.qa_chain({"query": user_question})
198
- st.markdown(f"<div class='card'><b>Answer:</b> {result['result']}</div>", unsafe_allow_html=True)
199
 
200
  with st.expander("πŸ” See source passages"):
201
- for i, doc in enumerate(result["source_documents"]):
202
  st.markdown(f"**Passage {i+1}:** {doc.page_content[:500]}...")
203
 
204
  # Chapter Q&A Generation Tab
@@ -224,11 +275,24 @@ if pdf_file:
224
  """, unsafe_allow_html=True)
225
  else:
226
  st.warning("No Q&A pairs generated. Try a different page range.")
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  # Footer
229
  st.markdown("---")
230
  st.markdown("""
231
  <div style="text-align: center; padding: 20px;">
232
- Built with ❀️ for students | PDF Study Assistant v1.0
233
  </div>
234
  """, unsafe_allow_html=True)
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
 
7
  from langchain.chains import RetrievalQA
8
+ from langchain_community.llms import HuggingFaceEndpoint
9
+ import requests
10
  import os
11
+ import json
12
 
13
  # Page configuration
14
  st.set_page_config(
 
72
  from { opacity: 0; }
73
  to { opacity: 1; }
74
  }
75
+
76
+ .spinner {
77
+ display: flex;
78
+ justify-content: center;
79
+ align-items: center;
80
+ height: 100px;
81
+ }
82
  </style>
83
  """, unsafe_allow_html=True)
84
 
85
  # Initialize session state
86
  if 'pdf_processed' not in st.session_state:
87
  st.session_state.pdf_processed = False
88
+ if 'vector_store' not in st.session_state:
89
+ st.session_state.vector_store = None
90
  if 'pages' not in st.session_state:
91
  st.session_state.pages = []
92
+ if 'history' not in st.session_state:
93
+ st.session_state.history = []
94
 
95
+ # Load embedding model with caching
96
  @st.cache_resource
97
  def load_embedding_model():
98
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
99
 
100
+ def query_hf_inference_api(prompt, model="google/flan-t5-xxl", max_tokens=200):
101
+ """Query Hugging Face Inference API directly"""
102
+ API_URL = f"https://api-inference.huggingface.co/models/{model}"
103
+ headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"}
104
+ payload = {
105
+ "inputs": prompt,
106
+ "parameters": {
107
+ "max_new_tokens": max_tokens,
108
+ "temperature": 0.5,
109
+ "do_sample": False
110
+ }
111
+ }
112
+
113
+ try:
114
+ response = requests.post(API_URL, headers=headers, json=payload)
115
+ response.raise_for_status()
116
+ result = response.json()
117
+ return result[0]['generated_text'] if result else ""
118
+ except Exception as e:
119
+ st.error(f"Error querying model: {str(e)}")
120
+ return ""
121
 
122
  def process_pdf(pdf_file):
123
  """Extract text from PDF and create vector store"""
 
126
  text = ""
127
  st.session_state.pages = []
128
  for page in doc:
129
+ page_text = page.get_text()
130
+ text += page_text
131
+ st.session_state.pages.append(page_text)
132
 
133
  with st.spinner("πŸ” Processing text..."):
134
  text_splitter = RecursiveCharacterTextSplitter(
 
139
  chunks = text_splitter.split_text(text)
140
 
141
  embeddings = load_embedding_model()
142
+ st.session_state.vector_store = FAISS.from_texts(chunks, embeddings)
 
 
 
 
 
 
 
 
143
 
144
  st.session_state.pdf_processed = True
145
  st.success("βœ… PDF processed successfully!")
146
 
147
+ def ask_question(question):
148
+ """Answer a question using the vector store and Hugging Face API"""
149
+ if not st.session_state.vector_store:
150
+ return "PDF not processed yet", []
151
+
152
+ # Find relevant passages
153
+ docs = st.session_state.vector_store.similarity_search(question, k=3)
154
+ context = "\n\n".join([doc.page_content for doc in docs])
155
+
156
+ # Format prompt for the model
157
+ prompt = f"""
158
+ Based on the following context, answer the question.
159
+ If the answer isn't in the context, say "I don't know".
160
+
161
+ Context:
162
+ {context}
163
+
164
+ Question: {question}
165
+ Answer:
166
+ """
167
+
168
+ # Query the model
169
+ answer = query_hf_inference_api(prompt)
170
+
171
+ # Add to history
172
+ st.session_state.history.append({
173
+ "question": question,
174
+ "answer": answer,
175
+ "sources": [doc.page_content for doc in docs]
176
+ })
177
+
178
+ return answer, docs
179
+
180
  def generate_qa_for_chapter(start_page, end_page):
181
  """Generate Q&A for specific chapter pages"""
182
  if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page:
 
193
  chunks = text_splitter.split_text(chapter_text)
194
 
195
  qa_pairs = []
 
196
 
197
  with st.spinner(f"🧠 Generating Q&A for pages {start_page}-{end_page}..."):
198
  for i, chunk in enumerate(chunks):
199
  if i % 2 == 0: # Generate question
200
+ prompt = f"Based on this text, generate one study question: {chunk[:500]}"
201
+ question = query_hf_inference_api(prompt, max_tokens=100)
202
+ if question and not question.endswith("?"):
203
+ question += "?"
204
  else: # Generate answer
205
+ if qa_pairs: # Ensure we have a question to answer
206
+ prompt = f"Answer this question: {qa_pairs[-1][0]} using this context: {chunk[:500]}"
207
+ answer = query_hf_inference_api(prompt, max_tokens=200)
208
+ qa_pairs[-1] = (qa_pairs[-1][0], answer)
209
 
210
  return qa_pairs
211
 
 
226
  # Navigation tabs
227
  selected_tab = option_menu(
228
  None,
229
+ ["Ask Questions", "Generate Chapter Q&A", "History"],
230
+ icons=["chat", "book", "clock-history"],
231
  menu_icon="cast",
232
  default_index=0,
233
  orientation="horizontal",
 
245
 
246
  if user_question:
247
  with st.spinner("πŸ€” Thinking..."):
248
+ answer, docs = ask_question(user_question)
249
+ st.markdown(f"<div class='card'><b>Answer:</b> {answer}</div>", unsafe_allow_html=True)
250
 
251
  with st.expander("πŸ” See source passages"):
252
+ for i, doc in enumerate(docs):
253
  st.markdown(f"**Passage {i+1}:** {doc.page_content[:500]}...")
254
 
255
  # Chapter Q&A Generation Tab
 
275
  """, unsafe_allow_html=True)
276
  else:
277
  st.warning("No Q&A pairs generated. Try a different page range.")
278
+
279
+ # History Tab
280
+ elif selected_tab == "History":
281
+ st.markdown("### ⏳ Question History")
282
+ if not st.session_state.history:
283
+ st.info("No questions asked yet.")
284
+ else:
285
+ for i, item in enumerate(reversed(st.session_state.history)):
286
+ with st.expander(f"Q{i+1}: {item['question']}"):
287
+ st.markdown(f"**Answer:** {item['answer']}")
288
+ st.markdown("**Source Passages:**")
289
+ for j, source in enumerate(item['sources']):
290
+ st.markdown(f"{j+1}. {source[:500]}...")
291
 
292
  # Footer
293
  st.markdown("---")
294
  st.markdown("""
295
  <div style="text-align: center; padding: 20px;">
296
+ Built with ❀️ for students | PDF Study Assistant v2.0
297
  </div>
298
  """, unsafe_allow_html=True)