sunbal7 commited on
Commit
e6bfac3
Β·
verified Β·
1 Parent(s): 6c06b5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -380
app.py CHANGED
@@ -1,423 +1,179 @@
1
  import streamlit as st
2
- from streamlit_option_menu import option_menu
3
- import fitz # PyMuPDF
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
7
- import requests
8
- import os
9
- import time
 
 
 
 
10
 
11
- # Page configuration
12
  st.set_page_config(
13
- page_title="PDF Study Assistant",
14
  page_icon="πŸ“š",
15
  layout="wide",
16
  initial_sidebar_state="collapsed"
17
  )
18
 
19
- # Custom CSS for colorful design
 
 
 
 
 
 
 
20
  st.markdown("""
21
- <style>
22
- :root {
23
- --primary: #ff4b4b;
24
- --secondary: #ff9a3d;
25
- --accent1: #ffcb74;
26
- --accent2: #3a86ff;
27
- --background: #f0f2f6;
28
- --card: #ffffff;
29
- }
30
-
31
- .stApp {
32
- background: linear-gradient(135deg, var(--background) 0%, #e0e5ec 100%);
33
- }
34
-
35
- .stButton>button {
36
- background: linear-gradient(to right, var(--secondary), var(--primary));
37
- color: white;
38
- border-radius: 12px;
39
- padding: 8px 20px;
40
- font-weight: 600;
41
- }
42
-
43
- .stTextInput>div>div>input {
44
- border-radius: 12px;
45
- border: 2px solid var(--accent2);
46
- padding: 10px;
47
- }
48
-
49
- .card {
50
- background: var(--card);
51
- border-radius: 15px;
52
- box-shadow: 0 8px 16px rgba(0,0,0,0.1);
53
- padding: 20px;
54
- margin-bottom: 20px;
55
- }
56
-
57
- .header {
58
- background: linear-gradient(to right, var(--accent2), var(--primary));
59
- -webkit-background-clip: text;
60
- -webkit-text-fill-color: transparent;
61
- text-align: center;
62
- margin-bottom: 30px;
63
- }
64
-
65
- .tab-content {
66
- animation: fadeIn 0.5s ease-in-out;
67
- }
68
-
69
- .error {
70
- background-color: #ffebee;
71
- border-left: 4px solid #f44336;
72
- padding: 10px;
73
- margin: 10px 0;
74
- }
75
-
76
- .info {
77
- background-color: #e3f2fd;
78
- border-left: 4px solid #2196f3;
79
- padding: 10px;
80
- margin: 10px 0;
81
- }
82
-
83
- .success {
84
- background-color: #e8f5e9;
85
- border-left: 4px solid #4caf50;
86
- padding: 10px;
87
- margin: 10px 0;
88
- }
89
-
90
- @keyframes fadeIn {
91
- from { opacity: 0; }
92
- to { opacity: 1; }
93
- }
94
- </style>
95
  """, unsafe_allow_html=True)
96
 
97
  # Initialize session state
98
- if 'pdf_processed' not in st.session_state:
99
- st.session_state.pdf_processed = False
100
- if 'vector_store' not in st.session_state:
101
  st.session_state.vector_store = None
102
- if 'pages' not in st.session_state:
103
- st.session_state.pages = []
104
- if 'history' not in st.session_state:
105
- st.session_state.history = []
106
- if 'token_valid' not in st.session_state:
107
- st.session_state.token_valid = None
108
-
109
- # Load embedding model with caching
110
- @st.cache_resource
111
- def load_embedding_model():
112
- return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
113
-
114
- def check_token_validity():
115
- """Check if the token is valid by making a simple API call"""
116
- if not os.getenv("HF_API_KEY"):
117
- return False
118
-
119
- try:
120
- headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"}
121
- response = requests.get("https://huggingface.co/api/whoami", headers=headers)
122
- return response.status_code == 200
123
- except:
124
- return False
125
 
126
- def query_hf_inference_api(prompt, max_tokens=200, model="google/flan-t5-base"):
127
- """Query Hugging Face Inference API with better error handling"""
128
- API_URL = f"https://api-inference.huggingface.co/models/{model}"
129
- headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} if os.getenv('HF_API_KEY') else {}
130
-
131
- payload = {
132
- "inputs": prompt,
133
- "parameters": {
134
- "max_new_tokens": max_tokens,
135
- "temperature": 0.5,
136
- "do_sample": False
137
- }
138
- }
139
-
140
- try:
141
- response = requests.post(API_URL, headers=headers, json=payload)
142
-
143
- if response.status_code == 200:
144
- result = response.json()
145
- return result[0]['generated_text'] if result else ""
146
-
147
- elif response.status_code == 403:
148
- # Detailed debug information
149
- st.session_state.token_valid = check_token_validity()
150
-
151
- debug_info = f"""
152
- <div class="error">
153
- <h4>403 Forbidden Error</h4>
154
- <p>Token is set: <strong>{'Yes' if os.getenv('HF_API_KEY') else 'No'}</strong></p>
155
- <p>Token valid: <strong>{'Yes' if st.session_state.token_valid else 'No'}</strong></p>
156
- <p>Model: {model}</p>
157
- <p>Possible solutions:</p>
158
- <ol>
159
- <li>Visit the <a href="https://huggingface.co/{model}" target="_blank">model page</a> and click "Agree and access repository"</li>
160
- <li>Ensure your token has "read" permissions</li>
161
- <li>Wait 5-10 minutes after accepting terms</li>
162
- <li>Try a different model using the dropdown below</li>
163
- </ol>
164
- </div>
165
- """
166
- st.markdown(debug_info, unsafe_allow_html=True)
167
- return ""
168
-
169
- elif response.status_code == 429:
170
- st.warning("Rate limit exceeded. Waiting and retrying...")
171
- time.sleep(3)
172
- return query_hf_inference_api(prompt, max_tokens, model)
173
-
174
- else:
175
- st.error(f"API Error {response.status_code}: {response.text[:200]}")
176
- return ""
177
-
178
- except Exception as e:
179
- st.error(f"Connection error: {str(e)}")
180
- return ""
181
 
 
182
  def process_pdf(pdf_file):
183
- """Extract text from PDF and create vector store"""
184
- with st.spinner("πŸ“– Reading PDF..."):
185
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
186
- text = ""
187
- st.session_state.pages = []
188
- for page in doc:
189
- page_text = page.get_text()
190
- text += page_text
191
- st.session_state.pages.append(page_text)
192
 
193
- with st.spinner("πŸ” Processing text..."):
194
- text_splitter = RecursiveCharacterTextSplitter(
195
- chunk_size=1000,
196
- chunk_overlap=200,
197
- length_function=len
198
- )
199
- chunks = text_splitter.split_text(text)
200
-
201
- embeddings = load_embedding_model()
202
- st.session_state.vector_store = FAISS.from_texts(chunks, embeddings)
203
 
204
- st.session_state.pdf_processed = True
205
- st.success("βœ… PDF processed successfully!")
206
-
207
- def ask_question(question, model_choice):
208
- """Answer a question using the vector store and Hugging Face API"""
209
- if not st.session_state.vector_store:
210
- return "PDF not processed yet", []
211
 
212
- # Find relevant passages
213
- docs = st.session_state.vector_store.similarity_search(question, k=3)
214
- context = "\n\n".join([doc.page_content[:500] for doc in docs])
215
 
216
- # Format prompt for the model
217
- prompt = f"""
218
- Based on the following context, answer the question.
219
- If the answer isn't in the context, say "I don't know".
 
 
220
 
221
- Context:
 
222
  {context}
223
 
224
  Question: {question}
225
- Answer:
226
- """
227
 
228
- # Query the model
229
- answer = query_hf_inference_api(prompt, model=model_choice)
230
-
231
- # Add to history
232
- st.session_state.history.append({
233
- "question": question,
234
- "answer": answer,
235
- "sources": [doc.page_content for doc in docs],
236
- "model": model_choice
237
- })
238
 
239
- return answer, docs
240
-
241
- def generate_qa_for_chapter(start_page, end_page, model_choice):
242
- """Generate Q&A for specific chapter pages"""
243
- if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page:
244
- st.error("Invalid page range")
245
- return []
246
 
247
- chapter_text = "\n".join(st.session_state.pages[start_page-1:end_page])
248
 
249
- text_splitter = RecursiveCharacterTextSplitter(
250
- chunk_size=800,
251
- chunk_overlap=100,
252
- length_function=len
 
253
  )
254
- chunks = text_splitter.split_text(chapter_text)
255
 
256
- qa_pairs = []
257
-
258
- with st.spinner(f"🧠 Generating Q&A for pages {start_page}-{end_page}..."):
259
- for i, chunk in enumerate(chunks):
260
- if i % 2 == 0: # Generate question
261
- prompt = f"Based on this text, generate one study question: {chunk[:500]}"
262
- question = query_hf_inference_api(prompt, model=model_choice, max_tokens=100)
263
- if question and not question.endswith("?"):
264
- question += "?"
265
- if question: # Only add if we got a valid question
266
- qa_pairs.append((question, ""))
267
- else: # Generate answer
268
- if qa_pairs: # Ensure we have a question to answer
269
- prompt = f"Answer this question: {qa_pairs[-1][0]} using this context: {chunk[:500]}"
270
- answer = query_hf_inference_api(prompt, model=model_choice, max_tokens=200)
271
- qa_pairs[-1] = (qa_pairs[-1][0], answer)
 
 
 
272
 
273
- return qa_pairs
 
274
 
275
- # App header
276
- st.markdown("<h1 class='header'>πŸ“š PDF Study Assistant</h1>", unsafe_allow_html=True)
 
277
 
278
- # Model selection
279
- MODEL_OPTIONS = {
280
- "google/flan-t5-base": "T5 Base (Recommended)",
281
- "google/flan-t5-large": "T5 Large (Requires Auth)",
282
- "mrm8488/t5-base-finetuned-question-generation-ap": "Question Generation",
283
- "declare-lab/flan-alpaca-base": "Alpaca Base"
284
- }
285
 
286
- # Debug info panel
287
- with st.expander("πŸ”§ Debug Information", expanded=False):
288
- st.subheader("Hugging Face Token Status")
289
-
290
- # Check token validity
291
- token_valid = check_token_validity()
292
- st.session_state.token_valid = token_valid
293
-
294
- col1, col2 = st.columns(2)
295
- with col1:
296
- st.write(f"Token is set: {'βœ… Yes' if os.getenv('HF_API_KEY') else '❌ No'}")
297
- with col2:
298
- st.write(f"Token is valid: {'βœ… Yes' if token_valid else '❌ No'}")
299
-
300
- if os.getenv('HF_API_KEY'):
301
- st.markdown("""
302
- <div class="info">
303
- <p>Your token is set but we're still having issues. Try these steps:</p>
304
- <ol>
305
- <li>Visit the model page for your selected model</li>
306
- <li>Click "Agree and access repository"</li>
307
- <li>Wait 5-10 minutes for changes to propagate</li>
308
- <li>Try a different model from the dropdown</li>
309
- </ol>
310
- </div>
311
- """, unsafe_allow_html=True)
312
- else:
313
- st.markdown("""
314
- <div class="error">
315
- <p>Token is not set! Add it in your Space secrets:</p>
316
- <ol>
317
- <li>Go to your Space β†’ Settings β†’ Secrets</li>
318
- <li>Add <code>HF_API_KEY</code> with your token</li>
319
- <li>Redeploy the Space</li>
320
- </ol>
321
- <p>Get your token: <a href="https://huggingface.co/settings/tokens" target="_blank">https://huggingface.co/settings/tokens</a></p>
322
- </div>
323
- """, unsafe_allow_html=True)
324
 
325
- # PDF Upload Section (FIXED LABEL ERROR)
326
- with st.container():
327
- st.subheader("πŸ“€ Upload Your Textbook/Notes")
328
- # Fixed empty label issue by adding a space and hiding it
329
- pdf_file = st.file_uploader(
330
- "Upload PDF",
331
- type="pdf",
332
- label_visibility="collapsed"
333
- )
 
 
 
 
 
334
 
335
- # Main content
336
- if pdf_file:
337
- if not st.session_state.pdf_processed:
338
- process_pdf(pdf_file)
339
-
340
- if st.session_state.pdf_processed:
341
- # Model selection
342
- st.subheader("Model Selection")
343
- model_choice = st.selectbox(
344
- "Choose AI model:",
345
- options=list(MODEL_OPTIONS.keys()),
346
- format_func=lambda x: MODEL_OPTIONS[x],
347
- help="Some models require accepting terms on Hugging Face"
348
- )
349
-
350
- # Navigation tabs
351
- selected_tab = option_menu(
352
- None,
353
- ["Ask Questions", "Generate Chapter Q&A", "History"],
354
- icons=["chat", "book", "clock-history"],
355
- menu_icon="cast",
356
- default_index=0,
357
- orientation="horizontal",
358
- styles={
359
- "container": {"padding": "0!important", "background-color": "#f9f9f9"},
360
- "nav-link": {"font-size": "16px", "font-weight": "bold"},
361
- "nav-link-selected": {"background": "linear-gradient(to right, #3a86ff, #ff4b4b)"},
362
- }
363
- )
364
-
365
- # Question Answering Tab
366
- if selected_tab == "Ask Questions":
367
- st.markdown("### πŸ’¬ Ask Questions About Your Document")
368
- user_question = st.text_input("Type your question here:", key="user_question")
369
 
370
- if user_question:
371
- with st.spinner("πŸ€” Thinking..."):
372
- answer, docs = ask_question(user_question, model_choice)
373
- if answer:
374
- st.markdown(f"<div class='card'><b>Answer:</b> {answer}</div>", unsafe_allow_html=True)
375
-
376
- with st.expander("πŸ” See source passages"):
377
- for i, doc in enumerate(docs):
378
- st.markdown(f"**Passage {i+1}:** {doc.page_content[:500]}...")
379
-
380
- # Chapter Q&A Generation Tab
381
- elif selected_tab == "Generate Chapter Q&A":
382
- st.markdown("### πŸ“ Generate Q&A for Specific Chapter")
383
- col1, col2 = st.columns(2)
384
- with col1:
385
- start_page = st.number_input("Start Page", min_value=1, max_value=len(st.session_state.pages), value=1)
386
- with col2:
387
- end_page = st.number_input("End Page", min_value=1, max_value=len(st.session_state.pages), value=min(5, len(st.session_state.pages)))
388
 
389
- if st.button("Generate Q&A", key="generate_qa"):
390
- qa_pairs = generate_qa_for_chapter(start_page, end_page, model_choice)
391
-
392
- if qa_pairs:
393
- st.markdown(f"<h4>πŸ“– Generated Questions for Pages {start_page}-{end_page}</h4>", unsafe_allow_html=True)
394
- for i, (question, answer) in enumerate(qa_pairs):
395
- st.markdown(f"""
396
- <div class='card'>
397
- <b>Q{i+1}:</b> {question}<br>
398
- <b>A{i+1}:</b> {answer}
399
- </div>
400
- """, unsafe_allow_html=True)
401
- else:
402
- st.warning("No Q&A pairs generated. Try a different page range.")
403
-
404
- # History Tab
405
- elif selected_tab == "History":
406
- st.markdown("### ⏳ Question History")
407
- if not st.session_state.history:
408
- st.info("No questions asked yet.")
409
- else:
410
- for i, item in enumerate(reversed(st.session_state.history)):
411
- with st.expander(f"Q{i+1}: {item['question']} ({MODEL_OPTIONS.get(item['model'], item['model'])})"):
412
- st.markdown(f"**Answer:** {item['answer']}")
413
- st.markdown("**Source Passages:**")
414
- for j, source in enumerate(item['sources']):
415
- st.markdown(f"{j+1}. {source[:500]}...")
416
 
417
  # Footer
418
  st.markdown("---")
419
- st.markdown("""
420
- <div style="text-align: center; padding: 20px;">
421
- Built with ❀️ for students | PDF Study Assistant v4.1
422
- </div>
423
- """, unsafe_allow_html=True)
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ import tempfile
4
+ from langchain_community.document_loaders import PyPDFLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
6
  from langchain_community.vectorstores import FAISS
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from langchain_community.chat_models import ChatOllama
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.prompts import PromptTemplate
11
+ from langchain_core.runnables import RunnablePassthrough
12
+ from langchain_core.output_parsers import StrOutputParser
13
+ import base64
14
 
15
+ # Set page config
16
  st.set_page_config(
17
+ page_title="EduQuery - Smart PDF Assistant",
18
  page_icon="πŸ“š",
19
  layout="wide",
20
  initial_sidebar_state="collapsed"
21
  )
22
 
23
+ # Custom CSS for colorful UI
24
+ def local_css(file_name):
25
+ with open(file_name) as f:
26
+ st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
27
+
28
+ local_css("style.css")
29
+
30
+ # Header with gradient
31
  st.markdown("""
32
+ <div class="header">
33
+ <h1>πŸ“š EduQuery</h1>
34
+ <p>Smart PDF Assistant for Students</p>
35
+ </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  """, unsafe_allow_html=True)
37
 
38
  # Initialize session state
39
+ if "vector_store" not in st.session_state:
 
 
40
  st.session_state.vector_store = None
41
+ if "messages" not in st.session_state:
42
+ st.session_state.messages = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ # Model selection
45
+ MODEL_NAME = "nous-hermes2" # Best open-source model for instruction following
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # PDF Processing
48
  def process_pdf(pdf_file):
49
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
50
+ tmp_file.write(pdf_file.getvalue())
51
+ tmp_path = tmp_file.name
 
 
 
 
 
 
52
 
53
+ loader = PyPDFLoader(tmp_path)
54
+ docs = loader.load()
 
 
 
 
 
 
 
 
55
 
56
+ text_splitter = RecursiveCharacterTextSplitter(
57
+ chunk_size=1000,
58
+ chunk_overlap=200,
59
+ length_function=len
60
+ )
61
+ chunks = text_splitter.split_documents(docs)
 
62
 
63
+ embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
64
+ vector_store = FAISS.from_documents(chunks, embeddings)
 
65
 
66
+ os.unlink(tmp_path)
67
+ return vector_store
68
+
69
+ # RAG Setup
70
+ def setup_qa_chain(vector_store):
71
+ llm = ChatOllama(model=MODEL_NAME, temperature=0.3)
72
 
73
+ custom_prompt = """
74
+ You are an expert academic assistant. Answer the question based only on the following context:
75
  {context}
76
 
77
  Question: {question}
 
 
78
 
79
+ Provide a clear, concise answer with page number references. If unsure, say "I couldn't find this information in the document".
80
+ """
 
 
 
 
 
 
 
 
81
 
82
+ prompt = PromptTemplate(
83
+ template=custom_prompt,
84
+ input_variables=["context", "question"]
85
+ )
 
 
 
86
 
87
+ retriever = vector_store.as_retriever(search_kwargs={"k": 3})
88
 
89
+ qa_chain = (
90
+ {"context": retriever, "question": RunnablePassthrough()}
91
+ | prompt
92
+ | llm
93
+ | StrOutputParser()
94
  )
 
95
 
96
+ return qa_chain
97
+
98
+ # Generate questions from chapter
99
+ def generate_chapter_questions(vector_store, chapter_title):
100
+ llm = ChatOllama(model=MODEL_NAME, temperature=0.7)
101
+
102
+ prompt = PromptTemplate(
103
+ input_variables=["chapter_title"],
104
+ template="""
105
+ You are an expert educator. Generate 5 important questions and answers about '{chapter_title}'
106
+ that would help students understand key concepts. Format as:
107
+
108
+ Q1: [Question]
109
+ A1: [Answer with page reference]
110
+
111
+ Q2: [Question]
112
+ A2: [Answer with page reference]
113
+ ..."""
114
+ )
115
 
116
+ chain = prompt | llm | StrOutputParser()
117
+ return chain.invoke({"chapter_title": chapter_title})
118
 
119
+ # File upload section
120
+ st.subheader("πŸ“€ Upload Your Textbook/Notes")
121
+ uploaded_file = st.file_uploader("", type="pdf", accept_multiple_files=False)
122
 
123
+ if uploaded_file:
124
+ with st.spinner("Processing PDF..."):
125
+ st.session_state.vector_store = process_pdf(uploaded_file)
126
+ st.success("PDF processed successfully! You can now ask questions.")
 
 
 
127
 
128
+ # Main content columns
129
+ col1, col2 = st.columns([1, 2])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ # Chapter-based Q&A Generator
132
+ with col1:
133
+ st.subheader("πŸ” Generate Chapter Questions")
134
+ chapter_title = st.text_input("Enter chapter title/section name:")
135
+
136
+ if st.button("Generate Q&A") and chapter_title and st.session_state.vector_store:
137
+ with st.spinner(f"Generating questions about {chapter_title}..."):
138
+ questions = generate_chapter_questions(
139
+ st.session_state.vector_store,
140
+ chapter_title
141
+ )
142
+ st.markdown(f"<div class='qa-box'>{questions}</div>", unsafe_allow_html=True)
143
+ elif chapter_title and not st.session_state.vector_store:
144
+ st.warning("Please upload a PDF first")
145
 
146
+ # Chat interface
147
+ with col2:
148
+ st.subheader("πŸ’¬ Ask Anything About the Document")
149
+
150
+ for message in st.session_state.messages:
151
+ with st.chat_message(message["role"]):
152
+ st.markdown(message["content"])
153
+
154
+ if prompt := st.chat_input("Your question..."):
155
+ if not st.session_state.vector_store:
156
+ st.warning("Please upload a PDF first")
157
+ st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
+ st.session_state.messages.append({"role": "user", "content": prompt})
160
+ with st.chat_message("user"):
161
+ st.markdown(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ with st.chat_message("assistant"):
164
+ with st.spinner("Thinking..."):
165
+ qa_chain = setup_qa_chain(st.session_state.vector_store)
166
+ response = qa_chain.invoke(prompt)
167
+ st.markdown(response)
168
+ st.session_state.messages.append({"role": "assistant", "content": response})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  # Footer
171
  st.markdown("---")
172
+ st.markdown(
173
+ """
174
+ <div class="footer">
175
+ <p>EduQuery - Helping students learn smarter β€’ Powered by Nous-Hermes2 and LangChain</p>
176
+ </div>
177
+ """,
178
+ unsafe_allow_html=True
179
+ )