import streamlit as st from streamlit_option_menu import option_menu import fitz # PyMuPDF from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS import requests import os import time # Page configuration st.set_page_config( page_title="PDF Study Assistant", page_icon="📚", layout="wide", initial_sidebar_state="collapsed" ) # Custom CSS for colorful design st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'pdf_processed' not in st.session_state: st.session_state.pdf_processed = False if 'vector_store' not in st.session_state: st.session_state.vector_store = None if 'pages' not in st.session_state: st.session_state.pages = [] if 'history' not in st.session_state: st.session_state.history = [] if 'token_valid' not in st.session_state: st.session_state.token_valid = None # Load embedding model with caching @st.cache_resource def load_embedding_model(): return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") def check_token_validity(): """Check if the token is valid by making a simple API call""" if not os.getenv("HF_API_KEY"): return False try: headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} response = requests.get("https://huggingface.co/api/whoami", headers=headers) return response.status_code == 200 except: return False def query_hf_inference_api(prompt, max_tokens=200, model="google/flan-t5-base"): """Query Hugging Face Inference API with better error handling""" API_URL = f"https://api-inference.huggingface.co/models/{model}" headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} if os.getenv('HF_API_KEY') else {} payload = { "inputs": prompt, "parameters": { "max_new_tokens": max_tokens, "temperature": 0.5, "do_sample": False } } try: response = requests.post(API_URL, headers=headers, json=payload) if response.status_code == 200: result = response.json() return result[0]['generated_text'] if result else "" elif response.status_code == 403: # Detailed debug information st.session_state.token_valid = check_token_validity() debug_info = f"""

403 Forbidden Error

Token is set: {'Yes' if os.getenv('HF_API_KEY') else 'No'}

Token valid: {'Yes' if st.session_state.token_valid else 'No'}

Model: {model}

Possible solutions:

  1. Visit the model page and click "Agree and access repository"
  2. Ensure your token has "read" permissions
  3. Wait 5-10 minutes after accepting terms
  4. Try a different model using the dropdown below
""" st.markdown(debug_info, unsafe_allow_html=True) return "" elif response.status_code == 429: st.warning("Rate limit exceeded. Waiting and retrying...") time.sleep(3) return query_hf_inference_api(prompt, max_tokens, model) else: st.error(f"API Error {response.status_code}: {response.text[:200]}") return "" except Exception as e: st.error(f"Connection error: {str(e)}") return "" def process_pdf(pdf_file): """Extract text from PDF and create vector store""" with st.spinner("📖 Reading PDF..."): doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "" st.session_state.pages = [] for page in doc: page_text = page.get_text() text += page_text st.session_state.pages.append(page_text) with st.spinner("🔍 Processing text..."): text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, length_function=len ) chunks = text_splitter.split_text(text) embeddings = load_embedding_model() st.session_state.vector_store = FAISS.from_texts(chunks, embeddings) st.session_state.pdf_processed = True st.success("✅ PDF processed successfully!") def ask_question(question, model_choice): """Answer a question using the vector store and Hugging Face API""" if not st.session_state.vector_store: return "PDF not processed yet", [] # Find relevant passages docs = st.session_state.vector_store.similarity_search(question, k=3) context = "\n\n".join([doc.page_content[:500] for doc in docs]) # Format prompt for the model prompt = f""" Based on the following context, answer the question. If the answer isn't in the context, say "I don't know". Context: {context} Question: {question} Answer: """ # Query the model answer = query_hf_inference_api(prompt, model=model_choice) # Add to history st.session_state.history.append({ "question": question, "answer": answer, "sources": [doc.page_content for doc in docs], "model": model_choice }) return answer, docs def generate_qa_for_chapter(start_page, end_page, model_choice): """Generate Q&A for specific chapter pages""" if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page: st.error("Invalid page range") return [] chapter_text = "\n".join(st.session_state.pages[start_page-1:end_page]) text_splitter = RecursiveCharacterTextSplitter( chunk_size=800, chunk_overlap=100, length_function=len ) chunks = text_splitter.split_text(chapter_text) qa_pairs = [] with st.spinner(f"🧠 Generating Q&A for pages {start_page}-{end_page}..."): for i, chunk in enumerate(chunks): if i % 2 == 0: # Generate question prompt = f"Based on this text, generate one study question: {chunk[:500]}" question = query_hf_inference_api(prompt, model=model_choice, max_tokens=100) if question and not question.endswith("?"): question += "?" if question: # Only add if we got a valid question qa_pairs.append((question, "")) else: # Generate answer if qa_pairs: # Ensure we have a question to answer prompt = f"Answer this question: {qa_pairs[-1][0]} using this context: {chunk[:500]}" answer = query_hf_inference_api(prompt, model=model_choice, max_tokens=200) qa_pairs[-1] = (qa_pairs[-1][0], answer) return qa_pairs # App header st.markdown("

📚 PDF Study Assistant

", unsafe_allow_html=True) # Model selection MODEL_OPTIONS = { "google/flan-t5-base": "T5 Base (Recommended)", "google/flan-t5-large": "T5 Large (Requires Auth)", "mrm8488/t5-base-finetuned-question-generation-ap": "Question Generation", "declare-lab/flan-alpaca-base": "Alpaca Base" } # Debug info panel with st.expander("🔧 Debug Information", expanded=False): st.subheader("Hugging Face Token Status") # Check token validity token_valid = check_token_validity() st.session_state.token_valid = token_valid col1, col2 = st.columns(2) with col1: st.write(f"Token is set: {'✅ Yes' if os.getenv('HF_API_KEY') else '❌ No'}") with col2: st.write(f"Token is valid: {'✅ Yes' if token_valid else '❌ No'}") if os.getenv('HF_API_KEY'): st.markdown("""

Your token is set but we're still having issues. Try these steps:

  1. Visit the model page for your selected model
  2. Click "Agree and access repository"
  3. Wait 5-10 minutes for changes to propagate
  4. Try a different model from the dropdown
""", unsafe_allow_html=True) else: st.markdown("""

Token is not set! Add it in your Space secrets:

  1. Go to your Space → Settings → Secrets
  2. Add HF_API_KEY with your token
  3. Redeploy the Space

Get your token: https://huggingface.co/settings/tokens

""", unsafe_allow_html=True) # PDF Upload Section (FIXED LABEL ERROR) with st.container(): st.subheader("📤 Upload Your Textbook/Notes") # Fixed empty label issue by adding a space and hiding it pdf_file = st.file_uploader( "Upload PDF", type="pdf", label_visibility="collapsed" ) # Main content if pdf_file: if not st.session_state.pdf_processed: process_pdf(pdf_file) if st.session_state.pdf_processed: # Model selection st.subheader("Model Selection") model_choice = st.selectbox( "Choose AI model:", options=list(MODEL_OPTIONS.keys()), format_func=lambda x: MODEL_OPTIONS[x], help="Some models require accepting terms on Hugging Face" ) # Navigation tabs selected_tab = option_menu( None, ["Ask Questions", "Generate Chapter Q&A", "History"], icons=["chat", "book", "clock-history"], menu_icon="cast", default_index=0, orientation="horizontal", styles={ "container": {"padding": "0!important", "background-color": "#f9f9f9"}, "nav-link": {"font-size": "16px", "font-weight": "bold"}, "nav-link-selected": {"background": "linear-gradient(to right, #3a86ff, #ff4b4b)"}, } ) # Question Answering Tab if selected_tab == "Ask Questions": st.markdown("### 💬 Ask Questions About Your Document") user_question = st.text_input("Type your question here:", key="user_question") if user_question: with st.spinner("🤔 Thinking..."): answer, docs = ask_question(user_question, model_choice) if answer: st.markdown(f"
Answer: {answer}
", unsafe_allow_html=True) with st.expander("🔍 See source passages"): for i, doc in enumerate(docs): st.markdown(f"**Passage {i+1}:** {doc.page_content[:500]}...") # Chapter Q&A Generation Tab elif selected_tab == "Generate Chapter Q&A": st.markdown("### 📝 Generate Q&A for Specific Chapter") col1, col2 = st.columns(2) with col1: start_page = st.number_input("Start Page", min_value=1, max_value=len(st.session_state.pages), value=1) with col2: end_page = st.number_input("End Page", min_value=1, max_value=len(st.session_state.pages), value=min(5, len(st.session_state.pages))) if st.button("Generate Q&A", key="generate_qa"): qa_pairs = generate_qa_for_chapter(start_page, end_page, model_choice) if qa_pairs: st.markdown(f"

📖 Generated Questions for Pages {start_page}-{end_page}

", unsafe_allow_html=True) for i, (question, answer) in enumerate(qa_pairs): st.markdown(f"""
Q{i+1}: {question}
A{i+1}: {answer}
""", unsafe_allow_html=True) else: st.warning("No Q&A pairs generated. Try a different page range.") # History Tab elif selected_tab == "History": st.markdown("### ⏳ Question History") if not st.session_state.history: st.info("No questions asked yet.") else: for i, item in enumerate(reversed(st.session_state.history)): with st.expander(f"Q{i+1}: {item['question']} ({MODEL_OPTIONS.get(item['model'], item['model'])})"): st.markdown(f"**Answer:** {item['answer']}") st.markdown("**Source Passages:**") for j, source in enumerate(item['sources']): st.markdown(f"{j+1}. {source[:500]}...") # Footer st.markdown("---") st.markdown("""
Built with ❤️ for students | PDF Study Assistant v4.1
""", unsafe_allow_html=True)