import streamlit as st
from streamlit_option_menu import option_menu
import fitz # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import requests
import os
import time
# Page configuration
st.set_page_config(
page_title="PDF Study Assistant",
page_icon="📚",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS for colorful design
st.markdown("""
""", unsafe_allow_html=True)
# Initialize session state
if 'pdf_processed' not in st.session_state:
st.session_state.pdf_processed = False
if 'vector_store' not in st.session_state:
st.session_state.vector_store = None
if 'pages' not in st.session_state:
st.session_state.pages = []
if 'history' not in st.session_state:
st.session_state.history = []
if 'token_valid' not in st.session_state:
st.session_state.token_valid = None
# Load embedding model with caching
@st.cache_resource
def load_embedding_model():
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
def check_token_validity():
"""Check if the token is valid by making a simple API call"""
if not os.getenv("HF_API_KEY"):
return False
try:
headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"}
response = requests.get("https://huggingface.co/api/whoami", headers=headers)
return response.status_code == 200
except:
return False
def query_hf_inference_api(prompt, max_tokens=200, model="google/flan-t5-base"):
"""Query Hugging Face Inference API with better error handling"""
API_URL = f"https://api-inference.huggingface.co/models/{model}"
headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} if os.getenv('HF_API_KEY') else {}
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": max_tokens,
"temperature": 0.5,
"do_sample": False
}
}
try:
response = requests.post(API_URL, headers=headers, json=payload)
if response.status_code == 200:
result = response.json()
return result[0]['generated_text'] if result else ""
elif response.status_code == 403:
# Detailed debug information
st.session_state.token_valid = check_token_validity()
debug_info = f"""
403 Forbidden Error
Token is set: {'Yes' if os.getenv('HF_API_KEY') else 'No'}
Token valid: {'Yes' if st.session_state.token_valid else 'No'}
Model: {model}
Possible solutions:
- Visit the model page and click "Agree and access repository"
- Ensure your token has "read" permissions
- Wait 5-10 minutes after accepting terms
- Try a different model using the dropdown below
"""
st.markdown(debug_info, unsafe_allow_html=True)
return ""
elif response.status_code == 429:
st.warning("Rate limit exceeded. Waiting and retrying...")
time.sleep(3)
return query_hf_inference_api(prompt, max_tokens, model)
else:
st.error(f"API Error {response.status_code}: {response.text[:200]}")
return ""
except Exception as e:
st.error(f"Connection error: {str(e)}")
return ""
def process_pdf(pdf_file):
"""Extract text from PDF and create vector store"""
with st.spinner("📖 Reading PDF..."):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
st.session_state.pages = []
for page in doc:
page_text = page.get_text()
text += page_text
st.session_state.pages.append(page_text)
with st.spinner("🔍 Processing text..."):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
embeddings = load_embedding_model()
st.session_state.vector_store = FAISS.from_texts(chunks, embeddings)
st.session_state.pdf_processed = True
st.success("✅ PDF processed successfully!")
def ask_question(question, model_choice):
"""Answer a question using the vector store and Hugging Face API"""
if not st.session_state.vector_store:
return "PDF not processed yet", []
# Find relevant passages
docs = st.session_state.vector_store.similarity_search(question, k=3)
context = "\n\n".join([doc.page_content[:500] for doc in docs])
# Format prompt for the model
prompt = f"""
Based on the following context, answer the question.
If the answer isn't in the context, say "I don't know".
Context:
{context}
Question: {question}
Answer:
"""
# Query the model
answer = query_hf_inference_api(prompt, model=model_choice)
# Add to history
st.session_state.history.append({
"question": question,
"answer": answer,
"sources": [doc.page_content for doc in docs],
"model": model_choice
})
return answer, docs
def generate_qa_for_chapter(start_page, end_page, model_choice):
"""Generate Q&A for specific chapter pages"""
if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page:
st.error("Invalid page range")
return []
chapter_text = "\n".join(st.session_state.pages[start_page-1:end_page])
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=100,
length_function=len
)
chunks = text_splitter.split_text(chapter_text)
qa_pairs = []
with st.spinner(f"🧠 Generating Q&A for pages {start_page}-{end_page}..."):
for i, chunk in enumerate(chunks):
if i % 2 == 0: # Generate question
prompt = f"Based on this text, generate one study question: {chunk[:500]}"
question = query_hf_inference_api(prompt, model=model_choice, max_tokens=100)
if question and not question.endswith("?"):
question += "?"
if question: # Only add if we got a valid question
qa_pairs.append((question, ""))
else: # Generate answer
if qa_pairs: # Ensure we have a question to answer
prompt = f"Answer this question: {qa_pairs[-1][0]} using this context: {chunk[:500]}"
answer = query_hf_inference_api(prompt, model=model_choice, max_tokens=200)
qa_pairs[-1] = (qa_pairs[-1][0], answer)
return qa_pairs
# App header
st.markdown("", unsafe_allow_html=True)
# Model selection
MODEL_OPTIONS = {
"google/flan-t5-base": "T5 Base (Recommended)",
"google/flan-t5-large": "T5 Large (Requires Auth)",
"mrm8488/t5-base-finetuned-question-generation-ap": "Question Generation",
"declare-lab/flan-alpaca-base": "Alpaca Base"
}
# Debug info panel
with st.expander("🔧 Debug Information", expanded=False):
st.subheader("Hugging Face Token Status")
# Check token validity
token_valid = check_token_validity()
st.session_state.token_valid = token_valid
col1, col2 = st.columns(2)
with col1:
st.write(f"Token is set: {'✅ Yes' if os.getenv('HF_API_KEY') else '❌ No'}")
with col2:
st.write(f"Token is valid: {'✅ Yes' if token_valid else '❌ No'}")
if os.getenv('HF_API_KEY'):
st.markdown("""
Your token is set but we're still having issues. Try these steps:
- Visit the model page for your selected model
- Click "Agree and access repository"
- Wait 5-10 minutes for changes to propagate
- Try a different model from the dropdown
""", unsafe_allow_html=True)
else:
st.markdown("""
Token is not set! Add it in your Space secrets:
- Go to your Space → Settings → Secrets
- Add
HF_API_KEY
with your token
- Redeploy the Space
Get your token: https://huggingface.co/settings/tokens
""", unsafe_allow_html=True)
# PDF Upload Section (FIXED LABEL ERROR)
with st.container():
st.subheader("📤 Upload Your Textbook/Notes")
# Fixed empty label issue by adding a space and hiding it
pdf_file = st.file_uploader(
"Upload PDF",
type="pdf",
label_visibility="collapsed"
)
# Main content
if pdf_file:
if not st.session_state.pdf_processed:
process_pdf(pdf_file)
if st.session_state.pdf_processed:
# Model selection
st.subheader("Model Selection")
model_choice = st.selectbox(
"Choose AI model:",
options=list(MODEL_OPTIONS.keys()),
format_func=lambda x: MODEL_OPTIONS[x],
help="Some models require accepting terms on Hugging Face"
)
# Navigation tabs
selected_tab = option_menu(
None,
["Ask Questions", "Generate Chapter Q&A", "History"],
icons=["chat", "book", "clock-history"],
menu_icon="cast",
default_index=0,
orientation="horizontal",
styles={
"container": {"padding": "0!important", "background-color": "#f9f9f9"},
"nav-link": {"font-size": "16px", "font-weight": "bold"},
"nav-link-selected": {"background": "linear-gradient(to right, #3a86ff, #ff4b4b)"},
}
)
# Question Answering Tab
if selected_tab == "Ask Questions":
st.markdown("### 💬 Ask Questions About Your Document")
user_question = st.text_input("Type your question here:", key="user_question")
if user_question:
with st.spinner("🤔 Thinking..."):
answer, docs = ask_question(user_question, model_choice)
if answer:
st.markdown(f"Answer: {answer}
", unsafe_allow_html=True)
with st.expander("🔍 See source passages"):
for i, doc in enumerate(docs):
st.markdown(f"**Passage {i+1}:** {doc.page_content[:500]}...")
# Chapter Q&A Generation Tab
elif selected_tab == "Generate Chapter Q&A":
st.markdown("### 📝 Generate Q&A for Specific Chapter")
col1, col2 = st.columns(2)
with col1:
start_page = st.number_input("Start Page", min_value=1, max_value=len(st.session_state.pages), value=1)
with col2:
end_page = st.number_input("End Page", min_value=1, max_value=len(st.session_state.pages), value=min(5, len(st.session_state.pages)))
if st.button("Generate Q&A", key="generate_qa"):
qa_pairs = generate_qa_for_chapter(start_page, end_page, model_choice)
if qa_pairs:
st.markdown(f"📖 Generated Questions for Pages {start_page}-{end_page}
", unsafe_allow_html=True)
for i, (question, answer) in enumerate(qa_pairs):
st.markdown(f"""
Q{i+1}: {question}
A{i+1}: {answer}
""", unsafe_allow_html=True)
else:
st.warning("No Q&A pairs generated. Try a different page range.")
# History Tab
elif selected_tab == "History":
st.markdown("### ⏳ Question History")
if not st.session_state.history:
st.info("No questions asked yet.")
else:
for i, item in enumerate(reversed(st.session_state.history)):
with st.expander(f"Q{i+1}: {item['question']} ({MODEL_OPTIONS.get(item['model'], item['model'])})"):
st.markdown(f"**Answer:** {item['answer']}")
st.markdown("**Source Passages:**")
for j, source in enumerate(item['sources']):
st.markdown(f"{j+1}. {source[:500]}...")
# Footer
st.markdown("---")
st.markdown("""
Built with ❤️ for students | PDF Study Assistant v4.1
""", unsafe_allow_html=True)