Spaces:

sunbal7
/

PDFQueryApplication

Sleeping

App Files Files Community

PDFQueryApplication / app.py

sunbal7

Update app.py

6c06b5f verified 2 months ago

raw

history blame

15.2 kB

	import streamlit as st
	from streamlit_option_menu import option_menu
	import fitz # PyMuPDF
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	import requests
	import os
	import time

	# Page configuration
	st.set_page_config(
	page_title="PDF Study Assistant",
	page_icon="📚",
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	# Custom CSS for colorful design
	st.markdown("""
	<style>
	:root {
	--primary: #ff4b4b;
	--secondary: #ff9a3d;
	--accent1: #ffcb74;
	--accent2: #3a86ff;
	--background: #f0f2f6;
	--card: #ffffff;
	}

	.stApp {
	background: linear-gradient(135deg, var(--background) 0%, #e0e5ec 100%);
	}

	.stButton>button {
	background: linear-gradient(to right, var(--secondary), var(--primary));
	color: white;
	border-radius: 12px;
	padding: 8px 20px;
	font-weight: 600;
	}

	.stTextInput>div>div>input {
	border-radius: 12px;
	border: 2px solid var(--accent2);
	padding: 10px;
	}

	.card {
	background: var(--card);
	border-radius: 15px;
	box-shadow: 0 8px 16px rgba(0,0,0,0.1);
	padding: 20px;
	margin-bottom: 20px;
	}

	.header {
	background: linear-gradient(to right, var(--accent2), var(--primary));
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	text-align: center;
	margin-bottom: 30px;
	}

	.tab-content {
	animation: fadeIn 0.5s ease-in-out;
	}

	.error {
	background-color: #ffebee;
	border-left: 4px solid #f44336;
	padding: 10px;
	margin: 10px 0;
	}

	.info {
	background-color: #e3f2fd;
	border-left: 4px solid #2196f3;
	padding: 10px;
	margin: 10px 0;
	}

	.success {
	background-color: #e8f5e9;
	border-left: 4px solid #4caf50;
	padding: 10px;
	margin: 10px 0;
	}

	@keyframes fadeIn {
	from { opacity: 0; }
	to { opacity: 1; }
	}
	</style>
	""", unsafe_allow_html=True)

	# Initialize session state
	if 'pdf_processed' not in st.session_state:
	st.session_state.pdf_processed = False
	if 'vector_store' not in st.session_state:
	st.session_state.vector_store = None
	if 'pages' not in st.session_state:
	st.session_state.pages = []
	if 'history' not in st.session_state:
	st.session_state.history = []
	if 'token_valid' not in st.session_state:
	st.session_state.token_valid = None

	# Load embedding model with caching
	@st.cache_resource
	def load_embedding_model():
	return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	def check_token_validity():
	"""Check if the token is valid by making a simple API call"""
	if not os.getenv("HF_API_KEY"):
	return False

	try:
	headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"}
	response = requests.get("https://huggingface.co/api/whoami", headers=headers)
	return response.status_code == 200
	except:
	return False

	def query_hf_inference_api(prompt, max_tokens=200, model="google/flan-t5-base"):
	"""Query Hugging Face Inference API with better error handling"""
	API_URL = f"https://api-inference.huggingface.co/models/{model}"
	headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} if os.getenv('HF_API_KEY') else {}

	payload = {
	"inputs": prompt,
	"parameters": {
	"max_new_tokens": max_tokens,
	"temperature": 0.5,
	"do_sample": False
	}
	}

	try:
	response = requests.post(API_URL, headers=headers, json=payload)

	if response.status_code == 200:
	result = response.json()
	return result[0]['generated_text'] if result else ""

	elif response.status_code == 403:
	# Detailed debug information
	st.session_state.token_valid = check_token_validity()

	debug_info = f"""
	<div class="error">
	<h4>403 Forbidden Error</h4>
	<p>Token is set: <strong>{'Yes' if os.getenv('HF_API_KEY') else 'No'}</strong></p>
	<p>Token valid: <strong>{'Yes' if st.session_state.token_valid else 'No'}</strong></p>
	<p>Model: {model}</p>
	<p>Possible solutions:</p>
	<ol>
	<li>Visit the <a href="https://huggingface.co/{model}" target="_blank">model page</a> and click "Agree and access repository"</li>
	<li>Ensure your token has "read" permissions</li>
	<li>Wait 5-10 minutes after accepting terms</li>
	<li>Try a different model using the dropdown below</li>
	</ol>
	</div>
	"""
	st.markdown(debug_info, unsafe_allow_html=True)
	return ""

	elif response.status_code == 429:
	st.warning("Rate limit exceeded. Waiting and retrying...")
	time.sleep(3)
	return query_hf_inference_api(prompt, max_tokens, model)

	else:
	st.error(f"API Error {response.status_code}: {response.text[:200]}")
	return ""

	except Exception as e:
	st.error(f"Connection error: {str(e)}")
	return ""

	def process_pdf(pdf_file):
	"""Extract text from PDF and create vector store"""
	with st.spinner("📖 Reading PDF..."):
	doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
	text = ""
	st.session_state.pages = []
	for page in doc:
	page_text = page.get_text()
	text += page_text
	st.session_state.pages.append(page_text)

	with st.spinner("🔍 Processing text..."):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)
	chunks = text_splitter.split_text(text)

	embeddings = load_embedding_model()
	st.session_state.vector_store = FAISS.from_texts(chunks, embeddings)

	st.session_state.pdf_processed = True
	st.success("✅ PDF processed successfully!")

	def ask_question(question, model_choice):
	"""Answer a question using the vector store and Hugging Face API"""
	if not st.session_state.vector_store:
	return "PDF not processed yet", []

	# Find relevant passages
	docs = st.session_state.vector_store.similarity_search(question, k=3)
	context = "\n\n".join([doc.page_content[:500] for doc in docs])

	# Format prompt for the model
	prompt = f"""
	Based on the following context, answer the question.
	If the answer isn't in the context, say "I don't know".

	Context:
	{context}

	Question: {question}
	Answer:
	"""

	# Query the model
	answer = query_hf_inference_api(prompt, model=model_choice)

	# Add to history
	st.session_state.history.append({
	"question": question,
	"answer": answer,
	"sources": [doc.page_content for doc in docs],
	"model": model_choice
	})

	return answer, docs

	def generate_qa_for_chapter(start_page, end_page, model_choice):
	"""Generate Q&A for specific chapter pages"""
	if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page:
	st.error("Invalid page range")
	return []

	chapter_text = "\n".join(st.session_state.pages[start_page-1:end_page])

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=800,
	chunk_overlap=100,
	length_function=len
	)
	chunks = text_splitter.split_text(chapter_text)

	qa_pairs = []

	with st.spinner(f"🧠 Generating Q&A for pages {start_page}-{end_page}..."):
	for i, chunk in enumerate(chunks):
	if i % 2 == 0: # Generate question
	prompt = f"Based on this text, generate one study question: {chunk[:500]}"
	question = query_hf_inference_api(prompt, model=model_choice, max_tokens=100)
	if question and not question.endswith("?"):
	question += "?"
	if question: # Only add if we got a valid question
	qa_pairs.append((question, ""))
	else: # Generate answer
	if qa_pairs: # Ensure we have a question to answer
	prompt = f"Answer this question: {qa_pairs[-1][0]} using this context: {chunk[:500]}"
	answer = query_hf_inference_api(prompt, model=model_choice, max_tokens=200)
	qa_pairs[-1] = (qa_pairs[-1][0], answer)

	return qa_pairs

	# App header
	st.markdown("<h1 class='header'>📚 PDF Study Assistant</h1>", unsafe_allow_html=True)

	# Model selection
	MODEL_OPTIONS = {
	"google/flan-t5-base": "T5 Base (Recommended)",
	"google/flan-t5-large": "T5 Large (Requires Auth)",
	"mrm8488/t5-base-finetuned-question-generation-ap": "Question Generation",
	"declare-lab/flan-alpaca-base": "Alpaca Base"
	}

	# Debug info panel
	with st.expander("🔧 Debug Information", expanded=False):
	st.subheader("Hugging Face Token Status")

	# Check token validity
	token_valid = check_token_validity()
	st.session_state.token_valid = token_valid

	col1, col2 = st.columns(2)
	with col1:
	st.write(f"Token is set: {'✅ Yes' if os.getenv('HF_API_KEY') else '❌ No'}")
	with col2:
	st.write(f"Token is valid: {'✅ Yes' if token_valid else '❌ No'}")

	if os.getenv('HF_API_KEY'):
	st.markdown("""
	<div class="info">
	<p>Your token is set but we're still having issues. Try these steps:</p>
	<ol>
	<li>Visit the model page for your selected model</li>
	<li>Click "Agree and access repository"</li>
	<li>Wait 5-10 minutes for changes to propagate</li>
	<li>Try a different model from the dropdown</li>
	</ol>
	</div>
	""", unsafe_allow_html=True)
	else:
	st.markdown("""
	<div class="error">
	<p>Token is not set! Add it in your Space secrets:</p>
	<ol>
	<li>Go to your Space → Settings → Secrets</li>
	<li>Add <code>HF_API_KEY</code> with your token</li>
	<li>Redeploy the Space</li>
	</ol>
	<p>Get your token: <a href="https://huggingface.co/settings/tokens" target="_blank">https://huggingface.co/settings/tokens</a></p>
	</div>
	""", unsafe_allow_html=True)

	# PDF Upload Section (FIXED LABEL ERROR)
	with st.container():
	st.subheader("📤 Upload Your Textbook/Notes")
	# Fixed empty label issue by adding a space and hiding it
	pdf_file = st.file_uploader(
	"Upload PDF",
	type="pdf",
	label_visibility="collapsed"
	)

	# Main content
	if pdf_file:
	if not st.session_state.pdf_processed:
	process_pdf(pdf_file)

	if st.session_state.pdf_processed:
	# Model selection
	st.subheader("Model Selection")
	model_choice = st.selectbox(
	"Choose AI model:",
	options=list(MODEL_OPTIONS.keys()),
	format_func=lambda x: MODEL_OPTIONS[x],
	help="Some models require accepting terms on Hugging Face"
	)

	# Navigation tabs
	selected_tab = option_menu(
	None,
	["Ask Questions", "Generate Chapter Q&A", "History"],
	icons=["chat", "book", "clock-history"],
	menu_icon="cast",
	default_index=0,
	orientation="horizontal",
	styles={
	"container": {"padding": "0!important", "background-color": "#f9f9f9"},
	"nav-link": {"font-size": "16px", "font-weight": "bold"},
	"nav-link-selected": {"background": "linear-gradient(to right, #3a86ff, #ff4b4b)"},
	}
	)

	# Question Answering Tab
	if selected_tab == "Ask Questions":
	st.markdown("### 💬 Ask Questions About Your Document")
	user_question = st.text_input("Type your question here:", key="user_question")

	if user_question:
	with st.spinner("🤔 Thinking..."):
	answer, docs = ask_question(user_question, model_choice)
	if answer:
	st.markdown(f"<div class='card'><b>Answer:</b> {answer}</div>", unsafe_allow_html=True)

	with st.expander("🔍 See source passages"):
	for i, doc in enumerate(docs):
	st.markdown(f"Passage {i+1}: {doc.page_content[:500]}...")

	# Chapter Q&A Generation Tab
	elif selected_tab == "Generate Chapter Q&A":
	st.markdown("### 📝 Generate Q&A for Specific Chapter")
	col1, col2 = st.columns(2)
	with col1:
	start_page = st.number_input("Start Page", min_value=1, max_value=len(st.session_state.pages), value=1)
	with col2:
	end_page = st.number_input("End Page", min_value=1, max_value=len(st.session_state.pages), value=min(5, len(st.session_state.pages)))

	if st.button("Generate Q&A", key="generate_qa"):
	qa_pairs = generate_qa_for_chapter(start_page, end_page, model_choice)

	if qa_pairs:
	st.markdown(f"<h4>📖 Generated Questions for Pages {start_page}-{end_page}</h4>", unsafe_allow_html=True)
	for i, (question, answer) in enumerate(qa_pairs):
	st.markdown(f"""
	<div class='card'>
	<b>Q{i+1}:</b> {question}<br>
	<b>A{i+1}:</b> {answer}
	</div>
	""", unsafe_allow_html=True)
	else:
	st.warning("No Q&A pairs generated. Try a different page range.")

	# History Tab
	elif selected_tab == "History":
	st.markdown("### ⏳ Question History")
	if not st.session_state.history:
	st.info("No questions asked yet.")
	else:
	for i, item in enumerate(reversed(st.session_state.history)):
	with st.expander(f"Q{i+1}: {item['question']} ({MODEL_OPTIONS.get(item['model'], item['model'])})"):
	st.markdown(f"Answer: {item['answer']}")
	st.markdown("Source Passages:")
	for j, source in enumerate(item['sources']):
	st.markdown(f"{j+1}. {source[:500]}...")

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style="text-align: center; padding: 20px;">
	Built with ❤️ for students \| PDF Study Assistant v4.1
	</div>
	""", unsafe_allow_html=True)