Spaces:

sunbal7
/

PDFQueryApplication

Sleeping

File size: 12,525 Bytes

import streamlit as st
from streamlit_option_menu import option_menu
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import requests
import os
import time

# Page configuration
st.set_page_config(
    page_title="PDF Study Assistant",
    page_icon="📚",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# Custom CSS for colorful design
st.markdown("""
<style>
    :root {
        --primary: #ff4b4b;
        --secondary: #ff9a3d;
        --accent1: #ffcb74;
        --accent2: #3a86ff;
        --background: #f0f2f6;
        --card: #ffffff;
    }
    
    .stApp {
        background: linear-gradient(135deg, var(--background) 0%, #e0e5ec 100%);
    }
    
    .stButton>button {
        background: linear-gradient(to right, var(--secondary), var(--primary));
        color: white;
        border-radius: 12px;
        padding: 8px 20px;
        font-weight: 600;
    }
    
    .stTextInput>div>div>input {
        border-radius: 12px;
        border: 2px solid var(--accent2);
        padding: 10px;
    }
    
    .card {
        background: var(--card);
        border-radius: 15px;
        box-shadow: 0 8px 16px rgba(0,0,0,0.1);
        padding: 20px;
        margin-bottom: 20px;
    }
    
    .header {
        background: linear-gradient(to right, var(--accent2), var(--primary));
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        text-align: center;
        margin-bottom: 30px;
    }
    
    .tab-content {
        animation: fadeIn 0.5s ease-in-out;
    }
    
    .error {
        background-color: #ffebee;
        border-left: 4px solid #f44336;
        padding: 10px;
    }
    
    .info {
        background-color: #e3f2fd;
        border-left: 4px solid #2196f3;
        padding: 10px;
    }
    
    @keyframes fadeIn {
        from { opacity: 0; }
        to { opacity: 1; }
    }
</style>
""", unsafe_allow_html=True)

# Initialize session state
if 'pdf_processed' not in st.session_state:
    st.session_state.pdf_processed = False
if 'vector_store' not in st.session_state:
    st.session_state.vector_store = None
if 'pages' not in st.session_state:
    st.session_state.pages = []
if 'history' not in st.session_state:
    st.session_state.history = []

# Load embedding model with caching
@st.cache_resource
def load_embedding_model():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def query_hf_inference_api(prompt, max_tokens=200):
    """Query Hugging Face Inference API with error handling and retry"""
    MODEL = "google/flan-t5-large"  # Smaller, freely accessible model
    API_URL = f"https://api-inference.huggingface.co/models/{MODEL}"
    headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} if os.getenv('HF_API_KEY') else {}
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": max_tokens,
            "temperature": 0.5,
            "do_sample": False
        }
    }
    
    try:
        response = requests.post(API_URL, headers=headers, json=payload)
        
        if response.status_code == 200:
            result = response.json()
            return result[0]['generated_text'] if result else ""
        
        elif response.status_code == 403:
            st.error("403 Forbidden: Please check your Hugging Face API token and model access")
            st.markdown("""
            <div class="info">
                <h4>How to fix this:</h4>
                <ol>
                    <li>Get your free Hugging Face token from <a href="https://huggingface.co/settings/tokens" target="_blank">https://huggingface.co/settings/tokens</a></li>
                    <li>Add it to your Space secrets as <code>HF_API_KEY</code></li>
                    <li>Accept terms for the model: <a href="https://huggingface.co/google/flan-t5-large" target="_blank">https://huggingface.co/google/flan-t5-large</a></li>
                </ol>
            </div>
            """, unsafe_allow_html=True)
            return ""
        
        elif response.status_code == 429:
            st.warning("Rate limit exceeded. Waiting and retrying...")
            time.sleep(5)  # Wait 5 seconds before retrying
            return query_hf_inference_api(prompt, max_tokens)
        
        else:
            st.error(f"API Error {response.status_code}: {response.text[:200]}")
            return ""
            
    except Exception as e:
        st.error(f"Connection error: {str(e)}")
        return ""

def process_pdf(pdf_file):
    """Extract text from PDF and create vector store"""
    with st.spinner("📖 Reading PDF..."):
        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
        text = ""
        st.session_state.pages = []
        for page in doc:
            page_text = page.get_text()
            text += page_text
            st.session_state.pages.append(page_text)
    
    with st.spinner("🔍 Processing text..."):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )
        chunks = text_splitter.split_text(text)
        
        embeddings = load_embedding_model()
        st.session_state.vector_store = FAISS.from_texts(chunks, embeddings)
    
    st.session_state.pdf_processed = True
    st.success("✅ PDF processed successfully!")

def ask_question(question):
    """Answer a question using the vector store and Hugging Face API"""
    if not st.session_state.vector_store:
        return "PDF not processed yet", []
    
    # Find relevant passages
    docs = st.session_state.vector_store.similarity_search(question, k=3)
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Format prompt for the model
    prompt = f"""
    Based on the following context, answer the question. 
    If the answer isn't in the context, say "I don't know".
    
    Context:
    {context}
    
    Question: {question}
    Answer:
    """
    
    # Query the model
    answer = query_hf_inference_api(prompt)
    
    # Add to history
    st.session_state.history.append({
        "question": question,
        "answer": answer,
        "sources": [doc.page_content for doc in docs]
    })
    
    return answer, docs

def generate_qa_for_chapter(start_page, end_page):
    """Generate Q&A for specific chapter pages"""
    if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page:
        st.error("Invalid page range")
        return []
    
    chapter_text = "\n".join(st.session_state.pages[start_page-1:end_page])
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        length_function=len
    )
    chunks = text_splitter.split_text(chapter_text)
    
    qa_pairs = []
    
    with st.spinner(f"🧠 Generating Q&A for pages {start_page}-{end_page}..."):
        for i, chunk in enumerate(chunks):
            if i % 2 == 0:  # Generate question
                prompt = f"Based on this text, generate one study question: {chunk[:500]}"
                question = query_hf_inference_api(prompt, max_tokens=100)
                if question and not question.endswith("?"):
                    question += "?"
                if question:  # Only add if we got a valid question
                    qa_pairs.append((question, ""))
            else:  # Generate answer
                if qa_pairs:  # Ensure we have a question to answer
                    prompt = f"Answer this question: {qa_pairs[-1][0]} using this context: {chunk[:500]}"
                    answer = query_hf_inference_api(prompt, max_tokens=200)
                    qa_pairs[-1] = (qa_pairs[-1][0], answer)
    
    return qa_pairs

# App header
st.markdown("<h1 class='header'>📚 PDF Study Assistant</h1>", unsafe_allow_html=True)

# API Token Instructions
if not os.getenv("HF_API_KEY"):
    st.markdown("""
    <div class="info">
        <h4>Setup Required:</h4>
        <p>This app requires a free Hugging Face API token to work:</p>
        <ol>
            <li>Get your token from <a href="https://huggingface.co/settings/tokens" target="_blank">https://huggingface.co/settings/tokens</a></li>
            <li>Add it to your Space secrets as <code>HF_API_KEY</code></li>
            <li>Accept terms for the model: <a href="https://huggingface.co/google/flan-t5-large" target="_blank">google/flan-t5-large</a></li>
        </ol>
    </div>
    """, unsafe_allow_html=True)

# PDF Upload Section
with st.container():
    st.subheader("📤 Upload Your Textbook/Notes")
    pdf_file = st.file_uploader("", type="pdf", label_visibility="collapsed")

# Main content
if pdf_file:
    if not st.session_state.pdf_processed:
        process_pdf(pdf_file)
    
    if st.session_state.pdf_processed:
        # Navigation tabs
        selected_tab = option_menu(
            None,
            ["Ask Questions", "Generate Chapter Q&A", "History"],
            icons=["chat", "book", "clock-history"],
            menu_icon="cast",
            default_index=0,
            orientation="horizontal",
            styles={
                "container": {"padding": "0!important", "background-color": "#f9f9f9"},
                "nav-link": {"font-size": "16px", "font-weight": "bold"},
                "nav-link-selected": {"background": "linear-gradient(to right, #3a86ff, #ff4b4b)"},
            }
        )
        
        # Question Answering Tab
        if selected_tab == "Ask Questions":
            st.markdown("### 💬 Ask Questions About Your Document")
            user_question = st.text_input("Type your question here:", key="user_question")
            
            if user_question:
                with st.spinner("🤔 Thinking..."):
                    answer, docs = ask_question(user_question)
                    if answer:
                        st.markdown(f"<div class='card'><b>Answer:</b> {answer}</div>", unsafe_allow_html=True)
                        
                        with st.expander("🔍 See source passages"):
                            for i, doc in enumerate(docs):
                                st.markdown(f"**Passage {i+1}:** {doc.page_content[:500]}...")
        
        # Chapter Q&A Generation Tab
        elif selected_tab == "Generate Chapter Q&A":
            st.markdown("### 📝 Generate Q&A for Specific Chapter")
            col1, col2 = st.columns(2)
            with col1:
                start_page = st.number_input("Start Page", min_value=1, max_value=len(st.session_state.pages), value=1)
            with col2:
                end_page = st.number_input("End Page", min_value=1, max_value=len(st.session_state.pages), value=min(5, len(st.session_state.pages)))
            
            if st.button("Generate Q&A", key="generate_qa"):
                qa_pairs = generate_qa_for_chapter(start_page, end_page)
                
                if qa_pairs:
                    st.markdown(f"<h4>📖 Generated Questions for Pages {start_page}-{end_page}</h4>", unsafe_allow_html=True)
                    for i, (question, answer) in enumerate(qa_pairs):
                        st.markdown(f"""
                        <div class='card'>
                            <b>Q{i+1}:</b> {question}<br>
                            <b>A{i+1}:</b> {answer}
                        </div>
                        """, unsafe_allow_html=True)
                else:
                    st.warning("No Q&A pairs generated. Try a different page range.")
                    
        # History Tab
        elif selected_tab == "History":
            st.markdown("### ⏳ Question History")
            if not st.session_state.history:
                st.info("No questions asked yet.")
            else:
                for i, item in enumerate(reversed(st.session_state.history)):
                    with st.expander(f"Q{i+1}: {item['question']}"):
                        st.markdown(f"**Answer:** {item['answer']}")
                        st.markdown("**Source Passages:**")
                        for j, source in enumerate(item['sources']):
                            st.markdown(f"{j+1}. {source[:500]}...")

# Footer
st.markdown("---")
st.markdown("""
<div style="text-align: center; padding: 20px;">
    Built with ❤️ for students | PDF Study Assistant v3.0
</div>
""", unsafe_allow_html=True)