Spaces:

sunbal7
/

PDFQueryApplication

Sleeping

App Files Files Community

sunbal7 commited on Jun 20

Commit

e6bfac3

verified ·

1 Parent(s): 6c06b5f

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -380

app.py CHANGED Viewed

@@ -1,423 +1,179 @@
 import streamlit as st
-from streamlit_option_menu import option_menu
-import fitz  # PyMuPDF
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
-import requests
-import os
-import time
-# Page configuration
 st.set_page_config(
-    page_title="PDF Study Assistant",
     page_icon="📚",
     layout="wide",
     initial_sidebar_state="collapsed"
 )
-# Custom CSS for colorful design
 st.markdown("""
-<style>
-    :root {
-        --primary: #ff4b4b;
-        --secondary: #ff9a3d;
-        --accent1: #ffcb74;
-        --accent2: #3a86ff;
-        --background: #f0f2f6;
-        --card: #ffffff;
-    }
-    .stApp {
-        background: linear-gradient(135deg, var(--background) 0%, #e0e5ec 100%);
-    }
-    .stButton>button {
-        background: linear-gradient(to right, var(--secondary), var(--primary));
-        color: white;
-        border-radius: 12px;
-        padding: 8px 20px;
-        font-weight: 600;
-    }
-    .stTextInput>div>div>input {
-        border-radius: 12px;
-        border: 2px solid var(--accent2);
-        padding: 10px;
-    }
-    .card {
-        background: var(--card);
-        border-radius: 15px;
-        box-shadow: 0 8px 16px rgba(0,0,0,0.1);
-        padding: 20px;
-        margin-bottom: 20px;
-    }
-    .header {
-        background: linear-gradient(to right, var(--accent2), var(--primary));
-        -webkit-background-clip: text;
-        -webkit-text-fill-color: transparent;
-        text-align: center;
-        margin-bottom: 30px;
-    }
-    .tab-content {
-        animation: fadeIn 0.5s ease-in-out;
-    }
-    .error {
-        background-color: #ffebee;
-        border-left: 4px solid #f44336;
-        padding: 10px;
-        margin: 10px 0;
-    }
-    .info {
-        background-color: #e3f2fd;
-        border-left: 4px solid #2196f3;
-        padding: 10px;
-        margin: 10px 0;
-    }
-    .success {
-        background-color: #e8f5e9;
-        border-left: 4px solid #4caf50;
-        padding: 10px;
-        margin: 10px 0;
-    }
-    @keyframes fadeIn {
-        from { opacity: 0; }
-        to { opacity: 1; }
-    }
-</style>
 """, unsafe_allow_html=True)
 # Initialize session state
-if 'pdf_processed' not in st.session_state:
-    st.session_state.pdf_processed = False
-if 'vector_store' not in st.session_state:
     st.session_state.vector_store = None
-if 'pages' not in st.session_state:
-    st.session_state.pages = []
-if 'history' not in st.session_state:
-    st.session_state.history = []
-if 'token_valid' not in st.session_state:
-    st.session_state.token_valid = None
-# Load embedding model with caching
-@st.cache_resource
-def load_embedding_model():
-    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-def check_token_validity():
-    """Check if the token is valid by making a simple API call"""
-    if not os.getenv("HF_API_KEY"):
-        return False
-    try:
-        headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"}
-        response = requests.get("https://huggingface.co/api/whoami", headers=headers)
-        return response.status_code == 200
-    except:
-        return False
-def query_hf_inference_api(prompt, max_tokens=200, model="google/flan-t5-base"):
-    """Query Hugging Face Inference API with better error handling"""
-    API_URL = f"https://api-inference.huggingface.co/models/{model}"
-    headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"} if os.getenv('HF_API_KEY') else {}
-    payload = {
-        "inputs": prompt,
-        "parameters": {
-            "max_new_tokens": max_tokens,
-            "temperature": 0.5,
-            "do_sample": False
-        }
-    }
-    try:
-        response = requests.post(API_URL, headers=headers, json=payload)
-        if response.status_code == 200:
-            result = response.json()
-            return result[0]['generated_text'] if result else ""
-        elif response.status_code == 403:
-            # Detailed debug information
-            st.session_state.token_valid = check_token_validity()
-            debug_info = f"""
-            <div class="error">
-                <h4>403 Forbidden Error</h4>
-                <p>Token is set: <strong>{'Yes' if os.getenv('HF_API_KEY') else 'No'}</strong></p>
-                <p>Token valid: <strong>{'Yes' if st.session_state.token_valid else 'No'}</strong></p>
-                <p>Model: {model}</p>
-                <p>Possible solutions:</p>
-                <ol>
-                    <li>Visit the <a href="https://huggingface.co/{model}" target="_blank">model page</a> and click "Agree and access repository"</li>
-                    <li>Ensure your token has "read" permissions</li>
-                    <li>Wait 5-10 minutes after accepting terms</li>
-                    <li>Try a different model using the dropdown below</li>
-                </ol>
-            </div>
-            """
-            st.markdown(debug_info, unsafe_allow_html=True)
-            return ""
-        elif response.status_code == 429:
-            st.warning("Rate limit exceeded. Waiting and retrying...")
-            time.sleep(3)
-            return query_hf_inference_api(prompt, max_tokens, model)
-        else:
-            st.error(f"API Error {response.status_code}: {response.text[:200]}")
-            return ""
-    except Exception as e:
-        st.error(f"Connection error: {str(e)}")
-        return ""
 def process_pdf(pdf_file):
-    """Extract text from PDF and create vector store"""
-    with st.spinner("📖 Reading PDF..."):
-        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
-        text = ""
-        st.session_state.pages = []
-        for page in doc:
-            page_text = page.get_text()
-            text += page_text
-            st.session_state.pages.append(page_text)
-    with st.spinner("🔍 Processing text..."):
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,
-            chunk_overlap=200,
-            length_function=len
-        )
-        chunks = text_splitter.split_text(text)
-        embeddings = load_embedding_model()
-        st.session_state.vector_store = FAISS.from_texts(chunks, embeddings)
-    st.session_state.pdf_processed = True
-    st.success("✅ PDF processed successfully!")
-def ask_question(question, model_choice):
-    """Answer a question using the vector store and Hugging Face API"""
-    if not st.session_state.vector_store:
-        return "PDF not processed yet", []
-    # Find relevant passages
-    docs = st.session_state.vector_store.similarity_search(question, k=3)
-    context = "\n\n".join([doc.page_content[:500] for doc in docs])
-    # Format prompt for the model
-    prompt = f"""
-    Based on the following context, answer the question.
-    If the answer isn't in the context, say "I don't know".
-    Context:
     {context}
     Question: {question}
-    Answer:
-    """
-    # Query the model
-    answer = query_hf_inference_api(prompt, model=model_choice)
-    # Add to history
-    st.session_state.history.append({
-        "question": question,
-        "answer": answer,
-        "sources": [doc.page_content for doc in docs],
-        "model": model_choice
-    })
-    return answer, docs
-def generate_qa_for_chapter(start_page, end_page, model_choice):
-    """Generate Q&A for specific chapter pages"""
-    if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page:
-        st.error("Invalid page range")
-        return []
-    chapter_text = "\n".join(st.session_state.pages[start_page-1:end_page])
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=800,
-        chunk_overlap=100,
-        length_function=len
     )
-    chunks = text_splitter.split_text(chapter_text)
-    qa_pairs = []
-    with st.spinner(f"🧠 Generating Q&A for pages {start_page}-{end_page}..."):
-        for i, chunk in enumerate(chunks):
-            if i % 2 == 0:  # Generate question
-                prompt = f"Based on this text, generate one study question: {chunk[:500]}"
-                question = query_hf_inference_api(prompt, model=model_choice, max_tokens=100)
-                if question and not question.endswith("?"):
-                    question += "?"
-                if question:  # Only add if we got a valid question
-                    qa_pairs.append((question, ""))
-            else:  # Generate answer
-                if qa_pairs:  # Ensure we have a question to answer
-                    prompt = f"Answer this question: {qa_pairs[-1][0]} using this context: {chunk[:500]}"
-                    answer = query_hf_inference_api(prompt, model=model_choice, max_tokens=200)
-                    qa_pairs[-1] = (qa_pairs[-1][0], answer)
-    return qa_pairs
-# App header
-st.markdown("<h1 class='header'>📚 PDF Study Assistant</h1>", unsafe_allow_html=True)
-# Model selection
-MODEL_OPTIONS = {
-    "google/flan-t5-base": "T5 Base (Recommended)",
-    "google/flan-t5-large": "T5 Large (Requires Auth)",
-    "mrm8488/t5-base-finetuned-question-generation-ap": "Question Generation",
-    "declare-lab/flan-alpaca-base": "Alpaca Base"
-}
-# Debug info panel
-with st.expander("🔧 Debug Information", expanded=False):
-    st.subheader("Hugging Face Token Status")
-    # Check token validity
-    token_valid = check_token_validity()
-    st.session_state.token_valid = token_valid
-    col1, col2 = st.columns(2)
-    with col1:
-        st.write(f"Token is set: {'✅ Yes' if os.getenv('HF_API_KEY') else '❌ No'}")
-    with col2:
-        st.write(f"Token is valid: {'✅ Yes' if token_valid else '❌ No'}")
-    if os.getenv('HF_API_KEY'):
-        st.markdown("""
-        <div class="info">
-            <p>Your token is set but we're still having issues. Try these steps:</p>
-            <ol>
-                <li>Visit the model page for your selected model</li>
-                <li>Click "Agree and access repository"</li>
-                <li>Wait 5-10 minutes for changes to propagate</li>
-                <li>Try a different model from the dropdown</li>
-            </ol>
-        </div>
-        """, unsafe_allow_html=True)
-    else:
-        st.markdown("""
-        <div class="error">
-            <p>Token is not set! Add it in your Space secrets:</p>
-            <ol>
-                <li>Go to your Space → Settings → Secrets</li>
-                <li>Add <code>HF_API_KEY</code> with your token</li>
-                <li>Redeploy the Space</li>
-            </ol>
-            <p>Get your token: <a href="https://huggingface.co/settings/tokens" target="_blank">https://huggingface.co/settings/tokens</a></p>
-        </div>
-        """, unsafe_allow_html=True)
-# PDF Upload Section (FIXED LABEL ERROR)
-with st.container():
-    st.subheader("📤 Upload Your Textbook/Notes")
-    # Fixed empty label issue by adding a space and hiding it
-    pdf_file = st.file_uploader(
-        "Upload PDF",
-        type="pdf",
-        label_visibility="collapsed"
-    )
-# Main content
-if pdf_file:
-    if not st.session_state.pdf_processed:
-        process_pdf(pdf_file)
-    if st.session_state.pdf_processed:
-        # Model selection
-        st.subheader("Model Selection")
-        model_choice = st.selectbox(
-            "Choose AI model:",
-            options=list(MODEL_OPTIONS.keys()),
-            format_func=lambda x: MODEL_OPTIONS[x],
-            help="Some models require accepting terms on Hugging Face"
-        )
-        # Navigation tabs
-        selected_tab = option_menu(
-            None,
-            ["Ask Questions", "Generate Chapter Q&A", "History"],
-            icons=["chat", "book", "clock-history"],
-            menu_icon="cast",
-            default_index=0,
-            orientation="horizontal",
-            styles={
-                "container": {"padding": "0!important", "background-color": "#f9f9f9"},
-                "nav-link": {"font-size": "16px", "font-weight": "bold"},
-                "nav-link-selected": {"background": "linear-gradient(to right, #3a86ff, #ff4b4b)"},
-            }
-        )
-        # Question Answering Tab
-        if selected_tab == "Ask Questions":
-            st.markdown("### 💬 Ask Questions About Your Document")
-            user_question = st.text_input("Type your question here:", key="user_question")
-            if user_question:
-                with st.spinner("🤔 Thinking..."):
-                    answer, docs = ask_question(user_question, model_choice)
-                    if answer:
-                        st.markdown(f"<div class='card'><b>Answer:</b> {answer}</div>", unsafe_allow_html=True)
-                        with st.expander("🔍 See source passages"):
-                            for i, doc in enumerate(docs):
-                                st.markdown(f"**Passage {i+1}:** {doc.page_content[:500]}...")
-        # Chapter Q&A Generation Tab
-        elif selected_tab == "Generate Chapter Q&A":
-            st.markdown("### 📝 Generate Q&A for Specific Chapter")
-            col1, col2 = st.columns(2)
-            with col1:
-                start_page = st.number_input("Start Page", min_value=1, max_value=len(st.session_state.pages), value=1)
-            with col2:
-                end_page = st.number_input("End Page", min_value=1, max_value=len(st.session_state.pages), value=min(5, len(st.session_state.pages)))
-            if st.button("Generate Q&A", key="generate_qa"):
-                qa_pairs = generate_qa_for_chapter(start_page, end_page, model_choice)
-                if qa_pairs:
-                    st.markdown(f"<h4>📖 Generated Questions for Pages {start_page}-{end_page}</h4>", unsafe_allow_html=True)
-                    for i, (question, answer) in enumerate(qa_pairs):
-                        st.markdown(f"""
-                        <div class='card'>
-                            <b>Q{i+1}:</b> {question}<br>
-                            <b>A{i+1}:</b> {answer}
-                        </div>
-                        """, unsafe_allow_html=True)
-                else:
-                    st.warning("No Q&A pairs generated. Try a different page range.")
-        # History Tab
-        elif selected_tab == "History":
-            st.markdown("### ⏳ Question History")
-            if not st.session_state.history:
-                st.info("No questions asked yet.")
-            else:
-                for i, item in enumerate(reversed(st.session_state.history)):
-                    with st.expander(f"Q{i+1}: {item['question']} ({MODEL_OPTIONS.get(item['model'], item['model'])})"):
-                        st.markdown(f"**Answer:** {item['answer']}")
-                        st.markdown("**Source Passages:**")
-                        for j, source in enumerate(item['sources']):
-                            st.markdown(f"{j+1}. {source[:500]}...")
 # Footer
 st.markdown("---")
-st.markdown("""
-<div style="text-align: center; padding: 20px;">
-    Built with ❤️ for students | PDF Study Assistant v4.1
-</div>
-""", unsafe_allow_html=True)

 import streamlit as st
+import os
+import tempfile
+from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.chat_models import ChatOllama
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+import base64
+# Set page config
 st.set_page_config(
+    page_title="EduQuery - Smart PDF Assistant",
     page_icon="📚",
     layout="wide",
     initial_sidebar_state="collapsed"
 )
+# Custom CSS for colorful UI
+def local_css(file_name):
+    with open(file_name) as f:
+        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+local_css("style.css")
+# Header with gradient
 st.markdown("""
+<div class="header">
+    <h1>📚 EduQuery</h1>
+    <p>Smart PDF Assistant for Students</p>
+</div>
 """, unsafe_allow_html=True)
 # Initialize session state
+if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Model selection
+MODEL_NAME = "nous-hermes2"  # Best open-source model for instruction following
+# PDF Processing
 def process_pdf(pdf_file):
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+        tmp_file.write(pdf_file.getvalue())
+        tmp_path = tmp_file.name
+    loader = PyPDFLoader(tmp_path)
+    docs = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_documents(docs)
+    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+    vector_store = FAISS.from_documents(chunks, embeddings)
+    os.unlink(tmp_path)
+    return vector_store
+# RAG Setup
+def setup_qa_chain(vector_store):
+    llm = ChatOllama(model=MODEL_NAME, temperature=0.3)
+    custom_prompt = """
+    You are an expert academic assistant. Answer the question based only on the following context:
     {context}
     Question: {question}
+    Provide a clear, concise answer with page number references. If unsure, say "I couldn't find this information in the document".
+    """
+    prompt = PromptTemplate(
+        template=custom_prompt,
+        input_variables=["context", "question"]
+    )
+    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
+    qa_chain = (
+        {"context": retriever, "question": RunnablePassthrough()}
+        | prompt
+        | llm
+        | StrOutputParser()
     )
+    return qa_chain
+# Generate questions from chapter
+def generate_chapter_questions(vector_store, chapter_title):
+    llm = ChatOllama(model=MODEL_NAME, temperature=0.7)
+    prompt = PromptTemplate(
+        input_variables=["chapter_title"],
+        template="""
+        You are an expert educator. Generate 5 important questions and answers about '{chapter_title}'
+        that would help students understand key concepts. Format as:
+        Q1: [Question]
+        A1: [Answer with page reference]
+        Q2: [Question]
+        A2: [Answer with page reference]
+        ..."""
+    )
+    chain = prompt | llm | StrOutputParser()
+    return chain.invoke({"chapter_title": chapter_title})
+# File upload section
+st.subheader("📤 Upload Your Textbook/Notes")
+uploaded_file = st.file_uploader("", type="pdf", accept_multiple_files=False)
+if uploaded_file:
+    with st.spinner("Processing PDF..."):
+        st.session_state.vector_store = process_pdf(uploaded_file)
+    st.success("PDF processed successfully! You can now ask questions.")
+# Main content columns
+col1, col2 = st.columns([1, 2])
+# Chapter-based Q&A Generator
+with col1:
+    st.subheader("🔍 Generate Chapter Questions")
+    chapter_title = st.text_input("Enter chapter title/section name:")
+    if st.button("Generate Q&A") and chapter_title and st.session_state.vector_store:
+        with st.spinner(f"Generating questions about {chapter_title}..."):
+            questions = generate_chapter_questions(
+                st.session_state.vector_store,
+                chapter_title
+            )
+            st.markdown(f"<div class='qa-box'>{questions}</div>", unsafe_allow_html=True)
+    elif chapter_title and not st.session_state.vector_store:
+        st.warning("Please upload a PDF first")
+# Chat interface
+with col2:
+    st.subheader("💬 Ask Anything About the Document")
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    if prompt := st.chat_input("Your question..."):
+        if not st.session_state.vector_store:
+            st.warning("Please upload a PDF first")
+            st.stop()
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                qa_chain = setup_qa_chain(st.session_state.vector_store)
+                response = qa_chain.invoke(prompt)
+            st.markdown(response)
+            st.session_state.messages.append({"role": "assistant", "content": response})
 # Footer
 st.markdown("---")
+st.markdown(
+    """
+    <div class="footer">
+        <p>EduQuery - Helping students learn smarter • Powered by Nous-Hermes2 and LangChain</p>
+    </div>
+    """,
+    unsafe_allow_html=True
+)