import numpy as np from sentence_transformers import SentenceTransformer import faiss import re import gradio as gr import PyPDF2 import tempfile import os def extract_text_from_pdf(pdf_file): """ Extract text from a PDF file for Hugging Face Spaces """ if pdf_file is None: return "Please upload a PDF file." pdf_text = "" try: # In Hugging Face Spaces, pdf_file is already a file path with open(pdf_file.name, 'rb') as f: pdf_reader = PyPDF2.PdfReader(f) for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] pdf_text += page.extract_text() + "\n" except Exception as e: return f"Error processing PDF: {str(e)}" return pdf_text def preprocess_text(text): """ Preprocess the text into structured question-answer pairs """ # Split text into sections by questions sections = [] current_section = [] for line in text.split('\n'): line = line.strip() if line.startswith('Question'): if current_section: sections.append(' '.join(current_section)) current_section = [line] elif line: current_section.append(line) if current_section: sections.append(' '.join(current_section)) # Create a structured format structured_sections = [] for section in sections: # Remove page numbers and other irrelevant text section = re.sub(r'\d+\s*$', '', section) section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section) structured_sections.append(section.strip()) return structured_sections def create_qa_system(pdf_text, model_name="all-MiniLM-L6-v2"): """ Create and return a QA system with the processed text """ # Process text into structured sections text_chunks = preprocess_text(pdf_text) # Create embeddings model = SentenceTransformer(model_name) embeddings = model.encode(text_chunks) # Create FAISS index with cosine similarity dimension = embeddings.shape[1] # Normalize vectors for cosine similarity faiss.normalize_L2(embeddings) index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity index.add(embeddings) return model, index, text_chunks def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.3): """ Query the QA system with improved matching """ # Encode and normalize the question question_embedding = model.encode([question]) faiss.normalize_L2(question_embedding) # Search for the most similar chunks k = 1 # Get only the best match similarities, indices = index.search(question_embedding, k) best_idx = indices[0][0] similarity_score = similarities[0][0] # Cosine similarity score if similarity_score >= similarity_threshold: matched_text = text_chunks[best_idx] # Extract just the question number for reference question_num = re.search(r'Question \d+:', matched_text) question_num = question_num.group(0) if question_num else "Matching section" return { 'question': question_num, 'full_text': matched_text, 'confidence': float(similarity_score), 'found_answer': True } else: return { 'question': None, 'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.", 'confidence': float(similarity_score), 'found_answer': False } # Global variables to store model, index, and text chunks global_model = None global_index = None global_text_chunks = None def upload_file(file): global global_model, global_index, global_text_chunks if file is not None: try: # Extract text from PDF pdf_text = extract_text_from_pdf(file) if isinstance(pdf_text, str) and pdf_text.startswith("Error"): return pdf_text # Initialize QA system global_model, global_index, global_text_chunks = create_qa_system(pdf_text) return "✅ Document processed successfully! You can now ask questions." except Exception as e: return f"❌ Error processing document: {str(e)}" else: return "❌ Please upload a PDF file." def answer_question(question): global global_model, global_index, global_text_chunks if global_model is None or global_index is None or global_text_chunks is None: return "Please upload and process a document first." if not question.strip(): return "Please enter a question." result = query_qa_system(question, global_model, global_index, global_text_chunks) if result['found_answer']: response = f"Found matching section (confidence: {result['confidence']:.2f}):\n\n{result['full_text']}" else: response = f"{result['full_text']}\nBest match confidence: {result['confidence']:.2f}" return response # Custom CSS for professional styling custom_css = """ .gradio-container { max-width: 1200px !important; margin: auto !important; padding: 20px !important; background-color: #f8f9fa !important; } .main-header { text-align: center; margin-bottom: 2rem; padding: 2rem; background: linear-gradient(135deg, #1a365d 0%, #2c5282 100%); color: white; border-radius: 10px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); } .main-header h1 { font-size: 2.5rem; margin-bottom: 1rem; font-weight: 600; } .main-header p { font-size: 1.1rem; opacity: 0.9; } .upload-section { background: white; padding: 2rem; border-radius: 10px; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05); margin-bottom: 2rem; } .qa-section { background: white; padding: 2rem; border-radius: 10px; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05); } .status-box { margin-top: 1rem; padding: 1rem; border-radius: 8px; background: #f0f9ff; border: 1px solid #bae6fd; } .custom-button { background: #2563eb !important; color: white !important; border-radius: 8px !important; padding: 0.75rem 1.5rem !important; font-weight: 500 !important; } .custom-button:hover { background: #1d4ed8 !important; } .answer-box { background: #f8fafc !important; border: 1px solid #e2e8f0 !important; border-radius: 8px !important; font-family: 'Source Code Pro', monospace !important; } .section-title { color: #1e293b; font-size: 1.25rem; font-weight: 600; margin-bottom: 1rem; } /* Responsive design */ @media (max-width: 768px) { .gradio-container { padding: 10px !important; } .main-header { padding: 1.5rem; } .main-header h1 { font-size: 2rem; } } """ # Create the enhanced Gradio interface with gr.Blocks(title="Q&A Assistant", css=custom_css) as demo: # Header Section with gr.Row(elem_classes=["main-header"]): with gr.Column(): gr.Markdown("# Q&A Assistant") gr.Markdown("AI-powered interview preparation companion. Upload your PDF and get instant, relevant answers to your queries.") # Upload Section with gr.Row(): with gr.Column(elem_classes=["upload-section"]): gr.Markdown("### 📁 Document Upload", elem_classes=["section-title"]) with gr.Row(): pdf_upload = gr.File( label="Upload your interview questions PDF", file_types=[".pdf"], elem_classes=["file-upload"] ) with gr.Row(): upload_button = gr.Button("Initialize Q&A System", elem_classes=["custom-button"]) with gr.Row(): status_text = gr.Textbox( label="System Status", value="Upload a PDF to begin", elem_classes=["status-box"] ) # Q&A Section with gr.Row(): with gr.Column(elem_classes=["qa-section"]): gr.Markdown("### 💡 Ask Questions", elem_classes=["section-title"]) with gr.Row(): question_input = gr.Textbox( label="What would you like to know ?", placeholder="e.g., What are the common behavioral questions?", lines=2 ) with gr.Row(): submit_button = gr.Button("Get Answer", elem_classes=["custom-button"]) with gr.Row(): answer_output = gr.Textbox( label="Answer", lines=10, elem_classes=["answer-box"] ) # Information Section with gr.Row(): gr.Markdown("""
Made with ❤️ for interview preparation success
""") # Set up events upload_button.click(upload_file, inputs=pdf_upload, outputs=status_text) submit_button.click(answer_question, inputs=question_input, outputs=answer_output) # Launch the app if __name__ == "__main__": demo.launch()