Spaces:
Sleeping
Sleeping
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import re | |
import gradio as gr | |
import PyPDF2 | |
import tempfile | |
import os | |
def extract_text_from_pdf(pdf_file): | |
""" | |
Extract text from a PDF file for Hugging Face Spaces | |
""" | |
if pdf_file is None: | |
return "Please upload a PDF file." | |
pdf_text = "" | |
try: | |
# In Hugging Face Spaces, pdf_file is already a file path | |
with open(pdf_file.name, 'rb') as f: | |
pdf_reader = PyPDF2.PdfReader(f) | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
pdf_text += page.extract_text() + "\n" | |
except Exception as e: | |
return f"Error processing PDF: {str(e)}" | |
return pdf_text | |
def preprocess_text(text): | |
""" | |
Preprocess the text into structured question-answer pairs | |
""" | |
# Split text into sections by questions | |
sections = [] | |
current_section = [] | |
for line in text.split('\n'): | |
line = line.strip() | |
if line.startswith('Question'): | |
if current_section: | |
sections.append(' '.join(current_section)) | |
current_section = [line] | |
elif line: | |
current_section.append(line) | |
if current_section: | |
sections.append(' '.join(current_section)) | |
# Create a structured format | |
structured_sections = [] | |
for section in sections: | |
# Remove page numbers and other irrelevant text | |
section = re.sub(r'\d+\s*$', '', section) | |
section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section) | |
structured_sections.append(section.strip()) | |
return structured_sections | |
def create_qa_system(pdf_text, model_name="all-MiniLM-L6-v2"): | |
""" | |
Create and return a QA system with the processed text | |
""" | |
# Process text into structured sections | |
text_chunks = preprocess_text(pdf_text) | |
# Create embeddings | |
model = SentenceTransformer(model_name) | |
embeddings = model.encode(text_chunks) | |
# Create FAISS index with cosine similarity | |
dimension = embeddings.shape[1] | |
# Normalize vectors for cosine similarity | |
faiss.normalize_L2(embeddings) | |
index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity | |
index.add(embeddings) | |
return model, index, text_chunks | |
def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.3): | |
""" | |
Query the QA system with improved matching | |
""" | |
# Encode and normalize the question | |
question_embedding = model.encode([question]) | |
faiss.normalize_L2(question_embedding) | |
# Search for the most similar chunks | |
k = 1 # Get only the best match | |
similarities, indices = index.search(question_embedding, k) | |
best_idx = indices[0][0] | |
similarity_score = similarities[0][0] # Cosine similarity score | |
if similarity_score >= similarity_threshold: | |
matched_text = text_chunks[best_idx] | |
# Extract just the question number for reference | |
question_num = re.search(r'Question \d+:', matched_text) | |
question_num = question_num.group(0) if question_num else "Matching section" | |
return { | |
'question': question_num, | |
'full_text': matched_text, | |
'confidence': float(similarity_score), | |
'found_answer': True | |
} | |
else: | |
return { | |
'question': None, | |
'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.", | |
'confidence': float(similarity_score), | |
'found_answer': False | |
} | |
# Global variables to store model, index, and text chunks | |
global_model = None | |
global_index = None | |
global_text_chunks = None | |
def upload_file(file): | |
global global_model, global_index, global_text_chunks | |
if file is not None: | |
try: | |
# Extract text from PDF | |
pdf_text = extract_text_from_pdf(file) | |
if isinstance(pdf_text, str) and pdf_text.startswith("Error"): | |
return pdf_text | |
# Initialize QA system | |
global_model, global_index, global_text_chunks = create_qa_system(pdf_text) | |
return "β Document processed successfully! You can now ask questions." | |
except Exception as e: | |
return f"β Error processing document: {str(e)}" | |
else: | |
return "β Please upload a PDF file." | |
def answer_question(question): | |
global global_model, global_index, global_text_chunks | |
if global_model is None or global_index is None or global_text_chunks is None: | |
return "Please upload and process a document first." | |
if not question.strip(): | |
return "Please enter a question." | |
result = query_qa_system(question, global_model, global_index, global_text_chunks) | |
if result['found_answer']: | |
response = f"Found matching section (confidence: {result['confidence']:.2f}):\n\n{result['full_text']}" | |
else: | |
response = f"{result['full_text']}\nBest match confidence: {result['confidence']:.2f}" | |
return response | |
# Custom CSS for professional styling | |
custom_css = """ | |
.gradio-container { | |
max-width: 1200px !important; | |
margin: auto !important; | |
padding: 20px !important; | |
background-color: #f8f9fa !important; | |
} | |
.main-header { | |
text-align: center; | |
margin-bottom: 2rem; | |
padding: 2rem; | |
background: linear-gradient(135deg, #1a365d 0%, #2c5282 100%); | |
color: white; | |
border-radius: 10px; | |
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
} | |
.main-header h1 { | |
font-size: 2.5rem; | |
margin-bottom: 1rem; | |
font-weight: 600; | |
} | |
.main-header p { | |
font-size: 1.1rem; | |
opacity: 0.9; | |
} | |
.upload-section { | |
background: white; | |
padding: 2rem; | |
border-radius: 10px; | |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05); | |
margin-bottom: 2rem; | |
} | |
.qa-section { | |
background: white; | |
padding: 2rem; | |
border-radius: 10px; | |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05); | |
} | |
.status-box { | |
margin-top: 1rem; | |
padding: 1rem; | |
border-radius: 8px; | |
background: #f0f9ff; | |
border: 1px solid #bae6fd; | |
} | |
.custom-button { | |
background: #2563eb !important; | |
color: white !important; | |
border-radius: 8px !important; | |
padding: 0.75rem 1.5rem !important; | |
font-weight: 500 !important; | |
} | |
.custom-button:hover { | |
background: #1d4ed8 !important; | |
} | |
.answer-box { | |
background: #f8fafc !important; | |
border: 1px solid #e2e8f0 !important; | |
border-radius: 8px !important; | |
font-family: 'Source Code Pro', monospace !important; | |
} | |
.section-title { | |
color: #1e293b; | |
font-size: 1.25rem; | |
font-weight: 600; | |
margin-bottom: 1rem; | |
} | |
/* Responsive design */ | |
@media (max-width: 768px) { | |
.gradio-container { | |
padding: 10px !important; | |
} | |
.main-header { | |
padding: 1.5rem; | |
} | |
.main-header h1 { | |
font-size: 2rem; | |
} | |
} | |
""" | |
# Create the enhanced Gradio interface | |
with gr.Blocks(title="Q&A Assistant", css=custom_css) as demo: | |
# Header Section | |
with gr.Row(elem_classes=["main-header"]): | |
with gr.Column(): | |
gr.Markdown("# Q&A Assistant") | |
gr.Markdown("AI-powered interview preparation companion. Upload your PDF and get instant, relevant answers to your queries.") | |
# Upload Section | |
with gr.Row(): | |
with gr.Column(elem_classes=["upload-section"]): | |
gr.Markdown("### π Document Upload", elem_classes=["section-title"]) | |
with gr.Row(): | |
pdf_upload = gr.File( | |
label="Upload your interview questions PDF", | |
file_types=[".pdf"], | |
elem_classes=["file-upload"] | |
) | |
with gr.Row(): | |
upload_button = gr.Button("Initialize Q&A System", elem_classes=["custom-button"]) | |
with gr.Row(): | |
status_text = gr.Textbox( | |
label="System Status", | |
value="Upload a PDF to begin", | |
elem_classes=["status-box"] | |
) | |
# Q&A Section | |
with gr.Row(): | |
with gr.Column(elem_classes=["qa-section"]): | |
gr.Markdown("### π‘ Ask Questions", elem_classes=["section-title"]) | |
with gr.Row(): | |
question_input = gr.Textbox( | |
label="What would you like to know ?", | |
placeholder="e.g., What are the common behavioral questions?", | |
lines=2 | |
) | |
with gr.Row(): | |
submit_button = gr.Button("Get Answer", elem_classes=["custom-button"]) | |
with gr.Row(): | |
answer_output = gr.Textbox( | |
label="Answer", | |
lines=10, | |
elem_classes=["answer-box"] | |
) | |
# Information Section | |
with gr.Row(): | |
gr.Markdown(""" | |
<div style="text-align: center; padding: 2rem; color: #64748b; font-size: 0.9rem;"> | |
Made with β€οΈ for interview preparation success | |
</div> | |
""") | |
# Set up events | |
upload_button.click(upload_file, inputs=pdf_upload, outputs=status_text) | |
submit_button.click(answer_question, inputs=question_input, outputs=answer_output) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() |