TKM03's picture
Update app.py
26862d5 verified
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import re
import gradio as gr
import PyPDF2
import tempfile
import os
def extract_text_from_pdf(pdf_file):
"""
Extract text from a PDF file for Hugging Face Spaces
"""
if pdf_file is None:
return "Please upload a PDF file."
pdf_text = ""
try:
# In Hugging Face Spaces, pdf_file is already a file path
with open(pdf_file.name, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
pdf_text += page.extract_text() + "\n"
except Exception as e:
return f"Error processing PDF: {str(e)}"
return pdf_text
def preprocess_text(text):
"""
Preprocess the text into structured question-answer pairs
"""
# Split text into sections by questions
sections = []
current_section = []
for line in text.split('\n'):
line = line.strip()
if line.startswith('Question'):
if current_section:
sections.append(' '.join(current_section))
current_section = [line]
elif line:
current_section.append(line)
if current_section:
sections.append(' '.join(current_section))
# Create a structured format
structured_sections = []
for section in sections:
# Remove page numbers and other irrelevant text
section = re.sub(r'\d+\s*$', '', section)
section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section)
structured_sections.append(section.strip())
return structured_sections
def create_qa_system(pdf_text, model_name="all-MiniLM-L6-v2"):
"""
Create and return a QA system with the processed text
"""
# Process text into structured sections
text_chunks = preprocess_text(pdf_text)
# Create embeddings
model = SentenceTransformer(model_name)
embeddings = model.encode(text_chunks)
# Create FAISS index with cosine similarity
dimension = embeddings.shape[1]
# Normalize vectors for cosine similarity
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity
index.add(embeddings)
return model, index, text_chunks
def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.3):
"""
Query the QA system with improved matching
"""
# Encode and normalize the question
question_embedding = model.encode([question])
faiss.normalize_L2(question_embedding)
# Search for the most similar chunks
k = 1 # Get only the best match
similarities, indices = index.search(question_embedding, k)
best_idx = indices[0][0]
similarity_score = similarities[0][0] # Cosine similarity score
if similarity_score >= similarity_threshold:
matched_text = text_chunks[best_idx]
# Extract just the question number for reference
question_num = re.search(r'Question \d+:', matched_text)
question_num = question_num.group(0) if question_num else "Matching section"
return {
'question': question_num,
'full_text': matched_text,
'confidence': float(similarity_score),
'found_answer': True
}
else:
return {
'question': None,
'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.",
'confidence': float(similarity_score),
'found_answer': False
}
# Global variables to store model, index, and text chunks
global_model = None
global_index = None
global_text_chunks = None
def upload_file(file):
global global_model, global_index, global_text_chunks
if file is not None:
try:
# Extract text from PDF
pdf_text = extract_text_from_pdf(file)
if isinstance(pdf_text, str) and pdf_text.startswith("Error"):
return pdf_text
# Initialize QA system
global_model, global_index, global_text_chunks = create_qa_system(pdf_text)
return "βœ… Document processed successfully! You can now ask questions."
except Exception as e:
return f"❌ Error processing document: {str(e)}"
else:
return "❌ Please upload a PDF file."
def answer_question(question):
global global_model, global_index, global_text_chunks
if global_model is None or global_index is None or global_text_chunks is None:
return "Please upload and process a document first."
if not question.strip():
return "Please enter a question."
result = query_qa_system(question, global_model, global_index, global_text_chunks)
if result['found_answer']:
response = f"Found matching section (confidence: {result['confidence']:.2f}):\n\n{result['full_text']}"
else:
response = f"{result['full_text']}\nBest match confidence: {result['confidence']:.2f}"
return response
# Custom CSS for professional styling
custom_css = """
.gradio-container {
max-width: 1200px !important;
margin: auto !important;
padding: 20px !important;
background-color: #f8f9fa !important;
}
.main-header {
text-align: center;
margin-bottom: 2rem;
padding: 2rem;
background: linear-gradient(135deg, #1a365d 0%, #2c5282 100%);
color: white;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.main-header h1 {
font-size: 2.5rem;
margin-bottom: 1rem;
font-weight: 600;
}
.main-header p {
font-size: 1.1rem;
opacity: 0.9;
}
.upload-section {
background: white;
padding: 2rem;
border-radius: 10px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
margin-bottom: 2rem;
}
.qa-section {
background: white;
padding: 2rem;
border-radius: 10px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
}
.status-box {
margin-top: 1rem;
padding: 1rem;
border-radius: 8px;
background: #f0f9ff;
border: 1px solid #bae6fd;
}
.custom-button {
background: #2563eb !important;
color: white !important;
border-radius: 8px !important;
padding: 0.75rem 1.5rem !important;
font-weight: 500 !important;
}
.custom-button:hover {
background: #1d4ed8 !important;
}
.answer-box {
background: #f8fafc !important;
border: 1px solid #e2e8f0 !important;
border-radius: 8px !important;
font-family: 'Source Code Pro', monospace !important;
}
.section-title {
color: #1e293b;
font-size: 1.25rem;
font-weight: 600;
margin-bottom: 1rem;
}
/* Responsive design */
@media (max-width: 768px) {
.gradio-container {
padding: 10px !important;
}
.main-header {
padding: 1.5rem;
}
.main-header h1 {
font-size: 2rem;
}
}
"""
# Create the enhanced Gradio interface
with gr.Blocks(title="Q&A Assistant", css=custom_css) as demo:
# Header Section
with gr.Row(elem_classes=["main-header"]):
with gr.Column():
gr.Markdown("# Q&A Assistant")
gr.Markdown("AI-powered interview preparation companion. Upload your PDF and get instant, relevant answers to your queries.")
# Upload Section
with gr.Row():
with gr.Column(elem_classes=["upload-section"]):
gr.Markdown("### πŸ“ Document Upload", elem_classes=["section-title"])
with gr.Row():
pdf_upload = gr.File(
label="Upload your interview questions PDF",
file_types=[".pdf"],
elem_classes=["file-upload"]
)
with gr.Row():
upload_button = gr.Button("Initialize Q&A System", elem_classes=["custom-button"])
with gr.Row():
status_text = gr.Textbox(
label="System Status",
value="Upload a PDF to begin",
elem_classes=["status-box"]
)
# Q&A Section
with gr.Row():
with gr.Column(elem_classes=["qa-section"]):
gr.Markdown("### πŸ’‘ Ask Questions", elem_classes=["section-title"])
with gr.Row():
question_input = gr.Textbox(
label="What would you like to know ?",
placeholder="e.g., What are the common behavioral questions?",
lines=2
)
with gr.Row():
submit_button = gr.Button("Get Answer", elem_classes=["custom-button"])
with gr.Row():
answer_output = gr.Textbox(
label="Answer",
lines=10,
elem_classes=["answer-box"]
)
# Information Section
with gr.Row():
gr.Markdown("""
<div style="text-align: center; padding: 2rem; color: #64748b; font-size: 0.9rem;">
Made with ❀️ for interview preparation success
</div>
""")
# Set up events
upload_button.click(upload_file, inputs=pdf_upload, outputs=status_text)
submit_button.click(answer_question, inputs=question_input, outputs=answer_output)
# Launch the app
if __name__ == "__main__":
demo.launch()