Spaces:
Sleeping
Sleeping
File size: 9,470 Bytes
94fd8fc da3e470 3991753 94fd8fc da3e470 3991753 da3e470 3991753 dfc5079 3991753 da3e470 a22d089 da3e470 3991753 da3e470 94fd8fc 3c45742 94fd8fc 3c45742 94fd8fc 3c45742 94fd8fc 3c45742 94fd8fc 3c45742 94fd8fc 3c45742 94fd8fc 3c45742 94fd8fc 3c45742 26862d5 3c45742 26862d5 94fd8fc 3c45742 94fd8fc 3c45742 94fd8fc 3c45742 26862d5 3c45742 94fd8fc 3c45742 94fd8fc da3e470 94fd8fc 3991753 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 |
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import re
import gradio as gr
import PyPDF2
import tempfile
import os
def extract_text_from_pdf(pdf_file):
"""
Extract text from a PDF file for Hugging Face Spaces
"""
if pdf_file is None:
return "Please upload a PDF file."
pdf_text = ""
try:
# In Hugging Face Spaces, pdf_file is already a file path
with open(pdf_file.name, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
pdf_text += page.extract_text() + "\n"
except Exception as e:
return f"Error processing PDF: {str(e)}"
return pdf_text
def preprocess_text(text):
"""
Preprocess the text into structured question-answer pairs
"""
# Split text into sections by questions
sections = []
current_section = []
for line in text.split('\n'):
line = line.strip()
if line.startswith('Question'):
if current_section:
sections.append(' '.join(current_section))
current_section = [line]
elif line:
current_section.append(line)
if current_section:
sections.append(' '.join(current_section))
# Create a structured format
structured_sections = []
for section in sections:
# Remove page numbers and other irrelevant text
section = re.sub(r'\d+\s*$', '', section)
section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section)
structured_sections.append(section.strip())
return structured_sections
def create_qa_system(pdf_text, model_name="all-MiniLM-L6-v2"):
"""
Create and return a QA system with the processed text
"""
# Process text into structured sections
text_chunks = preprocess_text(pdf_text)
# Create embeddings
model = SentenceTransformer(model_name)
embeddings = model.encode(text_chunks)
# Create FAISS index with cosine similarity
dimension = embeddings.shape[1]
# Normalize vectors for cosine similarity
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity
index.add(embeddings)
return model, index, text_chunks
def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.3):
"""
Query the QA system with improved matching
"""
# Encode and normalize the question
question_embedding = model.encode([question])
faiss.normalize_L2(question_embedding)
# Search for the most similar chunks
k = 1 # Get only the best match
similarities, indices = index.search(question_embedding, k)
best_idx = indices[0][0]
similarity_score = similarities[0][0] # Cosine similarity score
if similarity_score >= similarity_threshold:
matched_text = text_chunks[best_idx]
# Extract just the question number for reference
question_num = re.search(r'Question \d+:', matched_text)
question_num = question_num.group(0) if question_num else "Matching section"
return {
'question': question_num,
'full_text': matched_text,
'confidence': float(similarity_score),
'found_answer': True
}
else:
return {
'question': None,
'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.",
'confidence': float(similarity_score),
'found_answer': False
}
# Global variables to store model, index, and text chunks
global_model = None
global_index = None
global_text_chunks = None
def upload_file(file):
global global_model, global_index, global_text_chunks
if file is not None:
try:
# Extract text from PDF
pdf_text = extract_text_from_pdf(file)
if isinstance(pdf_text, str) and pdf_text.startswith("Error"):
return pdf_text
# Initialize QA system
global_model, global_index, global_text_chunks = create_qa_system(pdf_text)
return "β
Document processed successfully! You can now ask questions."
except Exception as e:
return f"β Error processing document: {str(e)}"
else:
return "β Please upload a PDF file."
def answer_question(question):
global global_model, global_index, global_text_chunks
if global_model is None or global_index is None or global_text_chunks is None:
return "Please upload and process a document first."
if not question.strip():
return "Please enter a question."
result = query_qa_system(question, global_model, global_index, global_text_chunks)
if result['found_answer']:
response = f"Found matching section (confidence: {result['confidence']:.2f}):\n\n{result['full_text']}"
else:
response = f"{result['full_text']}\nBest match confidence: {result['confidence']:.2f}"
return response
# Custom CSS for professional styling
custom_css = """
.gradio-container {
max-width: 1200px !important;
margin: auto !important;
padding: 20px !important;
background-color: #f8f9fa !important;
}
.main-header {
text-align: center;
margin-bottom: 2rem;
padding: 2rem;
background: linear-gradient(135deg, #1a365d 0%, #2c5282 100%);
color: white;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.main-header h1 {
font-size: 2.5rem;
margin-bottom: 1rem;
font-weight: 600;
}
.main-header p {
font-size: 1.1rem;
opacity: 0.9;
}
.upload-section {
background: white;
padding: 2rem;
border-radius: 10px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
margin-bottom: 2rem;
}
.qa-section {
background: white;
padding: 2rem;
border-radius: 10px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
}
.status-box {
margin-top: 1rem;
padding: 1rem;
border-radius: 8px;
background: #f0f9ff;
border: 1px solid #bae6fd;
}
.custom-button {
background: #2563eb !important;
color: white !important;
border-radius: 8px !important;
padding: 0.75rem 1.5rem !important;
font-weight: 500 !important;
}
.custom-button:hover {
background: #1d4ed8 !important;
}
.answer-box {
background: #f8fafc !important;
border: 1px solid #e2e8f0 !important;
border-radius: 8px !important;
font-family: 'Source Code Pro', monospace !important;
}
.section-title {
color: #1e293b;
font-size: 1.25rem;
font-weight: 600;
margin-bottom: 1rem;
}
/* Responsive design */
@media (max-width: 768px) {
.gradio-container {
padding: 10px !important;
}
.main-header {
padding: 1.5rem;
}
.main-header h1 {
font-size: 2rem;
}
}
"""
# Create the enhanced Gradio interface
with gr.Blocks(title="Q&A Assistant", css=custom_css) as demo:
# Header Section
with gr.Row(elem_classes=["main-header"]):
with gr.Column():
gr.Markdown("# Q&A Assistant")
gr.Markdown("AI-powered interview preparation companion. Upload your PDF and get instant, relevant answers to your queries.")
# Upload Section
with gr.Row():
with gr.Column(elem_classes=["upload-section"]):
gr.Markdown("### π Document Upload", elem_classes=["section-title"])
with gr.Row():
pdf_upload = gr.File(
label="Upload your interview questions PDF",
file_types=[".pdf"],
elem_classes=["file-upload"]
)
with gr.Row():
upload_button = gr.Button("Initialize Q&A System", elem_classes=["custom-button"])
with gr.Row():
status_text = gr.Textbox(
label="System Status",
value="Upload a PDF to begin",
elem_classes=["status-box"]
)
# Q&A Section
with gr.Row():
with gr.Column(elem_classes=["qa-section"]):
gr.Markdown("### π‘ Ask Questions", elem_classes=["section-title"])
with gr.Row():
question_input = gr.Textbox(
label="What would you like to know ?",
placeholder="e.g., What are the common behavioral questions?",
lines=2
)
with gr.Row():
submit_button = gr.Button("Get Answer", elem_classes=["custom-button"])
with gr.Row():
answer_output = gr.Textbox(
label="Answer",
lines=10,
elem_classes=["answer-box"]
)
# Information Section
with gr.Row():
gr.Markdown("""
<div style="text-align: center; padding: 2rem; color: #64748b; font-size: 0.9rem;">
Made with β€οΈ for interview preparation success
</div>
""")
# Set up events
upload_button.click(upload_file, inputs=pdf_upload, outputs=status_text)
submit_button.click(answer_question, inputs=question_input, outputs=answer_output)
# Launch the app
if __name__ == "__main__":
demo.launch() |