Spaces:
Sleeping
Sleeping
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import re | |
import gradio as gr | |
def preprocess_text(text): | |
""" | |
Preprocess the text into structured question-answer pairs | |
""" | |
# Split text into sections by questions | |
sections = [] | |
current_section = [] | |
for line in text.split('\n'): | |
line = line.strip() | |
if line.startswith('Question'): | |
if current_section: | |
sections.append(' '.join(current_section)) | |
current_section = [line] | |
elif line: | |
current_section.append(line) | |
if current_section: | |
sections.append(' '.join(current_section)) | |
# Create a structured format | |
structured_sections = [] | |
for section in sections: | |
# Remove page numbers and other irrelevant text | |
section = re.sub(r'\d+\s*$', '', section) | |
section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section) | |
structured_sections.append(section.strip()) | |
return structured_sections | |
def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.4): | |
""" | |
Query the QA system with improved matching | |
""" | |
# Encode and normalize the question | |
question_embedding = model.encode([question]) | |
faiss.normalize_L2(question_embedding) | |
# Search for the most similar chunks | |
k = 1 # Get only the best match | |
similarities, indices = index.search(question_embedding, k) | |
best_idx = indices[0][0] | |
similarity_score = similarities[0][0] # Cosine similarity score | |
if similarity_score >= similarity_threshold: | |
matched_text = text_chunks[best_idx] | |
# Extract just the question number for reference | |
question_num = re.search(r'Question \d+:', matched_text) | |
question_num = question_num.group(0) if question_num else "Matching section" | |
return { | |
'question': question_num, | |
'full_text': matched_text, | |
'confidence': float(similarity_score), | |
'found_answer': True | |
} | |
else: | |
return { | |
'question': None, | |
'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.", | |
'confidence': float(similarity_score), | |
'found_answer': False | |
} | |
# Function to handle PDF file upload and initialization | |
def initialize_qa_system(pdf_file): | |
# Read the uploaded PDF | |
try: | |
from PyPDF2 import PdfReader | |
pdf_reader = PdfReader(pdf_file.name) | |
pdf_text = "" | |
for page in pdf_reader.pages: | |
text = page.extract_text() | |
if text: | |
pdf_text += text + "\n" | |
# Process text and create embeddings | |
text_chunks = preprocess_text(pdf_text) | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
embeddings = model.encode(text_chunks) | |
# Create index | |
dimension = embeddings.shape[1] | |
faiss.normalize_L2(embeddings) | |
index = faiss.IndexFlatIP(dimension) | |
index.add(embeddings) | |
return { | |
'model': model, | |
'index': index, | |
'text_chunks': text_chunks, | |
'status': f"System initialized with {len(text_chunks)} text chunks from your PDF!" | |
} | |
except Exception as e: | |
return { | |
'model': None, | |
'index': None, | |
'text_chunks': None, | |
'status': f"Error: {str(e)}" | |
} | |
# Global variables to store our QA system components | |
qa_system = {'model': None, 'index': None, 'text_chunks': None} | |
# Function to handle file upload | |
def upload_file(pdf_file): | |
global qa_system | |
result = initialize_qa_system(pdf_file) | |
qa_system = result | |
return result['status'] | |
# Function to handle questions | |
def answer_question(question): | |
global qa_system | |
if not qa_system['model'] or not qa_system['index'] or not qa_system['text_chunks']: | |
return "Please upload a PDF file first." | |
result = query_qa_system(question, qa_system['model'], qa_system['index'], qa_system['text_chunks']) | |
answer_start = result['full_text'].find('Answer:') + len('Answer:') | |
answer = result['full_text'][answer_start:].strip() | |
if result['found_answer']: | |
return f"Match (confidence: {result['confidence']:.2f}):\n\n{answer}" | |
else: | |
return f"{answer}\nBest match confidence: {result['confidence']:.2f}" | |
# Create the Gradio interface | |
with gr.Blocks(title="Interview Q&A Assistant") as demo: | |
gr.Markdown("# Interview Q&A Assistant") | |
gr.Markdown("Upload your interview questions PDF and ask questions to get the most relevant sections.") | |
with gr.Row(): | |
with gr.Column(): | |
pdf_upload = gr.File(label="Upload PDF File") | |
upload_button = gr.Button("Initialize Q&A System") | |
status_text = gr.Textbox(label="Status", value="Upload a PDF to begin") | |
with gr.Row(): | |
with gr.Column(): | |
question_input = gr.Textbox(label="Ask a question about interview preparation") | |
submit_button = gr.Button("Get Answer") | |
with gr.Row(): | |
answer_output = gr.Textbox(label="Answer", lines=10) | |
# Set up events | |
upload_button.click(upload_file, inputs=pdf_upload, outputs=status_text) | |
submit_button.click(answer_question, inputs=question_input, outputs=answer_output) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch(share=True) | |