Spaces:
Sleeping
Sleeping
File size: 5,540 Bytes
c1d4062 94fd8fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import re
import gradio as gr
def preprocess_text(text):
"""
Preprocess the text into structured question-answer pairs
"""
# Split text into sections by questions
sections = []
current_section = []
for line in text.split('\n'):
line = line.strip()
if line.startswith('Question'):
if current_section:
sections.append(' '.join(current_section))
current_section = [line]
elif line:
current_section.append(line)
if current_section:
sections.append(' '.join(current_section))
# Create a structured format
structured_sections = []
for section in sections:
# Remove page numbers and other irrelevant text
section = re.sub(r'\d+\s*$', '', section)
section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section)
structured_sections.append(section.strip())
return structured_sections
def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.4):
"""
Query the QA system with improved matching
"""
# Encode and normalize the question
question_embedding = model.encode([question])
faiss.normalize_L2(question_embedding)
# Search for the most similar chunks
k = 1 # Get only the best match
similarities, indices = index.search(question_embedding, k)
best_idx = indices[0][0]
similarity_score = similarities[0][0] # Cosine similarity score
if similarity_score >= similarity_threshold:
matched_text = text_chunks[best_idx]
# Extract just the question number for reference
question_num = re.search(r'Question \d+:', matched_text)
question_num = question_num.group(0) if question_num else "Matching section"
return {
'question': question_num,
'full_text': matched_text,
'confidence': float(similarity_score),
'found_answer': True
}
else:
return {
'question': None,
'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.",
'confidence': float(similarity_score),
'found_answer': False
}
# Function to handle PDF file upload and initialization
def initialize_qa_system(pdf_file):
# Read the uploaded PDF
try:
from PyPDF2 import PdfReader
pdf_reader = PdfReader(pdf_file.name)
pdf_text = ""
for page in pdf_reader.pages:
text = page.extract_text()
if text:
pdf_text += text + "\n"
# Process text and create embeddings
text_chunks = preprocess_text(pdf_text)
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(text_chunks)
# Create index
dimension = embeddings.shape[1]
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)
return {
'model': model,
'index': index,
'text_chunks': text_chunks,
'status': f"System initialized with {len(text_chunks)} text chunks from your PDF!"
}
except Exception as e:
return {
'model': None,
'index': None,
'text_chunks': None,
'status': f"Error: {str(e)}"
}
# Global variables to store our QA system components
qa_system = {'model': None, 'index': None, 'text_chunks': None}
# Function to handle file upload
def upload_file(pdf_file):
global qa_system
result = initialize_qa_system(pdf_file)
qa_system = result
return result['status']
# Function to handle questions
def answer_question(question):
global qa_system
if not qa_system['model'] or not qa_system['index'] or not qa_system['text_chunks']:
return "Please upload a PDF file first."
result = query_qa_system(question, qa_system['model'], qa_system['index'], qa_system['text_chunks'])
answer_start = result['full_text'].find('Answer:') + len('Answer:')
answer = result['full_text'][answer_start:].strip()
if result['found_answer']:
return f"Match (confidence: {result['confidence']:.2f}):\n\n{answer}"
else:
return f"{answer}\nBest match confidence: {result['confidence']:.2f}"
# Create the Gradio interface
with gr.Blocks(title="Interview Q&A Assistant") as demo:
gr.Markdown("# Interview Q&A Assistant")
gr.Markdown("Upload your interview questions PDF and ask questions to get the most relevant sections.")
with gr.Row():
with gr.Column():
pdf_upload = gr.File(label="Upload PDF File")
upload_button = gr.Button("Initialize Q&A System")
status_text = gr.Textbox(label="Status", value="Upload a PDF to begin")
with gr.Row():
with gr.Column():
question_input = gr.Textbox(label="Ask a question about interview preparation")
submit_button = gr.Button("Get Answer")
with gr.Row():
answer_output = gr.Textbox(label="Answer", lines=10)
# Set up events
upload_button.click(upload_file, inputs=pdf_upload, outputs=status_text)
submit_button.click(answer_question, inputs=question_input, outputs=answer_output)
# Launch the app
if __name__ == "__main__":
demo.launch(share=True)
|