Spaces:

TKM03
/

PDF_based_chatbot

Sleeping

App Files Files Community

PDF_based_chatbot / app.py

TKM03

Update app.py

d77f957 verified 7 months ago

raw

history blame

5.54 kB


	import numpy as np
	from sentence_transformers import SentenceTransformer
	import faiss
	import re
	import gradio as gr

	def preprocess_text(text):
	"""
	Preprocess the text into structured question-answer pairs
	"""
	# Split text into sections by questions
	sections = []
	current_section = []

	for line in text.split('\n'):
	line = line.strip()
	if line.startswith('Question'):
	if current_section:
	sections.append(' '.join(current_section))
	current_section = [line]
	elif line:
	current_section.append(line)

	if current_section:
	sections.append(' '.join(current_section))

	# Create a structured format
	structured_sections = []
	for section in sections:
	# Remove page numbers and other irrelevant text
	section = re.sub(r'\d+\s*$', '', section)
	section = re.sub(r'TRAPS:\|BEST ANSWER:\|PASSABLE ANSWER:', ' ', section)
	structured_sections.append(section.strip())

	return structured_sections

	def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.4):
	"""
	Query the QA system with improved matching
	"""
	# Encode and normalize the question
	question_embedding = model.encode([question])
	faiss.normalize_L2(question_embedding)

	# Search for the most similar chunks
	k = 1 # Get only the best match
	similarities, indices = index.search(question_embedding, k)

	best_idx = indices[0][0]
	similarity_score = similarities[0][0] # Cosine similarity score

	if similarity_score >= similarity_threshold:
	matched_text = text_chunks[best_idx]
	# Extract just the question number for reference
	question_num = re.search(r'Question \d+:', matched_text)
	question_num = question_num.group(0) if question_num else "Matching section"

	return {
	'question': question_num,
	'full_text': matched_text,
	'confidence': float(similarity_score),
	'found_answer': True
	}
	else:
	return {
	'question': None,
	'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.",
	'confidence': float(similarity_score),
	'found_answer': False
	}

	# Function to handle PDF file upload and initialization
	def initialize_qa_system(pdf_file):
	# Read the uploaded PDF
	try:
	from PyPDF2 import PdfReader
	pdf_reader = PdfReader(pdf_file.name)
	pdf_text = ""
	for page in pdf_reader.pages:
	text = page.extract_text()
	if text:
	pdf_text += text + "\n"

	# Process text and create embeddings
	text_chunks = preprocess_text(pdf_text)
	model = SentenceTransformer("all-MiniLM-L6-v2")
	embeddings = model.encode(text_chunks)

	# Create index
	dimension = embeddings.shape[1]
	faiss.normalize_L2(embeddings)
	index = faiss.IndexFlatIP(dimension)
	index.add(embeddings)

	return {
	'model': model,
	'index': index,
	'text_chunks': text_chunks,
	'status': f"System initialized with {len(text_chunks)} text chunks from your PDF!"
	}
	except Exception as e:
	return {
	'model': None,
	'index': None,
	'text_chunks': None,
	'status': f"Error: {str(e)}"
	}

	# Global variables to store our QA system components
	qa_system = {'model': None, 'index': None, 'text_chunks': None}

	# Function to handle file upload
	def upload_file(pdf_file):
	global qa_system
	result = initialize_qa_system(pdf_file)
	qa_system = result
	return result['status']

	# Function to handle questions
	def answer_question(question):
	global qa_system

	if not qa_system['model'] or not qa_system['index'] or not qa_system['text_chunks']:
	return "Please upload a PDF file first."

	result = query_qa_system(question, qa_system['model'], qa_system['index'], qa_system['text_chunks'])
	answer_start = result['full_text'].find('Answer:') + len('Answer:')
	answer = result['full_text'][answer_start:].strip()


	if result['found_answer']:
	return f"Match (confidence: {result['confidence']:.2f}):\n\n{answer}"
	else:
	return f"{answer}\nBest match confidence: {result['confidence']:.2f}"

	# Create the Gradio interface
	with gr.Blocks(title="Interview Q&A Assistant") as demo:
	gr.Markdown("# Interview Q&A Assistant")
	gr.Markdown("Upload your interview questions PDF and ask questions to get the most relevant sections.")

	with gr.Row():
	with gr.Column():
	pdf_upload = gr.File(label="Upload PDF File")
	upload_button = gr.Button("Initialize Q&A System")
	status_text = gr.Textbox(label="Status", value="Upload a PDF to begin")

	with gr.Row():
	with gr.Column():
	question_input = gr.Textbox(label="Ask a question about interview preparation")
	submit_button = gr.Button("Get Answer")

	with gr.Row():
	answer_output = gr.Textbox(label="Answer", lines=10)

	# Set up events
	upload_button.click(upload_file, inputs=pdf_upload, outputs=status_text)
	submit_button.click(answer_question, inputs=question_input, outputs=answer_output)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(share=True)