Spaces:

gaur3009
/

rmrr

Sleeping

App Files Files Community

rmrr / app.py

gaur3009

Create app.py

63f2fae verified 4 months ago

raw

history blame

4.74 kB

	import gradio as gr
	import os
	import re
	import PyPDF2
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity

	# ----------------------------
	# PDF Processing Engine
	# ----------------------------

	class PDFAnalyzer:
	def __init__(self):
	self.text_chunks = []
	self.embeddings = None
	self.active_doc = None
	self.model = SentenceTransformer('all-MiniLM-L6-v2')

	def process_pdf(self, filepath):
	"""Handle PDF file processing pipeline"""
	try:
	if not filepath.lower().endswith('.pdf'):
	return False, "Invalid file format - PDF required"

	text = self._extract_text(filepath)
	self.text_chunks = self._chunk_text(text)
	self.embeddings = self.model.encode(self.text_chunks)
	self.active_doc = os.path.basename(filepath)
	return True, f"Loaded {self.active_doc} ({len(self.text_chunks)} chunks)"

	except PyPDF2.errors.PdfReadError:
	return False, "Error reading PDF - file may be corrupted"
	except Exception as e:
	return False, f"Processing error: {str(e)}"

	def _extract_text(self, filepath):
	"""Extract text from PDF document"""
	text = ""
	with open(filepath, 'rb') as f:
	reader = PyPDF2.PdfReader(f)
	for page in reader.pages:
	text += page.extract_text() or ""
	return text

	def _chunk_text(self, text, chunk_size=400):
	"""Create semantic chunks from document text"""
	sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.\|\?)\s', text)
	chunks = []
	current_chunk = []
	count = 0

	for sentence in sentences:
	current_chunk.append(sentence)
	count += len(sentence.split())
	if count >= chunk_size:
	chunks.append(' '.join(current_chunk))
	current_chunk = []
	count = 0

	if current_chunk:
	chunks.append(' '.join(current_chunk))
	return chunks

	def query_document(self, question):
	"""Find relevant document section for a question"""
	if not self.active_doc:
	return "No active document. Please upload a PDF first."

	question_embed = self.model.encode(question)
	similarities = cosine_similarity([question_embed], self.embeddings)[0]
	best_match = np.argmax(similarities)
	return self.text_chunks[best_match]

	# ----------------------------
	# Gradio Interface
	# ----------------------------

	def create_interface():
	analyzer = PDFAnalyzer()
	chat_history = []

	def process_file(file):
	success, message = analyzer.process_pdf(file.name)
	status = f"✅ {message}" if success else f"❌ {message}"
	return status

	def respond(message, history):
	nonlocal analyzer

	# Handle document queries
	if analyzer.active_doc:
	response = analyzer.query_document(message)
	history.append((message, response))
	return history, history

	# Handle initial state
	history.append((message, "Please upload a PDF document first"))
	return history, history

	def clear_chat():
	nonlocal analyzer
	analyzer = PDFAnalyzer()
	return [], [], "❌ No document loaded"

	with gr.Blocks(title="PDF Analysis Assistant", theme=gr.themes.Soft()) as app:
	gr.Markdown("# 📄 PDF Analysis Assistant")
	gr.Markdown("Upload a PDF document and ask questions about its content")

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(label="Upload PDF", type="filepath")
	status_output = gr.Markdown("❌ No document loaded")
	upload_btn = gr.Button("Process Document")

	with gr.Column(scale=2):
	chatbot = gr.Chatbot(label="Conversation")
	msg = gr.Textbox(label="Your Question")
	clear_btn = gr.Button("Clear Chat")

	# Event handling
	upload_btn.click(
	process_file,
	inputs=file_input,
	outputs=status_output
	)

	msg.submit(
	respond,
	inputs=[msg, chatbot],
	outputs=[chatbot, chatbot]
	)

	clear_btn.click(
	clear_chat,
	outputs=[chatbot, file_input, status_output]
	)

	return app

	if __name__ == "__main__":
	app = create_interface()
	app.launch()