File size: 4,740 Bytes
63f2fae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import gradio as gr
import os
import re
import PyPDF2
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# ----------------------------
# PDF Processing Engine
# ----------------------------
class PDFAnalyzer:
def __init__(self):
self.text_chunks = []
self.embeddings = None
self.active_doc = None
self.model = SentenceTransformer('all-MiniLM-L6-v2')
def process_pdf(self, filepath):
"""Handle PDF file processing pipeline"""
try:
if not filepath.lower().endswith('.pdf'):
return False, "Invalid file format - PDF required"
text = self._extract_text(filepath)
self.text_chunks = self._chunk_text(text)
self.embeddings = self.model.encode(self.text_chunks)
self.active_doc = os.path.basename(filepath)
return True, f"Loaded {self.active_doc} ({len(self.text_chunks)} chunks)"
except PyPDF2.errors.PdfReadError:
return False, "Error reading PDF - file may be corrupted"
except Exception as e:
return False, f"Processing error: {str(e)}"
def _extract_text(self, filepath):
"""Extract text from PDF document"""
text = ""
with open(filepath, 'rb') as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
text += page.extract_text() or ""
return text
def _chunk_text(self, text, chunk_size=400):
"""Create semantic chunks from document text"""
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
chunks = []
current_chunk = []
count = 0
for sentence in sentences:
current_chunk.append(sentence)
count += len(sentence.split())
if count >= chunk_size:
chunks.append(' '.join(current_chunk))
current_chunk = []
count = 0
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def query_document(self, question):
"""Find relevant document section for a question"""
if not self.active_doc:
return "No active document. Please upload a PDF first."
question_embed = self.model.encode(question)
similarities = cosine_similarity([question_embed], self.embeddings)[0]
best_match = np.argmax(similarities)
return self.text_chunks[best_match]
# ----------------------------
# Gradio Interface
# ----------------------------
def create_interface():
analyzer = PDFAnalyzer()
chat_history = []
def process_file(file):
success, message = analyzer.process_pdf(file.name)
status = f"β
{message}" if success else f"β {message}"
return status
def respond(message, history):
nonlocal analyzer
# Handle document queries
if analyzer.active_doc:
response = analyzer.query_document(message)
history.append((message, response))
return history, history
# Handle initial state
history.append((message, "Please upload a PDF document first"))
return history, history
def clear_chat():
nonlocal analyzer
analyzer = PDFAnalyzer()
return [], [], "β No document loaded"
with gr.Blocks(title="PDF Analysis Assistant", theme=gr.themes.Soft()) as app:
gr.Markdown("# π PDF Analysis Assistant")
gr.Markdown("Upload a PDF document and ask questions about its content")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(label="Upload PDF", type="filepath")
status_output = gr.Markdown("β No document loaded")
upload_btn = gr.Button("Process Document")
with gr.Column(scale=2):
chatbot = gr.Chatbot(label="Conversation")
msg = gr.Textbox(label="Your Question")
clear_btn = gr.Button("Clear Chat")
# Event handling
upload_btn.click(
process_file,
inputs=file_input,
outputs=status_output
)
msg.submit(
respond,
inputs=[msg, chatbot],
outputs=[chatbot, chatbot]
)
clear_btn.click(
clear_chat,
outputs=[chatbot, file_input, status_output]
)
return app
if __name__ == "__main__":
app = create_interface()
app.launch() |