gaur3009 commited on
Commit
63f2fae
Β·
verified Β·
1 Parent(s): 8ee7a99

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import re
4
+ import PyPDF2
5
+ import numpy as np
6
+ from sentence_transformers import SentenceTransformer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+
9
+ # ----------------------------
10
+ # PDF Processing Engine
11
+ # ----------------------------
12
+
13
+ class PDFAnalyzer:
14
+ def __init__(self):
15
+ self.text_chunks = []
16
+ self.embeddings = None
17
+ self.active_doc = None
18
+ self.model = SentenceTransformer('all-MiniLM-L6-v2')
19
+
20
+ def process_pdf(self, filepath):
21
+ """Handle PDF file processing pipeline"""
22
+ try:
23
+ if not filepath.lower().endswith('.pdf'):
24
+ return False, "Invalid file format - PDF required"
25
+
26
+ text = self._extract_text(filepath)
27
+ self.text_chunks = self._chunk_text(text)
28
+ self.embeddings = self.model.encode(self.text_chunks)
29
+ self.active_doc = os.path.basename(filepath)
30
+ return True, f"Loaded {self.active_doc} ({len(self.text_chunks)} chunks)"
31
+
32
+ except PyPDF2.errors.PdfReadError:
33
+ return False, "Error reading PDF - file may be corrupted"
34
+ except Exception as e:
35
+ return False, f"Processing error: {str(e)}"
36
+
37
+ def _extract_text(self, filepath):
38
+ """Extract text from PDF document"""
39
+ text = ""
40
+ with open(filepath, 'rb') as f:
41
+ reader = PyPDF2.PdfReader(f)
42
+ for page in reader.pages:
43
+ text += page.extract_text() or ""
44
+ return text
45
+
46
+ def _chunk_text(self, text, chunk_size=400):
47
+ """Create semantic chunks from document text"""
48
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
49
+ chunks = []
50
+ current_chunk = []
51
+ count = 0
52
+
53
+ for sentence in sentences:
54
+ current_chunk.append(sentence)
55
+ count += len(sentence.split())
56
+ if count >= chunk_size:
57
+ chunks.append(' '.join(current_chunk))
58
+ current_chunk = []
59
+ count = 0
60
+
61
+ if current_chunk:
62
+ chunks.append(' '.join(current_chunk))
63
+ return chunks
64
+
65
+ def query_document(self, question):
66
+ """Find relevant document section for a question"""
67
+ if not self.active_doc:
68
+ return "No active document. Please upload a PDF first."
69
+
70
+ question_embed = self.model.encode(question)
71
+ similarities = cosine_similarity([question_embed], self.embeddings)[0]
72
+ best_match = np.argmax(similarities)
73
+ return self.text_chunks[best_match]
74
+
75
+ # ----------------------------
76
+ # Gradio Interface
77
+ # ----------------------------
78
+
79
+ def create_interface():
80
+ analyzer = PDFAnalyzer()
81
+ chat_history = []
82
+
83
+ def process_file(file):
84
+ success, message = analyzer.process_pdf(file.name)
85
+ status = f"βœ… {message}" if success else f"❌ {message}"
86
+ return status
87
+
88
+ def respond(message, history):
89
+ nonlocal analyzer
90
+
91
+ # Handle document queries
92
+ if analyzer.active_doc:
93
+ response = analyzer.query_document(message)
94
+ history.append((message, response))
95
+ return history, history
96
+
97
+ # Handle initial state
98
+ history.append((message, "Please upload a PDF document first"))
99
+ return history, history
100
+
101
+ def clear_chat():
102
+ nonlocal analyzer
103
+ analyzer = PDFAnalyzer()
104
+ return [], [], "❌ No document loaded"
105
+
106
+ with gr.Blocks(title="PDF Analysis Assistant", theme=gr.themes.Soft()) as app:
107
+ gr.Markdown("# πŸ“„ PDF Analysis Assistant")
108
+ gr.Markdown("Upload a PDF document and ask questions about its content")
109
+
110
+ with gr.Row():
111
+ with gr.Column(scale=1):
112
+ file_input = gr.File(label="Upload PDF", type="filepath")
113
+ status_output = gr.Markdown("❌ No document loaded")
114
+ upload_btn = gr.Button("Process Document")
115
+
116
+ with gr.Column(scale=2):
117
+ chatbot = gr.Chatbot(label="Conversation")
118
+ msg = gr.Textbox(label="Your Question")
119
+ clear_btn = gr.Button("Clear Chat")
120
+
121
+ # Event handling
122
+ upload_btn.click(
123
+ process_file,
124
+ inputs=file_input,
125
+ outputs=status_output
126
+ )
127
+
128
+ msg.submit(
129
+ respond,
130
+ inputs=[msg, chatbot],
131
+ outputs=[chatbot, chatbot]
132
+ )
133
+
134
+ clear_btn.click(
135
+ clear_chat,
136
+ outputs=[chatbot, file_input, status_output]
137
+ )
138
+
139
+ return app
140
+
141
+ if __name__ == "__main__":
142
+ app = create_interface()
143
+ app.launch()