Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import PyPDF2
|
|
5 |
import numpy as np
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
8 |
|
9 |
class PDFAnalyzer:
|
10 |
def __init__(self):
|
@@ -19,9 +20,17 @@ class PDFAnalyzer:
|
|
19 |
self.text_chunks = self._chunk_text(text)
|
20 |
self.embeddings = self.model.encode(self.text_chunks)
|
21 |
self.active_doc = os.path.basename(filepath)
|
22 |
-
return
|
|
|
|
|
|
|
|
|
23 |
except Exception as e:
|
24 |
-
return
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def _extract_text(self, filepath):
|
27 |
with open(filepath, 'rb') as f:
|
@@ -31,24 +40,45 @@ class PDFAnalyzer:
|
|
31 |
return [text[i:i+500] for i in range(0, len(text), 500)]
|
32 |
|
33 |
def query(self, question):
|
34 |
-
if not self.active_doc:
|
35 |
-
return
|
|
|
|
|
|
|
|
|
36 |
|
37 |
ques_emb = self.model.encode(question)
|
38 |
-
|
|
|
|
|
|
|
39 |
full_answer = self.text_chunks[best_idx]
|
|
|
40 |
|
41 |
-
#
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
question_words = set(question.lower().split())
|
44 |
|
45 |
-
# Find the most relevant sentence
|
46 |
-
sentences = re.split(r'(?<=[.!?]) +', full_answer)
|
47 |
best_sentence = max(sentences,
|
48 |
-
|
49 |
-
|
50 |
|
51 |
-
# Get 50 words before and after the best sentence
|
52 |
all_words = ' '.join(sentences).split()
|
53 |
try:
|
54 |
start = max(0, all_words.index(best_sentence.split()[0]) - 50)
|
@@ -57,35 +87,44 @@ class PDFAnalyzer:
|
|
57 |
start = 0
|
58 |
end = 100
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
return precise_answer
|
63 |
|
64 |
def create_app():
|
65 |
analyzer = PDFAnalyzer()
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
68 |
-
gr.Markdown("# π PDF QA Assistant")
|
69 |
|
70 |
with gr.Row():
|
71 |
with gr.Column(scale=1):
|
72 |
pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
|
73 |
-
status = gr.Markdown("**Status:**
|
74 |
gr.Button("Process PDF").click(
|
75 |
-
lambda f: analyzer.process_pdf(f.name) if f else "
|
76 |
inputs=pdf_upload,
|
77 |
outputs=status
|
78 |
)
|
79 |
|
80 |
with gr.Column(scale=2):
|
81 |
chatbot = gr.Chatbot(height=400)
|
82 |
-
question = gr.Textbox(label="
|
83 |
question.submit(
|
84 |
-
lambda q,h: h + [(q, analyzer.query(q))],
|
85 |
inputs=[question, chatbot],
|
86 |
outputs=chatbot
|
87 |
)
|
88 |
-
gr.Button("Clear").click(
|
89 |
lambda: [None, None, "**Status:** Session cleared"],
|
90 |
outputs=[chatbot, pdf_upload, status]
|
91 |
)
|
|
|
5 |
import numpy as np
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
+
import json
|
9 |
|
10 |
class PDFAnalyzer:
|
11 |
def __init__(self):
|
|
|
20 |
self.text_chunks = self._chunk_text(text)
|
21 |
self.embeddings = self.model.encode(self.text_chunks)
|
22 |
self.active_doc = os.path.basename(filepath)
|
23 |
+
return json.dumps({
|
24 |
+
"status": 200,
|
25 |
+
"message": f"Document {self.active_doc} processed successfully",
|
26 |
+
"document_id": hash(self.active_doc)
|
27 |
+
})
|
28 |
except Exception as e:
|
29 |
+
return json.dumps({
|
30 |
+
"status": 500,
|
31 |
+
"error": str(e),
|
32 |
+
"message": "Document processing failed"
|
33 |
+
})
|
34 |
|
35 |
def _extract_text(self, filepath):
|
36 |
with open(filepath, 'rb') as f:
|
|
|
40 |
return [text[i:i+500] for i in range(0, len(text), 500)]
|
41 |
|
42 |
def query(self, question):
|
43 |
+
if not self.active_doc:
|
44 |
+
return json.dumps({
|
45 |
+
"status": 400,
|
46 |
+
"message": "No document uploaded",
|
47 |
+
"results": []
|
48 |
+
})
|
49 |
|
50 |
ques_emb = self.model.encode(question)
|
51 |
+
similarities = cosine_similarity([ques_emb], self.embeddings)[0]
|
52 |
+
best_idx = np.argmax(similarities)
|
53 |
+
confidence = float(similarities[best_idx])
|
54 |
+
|
55 |
full_answer = self.text_chunks[best_idx]
|
56 |
+
sentences = re.split(r'(?<=[.!?]) +', full_answer)
|
57 |
|
58 |
+
# Generate API-like response
|
59 |
+
return json.dumps({
|
60 |
+
"status": 200,
|
61 |
+
"message": "Success",
|
62 |
+
"results": [{
|
63 |
+
"text": self._format_answer(full_answer, question),
|
64 |
+
"confidence": confidence,
|
65 |
+
"document_id": hash(self.active_doc),
|
66 |
+
"metadata": {
|
67 |
+
"chunk_index": best_idx,
|
68 |
+
"document": self.active_doc
|
69 |
+
}
|
70 |
+
}]
|
71 |
+
})
|
72 |
+
|
73 |
+
def _format_answer(self, text, question):
|
74 |
+
# Extract focused answer with 100-word context
|
75 |
+
sentences = re.split(r'(?<=[.!?]) +', text)
|
76 |
question_words = set(question.lower().split())
|
77 |
|
|
|
|
|
78 |
best_sentence = max(sentences,
|
79 |
+
key=lambda s: len(set(s.lower().split()) & question_words),
|
80 |
+
default="")
|
81 |
|
|
|
82 |
all_words = ' '.join(sentences).split()
|
83 |
try:
|
84 |
start = max(0, all_words.index(best_sentence.split()[0]) - 50)
|
|
|
87 |
start = 0
|
88 |
end = 100
|
89 |
|
90 |
+
return ' '.join(all_words[start:end]) + ("..." if end < len(all_words) else "")
|
|
|
|
|
91 |
|
92 |
def create_app():
|
93 |
analyzer = PDFAnalyzer()
|
94 |
|
95 |
+
def format_response(response):
|
96 |
+
try:
|
97 |
+
data = json.loads(response)
|
98 |
+
if data['status'] != 200:
|
99 |
+
return f"Error: {data.get('message', 'Unknown error')}"
|
100 |
+
|
101 |
+
result = data['results'][0]
|
102 |
+
return f"**Answer** ({result['confidence']:.2f} confidence):\n{result['text']}"
|
103 |
+
except:
|
104 |
+
return "Error processing response"
|
105 |
+
|
106 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
107 |
+
gr.Markdown("# π PDF QA Assistant (Cohere-style API)")
|
108 |
|
109 |
with gr.Row():
|
110 |
with gr.Column(scale=1):
|
111 |
pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
|
112 |
+
status = gr.Markdown("**Status:** Idle")
|
113 |
gr.Button("Process PDF").click(
|
114 |
+
lambda f: analyzer.process_pdf(f.name) if f else json.dumps({"status": 400, "error": "No file"}),
|
115 |
inputs=pdf_upload,
|
116 |
outputs=status
|
117 |
)
|
118 |
|
119 |
with gr.Column(scale=2):
|
120 |
chatbot = gr.Chatbot(height=400)
|
121 |
+
question = gr.Textbox(label="Query", placeholder="Enter your question...")
|
122 |
question.submit(
|
123 |
+
lambda q,h: h + [(q, format_response(analyzer.query(q)))],
|
124 |
inputs=[question, chatbot],
|
125 |
outputs=chatbot
|
126 |
)
|
127 |
+
gr.Button("Clear Session").click(
|
128 |
lambda: [None, None, "**Status:** Session cleared"],
|
129 |
outputs=[chatbot, pdf_upload, status]
|
130 |
)
|