Update app.py
Browse files
app.py
CHANGED
@@ -1,532 +1,3 @@
|
|
1 |
-
'''
|
2 |
-
|
3 |
-
import gradio as gr
|
4 |
-
import os
|
5 |
-
import re
|
6 |
-
import json
|
7 |
-
import torch
|
8 |
-
import numpy as np
|
9 |
-
import logging
|
10 |
-
from typing import Dict, List, Tuple, Optional
|
11 |
-
from tqdm import tqdm
|
12 |
-
from pydantic import BaseModel
|
13 |
-
import pprint
|
14 |
-
from transformers import (
|
15 |
-
AutoTokenizer,
|
16 |
-
AutoModelForSeq2SeqLM,
|
17 |
-
AutoModelForQuestionAnswering,
|
18 |
-
pipeline,
|
19 |
-
LogitsProcessor,
|
20 |
-
LogitsProcessorList,
|
21 |
-
PreTrainedModel,
|
22 |
-
PreTrainedTokenizer
|
23 |
-
)
|
24 |
-
from sentence_transformers import SentenceTransformer, CrossEncoder
|
25 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
26 |
-
from rank_bm25 import BM25Okapi
|
27 |
-
import PyPDF2
|
28 |
-
from sklearn.cluster import KMeans
|
29 |
-
import spacy
|
30 |
-
|
31 |
-
logging.basicConfig(
|
32 |
-
level=logging.INFO,
|
33 |
-
format="%(asctime)s [%(levelname)s] %(message)s"
|
34 |
-
)
|
35 |
-
|
36 |
-
print('====================== VERSION 6 (Force Use Of GPU)======================')
|
37 |
-
|
38 |
-
|
39 |
-
class ConfidenceCalibrator(LogitsProcessor):
|
40 |
-
"""Calibrates model confidence scores during generation"""
|
41 |
-
def __init__(self, calibration_factor: float = 0.9):
|
42 |
-
self.calibration_factor = calibration_factor
|
43 |
-
|
44 |
-
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
|
45 |
-
# Apply temperature scaling to smooth probability distribution
|
46 |
-
scores = scores / self.calibration_factor
|
47 |
-
return scores
|
48 |
-
|
49 |
-
|
50 |
-
class DocumentResult(BaseModel):
|
51 |
-
"""Structured output format for consistent results"""
|
52 |
-
content: str
|
53 |
-
confidence: float
|
54 |
-
source_page: int
|
55 |
-
supporting_evidence: List[str]
|
56 |
-
|
57 |
-
|
58 |
-
class OptimalModelSelector:
|
59 |
-
"""Dynamically selects best performing model for each task"""
|
60 |
-
def __init__(self):
|
61 |
-
self.qa_models = {
|
62 |
-
"deberta-v3": ("deepset/deberta-v3-large-squad2", 0.87),
|
63 |
-
"minilm": ("deepset/minilm-uncased-squad2", 0.84),
|
64 |
-
"roberta": ("deepset/roberta-base-squad2", 0.82)
|
65 |
-
}
|
66 |
-
self.summarization_models = {
|
67 |
-
"bart": ("facebook/bart-large-cnn", 0.85),
|
68 |
-
"pegasus": ("google/pegasus-xsum", 0.83)
|
69 |
-
}
|
70 |
-
self.current_models = {}
|
71 |
-
|
72 |
-
def get_best_model(self, task_type: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer, float]:
|
73 |
-
"""Returns model with highest validation score for given task"""
|
74 |
-
model_map = self.qa_models if "qa" in task_type else self.summarization_models
|
75 |
-
best_model_name, best_score = max(model_map.items(), key=lambda x: x[1][1])
|
76 |
-
|
77 |
-
if best_model_name not in self.current_models:
|
78 |
-
logging.info(f"Loading {best_model_name} for {task_type}")
|
79 |
-
tokenizer = AutoTokenizer.from_pretrained(model_map[best_model_name][0])
|
80 |
-
model = (AutoModelForQuestionAnswering if "qa" in task_type
|
81 |
-
else AutoModelForSeq2SeqLM).from_pretrained(model_map[best_model_name][0])
|
82 |
-
|
83 |
-
# Set model to high precision mode for stable confidence scores
|
84 |
-
model = model.eval().half().to('cuda' if torch.cuda.is_available() else 'cpu')
|
85 |
-
self.current_models[best_model_name] = (model, tokenizer)
|
86 |
-
|
87 |
-
return *self.current_models[best_model_name], best_score
|
88 |
-
|
89 |
-
|
90 |
-
class PDFAugmentedRetriever:
|
91 |
-
"""Enhanced context retrieval with hybrid search"""
|
92 |
-
def __init__(self, document_texts: List[str]):
|
93 |
-
self.documents = [(i, text) for i, text in enumerate(document_texts)]
|
94 |
-
self.bm25 = BM25Okapi([text.split() for _, text in self.documents])
|
95 |
-
self.encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
96 |
-
self.tfidf = TfidfVectorizer(stop_words='english').fit([text for _, text in self.documents])
|
97 |
-
|
98 |
-
def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[int, str, float]]:
|
99 |
-
"""Hybrid retrieval combining lexical and semantic search"""
|
100 |
-
# BM25 (lexical search)
|
101 |
-
bm25_scores = self.bm25.get_scores(query.split())
|
102 |
-
|
103 |
-
# Semantic similarity
|
104 |
-
semantic_scores = self.encoder.predict([(query, doc) for _, doc in self.documents])
|
105 |
-
|
106 |
-
# Combine scores with learned weights (from validation)
|
107 |
-
combined_scores = 0.4 * bm25_scores + 0.6 * np.array(semantic_scores)
|
108 |
-
|
109 |
-
# Get top passages
|
110 |
-
top_indices = np.argsort(combined_scores)[-top_k:][::-1]
|
111 |
-
return [(self.documents[i][0], self.documents[i][1], float(combined_scores[i]))
|
112 |
-
for i in top_indices]
|
113 |
-
|
114 |
-
|
115 |
-
class DetailedExplainer:
|
116 |
-
"""
|
117 |
-
Extracts key concepts from a text and explains each in depth.
|
118 |
-
"""
|
119 |
-
def __init__(self,
|
120 |
-
explanation_model: str = "google/flan-t5-large",
|
121 |
-
device: int = 0):
|
122 |
-
# generation pipeline for deep explanations
|
123 |
-
self.explainer = pipeline(
|
124 |
-
"text2text-generation",
|
125 |
-
model=explanation_model,
|
126 |
-
tokenizer=explanation_model,
|
127 |
-
device=device
|
128 |
-
)
|
129 |
-
# spaCy model for concept extraction
|
130 |
-
self.nlp = spacy.load("en_core_web_sm")
|
131 |
-
|
132 |
-
def extract_concepts(self, text: str) -> list:
|
133 |
-
"""
|
134 |
-
Use noun chunks and named entities to identify concepts.
|
135 |
-
Returns a list of unique concept strings.
|
136 |
-
"""
|
137 |
-
doc = self.nlp(text)
|
138 |
-
concepts = set()
|
139 |
-
for chunk in doc.noun_chunks:
|
140 |
-
if len(chunk) > 1 and not chunk.root.is_stop:
|
141 |
-
concepts.add(chunk.text.strip())
|
142 |
-
for ent in doc.ents:
|
143 |
-
if ent.label_ in ["PERSON", "ORG", "GPE", "NORP", "EVENT", "WORK_OF_ART"]:
|
144 |
-
concepts.add(ent.text.strip())
|
145 |
-
return list(concepts)
|
146 |
-
|
147 |
-
# The min_accurancy parameter ensures that the explanation is sufficiently accurate
|
148 |
-
# by calibrating the prompt to require a minimum level of detail.
|
149 |
-
# This is useful for complex concepts where a simple explanation may not suffice.
|
150 |
-
#min_accuracy = 0.7 # Default minimum accuracy threshold
|
151 |
-
def explain_concept(self, concept: str, context: str, min_accuracy: float = 0.50) -> str:
|
152 |
-
"""
|
153 |
-
Generate an explanation for a single concept using context.
|
154 |
-
Ensures at least `min_accuracy` via introspective prompt calibration.
|
155 |
-
"""
|
156 |
-
prompt = (
|
157 |
-
f"Explain the concept '{concept}' in depth using the following context. "
|
158 |
-
f"Aim for at least {int(min_accuracy * 100)}% accuracy."
|
159 |
-
f"\nContext:\n{context}\n"
|
160 |
-
)
|
161 |
-
result = self.explainer(
|
162 |
-
prompt,
|
163 |
-
max_length=200,
|
164 |
-
min_length=80,
|
165 |
-
do_sample=False
|
166 |
-
)
|
167 |
-
return result[0]["generated_text"].strip()
|
168 |
-
|
169 |
-
def explain_text(self, text: str, context: str) -> dict:
|
170 |
-
"""
|
171 |
-
For each concept in text, produce a detailed explanation.
|
172 |
-
Returns:
|
173 |
-
{
|
174 |
-
'concepts': [list of extracted concepts],
|
175 |
-
'explanations': {concept: explanation, ...}
|
176 |
-
}
|
177 |
-
"""
|
178 |
-
concepts = self.extract_concepts(text)
|
179 |
-
explanations = {}
|
180 |
-
for concept in concepts:
|
181 |
-
explanations[concept] = self.explain_concept(concept, context)
|
182 |
-
return {"concepts": concepts, "explanations": explanations}
|
183 |
-
|
184 |
-
|
185 |
-
class AdvancedPDFAnalyzer:
|
186 |
-
"""
|
187 |
-
High-precision PDF analysis engine with confidence calibration
|
188 |
-
Confidence scores are empirically validated to reach 0.9+ on benchmark datasets
|
189 |
-
"""
|
190 |
-
def __init__(self):
|
191 |
-
"""Initialize with optimized model selection and retrieval"""
|
192 |
-
self.logger = logging.getLogger("PDFAnalyzer")
|
193 |
-
self.model_selector = OptimalModelSelector()
|
194 |
-
self._verify_dependencies()
|
195 |
-
|
196 |
-
# Force use of GPU if available
|
197 |
-
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
198 |
-
if torch.cuda.is_available():
|
199 |
-
print("[INFO] Using GPU for inference.")
|
200 |
-
else:
|
201 |
-
print("[INFO] Using CPU for inference.")
|
202 |
-
|
203 |
-
# Initialize with highest confidence models
|
204 |
-
self.qa_model, self.qa_tokenizer, _ = self.model_selector.get_best_model("qa")
|
205 |
-
self.qa_model = self.qa_model.to(self.device)
|
206 |
-
|
207 |
-
self.summarizer = pipeline(
|
208 |
-
"summarization",
|
209 |
-
model="facebook/bart-large-cnn",
|
210 |
-
device=0 if torch.cuda.is_available() else -1,
|
211 |
-
framework="pt"
|
212 |
-
)
|
213 |
-
|
214 |
-
# Confidence calibration setup
|
215 |
-
self.logits_processor = LogitsProcessorList([
|
216 |
-
ConfidenceCalibrator(calibration_factor=0.85)
|
217 |
-
])
|
218 |
-
|
219 |
-
# Initialize the detailed explainer here
|
220 |
-
self.detailed_explainer = DetailedExplainer(
|
221 |
-
device=0 if torch.cuda.is_available() else -1
|
222 |
-
)
|
223 |
-
|
224 |
-
def _verify_dependencies(self):
|
225 |
-
"""Check for critical dependencies"""
|
226 |
-
try:
|
227 |
-
PyPDF2.PdfReader
|
228 |
-
except ImportError:
|
229 |
-
raise ImportError("PyPDF2 required: pip install pypdf2")
|
230 |
-
|
231 |
-
def extract_text_with_metadata(self, file_path: str) -> List[Dict]:
|
232 |
-
"""Extract text with page-level metadata and structural info"""
|
233 |
-
self.logger.info(f"Processing {file_path}")
|
234 |
-
documents = []
|
235 |
-
|
236 |
-
with open(file_path, 'rb') as f:
|
237 |
-
reader = PyPDF2.PdfReader(f)
|
238 |
-
|
239 |
-
for i, page in enumerate(tqdm(reader.pages)):
|
240 |
-
try:
|
241 |
-
text = page.extract_text()
|
242 |
-
if not text or not text.strip():
|
243 |
-
continue
|
244 |
-
|
245 |
-
# Add document context
|
246 |
-
page_number = i + 1
|
247 |
-
metadata = {
|
248 |
-
'source': os.path.basename(file_path),
|
249 |
-
'page': page_number,
|
250 |
-
'char_count': len(text),
|
251 |
-
'word_count': len(text.split()),
|
252 |
-
}
|
253 |
-
documents.append({
|
254 |
-
'content': self._clean_text(text),
|
255 |
-
'metadata': metadata
|
256 |
-
})
|
257 |
-
except Exception as e:
|
258 |
-
self.logger.warning(f"Page {i + 1} error: {str(e)}")
|
259 |
-
|
260 |
-
if not documents:
|
261 |
-
raise ValueError("No extractable content found in PDF")
|
262 |
-
|
263 |
-
return documents
|
264 |
-
|
265 |
-
def _clean_text(self, text: str) -> str:
|
266 |
-
"""Advanced text normalization with document structure preservation"""
|
267 |
-
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', ' ', text) # Remove control chars
|
268 |
-
text = re.sub(r'\s+', ' ', text) # Standardize whitespace
|
269 |
-
text = re.sub(r'(\w)-\s+(\w)', r'\1\2', text) # Fix hyphenated words
|
270 |
-
return text.strip()
|
271 |
-
|
272 |
-
def analyze_document(self, file_path: str) -> Dict:
|
273 |
-
"""Full document analysis pipeline with confidence scoring"""
|
274 |
-
documents = self.extract_text_with_metadata(file_path)
|
275 |
-
text_chunks = [doc['content'] for doc in documents]
|
276 |
-
|
277 |
-
# Initialize retriever with document chunks
|
278 |
-
retriever = PDFAugmentedRetriever(text_chunks)
|
279 |
-
|
280 |
-
# Generate summary with confidence
|
281 |
-
summary = self._generate_summary_with_confidence(
|
282 |
-
"\n".join(text_chunks),
|
283 |
-
retriever
|
284 |
-
)
|
285 |
-
|
286 |
-
return {
|
287 |
-
'document_metadata': [doc['metadata'] for doc in documents],
|
288 |
-
'summary': summary,
|
289 |
-
'avg_confidence': np.mean([s.confidence for s in summary])
|
290 |
-
}
|
291 |
-
|
292 |
-
def _generate_summary_with_confidence(self, text: str, retriever: PDFAugmentedRetriever) -> List[DocumentResult]:
|
293 |
-
"""Generates summary with calibrated confidence scores"""
|
294 |
-
sentences = [s.strip() for s in text.split('. ') if len(s.split()) > 6]
|
295 |
-
if not sentences:
|
296 |
-
return []
|
297 |
-
|
298 |
-
# Cluster sentences into topics
|
299 |
-
vectorizer = TfidfVectorizer(max_features=500)
|
300 |
-
X = vectorizer.fit_transform(sentences)
|
301 |
-
|
302 |
-
# Select most representative sentence per topic
|
303 |
-
summary_sentences = []
|
304 |
-
for cluster in self._cluster_text(X, n_clusters=min(5, len(sentences))):
|
305 |
-
cluster_sents = [sentences[i] for i in cluster]
|
306 |
-
sentence_scores = self._cross_validate_sentences(cluster_sents)
|
307 |
-
best_sentence = max(zip(cluster_sents, sentence_scores), key=lambda x: x[1])
|
308 |
-
summary_sentences.append(best_sentence)
|
309 |
-
|
310 |
-
# Format with confidence
|
311 |
-
return [
|
312 |
-
DocumentResult(
|
313 |
-
content=sent,
|
314 |
-
confidence=min(0.95, score * 1.1), # Calibrated boost
|
315 |
-
source_page=0,
|
316 |
-
supporting_evidence=self._find_supporting_evidence(sent, retriever)
|
317 |
-
)
|
318 |
-
for sent, score in summary_sentences
|
319 |
-
]
|
320 |
-
|
321 |
-
def answer_question(self, question: str, documents: List[Dict]) -> Dict:
|
322 |
-
"""High-confidence QA with evidence retrieval and detailed explanations"""
|
323 |
-
# Create searchable index
|
324 |
-
retriever = PDFAugmentedRetriever([doc['content'] for doc in documents])
|
325 |
-
|
326 |
-
# Retrieve relevant context
|
327 |
-
relevant_contexts = retriever.retrieve(question, top_k=3)
|
328 |
-
|
329 |
-
answers = []
|
330 |
-
for page_idx, context, similarity_score in relevant_contexts:
|
331 |
-
# Prepare QA inputs dynamically
|
332 |
-
inputs = self.qa_tokenizer(
|
333 |
-
question,
|
334 |
-
context,
|
335 |
-
add_special_tokens=True,
|
336 |
-
return_tensors="pt",
|
337 |
-
max_length=512,
|
338 |
-
truncation="only_second"
|
339 |
-
)
|
340 |
-
# Move inputs to the correct device
|
341 |
-
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
342 |
-
|
343 |
-
# Get model output with calibration
|
344 |
-
with torch.no_grad():
|
345 |
-
outputs = self.qa_model(**inputs)
|
346 |
-
start_logits = outputs.start_logits
|
347 |
-
end_logits = outputs.end_logits
|
348 |
-
|
349 |
-
# Apply confidence calibration
|
350 |
-
logits_processor = LogitsProcessorList([ConfidenceCalibrator()])
|
351 |
-
start_logits = logits_processor(inputs['input_ids'], start_logits)
|
352 |
-
end_logits = logits_processor(inputs['input_ids'], end_logits)
|
353 |
-
|
354 |
-
start_prob = torch.nn.functional.softmax(start_logits, dim=-1)
|
355 |
-
end_prob = torch.nn.functional.softmax(end_logits, dim=-1)
|
356 |
-
|
357 |
-
# Get best answer span
|
358 |
-
max_start_score, max_start_idx = torch.max(start_prob, dim=-1)
|
359 |
-
max_start_idx_int = max_start_idx.item()
|
360 |
-
max_end_score, max_end_idx = torch.max(end_prob[0, max_start_idx_int:], dim=-1)
|
361 |
-
max_end_idx_int = max_end_idx.item() + max_start_idx_int
|
362 |
-
|
363 |
-
confidence = float((max_start_score * max_end_score) * 0.9 * similarity_score)
|
364 |
-
|
365 |
-
answer_tokens = inputs["input_ids"][0][max_start_idx_int:max_end_idx_int + 1]
|
366 |
-
answer = self.qa_tokenizer.decode(answer_tokens, skip_special_tokens=True)
|
367 |
-
|
368 |
-
# Generate detailed explanations for concepts in answer
|
369 |
-
explanations_result = self.detailed_explainer.explain_text(answer, context)
|
370 |
-
|
371 |
-
answers.append({
|
372 |
-
"answer": answer,
|
373 |
-
"confidence": confidence,
|
374 |
-
"context": context,
|
375 |
-
"page_number": documents[page_idx]['metadata']['page'],
|
376 |
-
"explanations": explanations_result # contains 'concepts' and 'explanations'
|
377 |
-
})
|
378 |
-
|
379 |
-
# Select best answer with confidence validation
|
380 |
-
if not answers:
|
381 |
-
return {"answer": "No confident answer found", "confidence": 0.0, "explanations": {}}
|
382 |
-
|
383 |
-
best_answer = max(answers, key=lambda x: x['confidence'])
|
384 |
-
|
385 |
-
# Enforce minimum confidence threshold
|
386 |
-
if best_answer['confidence'] < 0.85:
|
387 |
-
best_answer['answer'] = f"[Low Confidence] {best_answer['answer']}"
|
388 |
-
|
389 |
-
return best_answer
|
390 |
-
|
391 |
-
def _cluster_text(self, X, n_clusters=5):
|
392 |
-
"""
|
393 |
-
Cluster sentences using KMeans and return indices for each cluster.
|
394 |
-
Returns a list of lists, where each sublist contains indices of sentences in that cluster.
|
395 |
-
"""
|
396 |
-
if X.shape[0] < n_clusters:
|
397 |
-
# Not enough sentences to cluster, return each as its own cluster
|
398 |
-
return [[i] for i in range(X.shape[0])]
|
399 |
-
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
400 |
-
labels = kmeans.fit_predict(X)
|
401 |
-
clusters = [[] for _ in range(n_clusters)]
|
402 |
-
for idx, label in enumerate(labels):
|
403 |
-
clusters[label].append(idx)
|
404 |
-
return clusters
|
405 |
-
|
406 |
-
def _cross_validate_sentences(self, sentences: List[str]) -> List[float]:
|
407 |
-
"""
|
408 |
-
Assigns a relevance/confidence score to each sentence in the cluster.
|
409 |
-
Here, we use the average TF-IDF score as a proxy for importance.
|
410 |
-
"""
|
411 |
-
if not sentences:
|
412 |
-
return []
|
413 |
-
vectorizer = TfidfVectorizer(stop_words='english')
|
414 |
-
tfidf_matrix = vectorizer.fit_transform(sentences)
|
415 |
-
# Score: sum of TF-IDF weights for each sentence
|
416 |
-
scores = tfidf_matrix.sum(axis=1)
|
417 |
-
# Flatten to 1D list of floats
|
418 |
-
return [float(s) for s in scores]
|
419 |
-
|
420 |
-
def _find_supporting_evidence(self, sentence: str, retriever, top_k: int = 2) -> List[str]:
|
421 |
-
"""
|
422 |
-
Finds supporting evidence for a summary sentence using the retriever.
|
423 |
-
Returns a list of the most relevant document passages.
|
424 |
-
"""
|
425 |
-
results = retriever.retrieve(sentence, top_k=top_k)
|
426 |
-
return [context for _, context, _ in results]
|
427 |
-
|
428 |
-
|
429 |
-
if __name__ == "__main__":
|
430 |
-
analyzer = AdvancedPDFAnalyzer()
|
431 |
-
file_path = input("Enter PDF file path (default: example.pdf): ").strip() or "example.pdf"
|
432 |
-
documents = analyzer.extract_text_with_metadata(file_path)
|
433 |
-
|
434 |
-
print("\nYou can now ask questions about the document. Type 'exit' to stop.")
|
435 |
-
while True:
|
436 |
-
user_question = input("\nAsk a question (or type 'exit'): ").strip()
|
437 |
-
if user_question.lower() in ["exit", "quit"]:
|
438 |
-
break
|
439 |
-
qa_result = analyzer.answer_question(user_question, documents)
|
440 |
-
print(f"AI Answer: {qa_result['answer']} (Confidence: {qa_result['confidence']:.2f})")
|
441 |
-
## Check confidence level
|
442 |
-
if qa_result['confidence'] >= 0.85:
|
443 |
-
print("\n[Info] High confidence in answer, you can trust the response.")
|
444 |
-
pprint.pprint(qa_result)
|
445 |
-
print("\nConcepts explained in detail:")
|
446 |
-
if 'explanations' in qa_result and qa_result['explanations']:
|
447 |
-
for concept in qa_result['explanations']['concepts']:
|
448 |
-
explanation = qa_result['explanations']['explanations'].get(concept, "")
|
449 |
-
print(f"\n>> {concept}:\n{explanation}\n")
|
450 |
-
if qa_result['confidence'] < 0.7 and qa_result['confidence'] >= 0.60:
|
451 |
-
# Print warning for confidence below 0.7
|
452 |
-
print(f"\n[Warning] Confidence below 0.7 , confidence {qa_result['confidence']}, Use the Quandans AI responses for reference only and confirm with the document. \n")
|
453 |
-
pprint(qa_result) #Print the full QA result for debugging
|
454 |
-
print("\nConcepts explained in detail:")
|
455 |
-
if 'explanations' in qa_result and qa_result['explanations']:
|
456 |
-
for concept in qa_result['explanations']['concepts']:
|
457 |
-
explanation = qa_result['explanations']['explanations'].get(concept, "")
|
458 |
-
print(f"\n>> {concept}:\n{explanation}\n")
|
459 |
-
|
460 |
-
if qa_result['confidence'] < 0.60:
|
461 |
-
print(f"[Warning] Low confidence in answer confidence:{qa_result['confidence']} . Consider rephrasing your question or checking the document.")
|
462 |
-
# Print detailed explanations for each concept
|
463 |
-
'''
|
464 |
-
if 'explanations' in qa_result and qa_result['explanations']:
|
465 |
-
print("\nConcepts explained in detail:")
|
466 |
-
for concept in qa_result['explanations']['concepts']:
|
467 |
-
explanation = qa_result['explanations']['explanations'].get(concept, "")
|
468 |
-
print(f"\n>> {concept}:\n{explanation}")
|
469 |
-
'''
|
470 |
-
|
471 |
-
# Now the model asks the user questions
|
472 |
-
print("\nNow the model will ask you questions about the document. Type 'exit' to stop.")
|
473 |
-
# Generate questions from the document (use summary sentences as questions)
|
474 |
-
summary = analyzer._generate_summary_with_confidence(
|
475 |
-
"\n".join([doc['content'] for doc in documents]),
|
476 |
-
PDFAugmentedRetriever([doc['content'] for doc in documents])
|
477 |
-
)
|
478 |
-
for i, doc_result in enumerate(summary):
|
479 |
-
question = f"What is the meaning of: '{doc_result.content}'?"
|
480 |
-
print(f"\nQuestion {i + 1}: {question}")
|
481 |
-
user_answer = input("Your answer: ").strip()
|
482 |
-
if user_answer.lower() in ["exit", "quit"]:
|
483 |
-
break
|
484 |
-
# Use sentence transformer for similarity
|
485 |
-
try:
|
486 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
487 |
-
correct = doc_result.content
|
488 |
-
emb_user = model.encode([user_answer])[0]
|
489 |
-
emb_correct = model.encode([correct])[0]
|
490 |
-
similarity = np.dot(emb_user, emb_correct) / (np.linalg.norm(emb_user) * np.linalg.norm(emb_correct))
|
491 |
-
print(f"Your answer similarity score: {similarity:.2f}")
|
492 |
-
except Exception as e:
|
493 |
-
print(f"Could not evaluate answer similarity: {e}")
|
494 |
-
|
495 |
-
print("Session ended.")
|
496 |
-
|
497 |
-
|
498 |
-
# Initialize analyzer once
|
499 |
-
analyzer = AdvancedPDFAnalyzer()
|
500 |
-
documents = analyzer.extract_text_with_metadata("example.pdf") # Change path if needed
|
501 |
-
|
502 |
-
def ask_question_gradio(question: str):
|
503 |
-
if not question.strip():
|
504 |
-
return "Please enter a valid question."
|
505 |
-
try:
|
506 |
-
result = analyzer.answer_question(question, documents)
|
507 |
-
answer = result['answer']
|
508 |
-
confidence = result['confidence']
|
509 |
-
explanation = "\n\n".join(
|
510 |
-
f"🔹 {concept}: {desc}"
|
511 |
-
for concept, desc in result.get("explanations", {}).get("explanations", {}).items()
|
512 |
-
)
|
513 |
-
return f"📌 **Answer**: {answer}\n\n🔒 **Confidence**: {confidence:.2f}\n\n📘 **Explanations**:\n{explanation}"
|
514 |
-
except Exception as e:
|
515 |
-
return f"❌ Error: {str(e)}"
|
516 |
-
|
517 |
-
# Gradio Interface
|
518 |
-
demo = gr.Interface(
|
519 |
-
fn=ask_question_gradio,
|
520 |
-
inputs=gr.Textbox(label="Ask a question about the PDF"),
|
521 |
-
outputs=gr.Markdown(label="Answer"),
|
522 |
-
title="Quandans AI - Ask Questions",
|
523 |
-
description="Enter a question based on the loaded PDF document. The system will provide an answer with confidence and concept explanations."
|
524 |
-
)
|
525 |
-
|
526 |
-
demo.launch()
|
527 |
-
|
528 |
-
'''
|
529 |
-
|
530 |
import os
|
531 |
import re
|
532 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import re
|
3 |
import json
|