Spaces:

AmelC
/

Quandans

Sleeping

App Files Files Community

AmelC commited on Jul 11

Commit

ddad505

verified ·

1 Parent(s): 707f8ee

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -529

app.py CHANGED Viewed

@@ -1,532 +1,3 @@
-'''
-import gradio as gr
-import os
-import re
-import json
-import torch
-import numpy as np
-import logging
-from typing import Dict, List, Tuple, Optional
-from tqdm import tqdm
-from pydantic import BaseModel
-import pprint
-from transformers import (
-    AutoTokenizer,
-    AutoModelForSeq2SeqLM,
-    AutoModelForQuestionAnswering,
-    pipeline,
-    LogitsProcessor,
-    LogitsProcessorList,
-    PreTrainedModel,
-    PreTrainedTokenizer
-)
-from sentence_transformers import SentenceTransformer, CrossEncoder
-from sklearn.feature_extraction.text import TfidfVectorizer
-from rank_bm25 import BM25Okapi
-import PyPDF2
-from sklearn.cluster import KMeans
-import spacy
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s"
-)
-print('====================== VERSION 6 (Force Use Of GPU)======================')
-class ConfidenceCalibrator(LogitsProcessor):
-    """Calibrates model confidence scores during generation"""
-    def __init__(self, calibration_factor: float = 0.9):
-        self.calibration_factor = calibration_factor
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        # Apply temperature scaling to smooth probability distribution
-        scores = scores / self.calibration_factor
-        return scores
-class DocumentResult(BaseModel):
-    """Structured output format for consistent results"""
-    content: str
-    confidence: float
-    source_page: int
-    supporting_evidence: List[str]
-class OptimalModelSelector:
-    """Dynamically selects best performing model for each task"""
-    def __init__(self):
-        self.qa_models = {
-            "deberta-v3": ("deepset/deberta-v3-large-squad2", 0.87),
-            "minilm": ("deepset/minilm-uncased-squad2", 0.84),
-            "roberta": ("deepset/roberta-base-squad2", 0.82)
-        }
-        self.summarization_models = {
-            "bart": ("facebook/bart-large-cnn", 0.85),
-            "pegasus": ("google/pegasus-xsum", 0.83)
-        }
-        self.current_models = {}
-    def get_best_model(self, task_type: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer, float]:
-        """Returns model with highest validation score for given task"""
-        model_map = self.qa_models if "qa" in task_type else self.summarization_models
-        best_model_name, best_score = max(model_map.items(), key=lambda x: x[1][1])
-        if best_model_name not in self.current_models:
-            logging.info(f"Loading {best_model_name} for {task_type}")
-            tokenizer = AutoTokenizer.from_pretrained(model_map[best_model_name][0])
-            model = (AutoModelForQuestionAnswering if "qa" in task_type
-                     else AutoModelForSeq2SeqLM).from_pretrained(model_map[best_model_name][0])
-            # Set model to high precision mode for stable confidence scores
-            model = model.eval().half().to('cuda' if torch.cuda.is_available() else 'cpu')
-            self.current_models[best_model_name] = (model, tokenizer)
-        return *self.current_models[best_model_name], best_score
-class PDFAugmentedRetriever:
-    """Enhanced context retrieval with hybrid search"""
-    def __init__(self, document_texts: List[str]):
-        self.documents = [(i, text) for i, text in enumerate(document_texts)]
-        self.bm25 = BM25Okapi([text.split() for _, text in self.documents])
-        self.encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
-        self.tfidf = TfidfVectorizer(stop_words='english').fit([text for _, text in self.documents])
-    def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[int, str, float]]:
-        """Hybrid retrieval combining lexical and semantic search"""
-        # BM25 (lexical search)
-        bm25_scores = self.bm25.get_scores(query.split())
-        # Semantic similarity
-        semantic_scores = self.encoder.predict([(query, doc) for _, doc in self.documents])
-        # Combine scores with learned weights (from validation)
-        combined_scores = 0.4 * bm25_scores + 0.6 * np.array(semantic_scores)
-        # Get top passages
-        top_indices = np.argsort(combined_scores)[-top_k:][::-1]
-        return [(self.documents[i][0], self.documents[i][1], float(combined_scores[i]))
-                for i in top_indices]
-class DetailedExplainer:
-    """
-    Extracts key concepts from a text and explains each in depth.
-    """
-    def __init__(self,
-                 explanation_model: str = "google/flan-t5-large",
-                 device: int = 0):
-        # generation pipeline for deep explanations
-        self.explainer = pipeline(
-            "text2text-generation",
-            model=explanation_model,
-            tokenizer=explanation_model,
-            device=device
-        )
-        # spaCy model for concept extraction
-        self.nlp = spacy.load("en_core_web_sm")
-    def extract_concepts(self, text: str) -> list:
-        """
-        Use noun chunks and named entities to identify concepts.
-        Returns a list of unique concept strings.
-        """
-        doc = self.nlp(text)
-        concepts = set()
-        for chunk in doc.noun_chunks:
-            if len(chunk) > 1 and not chunk.root.is_stop:
-                concepts.add(chunk.text.strip())
-        for ent in doc.ents:
-            if ent.label_ in ["PERSON", "ORG", "GPE", "NORP", "EVENT", "WORK_OF_ART"]:
-                concepts.add(ent.text.strip())
-        return list(concepts)
-    # The min_accurancy parameter ensures that the explanation is sufficiently accurate
-    # by calibrating the prompt to require a minimum level of detail.
-    # This is useful for complex concepts where a simple explanation may not suffice.
-    #min_accuracy = 0.7  # Default minimum accuracy threshold
-    def explain_concept(self, concept: str, context: str, min_accuracy: float = 0.50) -> str:
-        """
-        Generate an explanation for a single concept using context.
-        Ensures at least `min_accuracy` via introspective prompt calibration.
-        """
-        prompt = (
-            f"Explain the concept '{concept}' in depth using the following context. "
-            f"Aim for at least {int(min_accuracy * 100)}% accuracy."
-            f"\nContext:\n{context}\n"
-        )
-        result = self.explainer(
-            prompt,
-            max_length=200,
-            min_length=80,
-            do_sample=False
-        )
-        return result[0]["generated_text"].strip()
-    def explain_text(self, text: str, context: str) -> dict:
-        """
-        For each concept in text, produce a detailed explanation.
-        Returns:
-          {
-            'concepts': [list of extracted concepts],
-            'explanations': {concept: explanation, ...}
-          }
-        """
-        concepts = self.extract_concepts(text)
-        explanations = {}
-        for concept in concepts:
-            explanations[concept] = self.explain_concept(concept, context)
-        return {"concepts": concepts, "explanations": explanations}
-class AdvancedPDFAnalyzer:
-    """
-    High-precision PDF analysis engine with confidence calibration
-    Confidence scores are empirically validated to reach 0.9+ on benchmark datasets
-    """
-    def __init__(self):
-        """Initialize with optimized model selection and retrieval"""
-        self.logger = logging.getLogger("PDFAnalyzer")
-        self.model_selector = OptimalModelSelector()
-        self._verify_dependencies()
-        # Force use of GPU if available
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        if torch.cuda.is_available():
-            print("[INFO] Using GPU for inference.")
-        else:
-            print("[INFO] Using CPU for inference.")
-        # Initialize with highest confidence models
-        self.qa_model, self.qa_tokenizer, _ = self.model_selector.get_best_model("qa")
-        self.qa_model = self.qa_model.to(self.device)
-        self.summarizer = pipeline(
-            "summarization",
-            model="facebook/bart-large-cnn",
-            device=0 if torch.cuda.is_available() else -1,
-            framework="pt"
-        )
-        # Confidence calibration setup
-        self.logits_processor = LogitsProcessorList([
-            ConfidenceCalibrator(calibration_factor=0.85)
-        ])
-        # Initialize the detailed explainer here
-        self.detailed_explainer = DetailedExplainer(
-            device=0 if torch.cuda.is_available() else -1
-        )
-    def _verify_dependencies(self):
-        """Check for critical dependencies"""
-        try:
-            PyPDF2.PdfReader
-        except ImportError:
-            raise ImportError("PyPDF2 required: pip install pypdf2")
-    def extract_text_with_metadata(self, file_path: str) -> List[Dict]:
-        """Extract text with page-level metadata and structural info"""
-        self.logger.info(f"Processing {file_path}")
-        documents = []
-        with open(file_path, 'rb') as f:
-            reader = PyPDF2.PdfReader(f)
-            for i, page in enumerate(tqdm(reader.pages)):
-                try:
-                    text = page.extract_text()
-                    if not text or not text.strip():
-                        continue
-                    # Add document context
-                    page_number = i + 1
-                    metadata = {
-                        'source': os.path.basename(file_path),
-                        'page': page_number,
-                        'char_count': len(text),
-                        'word_count': len(text.split()),
-                    }
-                    documents.append({
-                        'content': self._clean_text(text),
-                        'metadata': metadata
-                    })
-                except Exception as e:
-                    self.logger.warning(f"Page {i + 1} error: {str(e)}")
-        if not documents:
-            raise ValueError("No extractable content found in PDF")
-        return documents
-    def _clean_text(self, text: str) -> str:
-        """Advanced text normalization with document structure preservation"""
-        text = re.sub(r'[\x00-\x1F\x7F-\x9F]', ' ', text)  # Remove control chars
-        text = re.sub(r'\s+', ' ', text)  # Standardize whitespace
-        text = re.sub(r'(\w)-\s+(\w)', r'\1\2', text)  # Fix hyphenated words
-        return text.strip()
-    def analyze_document(self, file_path: str) -> Dict:
-        """Full document analysis pipeline with confidence scoring"""
-        documents = self.extract_text_with_metadata(file_path)
-        text_chunks = [doc['content'] for doc in documents]
-        # Initialize retriever with document chunks
-        retriever = PDFAugmentedRetriever(text_chunks)
-        # Generate summary with confidence
-        summary = self._generate_summary_with_confidence(
-            "\n".join(text_chunks),
-            retriever
-        )
-        return {
-            'document_metadata': [doc['metadata'] for doc in documents],
-            'summary': summary,
-            'avg_confidence': np.mean([s.confidence for s in summary])
-        }
-    def _generate_summary_with_confidence(self, text: str, retriever: PDFAugmentedRetriever) -> List[DocumentResult]:
-        """Generates summary with calibrated confidence scores"""
-        sentences = [s.strip() for s in text.split('. ') if len(s.split()) > 6]
-        if not sentences:
-            return []
-        # Cluster sentences into topics
-        vectorizer = TfidfVectorizer(max_features=500)
-        X = vectorizer.fit_transform(sentences)
-        # Select most representative sentence per topic
-        summary_sentences = []
-        for cluster in self._cluster_text(X, n_clusters=min(5, len(sentences))):
-            cluster_sents = [sentences[i] for i in cluster]
-            sentence_scores = self._cross_validate_sentences(cluster_sents)
-            best_sentence = max(zip(cluster_sents, sentence_scores), key=lambda x: x[1])
-            summary_sentences.append(best_sentence)
-        # Format with confidence
-        return [
-            DocumentResult(
-                content=sent,
-                confidence=min(0.95, score * 1.1),  # Calibrated boost
-                source_page=0,
-                supporting_evidence=self._find_supporting_evidence(sent, retriever)
-            )
-            for sent, score in summary_sentences
-        ]
-    def answer_question(self, question: str, documents: List[Dict]) -> Dict:
-        """High-confidence QA with evidence retrieval and detailed explanations"""
-        # Create searchable index
-        retriever = PDFAugmentedRetriever([doc['content'] for doc in documents])
-        # Retrieve relevant context
-        relevant_contexts = retriever.retrieve(question, top_k=3)
-        answers = []
-        for page_idx, context, similarity_score in relevant_contexts:
-            # Prepare QA inputs dynamically
-            inputs = self.qa_tokenizer(
-                question,
-                context,
-                add_special_tokens=True,
-                return_tensors="pt",
-                max_length=512,
-                truncation="only_second"
-            )
-            # Move inputs to the correct device
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
-            # Get model output with calibration
-            with torch.no_grad():
-                outputs = self.qa_model(**inputs)
-                start_logits = outputs.start_logits
-                end_logits = outputs.end_logits
-                # Apply confidence calibration
-                logits_processor = LogitsProcessorList([ConfidenceCalibrator()])
-                start_logits = logits_processor(inputs['input_ids'], start_logits)
-                end_logits = logits_processor(inputs['input_ids'], end_logits)
-                start_prob = torch.nn.functional.softmax(start_logits, dim=-1)
-                end_prob = torch.nn.functional.softmax(end_logits, dim=-1)
-                # Get best answer span
-                max_start_score, max_start_idx = torch.max(start_prob, dim=-1)
-                max_start_idx_int = max_start_idx.item()
-                max_end_score, max_end_idx = torch.max(end_prob[0, max_start_idx_int:], dim=-1)
-                max_end_idx_int = max_end_idx.item() + max_start_idx_int
-                confidence = float((max_start_score * max_end_score) * 0.9 * similarity_score)
-                answer_tokens = inputs["input_ids"][0][max_start_idx_int:max_end_idx_int + 1]
-                answer = self.qa_tokenizer.decode(answer_tokens, skip_special_tokens=True)
-                # Generate detailed explanations for concepts in answer
-                explanations_result = self.detailed_explainer.explain_text(answer, context)
-                answers.append({
-                    "answer": answer,
-                    "confidence": confidence,
-                    "context": context,
-                    "page_number": documents[page_idx]['metadata']['page'],
-                    "explanations": explanations_result  # contains 'concepts' and 'explanations'
-                })
-        # Select best answer with confidence validation
-        if not answers:
-            return {"answer": "No confident answer found", "confidence": 0.0, "explanations": {}}
-        best_answer = max(answers, key=lambda x: x['confidence'])
-        # Enforce minimum confidence threshold
-        if best_answer['confidence'] < 0.85:
-            best_answer['answer'] = f"[Low Confidence] {best_answer['answer']}"
-        return best_answer
-    def _cluster_text(self, X, n_clusters=5):
-        """
-        Cluster sentences using KMeans and return indices for each cluster.
-        Returns a list of lists, where each sublist contains indices of sentences in that cluster.
-        """
-        if X.shape[0] < n_clusters:
-            # Not enough sentences to cluster, return each as its own cluster
-            return [[i] for i in range(X.shape[0])]
-        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
-        labels = kmeans.fit_predict(X)
-        clusters = [[] for _ in range(n_clusters)]
-        for idx, label in enumerate(labels):
-            clusters[label].append(idx)
-        return clusters
-    def _cross_validate_sentences(self, sentences: List[str]) -> List[float]:
-        """
-        Assigns a relevance/confidence score to each sentence in the cluster.
-        Here, we use the average TF-IDF score as a proxy for importance.
-        """
-        if not sentences:
-            return []
-        vectorizer = TfidfVectorizer(stop_words='english')
-        tfidf_matrix = vectorizer.fit_transform(sentences)
-        # Score: sum of TF-IDF weights for each sentence
-        scores = tfidf_matrix.sum(axis=1)
-        # Flatten to 1D list of floats
-        return [float(s) for s in scores]
-    def _find_supporting_evidence(self, sentence: str, retriever, top_k: int = 2) -> List[str]:
-        """
-        Finds supporting evidence for a summary sentence using the retriever.
-        Returns a list of the most relevant document passages.
-        """
-        results = retriever.retrieve(sentence, top_k=top_k)
-        return [context for _, context, _ in results]
-if __name__ == "__main__":
-    analyzer = AdvancedPDFAnalyzer()
-    file_path = input("Enter PDF file path (default: example.pdf): ").strip() or "example.pdf"
-    documents = analyzer.extract_text_with_metadata(file_path)
-    print("\nYou can now ask questions about the document. Type 'exit' to stop.")
-    while True:
-        user_question = input("\nAsk a question (or type 'exit'): ").strip()
-        if user_question.lower() in ["exit", "quit"]:
-            break
-        qa_result = analyzer.answer_question(user_question, documents)
-        print(f"AI Answer: {qa_result['answer']} (Confidence: {qa_result['confidence']:.2f})")
-        ## Check confidence level
-        if qa_result['confidence'] >= 0.85:
-            print("\n[Info] High confidence in answer, you can trust the response.")
-            pprint.pprint(qa_result)
-            print("\nConcepts explained in detail:")
-            if 'explanations' in qa_result and qa_result['explanations']:
-                for concept in qa_result['explanations']['concepts']:
-                    explanation = qa_result['explanations']['explanations'].get(concept, "")
-                    print(f"\n>> {concept}:\n{explanation}\n")
-        if qa_result['confidence'] < 0.7 and qa_result['confidence'] >= 0.60:
-            # Print warning for confidence below 0.7
-            print(f"\n[Warning] Confidence below 0.7 , confidence {qa_result['confidence']}, Use the Quandans AI responses for reference only and confirm with the document. \n")
-            pprint(qa_result) #Print the full QA result for debugging
-            print("\nConcepts explained in detail:")
-            if 'explanations' in qa_result and qa_result['explanations']:
-                for concept in qa_result['explanations']['concepts']:
-                    explanation = qa_result['explanations']['explanations'].get(concept, "")
-                    print(f"\n>> {concept}:\n{explanation}\n")
-        if qa_result['confidence'] < 0.60:
-            print(f"[Warning] Low confidence in answer confidence:{qa_result['confidence']} . Consider rephrasing your question or checking the document.")
-        # Print detailed explanations for each concept
-        '''
-        if 'explanations' in qa_result and qa_result['explanations']:
-            print("\nConcepts explained in detail:")
-            for concept in qa_result['explanations']['concepts']:
-                explanation = qa_result['explanations']['explanations'].get(concept, "")
-                print(f"\n>> {concept}:\n{explanation}")
-        '''
-    # Now the model asks the user questions
-    print("\nNow the model will ask you questions about the document. Type 'exit' to stop.")
-    # Generate questions from the document (use summary sentences as questions)
-    summary = analyzer._generate_summary_with_confidence(
-        "\n".join([doc['content'] for doc in documents]),
-        PDFAugmentedRetriever([doc['content'] for doc in documents])
-    )
-    for i, doc_result in enumerate(summary):
-        question = f"What is the meaning of: '{doc_result.content}'?"
-        print(f"\nQuestion {i + 1}: {question}")
-        user_answer = input("Your answer: ").strip()
-        if user_answer.lower() in ["exit", "quit"]:
-            break
-        # Use sentence transformer for similarity
-        try:
-            model = SentenceTransformer('all-MiniLM-L6-v2')
-            correct = doc_result.content
-            emb_user = model.encode([user_answer])[0]
-            emb_correct = model.encode([correct])[0]
-            similarity = np.dot(emb_user, emb_correct) / (np.linalg.norm(emb_user) * np.linalg.norm(emb_correct))
-            print(f"Your answer similarity score: {similarity:.2f}")
-        except Exception as e:
-            print(f"Could not evaluate answer similarity: {e}")
-    print("Session ended.")
-# Initialize analyzer once
-analyzer = AdvancedPDFAnalyzer()
-documents = analyzer.extract_text_with_metadata("example.pdf")  # Change path if needed
-def ask_question_gradio(question: str):
-    if not question.strip():
-        return "Please enter a valid question."
-    try:
-        result = analyzer.answer_question(question, documents)
-        answer = result['answer']
-        confidence = result['confidence']
-        explanation = "\n\n".join(
-            f"🔹 {concept}: {desc}"
-            for concept, desc in result.get("explanations", {}).get("explanations", {}).items()
-        )
-        return f"📌 **Answer**: {answer}\n\n🔒 **Confidence**: {confidence:.2f}\n\n📘 **Explanations**:\n{explanation}"
-    except Exception as e:
-        return f"❌ Error: {str(e)}"
-# Gradio Interface
-demo = gr.Interface(
-    fn=ask_question_gradio,
-    inputs=gr.Textbox(label="Ask a question about the PDF"),
-    outputs=gr.Markdown(label="Answer"),
-    title="Quandans AI - Ask Questions",
-    description="Enter a question based on the loaded PDF document. The system will provide an answer with confidence and concept explanations."
-)
-demo.launch()
-'''
 import os
 import re
 import json

 import os
 import re
 import json