Spaces:

priyanshu23456
/

pdfassistant

Sleeping

App Files Files Community

priyanshu23456 commited on Apr 11

Commit

a987525

verified ·

1 Parent(s): f800f49

Update app.py

Browse files

Files changed (1) hide show

app.py +219 -82

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
-from flask_cors import CORS  # ✅ Add this line
 import os
 import torch
 import fitz  # PyMuPDF
@@ -12,6 +12,11 @@ import faiss
 import numpy as np
 import tempfile
 from PIL import Image
 # Fix caching issue on Hugging Face Spaces
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
@@ -19,16 +24,65 @@ os.environ["HF_HOME"] = "/tmp"
 os.environ["XDG_CACHE_HOME"] = "/tmp"
 app = Flask(__name__)
-CORS(app)  # ✅ Enable CORS for all routes
 UPLOAD_FOLDER = "/tmp/uploads"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Improved OCR function
 def ocr_pdf(pdf_path):
     try:
         # Use a higher DPI for better quality
         images = convert_from_path(
             pdf_path,
@@ -39,17 +93,20 @@ def ocr_pdf(pdf_path):
         )
         text = ""
-        for img in images:
             # Preprocess the image for better OCR results
             preprocessed = preprocess_image_for_ocr(img)
             # Use tesseract with more options
-            text += pytesseract.image_to_string(
                 preprocessed,
                 config='--psm 1 --oem 3 -l eng'  # Page segmentation mode 1 (auto), OCR Engine mode 3 (default)
             )
         return text
     except Exception as e:
-        print(f"OCR error: {str(e)}")
         return ""
 # Image preprocessing function for better OCR
@@ -66,27 +123,38 @@ def preprocess_image_for_ocr(img):
 # Improved extract_text function with better text detection
 def extract_text(pdf_path):
-    doc = fitz.open(pdf_path)
-    text = ""
-    for page in doc:
-        page_text = page.get_text()
-        text += page_text
-    # Check if the text is meaningful (more sophisticated check)
-    words = text.split()
-    unique_words = set(word.lower() for word in words if len(word) > 2)
-    # If we don't have enough meaningful text, try OCR
-    if len(unique_words) < 20 or len(text.strip()) < 100:
-        ocr_text = ocr_pdf(pdf_path)
-        # If OCR gave us more text, use it
-        if len(ocr_text.strip()) > len(text.strip()):
-            text = ocr_text
-    return text
-# ✅ Split into chunks
 def split_into_chunks(text, max_tokens=300, overlap=50):
     sentences = text.split('.')
     chunks, current = [], ''
     for sentence in sentences:
@@ -102,60 +170,95 @@ def split_into_chunks(text, max_tokens=300, overlap=50):
                 current = sentence
     if current:
         chunks.append(current.strip())
     return chunks
-# ✅ Setup FAISS
 def setup_faiss(chunks):
-    embedder = SentenceTransformer("all-MiniLM-L6-v2")
-    embeddings = embedder.encode(chunks)
-    dim = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dim)
-    index.add(embeddings)
-    return index, embeddings, chunks
-# ✅ QA pipeline
 def answer_with_qa_pipeline(chunks, question):
-    qa_pipeline = pipeline(
-        "question-answering",
-        model="distilbert-base-cased-distilled-squad",
-        tokenizer="distilbert-base-cased",
-        device=0 if device == "cuda" else -1
-    )
-    context = " ".join(chunks[:5])
     try:
         result = qa_pipeline(question=question, context=context)
         return result["answer"]
-    except:
         return ""
-# Modify your answer_with_generation function like this:
 def answer_with_generation(index, embeddings, chunks, question):
-    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-    # Fix for meta tensor error - load model with device_map="auto"
-    model = AutoModelForCausalLM.from_pretrained(
-        "distilgpt2",
-        device_map="auto",  # This handles device placement automatically
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32  # Use fp16 if possible
-    )
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-    embedder = SentenceTransformer("all-MiniLM-L6-v2")
-    q_embedding = embedder.encode([question])
-    _, top_k_indices = index.search(q_embedding, k=3)
-    relevant_chunks = [chunks[i] for i in top_k_indices[0]]
-    context = " ".join(relevant_chunks)
-    prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
-    # Handle inputs without explicit device placement
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-    # Let the model handle device placement internally
     try:
         output = model.generate(
             **inputs,
             max_new_tokens=300,
@@ -165,15 +268,19 @@ def answer_with_generation(index, embeddings, chunks, question):
             num_beams=3,
             no_repeat_ngram_size=2
         )
         answer = tokenizer.decode(output[0], skip_special_tokens=True)
         if "Detailed answer:" in answer:
-            return answer.split("Detailed answer:")[-1].strip()
         return answer.strip()
     except Exception as e:
-        print(f"Generation error: {str(e)}")
         return "I couldn't generate a good answer based on the PDF content."
-# ✅ API route
 @app.route('/')
 def home():
     return jsonify({"message": "PDF QA API is running!"})
@@ -182,28 +289,58 @@ def home():
 def ask():
     file = request.files.get("pdf")
     question = request.form.get("question", "")
     if not file or not question:
         return jsonify({"error": "Both PDF file and question are required"}), 400
-    filename = secure_filename(file.filename)
-    filepath = os.path.join(UPLOAD_FOLDER, filename)
-    file.save(filepath)
     try:
-        # 🧠 Process PDF and generate answer
         text = extract_text(filepath)
         chunks = split_into_chunks(text)
-        answer = answer_with_qa_pipeline(chunks, question)
-        if len(answer.strip()) < 20:
-            index, embeddings, chunks = setup_faiss(chunks)
-            answer = answer_with_generation(index, embeddings, chunks, question)
         return jsonify({"answer": answer})
     except Exception as e:
-        return jsonify({"error": str(e)}), 500
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860)

 from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
+from flask_cors import CORS
 import os
 import torch
 import fitz  # PyMuPDF
 import numpy as np
 import tempfile
 from PIL import Image
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Fix caching issue on Hugging Face Spaces
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["XDG_CACHE_HOME"] = "/tmp"
 app = Flask(__name__)
+CORS(app)  # Enable CORS for all routes
 UPLOAD_FOLDER = "/tmp/uploads"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"Using device: {device}")
+# Global model variables
+embedder = None
+qa_pipeline = None
+tokenizer = None
+model = None
+# Initialize models once on startup
+def initialize_models():
+    global embedder, qa_pipeline, tokenizer, model
+    try:
+        logger.info("Loading SentenceTransformer model...")
+        embedder = SentenceTransformer("all-MiniLM-L6-v2")
+        logger.info("Loading QA pipeline...")
+        qa_pipeline = pipeline(
+            "question-answering",
+            model="distilbert-base-cased-distilled-squad",
+            tokenizer="distilbert-base-cased",
+            device=0 if device == "cuda" else -1
+        )
+        logger.info("Loading language model...")
+        tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+        model = AutoModelForCausalLM.from_pretrained(
+            "distilgpt2",
+            device_map="auto",
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+        )
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+            model.config.pad_token_id = model.config.eos_token_id
+        logger.info("Models initialized successfully")
+    except Exception as e:
+        logger.error(f"Error initializing models: {str(e)}")
+        raise
+# Cleanup function for temporary files
+def cleanup_temp_files(filepath):
+    try:
+        if os.path.exists(filepath):
+            os.remove(filepath)
+            logger.info(f"Removed temporary file: {filepath}")
+    except Exception as e:
+        logger.warning(f"Failed to clean up file {filepath}: {str(e)}")
 # Improved OCR function
 def ocr_pdf(pdf_path):
     try:
+        logger.info(f"Starting OCR for {pdf_path}")
         # Use a higher DPI for better quality
         images = convert_from_path(
             pdf_path,
         )
         text = ""
+        for i, img in enumerate(images):
+            logger.info(f"Processing page {i+1} of {len(images)}")
             # Preprocess the image for better OCR results
             preprocessed = preprocess_image_for_ocr(img)
             # Use tesseract with more options
+            page_text = pytesseract.image_to_string(
                 preprocessed,
                 config='--psm 1 --oem 3 -l eng'  # Page segmentation mode 1 (auto), OCR Engine mode 3 (default)
             )
+            text += page_text
+        logger.info(f"OCR completed with {len(text)} characters extracted")
         return text
     except Exception as e:
+        logger.error(f"OCR error: {str(e)}")
         return ""
 # Image preprocessing function for better OCR
 # Improved extract_text function with better text detection
 def extract_text(pdf_path):
+    try:
+        logger.info(f"Extracting text from {pdf_path}")
+        doc = fitz.open(pdf_path)
+        text = ""
+        for page_num, page in enumerate(doc):
+            page_text = page.get_text()
+            text += page_text
+            logger.info(f"Extracted {len(page_text)} characters from page {page_num+1}")
+        # Check if the text is meaningful (more sophisticated check)
+        words = text.split()
+        unique_words = set(word.lower() for word in words if len(word) > 2)
+        logger.info(f"PDF text extraction: {len(text)} chars, {len(words)} words, {len(unique_words)} unique words")
+        # If we don't have enough meaningful text, try OCR
+        if len(unique_words) < 20 or len(text.strip()) < 100:
+            logger.info("Text extraction yielded insufficient results, trying OCR...")
+            ocr_text = ocr_pdf(pdf_path)
+            # If OCR gave us more text, use it
+            if len(ocr_text.strip()) > len(text.strip()):
+                logger.info(f"Using OCR result: {len(ocr_text)} chars (better than {len(text)} chars)")
+                text = ocr_text
+        return text
+    except Exception as e:
+        logger.error(f"Text extraction error: {str(e)}")
+        return ""
+# Split into chunks
 def split_into_chunks(text, max_tokens=300, overlap=50):
+    logger.info(f"Splitting text into chunks (max_tokens={max_tokens}, overlap={overlap})")
     sentences = text.split('.')
     chunks, current = [], ''
     for sentence in sentences:
                 current = sentence
     if current:
         chunks.append(current.strip())
+    logger.info(f"Split text into {len(chunks)} chunks")
     return chunks
+# Setup FAISS
 def setup_faiss(chunks):
+    try:
+        logger.info("Setting up FAISS index")
+        global embedder
+        if embedder is None:
+            embedder = SentenceTransformer("all-MiniLM-L6-v2")
+        embeddings = embedder.encode(chunks)
+        dim = embeddings.shape[1]
+        index = faiss.IndexFlatL2(dim)
+        index.add(embeddings)
+        logger.info(f"FAISS index created with {len(chunks)} chunks and dimension {dim}")
+        return index, embeddings, chunks
+    except Exception as e:
+        logger.error(f"FAISS setup error: {str(e)}")
+        raise
+# QA pipeline
 def answer_with_qa_pipeline(chunks, question):
     try:
+        logger.info(f"Answering with QA pipeline: '{question}'")
+        global qa_pipeline
+        if qa_pipeline is None:
+            logger.info("QA pipeline not initialized, creating now...")
+            qa_pipeline = pipeline(
+                "question-answering",
+                model="distilbert-base-cased-distilled-squad",
+                tokenizer="distilbert-base-cased",
+                device=0 if device == "cuda" else -1
+            )
+        # Limit context size to avoid token length issues
+        context = " ".join(chunks[:5])
+        if len(context) > 5000:  # Approx token limit
+            context = context[:5000]
         result = qa_pipeline(question=question, context=context)
+        logger.info(f"QA pipeline answer: '{result['answer']}' (score: {result['score']})")
         return result["answer"]
+    except Exception as e:
+        logger.error(f"QA pipeline error: {str(e)}")
         return ""
+# Generation-based answering
 def answer_with_generation(index, embeddings, chunks, question):
     try:
+        logger.info(f"Answering with generation model: '{question}'")
+        global tokenizer, model
+        if tokenizer is None or model is None:
+            logger.info("Generation models not initialized, creating now...")
+            tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+            model = AutoModelForCausalLM.from_pretrained(
+                "distilgpt2",
+                device_map="auto",
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+            )
+            if tokenizer.pad_token is None:
+                tokenizer.pad_token = tokenizer.eos_token
+                model.config.pad_token_id = model.config.eos_token_id
+        # Get embeddings for question
+        q_embedding = embedder.encode([question])
+        # Find relevant chunks
+        _, top_k_indices = index.search(q_embedding, k=3)
+        relevant_chunks = [chunks[i] for i in top_k_indices[0]]
+        context = " ".join(relevant_chunks)
+        # Limit context size to avoid token length issues
+        if len(context) > 4000:
+            context = context[:4000]
+        # Create prompt
+        prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
+        # Handle inputs
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+        # Move inputs to the right device if needed
+        if torch.cuda.is_available():
+            inputs = {k: v.to('cuda') for k, v in inputs.items()}
+        # Generate answer
         output = model.generate(
             **inputs,
             max_new_tokens=300,
             num_beams=3,
             no_repeat_ngram_size=2
         )
+        # Decode and format answer
         answer = tokenizer.decode(output[0], skip_special_tokens=True)
         if "Detailed answer:" in answer:
+            answer = answer.split("Detailed answer:")[-1].strip()
+        logger.info(f"Generation answer: '{answer[:50]}...' (length: {len(answer)})")
         return answer.strip()
     except Exception as e:
+        logger.error(f"Generation error: {str(e)}")
         return "I couldn't generate a good answer based on the PDF content."
+# API route
 @app.route('/')
 def home():
     return jsonify({"message": "PDF QA API is running!"})
 def ask():
     file = request.files.get("pdf")
     question = request.form.get("question", "")
+    filepath = None
     if not file or not question:
         return jsonify({"error": "Both PDF file and question are required"}), 400
     try:
+        filename = secure_filename(file.filename)
+        filepath = os.path.join(UPLOAD_FOLDER, filename)
+        file.save(filepath)
+        logger.info(f"Processing file: {filename}, Question: '{question}'")
+        # Process PDF and generate answer
         text = extract_text(filepath)
+        if not text.strip():
+            return jsonify({"error": "Could not extract text from the PDF"}), 400
         chunks = split_into_chunks(text)
+        if not chunks:
+            return jsonify({"error": "PDF content couldn't be processed"}), 400
+        try:
+            answer = answer_with_qa_pipeline(chunks, question)
+        except Exception as e:
+            logger.warning(f"QA pipeline failed: {str(e)}")
+            answer = ""
+        # If QA pipeline didn't give a good answer, try generation
+        if not answer or len(answer.strip()) < 20:
+            try:
+                logger.info("QA pipeline answer insufficient, trying generation...")
+                index, embeddings, chunks = setup_faiss(chunks)
+                answer = answer_with_generation(index, embeddings, chunks, question)
+            except Exception as e:
+                logger.error(f"Generation fallback failed: {str(e)}")
+                return jsonify({"error": "Failed to generate answer from PDF content"}), 500
         return jsonify({"answer": answer})
     except Exception as e:
+        logger.error(f"Error processing request: {str(e)}")
+        return jsonify({"error": f"An error occurred processing your request: {str(e)}"}), 500
+    finally:
+        # Always clean up, even if errors occur
+        if filepath:
+            cleanup_temp_files(filepath)
 if __name__ == "__main__":
+    try:
+        # Initialize models at startup
+        initialize_models()
+        logger.info("Starting Flask application")
+        app.run(host="0.0.0.0", port=7860)
+    except Exception as e:
+        logger.critical(f"Failed to start application: {str(e)}")