Spaces:

priyanshu23456
/

pdfassistant

Sleeping

App Files Files Community

priyanshu23456 commited on 17 days ago

Commit

e87b8a7

verified ·

1 Parent(s): 324a36a

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -126

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
 from flask_cors import CORS
 import os
@@ -6,12 +6,14 @@ import torch
 import fitz  # PyMuPDF
 import pytesseract
 from pdf2image import convert_from_path
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
 import tempfile
 from PIL import Image
 import logging
@@ -73,83 +75,6 @@ def initialize_models():
         logger.error(f"Error initializing models: {str(e)}")
         raise
-# Generation-based answering
-def answer_with_generation(index, embeddings, chunks, question):
-    try:
-        logger.info(f"Answering with generation model: '{question}'")
-        global tokenizer, model
-        if tokenizer is None or model is None:
-            logger.info("Generation models not initialized, creating now...")
-            model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch.float16,
-                device_map="cpu",
-                low_cpu_mem_usage=True
-            )
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
-                model.config.pad_token_id = model.config.eos_token_id
-        # Get embeddings for question
-        q_embedding = embedder.encode([question])
-        # Find relevant chunks
-        _, top_k_indices = index.search(q_embedding, k=3)
-        relevant_chunks = [chunks[i] for i in top_k_indices[0]]
-        context = " ".join(relevant_chunks)
-        # Limit context size
-        if len(context) > 2000:
-            context = context[:2000]
-        # Create prompt
-        prompt = f"""<|im_start|>system
-You are a helpful assistant answering questions based on provided PDF content. Use the information below to give a clear, concise, and accurate answer. Avoid speculation and focus on the context.
-<|im_end|>
-<|im_start|>user
-**Context**: {context}
-**Question**: {question}
-**Instruction**: Provide a detailed and accurate answer based on the context. If the context doesn't contain enough information, say so clearly. <|im_end|>"""
-        # Handle inputs
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
-        # Move inputs to CPU
-        inputs = {k: v.to('cpu') for k, v in inputs.items()}
-        # Generate answer
-        output = model.generate(
-            **inputs,
-            max_new_tokens=300,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True,
-            num_beams=2,
-            no_repeat_ngram_size=2
-        )
-        # Decode and format answer
-        answer = tokenizer.decode(output[0], skip_special_tokens=True)
-        if "<|im_end|>" in answer:
-            answer = answer.split("<|im_end|>")[1].strip()
-        elif "Instruction" in answer:
-            answer = answer.split("Instruction")[1].strip()
-        logger.info(f"Generation answer: '{answer[:50]}...' (length: {len(answer)})")
-        return answer.strip()
-    except Exception as e:
-        logger.error(f"Generation error: {str(e)}")
-        return "I couldn't generate a good answer based on the PDF content."
 # Cleanup function for temporary files
 def cleanup_temp_files(filepath):
     try:
@@ -297,19 +222,21 @@ def answer_with_qa_pipeline(chunks, question):
         logger.error(f"QA pipeline error: {str(e)}")
         return ""
-# Generation-based answering
-def answer_with_generation(index, embeddings, chunks, question):
     try:
-        logger.info(f"Answering with generation model: '{question}'")
         global tokenizer, model
         if tokenizer is None or model is None:
             logger.info("Generation models not initialized, creating now...")
-            tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
             model = AutoModelForCausalLM.from_pretrained(
-                "distilgpt2",
-                device_map="auto",
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
             )
             if tokenizer.pad_token is None:
@@ -324,41 +251,47 @@ def answer_with_generation(index, embeddings, chunks, question):
         relevant_chunks = [chunks[i] for i in top_k_indices[0]]
         context = " ".join(relevant_chunks)
-        # Limit context size to avoid token length issues
-        if len(context) > 4000:
-            context = context[:4000]
         # Create prompt
-        prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
         # Handle inputs
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-        # Move inputs to the right device if needed
-        if torch.cuda.is_available():
-            inputs = {k: v.to('cuda') for k, v in inputs.items()}
-        # Generate answer
-        output = model.generate(
             **inputs,
             max_new_tokens=300,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
-            num_beams=3,
             no_repeat_ngram_size=2
         )
-        # Decode and format answer
-        answer = tokenizer.decode(output[0], skip_special_tokens=True)
-        if "Detailed answer:" in answer:
-            answer = answer.split("Detailed answer:")[-1].strip()
-        logger.info(f"Generation answer: '{answer[:50]}...' (length: {len(answer)})")
-        return answer.strip()
     except Exception as e:
-        logger.error(f"Generation error: {str(e)}")
-        return "I couldn't generate a good answer based on the PDF content."
 # API route
 @app.route('/')
@@ -369,6 +302,7 @@ def home():
 def ask():
     file = request.files.get("pdf")
     question = request.form.get("question", "")
     filepath = None
     if not file or not question:
@@ -379,9 +313,9 @@ def ask():
         filepath = os.path.join(UPLOAD_FOLDER, filename)
         file.save(filepath)
-        logger.info(f"Processing file: {filename}, Question: '{question}'")
-        # Process PDF and generate answer
         text = extract_text(filepath)
         if not text.strip():
             return jsonify({"error": "Could not extract text from the PDF"}), 400
@@ -389,33 +323,137 @@ def ask():
         chunks = split_into_chunks(text)
         if not chunks:
             return jsonify({"error": "PDF content couldn't be processed"}), 400
-        try:
-            answer = answer_with_qa_pipeline(chunks, question)
-        except Exception as e:
-            logger.warning(f"QA pipeline failed: {str(e)}")
-            answer = ""
-        # If QA pipeline didn't give a good answer, try generation
-        if not answer or len(answer.strip()) < 20:
             try:
-                logger.info("QA pipeline answer insufficient, trying generation...")
-                index, embeddings, chunks = setup_faiss(chunks)
-                answer = answer_with_generation(index, embeddings, chunks, question)
             except Exception as e:
-                logger.error(f"Generation fallback failed: {str(e)}")
-                return jsonify({"error": "Failed to generate answer from PDF content"}), 500
-        return jsonify({"answer": answer})
     except Exception as e:
         logger.error(f"Error processing request: {str(e)}")
         return jsonify({"error": f"An error occurred processing your request: {str(e)}"}), 500
     finally:
-        # Always clean up, even if errors occur
-        if filepath:
             cleanup_temp_files(filepath)
 if __name__ == "__main__":
     try:
         # Initialize models at startup

+from flask import Flask, request, jsonify, Response
 from werkzeug.utils import secure_filename
 from flask_cors import CORS
 import os
 import fitz  # PyMuPDF
 import pytesseract
 from pdf2image import convert_from_path
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
 import tempfile
 from PIL import Image
+import threading
+import json
 import logging
         logger.error(f"Error initializing models: {str(e)}")
         raise
 # Cleanup function for temporary files
 def cleanup_temp_files(filepath):
     try:
         logger.error(f"QA pipeline error: {str(e)}")
         return ""
+# Generation-based answering with streaming support
+def generate_streaming_answer(index, embeddings, chunks, question, streamer):
     try:
+        logger.info(f"Generating streaming answer for: '{question}'")
         global tokenizer, model
         if tokenizer is None or model is None:
             logger.info("Generation models not initialized, creating now...")
+            model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
             model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                device_map="cpu",
+                low_cpu_mem_usage=True
             )
             if tokenizer.pad_token is None:
         relevant_chunks = [chunks[i] for i in top_k_indices[0]]
         context = " ".join(relevant_chunks)
+        # Limit context size
+        if len(context) > 2000:
+            context = context[:2000]
         # Create prompt
+        prompt = f"""<|im_start|>system
+You are a helpful assistant answering questions based on provided PDF content. Use the information below to give a clear, concise, and accurate answer. Avoid speculation and focus on the context.
+<|im_end|>
+<|im_start|>user
+**Context**: {context}
+**Question**: {question}
+**Instruction**: Provide a detailed and accurate answer based on the context. If the context doesn't contain enough information, say so clearly. <|im_end|>"""
         # Handle inputs
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
+        # Move inputs to CPU
+        inputs = {k: v.to('cpu') for k, v in inputs.items()}
+        # Generate answer using the streamer
+        generate_kwargs = dict(
             **inputs,
+            streamer=streamer,
             max_new_tokens=300,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
+            num_beams=2,
             no_repeat_ngram_size=2
         )
+        # Generate the answer (this will stream through the streamer)
+        model.generate(**generate_kwargs)
     except Exception as e:
+        logger.error(f"Streaming generation error: {str(e)}")
+        # If an error occurs during streaming, push an error message to the streamer
+        try:
+            streamer.put("I encountered an error while generating the response.")
+        except:
+            pass
 # API route
 @app.route('/')
 def ask():
     file = request.files.get("pdf")
     question = request.form.get("question", "")
+    streaming = request.form.get("streaming", "true").lower() == "true"
     filepath = None
     if not file or not question:
         filepath = os.path.join(UPLOAD_FOLDER, filename)
         file.save(filepath)
+        logger.info(f"Processing file: {filename}, Question: '{question}', Streaming: {streaming}")
+        # Process PDF and extract text
         text = extract_text(filepath)
         if not text.strip():
             return jsonify({"error": "Could not extract text from the PDF"}), 400
         chunks = split_into_chunks(text)
         if not chunks:
             return jsonify({"error": "PDF content couldn't be processed"}), 400
+        # Set up FAISS for semantic search
+        index, embeddings, chunks = setup_faiss(chunks)
+        # For non-streaming responses, use the regular approach
+        if not streaming:
             try:
+                answer = answer_with_qa_pipeline(chunks, question)
+                if not answer or len(answer.strip()) < 20:
+                    answer = answer_with_generation(index, embeddings, chunks, question)
+                return jsonify({"answer": answer})
             except Exception as e:
+                logger.error(f"Error generating answer: {str(e)}")
+                return jsonify({"error": f"An error occurred: {str(e)}"}), 500
+        # For streaming responses, use SSE
+        else:
+            try:
+                # Create a streamer for the text generation
+                streamer = TextIteratorStreamer(
+                    tokenizer, skip_prompt=True, skip_special_tokens=True
+                )
+                # Start generation in a separate thread
+                thread = threading.Thread(
+                    target=generate_streaming_answer,
+                    args=(index, embeddings, chunks, question, streamer)
+                )
+                thread.start()
+                # Stream responses as Server-Sent Events (SSE)
+                def generate():
+                    for new_text in streamer:
+                        yield f"data: {json.dumps({'response': new_text})}\n\n"
+                    yield "data: [DONE]\n\n"
+                # Cleanup will happen in a separate thread after the response is complete
+                cleanup_thread = threading.Thread(
+                    target=cleanup_temp_files,
+                    args=(filepath,)
+                )
+                cleanup_thread.daemon = True
+                cleanup_thread.start()
+                return Response(generate(), mimetype="text/event-stream")
+            except Exception as e:
+                logger.error(f"Error in streaming setup: {str(e)}")
+                return jsonify({"error": f"An error occurred: {str(e)}"}), 500
     except Exception as e:
         logger.error(f"Error processing request: {str(e)}")
         return jsonify({"error": f"An error occurred processing your request: {str(e)}"}), 500
     finally:
+        # For non-streaming responses, clean up immediately
+        # For streaming, we clean up in a separate thread
+        if filepath and not streaming:
             cleanup_temp_files(filepath)
+# Original generation function kept for non-streaming use
+def answer_with_generation(index, embeddings, chunks, question):
+    try:
+        logger.info(f"Answering with generation model: '{question}'")
+        global tokenizer, model
+        if tokenizer is None or model is None:
+            logger.info("Generation models not initialized, creating now...")
+            model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                device_map="cpu",
+                low_cpu_mem_usage=True
+            )
+            if tokenizer.pad_token is None:
+                tokenizer.pad_token = tokenizer.eos_token
+                model.config.pad_token_id = model.config.eos_token_id
+        # Get embeddings for question
+        q_embedding = embedder.encode([question])
+        # Find relevant chunks
+        _, top_k_indices = index.search(q_embedding, k=3)
+        relevant_chunks = [chunks[i] for i in top_k_indices[0]]
+        context = " ".join(relevant_chunks)
+        # Limit context size
+        if len(context) > 2000:
+            context = context[:2000]
+        # Create prompt
+        prompt = f"""<|im_start|>system
+You are a helpful assistant answering questions based on provided PDF content. Use the information below to give a clear, concise, and accurate answer. Avoid speculation and focus on the context.
+<|im_end|>
+<|im_start|>user
+**Context**: {context}
+**Question**: {question}
+**Instruction**: Provide a detailed and accurate answer based on the context. If the context doesn't contain enough information, say so clearly. <|im_end|>"""
+        # Handle inputs
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
+        # Move inputs to CPU
+        inputs = {k: v.to('cpu') for k, v in inputs.items()}
+        # Generate answer
+        output = model.generate(
+            **inputs,
+            max_new_tokens=300,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            num_beams=2,
+            no_repeat_ngram_size=2
+        )
+        # Decode and format answer
+        answer = tokenizer.decode(output[0], skip_special_tokens=True)
+        if "<|im_end|>" in answer:
+            answer = answer.split("<|im_end|>")[1].strip()
+        elif "Instruction" in answer:
+            answer = answer.split("Instruction")[1].strip()
+        logger.info(f"Generation answer: '{answer[:50]}...' (length: {len(answer)})")
+        return answer.strip()
+    except Exception as e:
+        logger.error(f"Generation error: {str(e)}")
+        return "I couldn't generate a good answer based on the PDF content."
 if __name__ == "__main__":
     try:
         # Initialize models at startup