Spaces:

priyanshu23456
/

pdfassistant

Running

App Files Files Community

priyanshu23456 commited on Apr 8

Commit

d47a566

verified ·

1 Parent(s): 63da8c0

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -74

app.py CHANGED Viewed

@@ -1,90 +1,137 @@
 import os
-from flask import Flask, request, jsonify
-from flask_cors import CORS
-import io
 import fitz  # PyMuPDF
 import torch
-from transformers import pipeline
-app = Flask(__name__)
-CORS(app)
-# Device setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-# Load a simpler QA model
-try:
-    print("Loading QA model...")
-    qa_model = pipeline(
         "question-answering",
         model="distilbert-base-cased-distilled-squad",
         device=0 if device == "cuda" else -1
     )
-    print("✅ Model loaded successfully")
-except Exception as e:
-    print(f"❌ Error loading model: {e}")
-    raise
-# Text extraction from PDFs
-def extract_text(pdf_bytes):
     try:
-        doc = fitz.open(stream=io.BytesIO(pdf_bytes), filetype="pdf")
-        text = ""
-        for page in doc:
-            text += page.get_text()
-        print("✅ Text extraction complete")
-        return text
-    except Exception as e:
-        print(f"❌ Text extraction error: {e}")
-        return ""
-# Process PDF and answer question
-def process_pdf_and_answer(pdf_bytes, question):
-    try:
-        # Extract text from PDF
-        text = extract_text(pdf_bytes)
-        if not text:
-            return "Could not extract text from the PDF."
-        # Use QA model directly (limiting context size for memory constraints)
-        result = qa_model(question=question, context=text[:5000])
         return result['answer']
-    except Exception as e:
-        print(f"❌ Processing error: {e}")
-        return f"An error occurred: {str(e)}"
-# API Endpoints
-@app.route("/health", methods=["GET"])
-def health_check():
-    return jsonify({"status": "healthy"})
-@app.route("/api/ask", methods=["POST"])
-def ask_question():
     try:
-        if 'file' not in request.files:
-            return jsonify({"error": "No file provided"}), 400
-        file = request.files['file']
-        if not file or file.filename == '':
-            return jsonify({"error": "Invalid file"}), 400
-        if 'question' not in request.form:
-            return jsonify({"error": "No question provided"}), 400
-        question = request.form['question']
-        pdf_bytes = file.read()
-        answer = process_pdf_and_answer(pdf_bytes, question)
-        return jsonify({
-            "answer": answer,
-            "success": True
-        })
-    except Exception as e:
-        print(f"❌ API error: {e}")
-        return jsonify({"error": str(e), "success": False}), 500
-if __name__ == "__main__":
-    port = int(os.environ.get("PORT", 7860))
-    app.run(host="0.0.0.0", port=port)

 import os
 import fitz  # PyMuPDF
+import pytesseract
+from pdf2image import convert_from_path
 import torch
+import faiss
+import numpy as np
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+from sentence_transformers import SentenceTransformer
+import gradio as gr
+# ✅ Device setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# ✅ OCR fallback
+def ocr_pdf(pdf_path):
+    images = convert_from_path(pdf_path)
+    text = ""
+    for img in images:
+        text += pytesseract.image_to_string(img)
+    return text
+# ✅ Text extraction
+def extract_text(pdf_path):
+    doc = fitz.open(pdf_path)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    if len(text.strip()) < 50:
+        print("⚠️ Not enough text, using OCR fallback...")
+        text = ocr_pdf(pdf_path)
+    print("✅ Text extraction complete")
+    return text
+# ✅ Chunking
+def split_into_chunks(text, max_tokens=300, overlap=50):
+    sentences = text.split('.')
+    chunks, current = [], ''
+    for sentence in sentences:
+        sentence = sentence.strip() + '.'
+        if len(current) + len(sentence) < max_tokens:
+            current += sentence
+        else:
+            chunks.append(current.strip())
+            words = current.split()
+            if len(words) > overlap:
+                current = ' '.join(words[-overlap:]) + ' ' + sentence
+            else:
+                current = sentence
+    if current:
+        chunks.append(current.strip())
+    return chunks
+# ✅ FAISS setup
+def setup_faiss(chunks):
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = embedder.encode(chunks)
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings)
+    return index, embeddings, chunks
+# ✅ QA method
+def answer_with_qa_pipeline(chunks, question):
+    qa_pipeline = pipeline(
         "question-answering",
         model="distilbert-base-cased-distilled-squad",
+        tokenizer="distilbert-base-cased",
         device=0 if device == "cuda" else -1
     )
+    context = " ".join(chunks[:5])
     try:
+        result = qa_pipeline(question=question, context=context)
         return result['answer']
+    except:
+        return "Could not answer with QA pipeline."
+# ✅ Generation method
+def answer_with_generation(index, embeddings, chunks, question):
+    model_name = "distilgpt2"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        model.config.pad_token_id = model.config.eos_token_id
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    q_embedding = embedder.encode([question])
+    _, top_k_indices = index.search(q_embedding, k=3)
+    relevant_chunks = [chunks[i] for i in top_k_indices[0]]
+    context = " ".join(relevant_chunks)
+    prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
     try:
+        output = model.generate(
+            **inputs,
+            max_new_tokens=300,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            num_beams=3,
+            no_repeat_ngram_size=2
+        )
+        answer = tokenizer.decode(output[0], skip_special_tokens=True)
+        if "Detailed answer:" in answer:
+            return answer.split("Detailed answer:")[-1].strip()
+        return answer
+    except:
+        return "Could not generate answer."
+# ✅ Main logic
+def process_pdf(file, question):
+    pdf_path = file.name
+    text = extract_text(pdf_path)
+    chunks = split_into_chunks(text)
+    qa_answer = answer_with_qa_pipeline(chunks, question)
+    if len(qa_answer) < 20:
+        index, embeddings, chunks = setup_faiss(chunks)
+        return answer_with_generation(index, embeddings, chunks, question)
+    return qa_answer
+# ✅ Gradio UI
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=[
+        gr.File(label="Upload PDF"),
+        gr.Textbox(label="Ask a question", placeholder="What is this PDF about?")
+    ],
+    outputs="text",
+    title="📄 PDF Chat Assistant",
+    description="Upload a PDF and ask anything about its content, even if it has scanned images!"
+)
+iface.launch()