Spaces:

priyanshu23456
/

pdfassistant

Sleeping

App Files Files Community

priyanshu23456 commited on Apr 8

Commit

e509c53

verified ·

1 Parent(s): 8e0dc95

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -61

app.py CHANGED Viewed

@@ -1,18 +1,22 @@
 import os
 import fitz  # PyMuPDF
 import pytesseract
 from pdf2image import convert_from_path
-import torch
-import faiss
-import numpy as np
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
-import gradio as gr
-# ✅ Device setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# ✅ OCR fallback
 def ocr_pdf(pdf_path):
     images = convert_from_path(pdf_path)
     text = ""
@@ -20,19 +24,17 @@ def ocr_pdf(pdf_path):
         text += pytesseract.image_to_string(img)
     return text
-# ✅ Text extraction
 def extract_text(pdf_path):
     doc = fitz.open(pdf_path)
     text = ""
     for page in doc:
         text += page.get_text()
     if len(text.strip()) < 50:
-        print("⚠️ Not enough text, using OCR fallback...")
         text = ocr_pdf(pdf_path)
-    print("✅ Text extraction complete")
     return text
-# ✅ Chunking
 def split_into_chunks(text, max_tokens=300, overlap=50):
     sentences = text.split('.')
     chunks, current = [], ''
@@ -51,16 +53,16 @@ def split_into_chunks(text, max_tokens=300, overlap=50):
         chunks.append(current.strip())
     return chunks
-# ✅ FAISS setup
 def setup_faiss(chunks):
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
     embeddings = embedder.encode(chunks)
-    dimension = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dimension)
     index.add(embeddings)
     return index, embeddings, chunks
-# ✅ QA method
 def answer_with_qa_pipeline(chunks, question):
     qa_pipeline = pipeline(
         "question-answering",
@@ -71,16 +73,14 @@ def answer_with_qa_pipeline(chunks, question):
     context = " ".join(chunks[:5])
     try:
         result = qa_pipeline(question=question, context=context)
-        return result['answer']
     except:
-        return "Could not answer with QA pipeline."
-# ✅ Generation method
 def answer_with_generation(index, embeddings, chunks, question):
-    model_name = "distilgpt2"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
         model.config.pad_token_id = model.config.eos_token_id
@@ -94,44 +94,43 @@ def answer_with_generation(index, embeddings, chunks, question):
     prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
     try:
-        output = model.generate(
-            **inputs,
-            max_new_tokens=300,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True,
-            num_beams=3,
-            no_repeat_ngram_size=2
-        )
-        answer = tokenizer.decode(output[0], skip_special_tokens=True)
-        if "Detailed answer:" in answer:
-            return answer.split("Detailed answer:")[-1].strip()
-        return answer
-    except:
-        return "Could not generate answer."
-# ✅ Main logic
-def process_pdf(file, question):
-    pdf_path = file.name
-    text = extract_text(pdf_path)
-    chunks = split_into_chunks(text)
-    qa_answer = answer_with_qa_pipeline(chunks, question)
-    if len(qa_answer) < 20:
-        index, embeddings, chunks = setup_faiss(chunks)
-        return answer_with_generation(index, embeddings, chunks, question)
-    return qa_answer
-# ✅ Gradio UI
-iface = gr.Interface(
-    fn=process_pdf,
-    inputs=[
-        gr.File(label="Upload PDF"),
-        gr.Textbox(label="Ask a question", placeholder="What is this PDF about?")
-    ],
-    outputs="text",
-    title="📄 PDF Chat Assistant",
-    description="Upload a PDF and ask anything about its content, even if it has scanned images!"
-)
-iface.launch()

+from flask import Flask, request, jsonify
+from werkzeug.utils import secure_filename
 import os
+import torch
 import fitz  # PyMuPDF
 import pytesseract
 from pdf2image import convert_from_path
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+app = Flask(__name__)
+UPLOAD_FOLDER = "uploads"
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# ✅ OCR for scanned PDFs
 def ocr_pdf(pdf_path):
     images = convert_from_path(pdf_path)
     text = ""
         text += pytesseract.image_to_string(img)
     return text
+# ✅ Extract text
 def extract_text(pdf_path):
     doc = fitz.open(pdf_path)
     text = ""
     for page in doc:
         text += page.get_text()
     if len(text.strip()) < 50:
         text = ocr_pdf(pdf_path)
     return text
+# ✅ Split into chunks
 def split_into_chunks(text, max_tokens=300, overlap=50):
     sentences = text.split('.')
     chunks, current = [], ''
         chunks.append(current.strip())
     return chunks
+# ✅ Setup FAISS
 def setup_faiss(chunks):
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
     embeddings = embedder.encode(chunks)
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dim)
     index.add(embeddings)
     return index, embeddings, chunks
+# ✅ QA pipeline
 def answer_with_qa_pipeline(chunks, question):
     qa_pipeline = pipeline(
         "question-answering",
     context = " ".join(chunks[:5])
     try:
         result = qa_pipeline(question=question, context=context)
+        return result["answer"]
     except:
+        return ""
+# ✅ Generation fallback
 def answer_with_generation(index, embeddings, chunks, question):
+    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
         model.config.pad_token_id = model.config.eos_token_id
     prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
+    output = model.generate(
+        **inputs,
+        max_new_tokens=300,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True,
+        num_beams=3,
+        no_repeat_ngram_size=2
+    )
+    answer = tokenizer.decode(output[0], skip_special_tokens=True)
+    if "Detailed answer:" in answer:
+        return answer.split("Detailed answer:")[-1].strip()
+    return answer.strip()
+# ✅ API route
+@app.route('/ask', methods=['POST'])
+def ask():
+    file = request.files.get("pdf")
+    question = request.form.get("question", "")
+    if not file or not question:
+        return jsonify({"error": "PDF and question required"}), 400
+    filename = secure_filename(file.filename)
+    filepath = os.path.join(UPLOAD_FOLDER, filename)
+    file.save(filepath)
     try:
+        text = extract_text(filepath)
+        chunks = split_into_chunks(text)
+        answer = answer_with_qa_pipeline(chunks, question)
+        if len(answer.strip()) < 20:
+            index, embeddings, chunks = setup_faiss(chunks)
+            answer = answer_with_generation(index, embeddings, chunks, question)
+        return jsonify({"answer": answer})
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860)