Spaces:
Sleeping
Sleeping
File size: 4,806 Bytes
e509c53 b66fd13 c971d0d e509c53 c971d0d d47a566 e509c53 5b5c1fd e509c53 b66fd13 b3305c3 e509c53 c971d0d e509c53 d47a566 e509c53 d47a566 e509c53 d47a566 e509c53 d47a566 e509c53 d47a566 e509c53 d47a566 c971d0d d47a566 c971d0d d47a566 c971d0d d47a566 e509c53 d47a566 e509c53 d47a566 e509c53 d47a566 e509c53 d47a566 c971d0d d47a566 c971d0d e509c53 1ccff27 e509c53 1ccff27 e509c53 c971d0d 1ccff27 e509c53 1ccff27 e509c53 1ccff27 e509c53 1ccff27 e509c53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from flask_cors import CORS # β
Add this line
import os
import torch
import fitz # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Fix caching issue on Hugging Face Spaces
os.environ["TRANSFORMERS_CACHE"] = "/tmp"
os.environ["HF_HOME"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
app = Flask(__name__)
CORS(app) # β
Enable CORS for all routes
UPLOAD_FOLDER = "/tmp/uploads"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
# β
OCR for scanned PDFs
def ocr_pdf(pdf_path):
images = convert_from_path(pdf_path)
text = ""
for img in images:
text += pytesseract.image_to_string(img)
return text
# β
Extract text
def extract_text(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
if len(text.strip()) < 50:
text = ocr_pdf(pdf_path)
return text
# β
Split into chunks
def split_into_chunks(text, max_tokens=300, overlap=50):
sentences = text.split('.')
chunks, current = [], ''
for sentence in sentences:
sentence = sentence.strip() + '.'
if len(current) + len(sentence) < max_tokens:
current += sentence
else:
chunks.append(current.strip())
words = current.split()
if len(words) > overlap:
current = ' '.join(words[-overlap:]) + ' ' + sentence
else:
current = sentence
if current:
chunks.append(current.strip())
return chunks
# β
Setup FAISS
def setup_faiss(chunks):
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(chunks)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
return index, embeddings, chunks
# β
QA pipeline
def answer_with_qa_pipeline(chunks, question):
qa_pipeline = pipeline(
"question-answering",
model="distilbert-base-cased-distilled-squad",
tokenizer="distilbert-base-cased",
device=0 if device == "cuda" else -1
)
context = " ".join(chunks[:5])
try:
result = qa_pipeline(question=question, context=context)
return result["answer"]
except:
return ""
# β
Generation fallback
def answer_with_generation(index, embeddings, chunks, question):
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
embedder = SentenceTransformer("all-MiniLM-L6-v2")
q_embedding = embedder.encode([question])
_, top_k_indices = index.search(q_embedding, k=3)
relevant_chunks = [chunks[i] for i in top_k_indices[0]]
context = " ".join(relevant_chunks)
prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
output = model.generate(
**inputs,
max_new_tokens=300,
temperature=0.7,
top_p=0.9,
do_sample=True,
num_beams=3,
no_repeat_ngram_size=2
)
answer = tokenizer.decode(output[0], skip_special_tokens=True)
if "Detailed answer:" in answer:
return answer.split("Detailed answer:")[-1].strip()
return answer.strip()
# β
API route
@app.route('/')
def home():
return jsonify({"message": "PDF QA API is running!"})
@app.route('/ask', methods=['POST'])
def ask():
file = request.files.get("pdf")
question = request.form.get("question", "")
if not file or not question:
return jsonify({"error": "Both PDF file and question are required"}), 400
filename = secure_filename(file.filename)
filepath = os.path.join(UPLOAD_FOLDER, filename)
file.save(filepath)
try:
# π§ Process PDF and generate answer
text = extract_text(filepath)
chunks = split_into_chunks(text)
answer = answer_with_qa_pipeline(chunks, question)
if len(answer.strip()) < 20:
index, embeddings, chunks = setup_faiss(chunks)
answer = answer_with_generation(index, embeddings, chunks, question)
return jsonify({"answer": answer})
except Exception as e:
return jsonify({"error": str(e)}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)
|