File size: 4,806 Bytes
e509c53
 
b66fd13
c971d0d
e509c53
c971d0d
d47a566
 
 
 
e509c53
 
 
5b5c1fd
 
 
 
 
e509c53
b66fd13
 
b3305c3
e509c53
c971d0d
 
 
e509c53
d47a566
 
 
 
 
 
 
e509c53
d47a566
 
 
 
 
 
 
 
 
e509c53
d47a566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e509c53
d47a566
 
 
e509c53
 
d47a566
 
 
e509c53
d47a566
 
c971d0d
 
d47a566
c971d0d
 
d47a566
c971d0d
d47a566
e509c53
d47a566
e509c53
d47a566
e509c53
d47a566
e509c53
 
d47a566
 
 
c971d0d
d47a566
 
 
 
 
 
 
 
c971d0d
e509c53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ccff27
 
 
 
e509c53
 
 
 
 
 
1ccff27
e509c53
 
 
 
 
c971d0d
1ccff27
e509c53
 
 
1ccff27
e509c53
 
 
1ccff27
e509c53
1ccff27
e509c53
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from flask_cors import CORS  # βœ… Add this line
import os
import torch
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Fix caching issue on Hugging Face Spaces
os.environ["TRANSFORMERS_CACHE"] = "/tmp"
os.environ["HF_HOME"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"

app = Flask(__name__)
CORS(app)  # βœ… Enable CORS for all routes

UPLOAD_FOLDER = "/tmp/uploads"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"

# βœ… OCR for scanned PDFs
def ocr_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img)
    return text

# βœ… Extract text
def extract_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    if len(text.strip()) < 50:
        text = ocr_pdf(pdf_path)
    return text

# βœ… Split into chunks
def split_into_chunks(text, max_tokens=300, overlap=50):
    sentences = text.split('.')
    chunks, current = [], ''
    for sentence in sentences:
        sentence = sentence.strip() + '.'
        if len(current) + len(sentence) < max_tokens:
            current += sentence
        else:
            chunks.append(current.strip())
            words = current.split()
            if len(words) > overlap:
                current = ' '.join(words[-overlap:]) + ' ' + sentence
            else:
                current = sentence
    if current:
        chunks.append(current.strip())
    return chunks

# βœ… Setup FAISS
def setup_faiss(chunks):
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embedder.encode(chunks)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index, embeddings, chunks

# βœ… QA pipeline
def answer_with_qa_pipeline(chunks, question):
    qa_pipeline = pipeline(
        "question-answering",
        model="distilbert-base-cased-distilled-squad",
        tokenizer="distilbert-base-cased",
        device=0 if device == "cuda" else -1
    )
    context = " ".join(chunks[:5])
    try:
        result = qa_pipeline(question=question, context=context)
        return result["answer"]
    except:
        return ""

# βœ… Generation fallback
def answer_with_generation(index, embeddings, chunks, question):
    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
    model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    q_embedding = embedder.encode([question])
    _, top_k_indices = index.search(q_embedding, k=3)
    relevant_chunks = [chunks[i] for i in top_k_indices[0]]
    context = " ".join(relevant_chunks)

    prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)

    output = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        num_beams=3,
        no_repeat_ngram_size=2
    )
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    if "Detailed answer:" in answer:
        return answer.split("Detailed answer:")[-1].strip()
    return answer.strip()

# βœ… API route
@app.route('/')
def home():
    return jsonify({"message": "PDF QA API is running!"})

@app.route('/ask', methods=['POST'])
def ask():
    file = request.files.get("pdf")
    question = request.form.get("question", "")

    if not file or not question:
        return jsonify({"error": "Both PDF file and question are required"}), 400

    filename = secure_filename(file.filename)
    filepath = os.path.join(UPLOAD_FOLDER, filename)
    file.save(filepath)

    try:
        # 🧠 Process PDF and generate answer
        text = extract_text(filepath)
        chunks = split_into_chunks(text)
        answer = answer_with_qa_pipeline(chunks, question)

        if len(answer.strip()) < 20:
            index, embeddings, chunks = setup_faiss(chunks)
            answer = answer_with_generation(index, embeddings, chunks, question)

        return jsonify({"answer": answer})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)