priyanshu23456 commited on
Commit
e509c53
Β·
verified Β·
1 Parent(s): 8e0dc95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -61
app.py CHANGED
@@ -1,18 +1,22 @@
 
 
1
  import os
 
2
  import fitz # PyMuPDF
3
  import pytesseract
4
  from pdf2image import convert_from_path
5
- import torch
6
- import faiss
7
- import numpy as np
8
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
9
  from sentence_transformers import SentenceTransformer
10
- import gradio as gr
 
 
 
 
 
11
 
12
- # βœ… Device setup
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
- # βœ… OCR fallback
16
  def ocr_pdf(pdf_path):
17
  images = convert_from_path(pdf_path)
18
  text = ""
@@ -20,19 +24,17 @@ def ocr_pdf(pdf_path):
20
  text += pytesseract.image_to_string(img)
21
  return text
22
 
23
- # βœ… Text extraction
24
  def extract_text(pdf_path):
25
  doc = fitz.open(pdf_path)
26
  text = ""
27
  for page in doc:
28
  text += page.get_text()
29
  if len(text.strip()) < 50:
30
- print("⚠️ Not enough text, using OCR fallback...")
31
  text = ocr_pdf(pdf_path)
32
- print("βœ… Text extraction complete")
33
  return text
34
 
35
- # βœ… Chunking
36
  def split_into_chunks(text, max_tokens=300, overlap=50):
37
  sentences = text.split('.')
38
  chunks, current = [], ''
@@ -51,16 +53,16 @@ def split_into_chunks(text, max_tokens=300, overlap=50):
51
  chunks.append(current.strip())
52
  return chunks
53
 
54
- # βœ… FAISS setup
55
  def setup_faiss(chunks):
56
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
57
  embeddings = embedder.encode(chunks)
58
- dimension = embeddings.shape[1]
59
- index = faiss.IndexFlatL2(dimension)
60
  index.add(embeddings)
61
  return index, embeddings, chunks
62
 
63
- # βœ… QA method
64
  def answer_with_qa_pipeline(chunks, question):
65
  qa_pipeline = pipeline(
66
  "question-answering",
@@ -71,16 +73,14 @@ def answer_with_qa_pipeline(chunks, question):
71
  context = " ".join(chunks[:5])
72
  try:
73
  result = qa_pipeline(question=question, context=context)
74
- return result['answer']
75
  except:
76
- return "Could not answer with QA pipeline."
77
 
78
- # βœ… Generation method
79
  def answer_with_generation(index, embeddings, chunks, question):
80
- model_name = "distilgpt2"
81
- tokenizer = AutoTokenizer.from_pretrained(model_name)
82
- model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
83
-
84
  if tokenizer.pad_token is None:
85
  tokenizer.pad_token = tokenizer.eos_token
86
  model.config.pad_token_id = model.config.eos_token_id
@@ -94,44 +94,43 @@ def answer_with_generation(index, embeddings, chunks, question):
94
  prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
95
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  try:
98
- output = model.generate(
99
- **inputs,
100
- max_new_tokens=300,
101
- temperature=0.7,
102
- top_p=0.9,
103
- do_sample=True,
104
- num_beams=3,
105
- no_repeat_ngram_size=2
106
- )
107
- answer = tokenizer.decode(output[0], skip_special_tokens=True)
108
- if "Detailed answer:" in answer:
109
- return answer.split("Detailed answer:")[-1].strip()
110
- return answer
111
- except:
112
- return "Could not generate answer."
113
-
114
- # βœ… Main logic
115
- def process_pdf(file, question):
116
- pdf_path = file.name
117
- text = extract_text(pdf_path)
118
- chunks = split_into_chunks(text)
119
- qa_answer = answer_with_qa_pipeline(chunks, question)
120
- if len(qa_answer) < 20:
121
- index, embeddings, chunks = setup_faiss(chunks)
122
- return answer_with_generation(index, embeddings, chunks, question)
123
- return qa_answer
124
-
125
- # βœ… Gradio UI
126
- iface = gr.Interface(
127
- fn=process_pdf,
128
- inputs=[
129
- gr.File(label="Upload PDF"),
130
- gr.Textbox(label="Ask a question", placeholder="What is this PDF about?")
131
- ],
132
- outputs="text",
133
- title="πŸ“„ PDF Chat Assistant",
134
- description="Upload a PDF and ask anything about its content, even if it has scanned images!"
135
- )
136
-
137
- iface.launch()
 
1
+ from flask import Flask, request, jsonify
2
+ from werkzeug.utils import secure_filename
3
  import os
4
+ import torch
5
  import fitz # PyMuPDF
6
  import pytesseract
7
  from pdf2image import convert_from_path
 
 
 
8
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
9
  from sentence_transformers import SentenceTransformer
10
+ import faiss
11
+ import numpy as np
12
+
13
+ app = Flask(__name__)
14
+ UPLOAD_FOLDER = "uploads"
15
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
16
 
 
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
 
19
+ # βœ… OCR for scanned PDFs
20
  def ocr_pdf(pdf_path):
21
  images = convert_from_path(pdf_path)
22
  text = ""
 
24
  text += pytesseract.image_to_string(img)
25
  return text
26
 
27
+ # βœ… Extract text
28
  def extract_text(pdf_path):
29
  doc = fitz.open(pdf_path)
30
  text = ""
31
  for page in doc:
32
  text += page.get_text()
33
  if len(text.strip()) < 50:
 
34
  text = ocr_pdf(pdf_path)
 
35
  return text
36
 
37
+ # βœ… Split into chunks
38
  def split_into_chunks(text, max_tokens=300, overlap=50):
39
  sentences = text.split('.')
40
  chunks, current = [], ''
 
53
  chunks.append(current.strip())
54
  return chunks
55
 
56
+ # βœ… Setup FAISS
57
  def setup_faiss(chunks):
58
  embedder = SentenceTransformer("all-MiniLM-L6-v2")
59
  embeddings = embedder.encode(chunks)
60
+ dim = embeddings.shape[1]
61
+ index = faiss.IndexFlatL2(dim)
62
  index.add(embeddings)
63
  return index, embeddings, chunks
64
 
65
+ # βœ… QA pipeline
66
  def answer_with_qa_pipeline(chunks, question):
67
  qa_pipeline = pipeline(
68
  "question-answering",
 
73
  context = " ".join(chunks[:5])
74
  try:
75
  result = qa_pipeline(question=question, context=context)
76
+ return result["answer"]
77
  except:
78
+ return ""
79
 
80
+ # βœ… Generation fallback
81
  def answer_with_generation(index, embeddings, chunks, question):
82
+ tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
83
+ model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
 
 
84
  if tokenizer.pad_token is None:
85
  tokenizer.pad_token = tokenizer.eos_token
86
  model.config.pad_token_id = model.config.eos_token_id
 
94
  prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
95
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
96
 
97
+ output = model.generate(
98
+ **inputs,
99
+ max_new_tokens=300,
100
+ temperature=0.7,
101
+ top_p=0.9,
102
+ do_sample=True,
103
+ num_beams=3,
104
+ no_repeat_ngram_size=2
105
+ )
106
+ answer = tokenizer.decode(output[0], skip_special_tokens=True)
107
+ if "Detailed answer:" in answer:
108
+ return answer.split("Detailed answer:")[-1].strip()
109
+ return answer.strip()
110
+
111
+ # βœ… API route
112
+ @app.route('/ask', methods=['POST'])
113
+ def ask():
114
+ file = request.files.get("pdf")
115
+ question = request.form.get("question", "")
116
+
117
+ if not file or not question:
118
+ return jsonify({"error": "PDF and question required"}), 400
119
+
120
+ filename = secure_filename(file.filename)
121
+ filepath = os.path.join(UPLOAD_FOLDER, filename)
122
+ file.save(filepath)
123
+
124
  try:
125
+ text = extract_text(filepath)
126
+ chunks = split_into_chunks(text)
127
+ answer = answer_with_qa_pipeline(chunks, question)
128
+ if len(answer.strip()) < 20:
129
+ index, embeddings, chunks = setup_faiss(chunks)
130
+ answer = answer_with_generation(index, embeddings, chunks, question)
131
+ return jsonify({"answer": answer})
132
+ except Exception as e:
133
+ return jsonify({"error": str(e)}), 500
134
+
135
+ if __name__ == "__main__":
136
+ app.run(host="0.0.0.0", port=7860)