priyanshu23456 commited on
Commit
f800f49
Β·
verified Β·
1 Parent(s): c9b3650

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -27
app.py CHANGED
@@ -10,6 +10,8 @@ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
10
  from sentence_transformers import SentenceTransformer
11
  import faiss
12
  import numpy as np
 
 
13
 
14
  # Fix caching issue on Hugging Face Spaces
15
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
@@ -24,22 +26,63 @@ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
24
 
25
  device = "cuda" if torch.cuda.is_available() else "cpu"
26
 
27
- # βœ… OCR for scanned PDFs
28
  def ocr_pdf(pdf_path):
29
- images = convert_from_path(pdf_path)
30
- text = ""
31
- for img in images:
32
- text += pytesseract.image_to_string(img)
33
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # βœ… Extract text
 
 
 
 
 
 
 
 
 
 
 
 
36
  def extract_text(pdf_path):
37
  doc = fitz.open(pdf_path)
38
  text = ""
39
  for page in doc:
40
- text += page.get_text()
41
- if len(text.strip()) < 50:
42
- text = ocr_pdf(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
43
  return text
44
 
45
  # βœ… Split into chunks
@@ -85,10 +128,17 @@ def answer_with_qa_pipeline(chunks, question):
85
  except:
86
  return ""
87
 
88
- # βœ… Generation fallback
89
  def answer_with_generation(index, embeddings, chunks, question):
90
  tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
91
- model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
 
 
 
 
 
 
 
92
  if tokenizer.pad_token is None:
93
  tokenizer.pad_token = tokenizer.eos_token
94
  model.config.pad_token_id = model.config.eos_token_id
@@ -100,21 +150,28 @@ def answer_with_generation(index, embeddings, chunks, question):
100
  context = " ".join(relevant_chunks)
101
 
102
  prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
103
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
104
-
105
- output = model.generate(
106
- **inputs,
107
- max_new_tokens=300,
108
- temperature=0.7,
109
- top_p=0.9,
110
- do_sample=True,
111
- num_beams=3,
112
- no_repeat_ngram_size=2
113
- )
114
- answer = tokenizer.decode(output[0], skip_special_tokens=True)
115
- if "Detailed answer:" in answer:
116
- return answer.split("Detailed answer:")[-1].strip()
117
- return answer.strip()
 
 
 
 
 
 
 
118
 
119
  # βœ… API route
120
  @app.route('/')
 
10
  from sentence_transformers import SentenceTransformer
11
  import faiss
12
  import numpy as np
13
+ import tempfile
14
+ from PIL import Image
15
 
16
  # Fix caching issue on Hugging Face Spaces
17
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 
26
 
27
  device = "cuda" if torch.cuda.is_available() else "cpu"
28
 
29
+ # Improved OCR function
30
  def ocr_pdf(pdf_path):
31
+ try:
32
+ # Use a higher DPI for better quality
33
+ images = convert_from_path(
34
+ pdf_path,
35
+ dpi=300, # Higher DPI for better quality
36
+ grayscale=False, # Color might help with some PDFs
37
+ thread_count=2, # Use multiple threads
38
+ use_pdftocairo=True # pdftocairo often gives better results
39
+ )
40
+
41
+ text = ""
42
+ for img in images:
43
+ # Preprocess the image for better OCR results
44
+ preprocessed = preprocess_image_for_ocr(img)
45
+ # Use tesseract with more options
46
+ text += pytesseract.image_to_string(
47
+ preprocessed,
48
+ config='--psm 1 --oem 3 -l eng' # Page segmentation mode 1 (auto), OCR Engine mode 3 (default)
49
+ )
50
+ return text
51
+ except Exception as e:
52
+ print(f"OCR error: {str(e)}")
53
+ return ""
54
 
55
+ # Image preprocessing function for better OCR
56
+ def preprocess_image_for_ocr(img):
57
+ # Convert to grayscale
58
+ gray = img.convert('L')
59
+
60
+ # Optional: You could add more preprocessing here like:
61
+ # - Thresholding
62
+ # - Noise removal
63
+ # - Contrast enhancement
64
+
65
+ return gray
66
+
67
+ # Improved extract_text function with better text detection
68
  def extract_text(pdf_path):
69
  doc = fitz.open(pdf_path)
70
  text = ""
71
  for page in doc:
72
+ page_text = page.get_text()
73
+ text += page_text
74
+
75
+ # Check if the text is meaningful (more sophisticated check)
76
+ words = text.split()
77
+ unique_words = set(word.lower() for word in words if len(word) > 2)
78
+
79
+ # If we don't have enough meaningful text, try OCR
80
+ if len(unique_words) < 20 or len(text.strip()) < 100:
81
+ ocr_text = ocr_pdf(pdf_path)
82
+ # If OCR gave us more text, use it
83
+ if len(ocr_text.strip()) > len(text.strip()):
84
+ text = ocr_text
85
+
86
  return text
87
 
88
  # βœ… Split into chunks
 
128
  except:
129
  return ""
130
 
131
+ # Modify your answer_with_generation function like this:
132
  def answer_with_generation(index, embeddings, chunks, question):
133
  tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
134
+
135
+ # Fix for meta tensor error - load model with device_map="auto"
136
+ model = AutoModelForCausalLM.from_pretrained(
137
+ "distilgpt2",
138
+ device_map="auto", # This handles device placement automatically
139
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 # Use fp16 if possible
140
+ )
141
+
142
  if tokenizer.pad_token is None:
143
  tokenizer.pad_token = tokenizer.eos_token
144
  model.config.pad_token_id = model.config.eos_token_id
 
150
  context = " ".join(relevant_chunks)
151
 
152
  prompt = f"Answer the following question based on this information:\n\nInformation: {context}\n\nQuestion: {question}\n\nDetailed answer:"
153
+
154
+ # Handle inputs without explicit device placement
155
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
156
+ # Let the model handle device placement internally
157
+
158
+ try:
159
+ output = model.generate(
160
+ **inputs,
161
+ max_new_tokens=300,
162
+ temperature=0.7,
163
+ top_p=0.9,
164
+ do_sample=True,
165
+ num_beams=3,
166
+ no_repeat_ngram_size=2
167
+ )
168
+ answer = tokenizer.decode(output[0], skip_special_tokens=True)
169
+ if "Detailed answer:" in answer:
170
+ return answer.split("Detailed answer:")[-1].strip()
171
+ return answer.strip()
172
+ except Exception as e:
173
+ print(f"Generation error: {str(e)}")
174
+ return "I couldn't generate a good answer based on the PDF content."
175
 
176
  # βœ… API route
177
  @app.route('/')