Chaitanya895 commited on
Commit
e283321
·
verified ·
1 Parent(s): 08947b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -17
app.py CHANGED
@@ -2,6 +2,10 @@ import os
2
  import logging
3
  import subprocess
4
  import tempfile
 
 
 
 
5
  from flask import Flask, request, jsonify, render_template
6
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
7
  from PIL import Image
@@ -9,7 +13,6 @@ from pdf2image import convert_from_bytes
9
  import io
10
  import torch
11
  import hashlib
12
- import time
13
  from concurrent.futures import ThreadPoolExecutor
14
  import asyncio
15
 
@@ -45,6 +48,11 @@ MAX_IMAGE_DIMENSION = 1000 # Reduced for faster OCR
45
  # Timeout for OCR operation (in seconds)
46
  OCR_TIMEOUT = 5
47
 
 
 
 
 
 
48
  # Global variables to store the model and tokenizer
49
  model = None
50
  tokenizer = None
@@ -55,21 +63,17 @@ def allowed_file(filename):
55
 
56
  # Preprocess image for faster OCR
57
  def preprocess_image(image):
58
- # Check image dimensions
59
  width, height = image.size
60
  logger.info(f"Original image dimensions: {width}x{height}")
61
  if width > MAX_IMAGE_DIMENSION or height > MAX_IMAGE_DIMENSION:
62
- # Maintain aspect ratio
63
  scale = min(MAX_IMAGE_DIMENSION / width, MAX_IMAGE_DIMENSION / height)
64
  new_width = int(width * scale)
65
  new_height = int(height * scale)
66
  image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
67
  logger.info(f"Resized image to: {new_width}x{new_height}")
68
  else:
69
- # Resize to 100 DPI (reduced for speed)
70
  target_dpi = 100
71
  image = image.resize((int(width * target_dpi / 72), int(height * target_dpi / 72)), Image.Resampling.LANCZOS)
72
- # Convert to grayscale
73
  image = image.convert("L")
74
  return image
75
 
@@ -83,32 +87,26 @@ def get_file_hash(file):
83
  # Extract text from image or PDF with caching
84
  def extract_text(file):
85
  try:
86
- # Compute file hash for caching
87
  file_hash = get_file_hash(file)
88
  cache_path = os.path.join(CACHE_DIR, f"{file_hash}.txt")
89
 
90
- # Check if result is cached
91
  if os.path.exists(cache_path):
92
  with open(cache_path, "r", encoding="utf-8") as f:
93
  return f.read().strip()
94
 
95
  start_time = time.time()
96
 
97
- # Prepare the image for OCR
98
  if file.filename.rsplit('.', 1)[1].lower() == 'pdf':
99
  file_bytes = file.read()
100
- # Process only the first page to reduce time
101
  images = convert_from_bytes(file_bytes, first_page=1, last_page=1)
102
  img = preprocess_image(images[0])
103
  else:
104
  img = Image.open(file)
105
  img = preprocess_image(img)
106
 
107
- # Save the image to a temporary file
108
  with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img_file:
109
  img.save(temp_img_file.name)
110
 
111
- # Use subprocess to run Tesseract with a timeout
112
  with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as temp_txt_file:
113
  try:
114
  subprocess.run(
@@ -127,15 +125,12 @@ def extract_text(file):
127
  os.unlink(temp_txt_file.name)
128
  return f"Error extracting text with Tesseract: {e.stderr}"
129
 
130
- # Read the output text
131
  with open(temp_txt_file.name, 'r', encoding='utf-8') as f:
132
  text = f.read().strip()
133
 
134
- # Clean up temporary files
135
  os.unlink(temp_img_file.name)
136
  os.unlink(temp_txt_file.name)
137
 
138
- # Cache the result
139
  with open(cache_path, "w", encoding="utf-8") as f:
140
  f.write(text)
141
 
@@ -144,6 +139,58 @@ def extract_text(file):
144
  except Exception as e:
145
  return f"Error extracting text: {str(e)}"
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  # Load and optimize the model
148
  def load_model():
149
  try:
@@ -153,7 +200,6 @@ def load_model():
153
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, cache_dir='/data/models')
154
  logger.info(f"Model and tokenizer loading took {time.time() - start_time:.2f} seconds")
155
 
156
- # Warm-up the model with a minimal input
157
  start_time = time.time()
158
  dummy_input = tokenizer("আমি", return_tensors="pt", padding=True)
159
  _ = model.generate(**dummy_input)
@@ -178,7 +224,6 @@ def initialize_model():
178
  # Translate text with parallel processing
179
  async def translate_text(sentence, model, tokenizer):
180
  start_time = time.time()
181
- # Split long texts into smaller chunks (max 64 tokens per chunk)
182
  max_length = 64
183
  inputs = []
184
  current_chunk = []
@@ -197,7 +242,6 @@ async def translate_text(sentence, model, tokenizer):
197
  if current_chunk:
198
  inputs.append(" ".join(current_chunk))
199
 
200
- # Translate each chunk in parallel
201
  def translate_chunk(chunk):
202
  input_ids = tokenizer(chunk, return_tensors="pt", padding=True)
203
  output_ids = model.generate(**input_ids, max_length=128)
@@ -268,5 +312,36 @@ async def translate():
268
  except Exception as e:
269
  return jsonify({"error": str(e)}), 500
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  if __name__ == "__main__":
272
  app.run(host="0.0.0.0", port=7860, debug=False)
 
2
  import logging
3
  import subprocess
4
  import tempfile
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import time
8
+ from urllib.parse import urljoin, urlparse
9
  from flask import Flask, request, jsonify, render_template
10
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
11
  from PIL import Image
 
13
  import io
14
  import torch
15
  import hashlib
 
16
  from concurrent.futures import ThreadPoolExecutor
17
  import asyncio
18
 
 
48
  # Timeout for OCR operation (in seconds)
49
  OCR_TIMEOUT = 5
50
 
51
+ # Crawling settings
52
+ CRAWL_DEPTH = 2 # Limit depth to avoid overloading
53
+ REQUEST_DELAY = 5 # Seconds between requests (politeness policy)
54
+ USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
55
+
56
  # Global variables to store the model and tokenizer
57
  model = None
58
  tokenizer = None
 
63
 
64
  # Preprocess image for faster OCR
65
  def preprocess_image(image):
 
66
  width, height = image.size
67
  logger.info(f"Original image dimensions: {width}x{height}")
68
  if width > MAX_IMAGE_DIMENSION or height > MAX_IMAGE_DIMENSION:
 
69
  scale = min(MAX_IMAGE_DIMENSION / width, MAX_IMAGE_DIMENSION / height)
70
  new_width = int(width * scale)
71
  new_height = int(height * scale)
72
  image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
73
  logger.info(f"Resized image to: {new_width}x{new_height}")
74
  else:
 
75
  target_dpi = 100
76
  image = image.resize((int(width * target_dpi / 72), int(height * target_dpi / 72)), Image.Resampling.LANCZOS)
 
77
  image = image.convert("L")
78
  return image
79
 
 
87
  # Extract text from image or PDF with caching
88
  def extract_text(file):
89
  try:
 
90
  file_hash = get_file_hash(file)
91
  cache_path = os.path.join(CACHE_DIR, f"{file_hash}.txt")
92
 
 
93
  if os.path.exists(cache_path):
94
  with open(cache_path, "r", encoding="utf-8") as f:
95
  return f.read().strip()
96
 
97
  start_time = time.time()
98
 
 
99
  if file.filename.rsplit('.', 1)[1].lower() == 'pdf':
100
  file_bytes = file.read()
 
101
  images = convert_from_bytes(file_bytes, first_page=1, last_page=1)
102
  img = preprocess_image(images[0])
103
  else:
104
  img = Image.open(file)
105
  img = preprocess_image(img)
106
 
 
107
  with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img_file:
108
  img.save(temp_img_file.name)
109
 
 
110
  with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as temp_txt_file:
111
  try:
112
  subprocess.run(
 
125
  os.unlink(temp_txt_file.name)
126
  return f"Error extracting text with Tesseract: {e.stderr}"
127
 
 
128
  with open(temp_txt_file.name, 'r', encoding='utf-8') as f:
129
  text = f.read().strip()
130
 
 
131
  os.unlink(temp_img_file.name)
132
  os.unlink(temp_txt_file.name)
133
 
 
134
  with open(cache_path, "w", encoding="utf-8") as f:
135
  f.write(text)
136
 
 
139
  except Exception as e:
140
  return f"Error extracting text: {str(e)}"
141
 
142
+ # Simple web crawler to extract Bangla text
143
+ def crawl_website(start_url, max_depth=CRAWL_DEPTH):
144
+ visited = set()
145
+ to_visit = [(start_url, 0)] # (url, depth)
146
+ extracted_texts = []
147
+
148
+ headers = {"User-Agent": USER_AGENT}
149
+
150
+ while to_visit:
151
+ url, depth = to_visit.pop(0)
152
+
153
+ if url in visited or depth > max_depth:
154
+ continue
155
+
156
+ visited.add(url)
157
+ logger.info(f"Crawling URL: {url} at depth {depth}")
158
+
159
+ try:
160
+ # Respect politeness policy
161
+ time.sleep(REQUEST_DELAY)
162
+
163
+ # Fetch the page
164
+ response = requests.get(url, headers=headers, timeout=10)
165
+ response.raise_for_status()
166
+
167
+ # Parse HTML
168
+ soup = BeautifulSoup(response.text, 'html.parser')
169
+
170
+ # Extract Bangla text from relevant tags
171
+ text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article'])
172
+ bangla_text = " ".join(element.get_text(strip=True) for element in text_elements if element.get_text(strip=True))
173
+
174
+ if bangla_text:
175
+ extracted_texts.append(bangla_text)
176
+
177
+ # Find links to crawl further
178
+ if depth < max_depth:
179
+ for link in soup.find_all('a', href=True):
180
+ href = link['href']
181
+ absolute_url = urljoin(url, href)
182
+ parsed_url = urlparse(absolute_url)
183
+
184
+ # Only crawl within the same domain
185
+ if parsed_url.netloc == urlparse(start_url).netloc and absolute_url not in visited:
186
+ to_visit.append((absolute_url, depth + 1))
187
+
188
+ except Exception as e:
189
+ logger.error(f"Error crawling {url}: {str(e)}")
190
+ continue
191
+
192
+ return " ".join(extracted_texts) if extracted_texts else "No Bangla text found on the website."
193
+
194
  # Load and optimize the model
195
  def load_model():
196
  try:
 
200
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, cache_dir='/data/models')
201
  logger.info(f"Model and tokenizer loading took {time.time() - start_time:.2f} seconds")
202
 
 
203
  start_time = time.time()
204
  dummy_input = tokenizer("আমি", return_tensors="pt", padding=True)
205
  _ = model.generate(**dummy_input)
 
224
  # Translate text with parallel processing
225
  async def translate_text(sentence, model, tokenizer):
226
  start_time = time.time()
 
227
  max_length = 64
228
  inputs = []
229
  current_chunk = []
 
242
  if current_chunk:
243
  inputs.append(" ".join(current_chunk))
244
 
 
245
  def translate_chunk(chunk):
246
  input_ids = tokenizer(chunk, return_tensors="pt", padding=True)
247
  output_ids = model.generate(**input_ids, max_length=128)
 
312
  except Exception as e:
313
  return jsonify({"error": str(e)}), 500
314
 
315
+ @app.route("/crawl_and_translate", methods=["POST"])
316
+ async def crawl_and_translate():
317
+ try:
318
+ start_time = time.time()
319
+ url = request.form.get("url")
320
+
321
+ if not url:
322
+ return render_template("index.html", error="Please enter a website URL.")
323
+
324
+ # Validate URL
325
+ parsed_url = urlparse(url)
326
+ if not parsed_url.scheme or not parsed_url.netloc:
327
+ return render_template("index.html", error="Invalid URL format.", url=url)
328
+
329
+ # Crawl the website
330
+ logger.info(f"Starting crawl for URL: {url}")
331
+ extracted_text = crawl_website(url)
332
+
333
+ if not extracted_text or extracted_text.startswith("No Bangla text"):
334
+ return render_template("index.html", error=extracted_text or "No text found to translate.", url=url)
335
+
336
+ # Translate the extracted text
337
+ translated = await translate_text(extracted_text, model, tokenizer)
338
+ if translated.startswith("Error"):
339
+ return render_template("index.html", error=translated, url=url, extracted_text=extracted_text)
340
+
341
+ logger.info(f"Total crawl and translate request took {time.time() - start_time:.2f} seconds")
342
+ return render_template("index.html", extracted_text=extracted_text, translated_text=translated, url=url)
343
+ except Exception as e:
344
+ return render_template("index.html", error=str(e), url=url)
345
+
346
  if __name__ == "__main__":
347
  app.run(host="0.0.0.0", port=7860, debug=False)