Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,10 @@ import os
|
|
2 |
import logging
|
3 |
import subprocess
|
4 |
import tempfile
|
|
|
|
|
|
|
|
|
5 |
from flask import Flask, request, jsonify, render_template
|
6 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
7 |
from PIL import Image
|
@@ -9,7 +13,6 @@ from pdf2image import convert_from_bytes
|
|
9 |
import io
|
10 |
import torch
|
11 |
import hashlib
|
12 |
-
import time
|
13 |
from concurrent.futures import ThreadPoolExecutor
|
14 |
import asyncio
|
15 |
|
@@ -45,6 +48,11 @@ MAX_IMAGE_DIMENSION = 1000 # Reduced for faster OCR
|
|
45 |
# Timeout for OCR operation (in seconds)
|
46 |
OCR_TIMEOUT = 5
|
47 |
|
|
|
|
|
|
|
|
|
|
|
48 |
# Global variables to store the model and tokenizer
|
49 |
model = None
|
50 |
tokenizer = None
|
@@ -55,21 +63,17 @@ def allowed_file(filename):
|
|
55 |
|
56 |
# Preprocess image for faster OCR
|
57 |
def preprocess_image(image):
|
58 |
-
# Check image dimensions
|
59 |
width, height = image.size
|
60 |
logger.info(f"Original image dimensions: {width}x{height}")
|
61 |
if width > MAX_IMAGE_DIMENSION or height > MAX_IMAGE_DIMENSION:
|
62 |
-
# Maintain aspect ratio
|
63 |
scale = min(MAX_IMAGE_DIMENSION / width, MAX_IMAGE_DIMENSION / height)
|
64 |
new_width = int(width * scale)
|
65 |
new_height = int(height * scale)
|
66 |
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
67 |
logger.info(f"Resized image to: {new_width}x{new_height}")
|
68 |
else:
|
69 |
-
# Resize to 100 DPI (reduced for speed)
|
70 |
target_dpi = 100
|
71 |
image = image.resize((int(width * target_dpi / 72), int(height * target_dpi / 72)), Image.Resampling.LANCZOS)
|
72 |
-
# Convert to grayscale
|
73 |
image = image.convert("L")
|
74 |
return image
|
75 |
|
@@ -83,32 +87,26 @@ def get_file_hash(file):
|
|
83 |
# Extract text from image or PDF with caching
|
84 |
def extract_text(file):
|
85 |
try:
|
86 |
-
# Compute file hash for caching
|
87 |
file_hash = get_file_hash(file)
|
88 |
cache_path = os.path.join(CACHE_DIR, f"{file_hash}.txt")
|
89 |
|
90 |
-
# Check if result is cached
|
91 |
if os.path.exists(cache_path):
|
92 |
with open(cache_path, "r", encoding="utf-8") as f:
|
93 |
return f.read().strip()
|
94 |
|
95 |
start_time = time.time()
|
96 |
|
97 |
-
# Prepare the image for OCR
|
98 |
if file.filename.rsplit('.', 1)[1].lower() == 'pdf':
|
99 |
file_bytes = file.read()
|
100 |
-
# Process only the first page to reduce time
|
101 |
images = convert_from_bytes(file_bytes, first_page=1, last_page=1)
|
102 |
img = preprocess_image(images[0])
|
103 |
else:
|
104 |
img = Image.open(file)
|
105 |
img = preprocess_image(img)
|
106 |
|
107 |
-
# Save the image to a temporary file
|
108 |
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img_file:
|
109 |
img.save(temp_img_file.name)
|
110 |
|
111 |
-
# Use subprocess to run Tesseract with a timeout
|
112 |
with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as temp_txt_file:
|
113 |
try:
|
114 |
subprocess.run(
|
@@ -127,15 +125,12 @@ def extract_text(file):
|
|
127 |
os.unlink(temp_txt_file.name)
|
128 |
return f"Error extracting text with Tesseract: {e.stderr}"
|
129 |
|
130 |
-
# Read the output text
|
131 |
with open(temp_txt_file.name, 'r', encoding='utf-8') as f:
|
132 |
text = f.read().strip()
|
133 |
|
134 |
-
# Clean up temporary files
|
135 |
os.unlink(temp_img_file.name)
|
136 |
os.unlink(temp_txt_file.name)
|
137 |
|
138 |
-
# Cache the result
|
139 |
with open(cache_path, "w", encoding="utf-8") as f:
|
140 |
f.write(text)
|
141 |
|
@@ -144,6 +139,58 @@ def extract_text(file):
|
|
144 |
except Exception as e:
|
145 |
return f"Error extracting text: {str(e)}"
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
# Load and optimize the model
|
148 |
def load_model():
|
149 |
try:
|
@@ -153,7 +200,6 @@ def load_model():
|
|
153 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, cache_dir='/data/models')
|
154 |
logger.info(f"Model and tokenizer loading took {time.time() - start_time:.2f} seconds")
|
155 |
|
156 |
-
# Warm-up the model with a minimal input
|
157 |
start_time = time.time()
|
158 |
dummy_input = tokenizer("আমি", return_tensors="pt", padding=True)
|
159 |
_ = model.generate(**dummy_input)
|
@@ -178,7 +224,6 @@ def initialize_model():
|
|
178 |
# Translate text with parallel processing
|
179 |
async def translate_text(sentence, model, tokenizer):
|
180 |
start_time = time.time()
|
181 |
-
# Split long texts into smaller chunks (max 64 tokens per chunk)
|
182 |
max_length = 64
|
183 |
inputs = []
|
184 |
current_chunk = []
|
@@ -197,7 +242,6 @@ async def translate_text(sentence, model, tokenizer):
|
|
197 |
if current_chunk:
|
198 |
inputs.append(" ".join(current_chunk))
|
199 |
|
200 |
-
# Translate each chunk in parallel
|
201 |
def translate_chunk(chunk):
|
202 |
input_ids = tokenizer(chunk, return_tensors="pt", padding=True)
|
203 |
output_ids = model.generate(**input_ids, max_length=128)
|
@@ -268,5 +312,36 @@ async def translate():
|
|
268 |
except Exception as e:
|
269 |
return jsonify({"error": str(e)}), 500
|
270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
if __name__ == "__main__":
|
272 |
app.run(host="0.0.0.0", port=7860, debug=False)
|
|
|
2 |
import logging
|
3 |
import subprocess
|
4 |
import tempfile
|
5 |
+
import requests
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import time
|
8 |
+
from urllib.parse import urljoin, urlparse
|
9 |
from flask import Flask, request, jsonify, render_template
|
10 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
11 |
from PIL import Image
|
|
|
13 |
import io
|
14 |
import torch
|
15 |
import hashlib
|
|
|
16 |
from concurrent.futures import ThreadPoolExecutor
|
17 |
import asyncio
|
18 |
|
|
|
48 |
# Timeout for OCR operation (in seconds)
|
49 |
OCR_TIMEOUT = 5
|
50 |
|
51 |
+
# Crawling settings
|
52 |
+
CRAWL_DEPTH = 2 # Limit depth to avoid overloading
|
53 |
+
REQUEST_DELAY = 5 # Seconds between requests (politeness policy)
|
54 |
+
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
55 |
+
|
56 |
# Global variables to store the model and tokenizer
|
57 |
model = None
|
58 |
tokenizer = None
|
|
|
63 |
|
64 |
# Preprocess image for faster OCR
|
65 |
def preprocess_image(image):
|
|
|
66 |
width, height = image.size
|
67 |
logger.info(f"Original image dimensions: {width}x{height}")
|
68 |
if width > MAX_IMAGE_DIMENSION or height > MAX_IMAGE_DIMENSION:
|
|
|
69 |
scale = min(MAX_IMAGE_DIMENSION / width, MAX_IMAGE_DIMENSION / height)
|
70 |
new_width = int(width * scale)
|
71 |
new_height = int(height * scale)
|
72 |
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
73 |
logger.info(f"Resized image to: {new_width}x{new_height}")
|
74 |
else:
|
|
|
75 |
target_dpi = 100
|
76 |
image = image.resize((int(width * target_dpi / 72), int(height * target_dpi / 72)), Image.Resampling.LANCZOS)
|
|
|
77 |
image = image.convert("L")
|
78 |
return image
|
79 |
|
|
|
87 |
# Extract text from image or PDF with caching
|
88 |
def extract_text(file):
|
89 |
try:
|
|
|
90 |
file_hash = get_file_hash(file)
|
91 |
cache_path = os.path.join(CACHE_DIR, f"{file_hash}.txt")
|
92 |
|
|
|
93 |
if os.path.exists(cache_path):
|
94 |
with open(cache_path, "r", encoding="utf-8") as f:
|
95 |
return f.read().strip()
|
96 |
|
97 |
start_time = time.time()
|
98 |
|
|
|
99 |
if file.filename.rsplit('.', 1)[1].lower() == 'pdf':
|
100 |
file_bytes = file.read()
|
|
|
101 |
images = convert_from_bytes(file_bytes, first_page=1, last_page=1)
|
102 |
img = preprocess_image(images[0])
|
103 |
else:
|
104 |
img = Image.open(file)
|
105 |
img = preprocess_image(img)
|
106 |
|
|
|
107 |
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img_file:
|
108 |
img.save(temp_img_file.name)
|
109 |
|
|
|
110 |
with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as temp_txt_file:
|
111 |
try:
|
112 |
subprocess.run(
|
|
|
125 |
os.unlink(temp_txt_file.name)
|
126 |
return f"Error extracting text with Tesseract: {e.stderr}"
|
127 |
|
|
|
128 |
with open(temp_txt_file.name, 'r', encoding='utf-8') as f:
|
129 |
text = f.read().strip()
|
130 |
|
|
|
131 |
os.unlink(temp_img_file.name)
|
132 |
os.unlink(temp_txt_file.name)
|
133 |
|
|
|
134 |
with open(cache_path, "w", encoding="utf-8") as f:
|
135 |
f.write(text)
|
136 |
|
|
|
139 |
except Exception as e:
|
140 |
return f"Error extracting text: {str(e)}"
|
141 |
|
142 |
+
# Simple web crawler to extract Bangla text
|
143 |
+
def crawl_website(start_url, max_depth=CRAWL_DEPTH):
|
144 |
+
visited = set()
|
145 |
+
to_visit = [(start_url, 0)] # (url, depth)
|
146 |
+
extracted_texts = []
|
147 |
+
|
148 |
+
headers = {"User-Agent": USER_AGENT}
|
149 |
+
|
150 |
+
while to_visit:
|
151 |
+
url, depth = to_visit.pop(0)
|
152 |
+
|
153 |
+
if url in visited or depth > max_depth:
|
154 |
+
continue
|
155 |
+
|
156 |
+
visited.add(url)
|
157 |
+
logger.info(f"Crawling URL: {url} at depth {depth}")
|
158 |
+
|
159 |
+
try:
|
160 |
+
# Respect politeness policy
|
161 |
+
time.sleep(REQUEST_DELAY)
|
162 |
+
|
163 |
+
# Fetch the page
|
164 |
+
response = requests.get(url, headers=headers, timeout=10)
|
165 |
+
response.raise_for_status()
|
166 |
+
|
167 |
+
# Parse HTML
|
168 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
169 |
+
|
170 |
+
# Extract Bangla text from relevant tags
|
171 |
+
text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article'])
|
172 |
+
bangla_text = " ".join(element.get_text(strip=True) for element in text_elements if element.get_text(strip=True))
|
173 |
+
|
174 |
+
if bangla_text:
|
175 |
+
extracted_texts.append(bangla_text)
|
176 |
+
|
177 |
+
# Find links to crawl further
|
178 |
+
if depth < max_depth:
|
179 |
+
for link in soup.find_all('a', href=True):
|
180 |
+
href = link['href']
|
181 |
+
absolute_url = urljoin(url, href)
|
182 |
+
parsed_url = urlparse(absolute_url)
|
183 |
+
|
184 |
+
# Only crawl within the same domain
|
185 |
+
if parsed_url.netloc == urlparse(start_url).netloc and absolute_url not in visited:
|
186 |
+
to_visit.append((absolute_url, depth + 1))
|
187 |
+
|
188 |
+
except Exception as e:
|
189 |
+
logger.error(f"Error crawling {url}: {str(e)}")
|
190 |
+
continue
|
191 |
+
|
192 |
+
return " ".join(extracted_texts) if extracted_texts else "No Bangla text found on the website."
|
193 |
+
|
194 |
# Load and optimize the model
|
195 |
def load_model():
|
196 |
try:
|
|
|
200 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, cache_dir='/data/models')
|
201 |
logger.info(f"Model and tokenizer loading took {time.time() - start_time:.2f} seconds")
|
202 |
|
|
|
203 |
start_time = time.time()
|
204 |
dummy_input = tokenizer("আমি", return_tensors="pt", padding=True)
|
205 |
_ = model.generate(**dummy_input)
|
|
|
224 |
# Translate text with parallel processing
|
225 |
async def translate_text(sentence, model, tokenizer):
|
226 |
start_time = time.time()
|
|
|
227 |
max_length = 64
|
228 |
inputs = []
|
229 |
current_chunk = []
|
|
|
242 |
if current_chunk:
|
243 |
inputs.append(" ".join(current_chunk))
|
244 |
|
|
|
245 |
def translate_chunk(chunk):
|
246 |
input_ids = tokenizer(chunk, return_tensors="pt", padding=True)
|
247 |
output_ids = model.generate(**input_ids, max_length=128)
|
|
|
312 |
except Exception as e:
|
313 |
return jsonify({"error": str(e)}), 500
|
314 |
|
315 |
+
@app.route("/crawl_and_translate", methods=["POST"])
|
316 |
+
async def crawl_and_translate():
|
317 |
+
try:
|
318 |
+
start_time = time.time()
|
319 |
+
url = request.form.get("url")
|
320 |
+
|
321 |
+
if not url:
|
322 |
+
return render_template("index.html", error="Please enter a website URL.")
|
323 |
+
|
324 |
+
# Validate URL
|
325 |
+
parsed_url = urlparse(url)
|
326 |
+
if not parsed_url.scheme or not parsed_url.netloc:
|
327 |
+
return render_template("index.html", error="Invalid URL format.", url=url)
|
328 |
+
|
329 |
+
# Crawl the website
|
330 |
+
logger.info(f"Starting crawl for URL: {url}")
|
331 |
+
extracted_text = crawl_website(url)
|
332 |
+
|
333 |
+
if not extracted_text or extracted_text.startswith("No Bangla text"):
|
334 |
+
return render_template("index.html", error=extracted_text or "No text found to translate.", url=url)
|
335 |
+
|
336 |
+
# Translate the extracted text
|
337 |
+
translated = await translate_text(extracted_text, model, tokenizer)
|
338 |
+
if translated.startswith("Error"):
|
339 |
+
return render_template("index.html", error=translated, url=url, extracted_text=extracted_text)
|
340 |
+
|
341 |
+
logger.info(f"Total crawl and translate request took {time.time() - start_time:.2f} seconds")
|
342 |
+
return render_template("index.html", extracted_text=extracted_text, translated_text=translated, url=url)
|
343 |
+
except Exception as e:
|
344 |
+
return render_template("index.html", error=str(e), url=url)
|
345 |
+
|
346 |
if __name__ == "__main__":
|
347 |
app.run(host="0.0.0.0", port=7860, debug=False)
|