Spaces:

Chaitanya895
/

bangla-translator

Sleeping

App Files Files Community

Chaitanya895 commited on May 13

Commit

e283321

verified ·

1 Parent(s): 08947b3

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -17

app.py CHANGED Viewed

@@ -2,6 +2,10 @@ import os
 import logging
 import subprocess
 import tempfile
 from flask import Flask, request, jsonify, render_template
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from PIL import Image
@@ -9,7 +13,6 @@ from pdf2image import convert_from_bytes
 import io
 import torch
 import hashlib
-import time
 from concurrent.futures import ThreadPoolExecutor
 import asyncio
@@ -45,6 +48,11 @@ MAX_IMAGE_DIMENSION = 1000  # Reduced for faster OCR
 # Timeout for OCR operation (in seconds)
 OCR_TIMEOUT = 5
 # Global variables to store the model and tokenizer
 model = None
 tokenizer = None
@@ -55,21 +63,17 @@ def allowed_file(filename):
 # Preprocess image for faster OCR
 def preprocess_image(image):
-    # Check image dimensions
     width, height = image.size
     logger.info(f"Original image dimensions: {width}x{height}")
     if width > MAX_IMAGE_DIMENSION or height > MAX_IMAGE_DIMENSION:
-        # Maintain aspect ratio
         scale = min(MAX_IMAGE_DIMENSION / width, MAX_IMAGE_DIMENSION / height)
         new_width = int(width * scale)
         new_height = int(height * scale)
         image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
         logger.info(f"Resized image to: {new_width}x{new_height}")
     else:
-        # Resize to 100 DPI (reduced for speed)
         target_dpi = 100
         image = image.resize((int(width * target_dpi / 72), int(height * target_dpi / 72)), Image.Resampling.LANCZOS)
-    # Convert to grayscale
     image = image.convert("L")
     return image
@@ -83,32 +87,26 @@ def get_file_hash(file):
 # Extract text from image or PDF with caching
 def extract_text(file):
     try:
-        # Compute file hash for caching
         file_hash = get_file_hash(file)
         cache_path = os.path.join(CACHE_DIR, f"{file_hash}.txt")
-        # Check if result is cached
         if os.path.exists(cache_path):
             with open(cache_path, "r", encoding="utf-8") as f:
                 return f.read().strip()
         start_time = time.time()
-        # Prepare the image for OCR
         if file.filename.rsplit('.', 1)[1].lower() == 'pdf':
             file_bytes = file.read()
-            # Process only the first page to reduce time
             images = convert_from_bytes(file_bytes, first_page=1, last_page=1)
             img = preprocess_image(images[0])
         else:
             img = Image.open(file)
             img = preprocess_image(img)
-        # Save the image to a temporary file
         with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img_file:
             img.save(temp_img_file.name)
-            # Use subprocess to run Tesseract with a timeout
             with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as temp_txt_file:
                 try:
                     subprocess.run(
@@ -127,15 +125,12 @@ def extract_text(file):
                     os.unlink(temp_txt_file.name)
                     return f"Error extracting text with Tesseract: {e.stderr}"
-                # Read the output text
                 with open(temp_txt_file.name, 'r', encoding='utf-8') as f:
                     text = f.read().strip()
-        # Clean up temporary files
         os.unlink(temp_img_file.name)
         os.unlink(temp_txt_file.name)
-        # Cache the result
         with open(cache_path, "w", encoding="utf-8") as f:
             f.write(text)
@@ -144,6 +139,58 @@ def extract_text(file):
     except Exception as e:
         return f"Error extracting text: {str(e)}"
 # Load and optimize the model
 def load_model():
     try:
@@ -153,7 +200,6 @@ def load_model():
         tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, cache_dir='/data/models')
         logger.info(f"Model and tokenizer loading took {time.time() - start_time:.2f} seconds")
-        # Warm-up the model with a minimal input
         start_time = time.time()
         dummy_input = tokenizer("আমি", return_tensors="pt", padding=True)
         _ = model.generate(**dummy_input)
@@ -178,7 +224,6 @@ def initialize_model():
 # Translate text with parallel processing
 async def translate_text(sentence, model, tokenizer):
     start_time = time.time()
-    # Split long texts into smaller chunks (max 64 tokens per chunk)
     max_length = 64
     inputs = []
     current_chunk = []
@@ -197,7 +242,6 @@ async def translate_text(sentence, model, tokenizer):
     if current_chunk:
         inputs.append(" ".join(current_chunk))
-    # Translate each chunk in parallel
     def translate_chunk(chunk):
         input_ids = tokenizer(chunk, return_tensors="pt", padding=True)
         output_ids = model.generate(**input_ids, max_length=128)
@@ -268,5 +312,36 @@ async def translate():
     except Exception as e:
         return jsonify({"error": str(e)}), 500
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=False)

 import logging
 import subprocess
 import tempfile
+import requests
+from bs4 import BeautifulSoup
+import time
+from urllib.parse import urljoin, urlparse
 from flask import Flask, request, jsonify, render_template
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from PIL import Image
 import io
 import torch
 import hashlib
 from concurrent.futures import ThreadPoolExecutor
 import asyncio
 # Timeout for OCR operation (in seconds)
 OCR_TIMEOUT = 5
+# Crawling settings
+CRAWL_DEPTH = 2  # Limit depth to avoid overloading
+REQUEST_DELAY = 5  # Seconds between requests (politeness policy)
+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
 # Global variables to store the model and tokenizer
 model = None
 tokenizer = None
 # Preprocess image for faster OCR
 def preprocess_image(image):
     width, height = image.size
     logger.info(f"Original image dimensions: {width}x{height}")
     if width > MAX_IMAGE_DIMENSION or height > MAX_IMAGE_DIMENSION:
         scale = min(MAX_IMAGE_DIMENSION / width, MAX_IMAGE_DIMENSION / height)
         new_width = int(width * scale)
         new_height = int(height * scale)
         image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
         logger.info(f"Resized image to: {new_width}x{new_height}")
     else:
         target_dpi = 100
         image = image.resize((int(width * target_dpi / 72), int(height * target_dpi / 72)), Image.Resampling.LANCZOS)
     image = image.convert("L")
     return image
 # Extract text from image or PDF with caching
 def extract_text(file):
     try:
         file_hash = get_file_hash(file)
         cache_path = os.path.join(CACHE_DIR, f"{file_hash}.txt")
         if os.path.exists(cache_path):
             with open(cache_path, "r", encoding="utf-8") as f:
                 return f.read().strip()
         start_time = time.time()
         if file.filename.rsplit('.', 1)[1].lower() == 'pdf':
             file_bytes = file.read()
             images = convert_from_bytes(file_bytes, first_page=1, last_page=1)
             img = preprocess_image(images[0])
         else:
             img = Image.open(file)
             img = preprocess_image(img)
         with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img_file:
             img.save(temp_img_file.name)
             with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as temp_txt_file:
                 try:
                     subprocess.run(
                     os.unlink(temp_txt_file.name)
                     return f"Error extracting text with Tesseract: {e.stderr}"
                 with open(temp_txt_file.name, 'r', encoding='utf-8') as f:
                     text = f.read().strip()
         os.unlink(temp_img_file.name)
         os.unlink(temp_txt_file.name)
         with open(cache_path, "w", encoding="utf-8") as f:
             f.write(text)
     except Exception as e:
         return f"Error extracting text: {str(e)}"
+# Simple web crawler to extract Bangla text
+def crawl_website(start_url, max_depth=CRAWL_DEPTH):
+    visited = set()
+    to_visit = [(start_url, 0)]  # (url, depth)
+    extracted_texts = []
+    headers = {"User-Agent": USER_AGENT}
+    while to_visit:
+        url, depth = to_visit.pop(0)
+        if url in visited or depth > max_depth:
+            continue
+        visited.add(url)
+        logger.info(f"Crawling URL: {url} at depth {depth}")
+        try:
+            # Respect politeness policy
+            time.sleep(REQUEST_DELAY)
+            # Fetch the page
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            # Parse HTML
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Extract Bangla text from relevant tags
+            text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article'])
+            bangla_text = " ".join(element.get_text(strip=True) for element in text_elements if element.get_text(strip=True))
+            if bangla_text:
+                extracted_texts.append(bangla_text)
+            # Find links to crawl further
+            if depth < max_depth:
+                for link in soup.find_all('a', href=True):
+                    href = link['href']
+                    absolute_url = urljoin(url, href)
+                    parsed_url = urlparse(absolute_url)
+                    # Only crawl within the same domain
+                    if parsed_url.netloc == urlparse(start_url).netloc and absolute_url not in visited:
+                        to_visit.append((absolute_url, depth + 1))
+        except Exception as e:
+            logger.error(f"Error crawling {url}: {str(e)}")
+            continue
+    return " ".join(extracted_texts) if extracted_texts else "No Bangla text found on the website."
 # Load and optimize the model
 def load_model():
     try:
         tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, cache_dir='/data/models')
         logger.info(f"Model and tokenizer loading took {time.time() - start_time:.2f} seconds")
         start_time = time.time()
         dummy_input = tokenizer("আমি", return_tensors="pt", padding=True)
         _ = model.generate(**dummy_input)
 # Translate text with parallel processing
 async def translate_text(sentence, model, tokenizer):
     start_time = time.time()
     max_length = 64
     inputs = []
     current_chunk = []
     if current_chunk:
         inputs.append(" ".join(current_chunk))
     def translate_chunk(chunk):
         input_ids = tokenizer(chunk, return_tensors="pt", padding=True)
         output_ids = model.generate(**input_ids, max_length=128)
     except Exception as e:
         return jsonify({"error": str(e)}), 500
+@app.route("/crawl_and_translate", methods=["POST"])
+async def crawl_and_translate():
+    try:
+        start_time = time.time()
+        url = request.form.get("url")
+        if not url:
+            return render_template("index.html", error="Please enter a website URL.")
+        # Validate URL
+        parsed_url = urlparse(url)
+        if not parsed_url.scheme or not parsed_url.netloc:
+            return render_template("index.html", error="Invalid URL format.", url=url)
+        # Crawl the website
+        logger.info(f"Starting crawl for URL: {url}")
+        extracted_text = crawl_website(url)
+        if not extracted_text or extracted_text.startswith("No Bangla text"):
+            return render_template("index.html", error=extracted_text or "No text found to translate.", url=url)
+        # Translate the extracted text
+        translated = await translate_text(extracted_text, model, tokenizer)
+        if translated.startswith("Error"):
+            return render_template("index.html", error=translated, url=url, extracted_text=extracted_text)
+        logger.info(f"Total crawl and translate request took {time.time() - start_time:.2f} seconds")
+        return render_template("index.html", extracted_text=extracted_text, translated_text=translated, url=url)
+    except Exception as e:
+        return render_template("index.html", error=str(e), url=url)
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=False)