Spaces:

Chaitanya895
/

bangla-translator

Sleeping

App Files Files Community

Chaitanya895 commited on May 13

Commit

ecdd0de

verified ·

1 Parent(s): 625a396

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -43

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import torch
 import hashlib
 from concurrent.futures import ThreadPoolExecutor
 import asyncio
 # Configure logging
 logging.basicConfig(
@@ -49,9 +50,9 @@ MAX_IMAGE_DIMENSION = 500
 OCR_TIMEOUT = 20
 # Crawling settings
-CRAWL_DEPTH = 1
-MAX_PAGES = 5
-REQUEST_DELAY = 5
 USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
 # Global variables to store the model and tokenizer
@@ -152,7 +153,30 @@ def extract_text(file):
         logger.error(f"Error in extract_text: {str(e)}")
         return f"Error extracting text: {str(e)}"
-# Simple web crawler to extract Bangla text
 def crawl_website(start_url, max_depth=CRAWL_DEPTH):
     visited = set()
     to_visit = [(start_url, 0)]  # (url, depth)
@@ -160,39 +184,58 @@ def crawl_website(start_url, max_depth=CRAWL_DEPTH):
     headers = {"User-Agent": USER_AGENT}
-    while to_visit:
-        url, depth = to_visit.pop(0)
-        if url in visited or depth > max_depth or len(visited) >= MAX_PAGES:
-            continue
-        visited.add(url)
-        logger.info(f"Crawling URL: {url} at depth {depth}")
-        try:
-            time.sleep(REQUEST_DELAY)
-            response = requests.get(url, headers=headers, timeout=10)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.text, 'html.parser')
-            text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article'])
-            bangla_text = " ".join(element.get_text(strip=True) for element in text_elements if element.get_text(strip=True))
-            if bangla_text:
-                extracted_texts.append(bangla_text)
-            if depth < max_depth:
-                for link in soup.find_all('a', href=True):
-                    href = link['href']
-                    absolute_url = urljoin(url, href)
-                    parsed_url = urlparse(absolute_url)
-                    if parsed_url.netloc == urlparse(start_url).netloc and absolute_url not in visited:
-                        to_visit.append((absolute_url, depth + 1))
-        except Exception as e:
-            logger.error(f"Error crawling {url}: {str(e)}")
-            continue
     return " ".join(extracted_texts) if extracted_texts else "No Bangla text found on the website."
 # Load and optimize the model
@@ -228,7 +271,7 @@ def initialize_model():
 # Translate text with parallel processing
 async def translate_text(sentence, model, tokenizer):
     start_time = time.time()
-    max_length = 16
     inputs = []
     current_chunk = []
     current_length = 0
@@ -258,6 +301,11 @@ async def translate_text(sentence, model, tokenizer):
     translated = " ".join(translated_chunks)
     logger.info(f"Translation took {time.time() - start_time:.2f} seconds")
     return translated
 # Initialize model
@@ -305,9 +353,9 @@ async def process_web_translate():
 @app.route("/web_translate", methods=["POST"])
 async def web_translate():
-    text = None  # Initialize text to avoid UnboundLocalError
     try:
-        return await asyncio.wait_for(process_web_translate(), timeout=60)  # Timeout after 60 seconds
     except asyncio.TimeoutError:
         logger.error("Request timed out after 60 seconds")
         return render_template("index.html", error="Request timed out. Please try again with a smaller input.", text=text)
@@ -341,12 +389,12 @@ async def process_crawl_and_translate():
 @app.route("/crawl_and_translate", methods=["POST"])
 async def crawl_and_translate():
-    url = None  # Initialize url to avoid UnboundLocalError
     try:
-        return await asyncio.wait_for(process_crawl_and_translate(), timeout=60)  # Timeout after 60 seconds
     except asyncio.TimeoutError:
-        logger.error("Crawl and translate request timed out after 60 seconds")
-        return render_template("index.html", error="Request timed out. Please try again with a smaller website.", url=url)
     except Exception as e:
         logger.error(f"Error in crawl_and_translate: {str(e)}")
         return render_template("index.html", error=f"Error processing request: {str(e)}", url=url)

 import hashlib
 from concurrent.futures import ThreadPoolExecutor
 import asyncio
+import gc
 # Configure logging
 logging.basicConfig(
 OCR_TIMEOUT = 20
 # Crawling settings
+CRAWL_DEPTH = 2  # Increased to 2 to explore subpages
+MAX_PAGES = 15  # Increased to 15 to handle larger inputs
+REQUEST_DELAY = 3
 USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
 # Global variables to store the model and tokenizer
         logger.error(f"Error in extract_text: {str(e)}")
         return f"Error extracting text: {str(e)}"
+# Crawl a single URL and extract text
+def crawl_single_url(url, headers):
+    try:
+        time.sleep(REQUEST_DELAY)
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article'])
+        bangla_text = " ".join(element.get_text(strip=True) for element in text_elements if element.get_text(strip=True))
+        # Extract links for further crawling
+        links = []
+        for link in soup.find_all('a', href=True):
+            href = link['href']
+            absolute_url = urljoin(url, href)
+            links.append(absolute_url)
+        return bangla_text, links
+    except Exception as e:
+        logger.error(f"Error crawling {url}: {str(e)}")
+        return "", []
+# Simple web crawler to extract Bangla text (parallelized)
 def crawl_website(start_url, max_depth=CRAWL_DEPTH):
     visited = set()
     to_visit = [(start_url, 0)]  # (url, depth)
     headers = {"User-Agent": USER_AGENT}
+    # Keywords to filter out irrelevant URLs
+    exclude_keywords = ['api', 'auth', 'login', 'signup', 'account', 'oauth']
+    # Keywords to prioritize news articles
+    include_keywords = ['news', 'article', 'bangladesh', 'politics', 'sports', 'entertainment']
+    with ThreadPoolExecutor(max_workers=5) as executor:  # Parallelize with 5 threads
+        while to_visit:
+            # Batch URLs to crawl in parallel
+            batch = []
+            while to_visit and len(batch) < 5 and len(visited) < MAX_PAGES:
+                url, depth = to_visit.pop(0)
+                if url in visited or depth > max_depth:
+                    continue
+                # Skip URLs with excluded keywords
+                if any(keyword in url.lower() for keyword in exclude_keywords):
+                    logger.info(f"Skipping URL due to excluded keyword: {url}")
+                    continue
+                batch.append((url, depth))
+                visited.add(url)
+            if not batch:
+                break
+            # Crawl batch of URLs in parallel
+            futures = []
+            for url, depth in batch:
+                logger.info(f"Crawling URL: {url} at depth {depth}")
+                futures.append(executor.submit(crawl_single_url, url, headers))
+            # Collect results
+            for (url, depth), future in zip(batch, futures):
+                bangla_text, links = future.result()
+                if bangla_text:
+                    extracted_texts.append(bangla_text)
+                # Add new links to crawl
+                if depth < max_depth:
+                    for absolute_url in links:
+                        parsed_url = urlparse(absolute_url)
+                        if parsed_url.netloc == urlparse(start_url).netloc and absolute_url not in visited:
+                            # Prioritize URLs with included keywords
+                            if any(keyword in absolute_url.lower() for keyword in include_keywords):
+                                to_visit.insert(0, (absolute_url, depth + 1))
+                            else:
+                                to_visit.append((absolute_url, depth + 1))
+            # Free memory after processing batch
+            gc.collect()
     return " ".join(extracted_texts) if extracted_texts else "No Bangla text found on the website."
 # Load and optimize the model
 # Translate text with parallel processing
 async def translate_text(sentence, model, tokenizer):
     start_time = time.time()
+    max_length = 32  # Increased chunk size to handle larger inputs
     inputs = []
     current_chunk = []
     current_length = 0
     translated = " ".join(translated_chunks)
     logger.info(f"Translation took {time.time() - start_time:.2f} seconds")
+    # Free memory after translation
+    del inputs, translated_chunks
+    gc.collect()
     return translated
 # Initialize model
 @app.route("/web_translate", methods=["POST"])
 async def web_translate():
+    text = None
     try:
+        return await asyncio.wait_for(process_web_translate(), timeout=60)
     except asyncio.TimeoutError:
         logger.error("Request timed out after 60 seconds")
         return render_template("index.html", error="Request timed out. Please try again with a smaller input.", text=text)
 @app.route("/crawl_and_translate", methods=["POST"])
 async def crawl_and_translate():
+    url = None
     try:
+        return await asyncio.wait_for(process_crawl_and_translate(), timeout=180)  # Increased to 180 seconds
     except asyncio.TimeoutError:
+        logger.error("Crawl and translate request timed out after 180 seconds")
+        return render_template("index.html", error="Request timed out after 180 seconds. Please try a smaller website or upgrade to a paid plan for better performance.", url=url)
     except Exception as e:
         logger.error(f"Error in crawl_and_translate: {str(e)}")
         return render_template("index.html", error=f"Error processing request: {str(e)}", url=url)