Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,6 +15,7 @@ import torch
|
|
| 15 |
import hashlib
|
| 16 |
from concurrent.futures import ThreadPoolExecutor
|
| 17 |
import asyncio
|
|
|
|
| 18 |
|
| 19 |
# Configure logging
|
| 20 |
logging.basicConfig(
|
|
@@ -49,9 +50,9 @@ MAX_IMAGE_DIMENSION = 500
|
|
| 49 |
OCR_TIMEOUT = 20
|
| 50 |
|
| 51 |
# Crawling settings
|
| 52 |
-
CRAWL_DEPTH =
|
| 53 |
-
MAX_PAGES =
|
| 54 |
-
REQUEST_DELAY =
|
| 55 |
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
| 56 |
|
| 57 |
# Global variables to store the model and tokenizer
|
|
@@ -152,7 +153,30 @@ def extract_text(file):
|
|
| 152 |
logger.error(f"Error in extract_text: {str(e)}")
|
| 153 |
return f"Error extracting text: {str(e)}"
|
| 154 |
|
| 155 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
def crawl_website(start_url, max_depth=CRAWL_DEPTH):
|
| 157 |
visited = set()
|
| 158 |
to_visit = [(start_url, 0)] # (url, depth)
|
|
@@ -160,39 +184,58 @@ def crawl_website(start_url, max_depth=CRAWL_DEPTH):
|
|
| 160 |
|
| 161 |
headers = {"User-Agent": USER_AGENT}
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
if depth < max_depth:
|
| 185 |
-
for link in soup.find_all('a', href=True):
|
| 186 |
-
href = link['href']
|
| 187 |
-
absolute_url = urljoin(url, href)
|
| 188 |
-
parsed_url = urlparse(absolute_url)
|
| 189 |
-
if parsed_url.netloc == urlparse(start_url).netloc and absolute_url not in visited:
|
| 190 |
-
to_visit.append((absolute_url, depth + 1))
|
| 191 |
-
|
| 192 |
-
except Exception as e:
|
| 193 |
-
logger.error(f"Error crawling {url}: {str(e)}")
|
| 194 |
-
continue
|
| 195 |
-
|
| 196 |
return " ".join(extracted_texts) if extracted_texts else "No Bangla text found on the website."
|
| 197 |
|
| 198 |
# Load and optimize the model
|
|
@@ -228,7 +271,7 @@ def initialize_model():
|
|
| 228 |
# Translate text with parallel processing
|
| 229 |
async def translate_text(sentence, model, tokenizer):
|
| 230 |
start_time = time.time()
|
| 231 |
-
max_length =
|
| 232 |
inputs = []
|
| 233 |
current_chunk = []
|
| 234 |
current_length = 0
|
|
@@ -258,6 +301,11 @@ async def translate_text(sentence, model, tokenizer):
|
|
| 258 |
|
| 259 |
translated = " ".join(translated_chunks)
|
| 260 |
logger.info(f"Translation took {time.time() - start_time:.2f} seconds")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
return translated
|
| 262 |
|
| 263 |
# Initialize model
|
|
@@ -305,9 +353,9 @@ async def process_web_translate():
|
|
| 305 |
|
| 306 |
@app.route("/web_translate", methods=["POST"])
|
| 307 |
async def web_translate():
|
| 308 |
-
text = None
|
| 309 |
try:
|
| 310 |
-
return await asyncio.wait_for(process_web_translate(), timeout=60)
|
| 311 |
except asyncio.TimeoutError:
|
| 312 |
logger.error("Request timed out after 60 seconds")
|
| 313 |
return render_template("index.html", error="Request timed out. Please try again with a smaller input.", text=text)
|
|
@@ -341,12 +389,12 @@ async def process_crawl_and_translate():
|
|
| 341 |
|
| 342 |
@app.route("/crawl_and_translate", methods=["POST"])
|
| 343 |
async def crawl_and_translate():
|
| 344 |
-
url = None
|
| 345 |
try:
|
| 346 |
-
return await asyncio.wait_for(process_crawl_and_translate(), timeout=
|
| 347 |
except asyncio.TimeoutError:
|
| 348 |
-
logger.error("Crawl and translate request timed out after
|
| 349 |
-
return render_template("index.html", error="Request timed out. Please try
|
| 350 |
except Exception as e:
|
| 351 |
logger.error(f"Error in crawl_and_translate: {str(e)}")
|
| 352 |
return render_template("index.html", error=f"Error processing request: {str(e)}", url=url)
|
|
|
|
| 15 |
import hashlib
|
| 16 |
from concurrent.futures import ThreadPoolExecutor
|
| 17 |
import asyncio
|
| 18 |
+
import gc
|
| 19 |
|
| 20 |
# Configure logging
|
| 21 |
logging.basicConfig(
|
|
|
|
| 50 |
OCR_TIMEOUT = 20
|
| 51 |
|
| 52 |
# Crawling settings
|
| 53 |
+
CRAWL_DEPTH = 2 # Increased to 2 to explore subpages
|
| 54 |
+
MAX_PAGES = 15 # Increased to 15 to handle larger inputs
|
| 55 |
+
REQUEST_DELAY = 3
|
| 56 |
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
| 57 |
|
| 58 |
# Global variables to store the model and tokenizer
|
|
|
|
| 153 |
logger.error(f"Error in extract_text: {str(e)}")
|
| 154 |
return f"Error extracting text: {str(e)}"
|
| 155 |
|
| 156 |
+
# Crawl a single URL and extract text
|
| 157 |
+
def crawl_single_url(url, headers):
|
| 158 |
+
try:
|
| 159 |
+
time.sleep(REQUEST_DELAY)
|
| 160 |
+
response = requests.get(url, headers=headers, timeout=10)
|
| 161 |
+
response.raise_for_status()
|
| 162 |
+
|
| 163 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 164 |
+
text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article'])
|
| 165 |
+
bangla_text = " ".join(element.get_text(strip=True) for element in text_elements if element.get_text(strip=True))
|
| 166 |
+
|
| 167 |
+
# Extract links for further crawling
|
| 168 |
+
links = []
|
| 169 |
+
for link in soup.find_all('a', href=True):
|
| 170 |
+
href = link['href']
|
| 171 |
+
absolute_url = urljoin(url, href)
|
| 172 |
+
links.append(absolute_url)
|
| 173 |
+
|
| 174 |
+
return bangla_text, links
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.error(f"Error crawling {url}: {str(e)}")
|
| 177 |
+
return "", []
|
| 178 |
+
|
| 179 |
+
# Simple web crawler to extract Bangla text (parallelized)
|
| 180 |
def crawl_website(start_url, max_depth=CRAWL_DEPTH):
|
| 181 |
visited = set()
|
| 182 |
to_visit = [(start_url, 0)] # (url, depth)
|
|
|
|
| 184 |
|
| 185 |
headers = {"User-Agent": USER_AGENT}
|
| 186 |
|
| 187 |
+
# Keywords to filter out irrelevant URLs
|
| 188 |
+
exclude_keywords = ['api', 'auth', 'login', 'signup', 'account', 'oauth']
|
| 189 |
+
# Keywords to prioritize news articles
|
| 190 |
+
include_keywords = ['news', 'article', 'bangladesh', 'politics', 'sports', 'entertainment']
|
| 191 |
+
|
| 192 |
+
with ThreadPoolExecutor(max_workers=5) as executor: # Parallelize with 5 threads
|
| 193 |
+
while to_visit:
|
| 194 |
+
# Batch URLs to crawl in parallel
|
| 195 |
+
batch = []
|
| 196 |
+
while to_visit and len(batch) < 5 and len(visited) < MAX_PAGES:
|
| 197 |
+
url, depth = to_visit.pop(0)
|
| 198 |
+
|
| 199 |
+
if url in visited or depth > max_depth:
|
| 200 |
+
continue
|
| 201 |
+
|
| 202 |
+
# Skip URLs with excluded keywords
|
| 203 |
+
if any(keyword in url.lower() for keyword in exclude_keywords):
|
| 204 |
+
logger.info(f"Skipping URL due to excluded keyword: {url}")
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
batch.append((url, depth))
|
| 208 |
+
visited.add(url)
|
| 209 |
+
|
| 210 |
+
if not batch:
|
| 211 |
+
break
|
| 212 |
+
|
| 213 |
+
# Crawl batch of URLs in parallel
|
| 214 |
+
futures = []
|
| 215 |
+
for url, depth in batch:
|
| 216 |
+
logger.info(f"Crawling URL: {url} at depth {depth}")
|
| 217 |
+
futures.append(executor.submit(crawl_single_url, url, headers))
|
| 218 |
+
|
| 219 |
+
# Collect results
|
| 220 |
+
for (url, depth), future in zip(batch, futures):
|
| 221 |
+
bangla_text, links = future.result()
|
| 222 |
+
if bangla_text:
|
| 223 |
+
extracted_texts.append(bangla_text)
|
| 224 |
+
|
| 225 |
+
# Add new links to crawl
|
| 226 |
+
if depth < max_depth:
|
| 227 |
+
for absolute_url in links:
|
| 228 |
+
parsed_url = urlparse(absolute_url)
|
| 229 |
+
if parsed_url.netloc == urlparse(start_url).netloc and absolute_url not in visited:
|
| 230 |
+
# Prioritize URLs with included keywords
|
| 231 |
+
if any(keyword in absolute_url.lower() for keyword in include_keywords):
|
| 232 |
+
to_visit.insert(0, (absolute_url, depth + 1))
|
| 233 |
+
else:
|
| 234 |
+
to_visit.append((absolute_url, depth + 1))
|
| 235 |
|
| 236 |
+
# Free memory after processing batch
|
| 237 |
+
gc.collect()
|
| 238 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
return " ".join(extracted_texts) if extracted_texts else "No Bangla text found on the website."
|
| 240 |
|
| 241 |
# Load and optimize the model
|
|
|
|
| 271 |
# Translate text with parallel processing
|
| 272 |
async def translate_text(sentence, model, tokenizer):
|
| 273 |
start_time = time.time()
|
| 274 |
+
max_length = 32 # Increased chunk size to handle larger inputs
|
| 275 |
inputs = []
|
| 276 |
current_chunk = []
|
| 277 |
current_length = 0
|
|
|
|
| 301 |
|
| 302 |
translated = " ".join(translated_chunks)
|
| 303 |
logger.info(f"Translation took {time.time() - start_time:.2f} seconds")
|
| 304 |
+
|
| 305 |
+
# Free memory after translation
|
| 306 |
+
del inputs, translated_chunks
|
| 307 |
+
gc.collect()
|
| 308 |
+
|
| 309 |
return translated
|
| 310 |
|
| 311 |
# Initialize model
|
|
|
|
| 353 |
|
| 354 |
@app.route("/web_translate", methods=["POST"])
|
| 355 |
async def web_translate():
|
| 356 |
+
text = None
|
| 357 |
try:
|
| 358 |
+
return await asyncio.wait_for(process_web_translate(), timeout=60)
|
| 359 |
except asyncio.TimeoutError:
|
| 360 |
logger.error("Request timed out after 60 seconds")
|
| 361 |
return render_template("index.html", error="Request timed out. Please try again with a smaller input.", text=text)
|
|
|
|
| 389 |
|
| 390 |
@app.route("/crawl_and_translate", methods=["POST"])
|
| 391 |
async def crawl_and_translate():
|
| 392 |
+
url = None
|
| 393 |
try:
|
| 394 |
+
return await asyncio.wait_for(process_crawl_and_translate(), timeout=180) # Increased to 180 seconds
|
| 395 |
except asyncio.TimeoutError:
|
| 396 |
+
logger.error("Crawl and translate request timed out after 180 seconds")
|
| 397 |
+
return render_template("index.html", error="Request timed out after 180 seconds. Please try a smaller website or upgrade to a paid plan for better performance.", url=url)
|
| 398 |
except Exception as e:
|
| 399 |
logger.error(f"Error in crawl_and_translate: {str(e)}")
|
| 400 |
return render_template("index.html", error=f"Error processing request: {str(e)}", url=url)
|