Chaitanya895 commited on
Commit
ecdd0de
·
verified ·
1 Parent(s): 625a396

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -43
app.py CHANGED
@@ -15,6 +15,7 @@ import torch
15
  import hashlib
16
  from concurrent.futures import ThreadPoolExecutor
17
  import asyncio
 
18
 
19
  # Configure logging
20
  logging.basicConfig(
@@ -49,9 +50,9 @@ MAX_IMAGE_DIMENSION = 500
49
  OCR_TIMEOUT = 20
50
 
51
  # Crawling settings
52
- CRAWL_DEPTH = 1
53
- MAX_PAGES = 5
54
- REQUEST_DELAY = 5
55
  USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
56
 
57
  # Global variables to store the model and tokenizer
@@ -152,7 +153,30 @@ def extract_text(file):
152
  logger.error(f"Error in extract_text: {str(e)}")
153
  return f"Error extracting text: {str(e)}"
154
 
155
- # Simple web crawler to extract Bangla text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def crawl_website(start_url, max_depth=CRAWL_DEPTH):
157
  visited = set()
158
  to_visit = [(start_url, 0)] # (url, depth)
@@ -160,39 +184,58 @@ def crawl_website(start_url, max_depth=CRAWL_DEPTH):
160
 
161
  headers = {"User-Agent": USER_AGENT}
162
 
163
- while to_visit:
164
- url, depth = to_visit.pop(0)
165
-
166
- if url in visited or depth > max_depth or len(visited) >= MAX_PAGES:
167
- continue
168
-
169
- visited.add(url)
170
- logger.info(f"Crawling URL: {url} at depth {depth}")
171
-
172
- try:
173
- time.sleep(REQUEST_DELAY)
174
- response = requests.get(url, headers=headers, timeout=10)
175
- response.raise_for_status()
176
-
177
- soup = BeautifulSoup(response.text, 'html.parser')
178
- text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article'])
179
- bangla_text = " ".join(element.get_text(strip=True) for element in text_elements if element.get_text(strip=True))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
- if bangla_text:
182
- extracted_texts.append(bangla_text)
183
-
184
- if depth < max_depth:
185
- for link in soup.find_all('a', href=True):
186
- href = link['href']
187
- absolute_url = urljoin(url, href)
188
- parsed_url = urlparse(absolute_url)
189
- if parsed_url.netloc == urlparse(start_url).netloc and absolute_url not in visited:
190
- to_visit.append((absolute_url, depth + 1))
191
-
192
- except Exception as e:
193
- logger.error(f"Error crawling {url}: {str(e)}")
194
- continue
195
-
196
  return " ".join(extracted_texts) if extracted_texts else "No Bangla text found on the website."
197
 
198
  # Load and optimize the model
@@ -228,7 +271,7 @@ def initialize_model():
228
  # Translate text with parallel processing
229
  async def translate_text(sentence, model, tokenizer):
230
  start_time = time.time()
231
- max_length = 16
232
  inputs = []
233
  current_chunk = []
234
  current_length = 0
@@ -258,6 +301,11 @@ async def translate_text(sentence, model, tokenizer):
258
 
259
  translated = " ".join(translated_chunks)
260
  logger.info(f"Translation took {time.time() - start_time:.2f} seconds")
 
 
 
 
 
261
  return translated
262
 
263
  # Initialize model
@@ -305,9 +353,9 @@ async def process_web_translate():
305
 
306
  @app.route("/web_translate", methods=["POST"])
307
  async def web_translate():
308
- text = None # Initialize text to avoid UnboundLocalError
309
  try:
310
- return await asyncio.wait_for(process_web_translate(), timeout=60) # Timeout after 60 seconds
311
  except asyncio.TimeoutError:
312
  logger.error("Request timed out after 60 seconds")
313
  return render_template("index.html", error="Request timed out. Please try again with a smaller input.", text=text)
@@ -341,12 +389,12 @@ async def process_crawl_and_translate():
341
 
342
  @app.route("/crawl_and_translate", methods=["POST"])
343
  async def crawl_and_translate():
344
- url = None # Initialize url to avoid UnboundLocalError
345
  try:
346
- return await asyncio.wait_for(process_crawl_and_translate(), timeout=60) # Timeout after 60 seconds
347
  except asyncio.TimeoutError:
348
- logger.error("Crawl and translate request timed out after 60 seconds")
349
- return render_template("index.html", error="Request timed out. Please try again with a smaller website.", url=url)
350
  except Exception as e:
351
  logger.error(f"Error in crawl_and_translate: {str(e)}")
352
  return render_template("index.html", error=f"Error processing request: {str(e)}", url=url)
 
15
  import hashlib
16
  from concurrent.futures import ThreadPoolExecutor
17
  import asyncio
18
+ import gc
19
 
20
  # Configure logging
21
  logging.basicConfig(
 
50
  OCR_TIMEOUT = 20
51
 
52
  # Crawling settings
53
+ CRAWL_DEPTH = 2 # Increased to 2 to explore subpages
54
+ MAX_PAGES = 15 # Increased to 15 to handle larger inputs
55
+ REQUEST_DELAY = 3
56
  USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
57
 
58
  # Global variables to store the model and tokenizer
 
153
  logger.error(f"Error in extract_text: {str(e)}")
154
  return f"Error extracting text: {str(e)}"
155
 
156
+ # Crawl a single URL and extract text
157
+ def crawl_single_url(url, headers):
158
+ try:
159
+ time.sleep(REQUEST_DELAY)
160
+ response = requests.get(url, headers=headers, timeout=10)
161
+ response.raise_for_status()
162
+
163
+ soup = BeautifulSoup(response.text, 'html.parser')
164
+ text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article'])
165
+ bangla_text = " ".join(element.get_text(strip=True) for element in text_elements if element.get_text(strip=True))
166
+
167
+ # Extract links for further crawling
168
+ links = []
169
+ for link in soup.find_all('a', href=True):
170
+ href = link['href']
171
+ absolute_url = urljoin(url, href)
172
+ links.append(absolute_url)
173
+
174
+ return bangla_text, links
175
+ except Exception as e:
176
+ logger.error(f"Error crawling {url}: {str(e)}")
177
+ return "", []
178
+
179
+ # Simple web crawler to extract Bangla text (parallelized)
180
  def crawl_website(start_url, max_depth=CRAWL_DEPTH):
181
  visited = set()
182
  to_visit = [(start_url, 0)] # (url, depth)
 
184
 
185
  headers = {"User-Agent": USER_AGENT}
186
 
187
+ # Keywords to filter out irrelevant URLs
188
+ exclude_keywords = ['api', 'auth', 'login', 'signup', 'account', 'oauth']
189
+ # Keywords to prioritize news articles
190
+ include_keywords = ['news', 'article', 'bangladesh', 'politics', 'sports', 'entertainment']
191
+
192
+ with ThreadPoolExecutor(max_workers=5) as executor: # Parallelize with 5 threads
193
+ while to_visit:
194
+ # Batch URLs to crawl in parallel
195
+ batch = []
196
+ while to_visit and len(batch) < 5 and len(visited) < MAX_PAGES:
197
+ url, depth = to_visit.pop(0)
198
+
199
+ if url in visited or depth > max_depth:
200
+ continue
201
+
202
+ # Skip URLs with excluded keywords
203
+ if any(keyword in url.lower() for keyword in exclude_keywords):
204
+ logger.info(f"Skipping URL due to excluded keyword: {url}")
205
+ continue
206
+
207
+ batch.append((url, depth))
208
+ visited.add(url)
209
+
210
+ if not batch:
211
+ break
212
+
213
+ # Crawl batch of URLs in parallel
214
+ futures = []
215
+ for url, depth in batch:
216
+ logger.info(f"Crawling URL: {url} at depth {depth}")
217
+ futures.append(executor.submit(crawl_single_url, url, headers))
218
+
219
+ # Collect results
220
+ for (url, depth), future in zip(batch, futures):
221
+ bangla_text, links = future.result()
222
+ if bangla_text:
223
+ extracted_texts.append(bangla_text)
224
+
225
+ # Add new links to crawl
226
+ if depth < max_depth:
227
+ for absolute_url in links:
228
+ parsed_url = urlparse(absolute_url)
229
+ if parsed_url.netloc == urlparse(start_url).netloc and absolute_url not in visited:
230
+ # Prioritize URLs with included keywords
231
+ if any(keyword in absolute_url.lower() for keyword in include_keywords):
232
+ to_visit.insert(0, (absolute_url, depth + 1))
233
+ else:
234
+ to_visit.append((absolute_url, depth + 1))
235
 
236
+ # Free memory after processing batch
237
+ gc.collect()
238
+
 
 
 
 
 
 
 
 
 
 
 
 
239
  return " ".join(extracted_texts) if extracted_texts else "No Bangla text found on the website."
240
 
241
  # Load and optimize the model
 
271
  # Translate text with parallel processing
272
  async def translate_text(sentence, model, tokenizer):
273
  start_time = time.time()
274
+ max_length = 32 # Increased chunk size to handle larger inputs
275
  inputs = []
276
  current_chunk = []
277
  current_length = 0
 
301
 
302
  translated = " ".join(translated_chunks)
303
  logger.info(f"Translation took {time.time() - start_time:.2f} seconds")
304
+
305
+ # Free memory after translation
306
+ del inputs, translated_chunks
307
+ gc.collect()
308
+
309
  return translated
310
 
311
  # Initialize model
 
353
 
354
  @app.route("/web_translate", methods=["POST"])
355
  async def web_translate():
356
+ text = None
357
  try:
358
+ return await asyncio.wait_for(process_web_translate(), timeout=60)
359
  except asyncio.TimeoutError:
360
  logger.error("Request timed out after 60 seconds")
361
  return render_template("index.html", error="Request timed out. Please try again with a smaller input.", text=text)
 
389
 
390
  @app.route("/crawl_and_translate", methods=["POST"])
391
  async def crawl_and_translate():
392
+ url = None
393
  try:
394
+ return await asyncio.wait_for(process_crawl_and_translate(), timeout=180) # Increased to 180 seconds
395
  except asyncio.TimeoutError:
396
+ logger.error("Crawl and translate request timed out after 180 seconds")
397
+ return render_template("index.html", error="Request timed out after 180 seconds. Please try a smaller website or upgrade to a paid plan for better performance.", url=url)
398
  except Exception as e:
399
  logger.error(f"Error in crawl_and_translate: {str(e)}")
400
  return render_template("index.html", error=f"Error processing request: {str(e)}", url=url)