acecalisto3 commited on
Commit
e08a69a
·
verified ·
1 Parent(s): 9727a6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -619
app.py CHANGED
@@ -1,23 +1,26 @@
1
  import json
2
- import sys
3
  import os
4
  import re
5
  import time
6
  import logging
7
  import mimetypes
 
 
 
8
  import tempfile
9
  from datetime import datetime
 
10
  from pathlib import Path
11
  from urllib.parse import urlparse
12
- from typing import List, Dict, Tuple, Union, Optional
13
  import requests
14
  import validators
15
  import gradio as gr
 
16
  from bs4 import BeautifulSoup
17
  from fake_useragent import UserAgent
 
18
  from cleantext import clean
19
- import qrcode
20
- import zipfile
21
 
22
  # Setup logging with detailed configuration
23
  logging.basicConfig(
@@ -26,49 +29,15 @@ logging.basicConfig(
26
  handlers=[
27
  logging.StreamHandler(),
28
  logging.FileHandler('app.log', encoding='utf-8')
29
- ])
 
30
  logger = logging.getLogger(__name__)
31
 
32
-
33
- # Add these imports at the top
34
- from config import Config
35
- from robots_handler import RobotsHandler
36
- import asyncio
37
- import aiohttp
38
- from tqdm import tqdm
39
-
40
- class Config:
41
- def __init__(self):
42
- self.settings = {
43
- 'TIMEOUT': int(os.getenv('URLD_TIMEOUT', 10)),
44
- 'MAX_FILE_SIZE': int(os.getenv('URLD_MAX_FILE_SIZE', 2 * 1024 * 1024 * 1024)),
45
- 'RESPECT_ROBOTS': os.getenv('URLD_RESPECT_ROBOTS', 'True').lower() == 'true',
46
- 'USE_PROXY': os.getenv('URLD_USE_PROXY', 'False').lower() == 'true',
47
- 'PROXY_URL': os.getenv('URLD_PROXY_URL', ''),
48
- 'REQUEST_DELAY': float(os.getenv('URLD_REQUEST_DELAY', 1.0)),
49
- 'MAX_RETRIES': int(os.getenv('URLD_MAX_RETRIES', 3)),
50
- 'OUTPUT_FORMAT': os.getenv('URLD_OUTPUT_FORMAT', 'json'),
51
- 'CHROME_DRIVER_PATH': os.getenv('URLD_CHROME_DRIVER_PATH', '/usr/local/bin/chromedriver'),
52
- }
53
-
54
- def get(self, key: str) -> Any:
55
- return self.settings.get(key)
56
-
57
- def update(self, settings: Dict[str, Any]) -> None:
58
- self.settings.update(settings)
59
-
60
  class URLProcessor:
61
  def __init__(self):
62
- self.config = Config()
63
- self.proxy_handler = ProxyHandler(self.config.get('PROXY_URL'))
64
- self.robots_handler = RobotsHandler()
65
- self.session = self._create_session()
66
-
67
- def _create_session(self):
68
- session = requests.Session()
69
- if self.config.get('USE_PROXY'):
70
- session.proxies = self.proxy_handler.get_proxy_config()
71
- session.headers.update({
72
  'User-Agent': UserAgent().random,
73
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
74
  'Accept-Language': 'en-US,en;q=0.5',
@@ -76,417 +45,119 @@ class URLProcessor:
76
  'Connection': 'keep-alive',
77
  'Upgrade-Insecure-Requests': '1'
78
  })
79
- return session
80
 
81
- def _fetch_with_selenium(self, url: str) -> Optional[str]:
82
  try:
83
- chrome_options = Options()
84
- from selenium import webdriver
85
- from selenium.webdriver.chrome.options import Options
86
- from selenium.webdriver.common.by import By
87
- from selenium.webdriver.support.ui import WebDriverWait
88
- from selenium.webdriver.support import expected_conditions as EC
89
- from selenium.common.exceptions import TimeoutException
90
- import time
91
-
92
- logger.info(f"Attempting to fetch {url} with Selenium")
93
-
94
- # Set up Chrome options
95
- chrome_options = Options()
96
- chrome_options.add_argument("--headless")
97
- chrome_options.add_argument("--no-sandbox")
98
- chrome_options.add_argument("--disable-dev-shm-usage")
99
- chrome_options.add_argument("--disable-gpu")
100
- chrome_options.add_argument("--window-size=1920,1080")
101
- chrome_options.add_argument(
102
- "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
103
-
104
- # Initialize the driver
105
- driver = webdriver.Chrome(options=chrome_options)
106
-
107
- try:
108
- # Navigate to the URL
109
- driver.get(url)
110
-
111
- # Wait for the page to load
112
- WebDriverWait(driver, 10).until(
113
- EC.presence_of_element_located((By.TAG_NAME, "body"))
114
- )
115
-
116
- # Simulate pressing ESC key to dismiss overlays
117
- from selenium.webdriver.common.keys import Keys
118
- action_chains = webdriver.ActionChains(driver)
119
- action_chains.send_keys(Keys.ESCAPE).perform()
120
- time.sleep(1) # give it a moment to take effect
121
- action_chains.reset_actions() # Clear actions
122
-
123
- # try again
124
- action_chains.send_keys(Keys.ESCAPE).perform()
125
- time.sleep(1) # give it a moment to take effect
126
- action_chains.reset_actions()
127
-
128
- # Get the page source
129
- page_source = driver.page_source
130
-
131
- # Save the Selenium HTML for debugging
132
- debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
133
- with open(debug_path, "w", encoding="utf-8") as f:
134
- f.write(page_source)
135
- logger.info(f"Saved Selenium HTML to {debug_path}")
136
-
137
- return page_source
138
- finally:
139
- driver.quit()
140
-
141
- except ImportError:
142
- logger.error("Selenium is not installed. Cannot use browser automation.")
143
- return None
144
  except Exception as e:
145
- logger.error(f"Selenium processing failed for {url}: {e}")
146
- return None
147
-
148
- async def fetch_urls_async(self, urls: List[str]) -> List[Dict]:
149
- async with aiohttp.ClientSession() as session:
150
- tasks = []
151
- for url in urls:
152
- if self.config.get('RESPECT_ROBOTS'):
153
- if not self.robots_handler.can_fetch(url, self.session.headers['User-Agent']):
154
- logger.warning(f"Skipping {url} due to robots.txt restrictions")
155
- continue
156
- tasks.append(self.fetch_content_async(session, url))
157
- return await asyncio.gather(*tasks)
158
 
159
- def _fetch_html_content(self, url: str) -> Optional[Dict]:
160
- """Standard HTML content processing"""
161
  try:
162
- # Try with a different user agent if it's a social media site
163
- if any(domain in url for domain in
164
- ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
165
- # Use a more realistic browser user agent instead of random one
166
- self.session.headers.update({
167
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
168
- # Add cookie consent headers to bypass some login walls
169
- 'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080; consent_accepted=true; cookie_consent=accepted',
170
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
171
- 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
172
- 'sec-ch-ua-mobile': '?0',
173
- 'sec-ch-ua-platform': '"macOS"',
174
- 'Sec-Fetch-Dest': 'document',
175
- 'Sec-Fetch-Mode': 'navigate',
176
- 'Sec-Fetch-Site': 'none',
177
- 'Sec-Fetch-User': '?1',
178
- 'Upgrade-Insecure-Requests': '1'
179
- })
180
- # For Facebook, try to access the mobile version which often has fewer restrictions
181
- if 'facebook.com' in url and 'm.facebook.com' not in url:
182
- url = url.replace('www.facebook.com', 'm.facebook.com')
183
- logger.info(f"Switched to mobile Facebook URL: {url}")
184
-
185
- # Add a delay to simulate human browsing
186
- time.sleep(1)
187
-
188
- # Try to get the page with multiple attempts
189
- max_attempts = 3
190
- for attempt in range(max_attempts):
191
- try:
192
- response = self.session.get(url, timeout=self.timeout)
193
- response.raise_for_status()
194
- break
195
- except (requests.exceptions.RequestException, Exception) as e:
196
- if attempt < max_attempts - 1:
197
- logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}. Retrying...")
198
- time.sleep(2) # Wait longer between retries
199
- else:
200
- raise
201
- logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
202
-
203
- # Save the raw HTML for debugging if needed
204
- debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
205
- with open(debug_path, "w", encoding="utf-8") as f:
206
- f.write(response.text)
207
- logger.info(f"Saved raw HTML to {debug_path}")
208
-
209
- # Check if we got a valid response with content
210
- if not response.text or len(response.text) < 100:
211
- logger.error(f"Empty or very short response from {url}")
212
- return None
213
-
214
- soup = BeautifulSoup(response.text, 'html.parser')
215
- # Remove unwanted elements
216
- for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
217
- element.decompose()
218
-
219
- # Simulate "ESC key" by removing login walls and overlays common on social media sites
220
- login_wall_selectors = [
221
- '.login-wall', '.signup-wall', '.overlay', '.modal',
222
- '[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
223
- '.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
224
- '[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
225
- '.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
226
- '.login-wrapper', '.login-container', '.signup-container',
227
- '.login-modal', '.signup-modal', '.auth-modal', '.auth-wall'
228
- ]
229
- for selector in login_wall_selectors:
230
- for element in soup.select(selector):
231
- logger.info(f"Removing login wall element: {selector}")
232
- element.decompose()
233
-
234
- # Enhanced removal for social media sites
235
- if 'facebook.com' in url:
236
- # Facebook specific elements - simulating ESC key
237
- fb_selectors = [
238
- '[data-testid="cookie-policy-manage-dialog"]',
239
- '[role="banner"]', '[role="complementary"]',
240
- '.login_form_container', '.login_form', '#login_form',
241
- '.uiLayer', '.pluginConnectButton', '.fbPageBanner',
242
- '._5hn6', '._67m7', '.nonLoggedInSignUp',
243
- '#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
244
- ]
245
- for selector in fb_selectors:
246
- for element in soup.select(selector):
247
- element.decompose()
248
-
249
- # Look for the main content in mobile version
250
- main_content = soup.select_one('#m_story_permalink_view') or soup.select_one(
251
- '#mobile_injected_video_feed_pagelet')
252
- if main_content:
253
- logger.info("Found Facebook mobile main content")
254
-
255
- elif 'instagram.com' in url:
256
- # Instagram specific elements - simulating ESC key
257
- ig_selectors = [
258
- '[role="presentation"]', '[role="banner"]', '[role="complementary"]',
259
- '.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
260
- '.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
261
- '._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
262
- '.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
263
- ]
264
- for selector in ig_selectors:
265
- for element in soup.select(selector):
266
- element.decompose()
267
-
268
- # Try to find the main content
269
- insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one(
270
- '._ab1y')
271
- if insta_content:
272
- logger.info("Found Instagram main content")
273
-
274
- elif 'twitter.com' in url or 'x.com' in url:
275
- # X/Twitter already works well for public content, but clean up any remaining overlays
276
- x_selectors = [
277
- '[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
278
- '[data-testid="sheetDialog"]', '[data-testid="mask"]',
279
- '.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
280
- '.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
281
- '.r-kemksi', '[data-testid="BottomBar"]'
282
- ]
283
- for selector in x_selectors:
284
- for element in soup.select(selector):
285
- element.decompose()
286
-
287
- elif 'huggingface.co' in url:
288
- # Special handling for Hugging Face
289
- logger.info("Applying special handling for Hugging Face")
290
- # Try to find the main content
291
- hf_selectors = ['.prose', '.space-content', '.model-description',
292
- '.dataset-description', 'article', '.markdown']
293
- for selector in hf_selectors:
294
- elements = soup.select(selector)
295
- if elements:
296
- logger.info(f"Found Hugging Face content with selector: {selector}")
297
- break
298
- # Extract content using a general approach - try multiple strategies
299
- # Strategy 1: Look for semantic HTML5 elements
300
- main_content = None
301
- for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry',
302
- '.page']:
303
- elements = soup.select(selector)
304
- if elements:
305
- main_content = elements[0]
306
- logger.info(f"Found content with selector: {selector}")
307
- break
308
-
309
- # Strategy 2: If no semantic elements, try common class names
310
- if not main_content or not main_content.get_text(strip=True):
311
- for div in soup.find_all('div'):
312
- class_name = div.get('class', [])
313
- id_name = div.get('id', '')
314
- if any(term in ' '.join(class_name).lower() for term in
315
- ['content', 'main', 'body', 'article', 'post']):
316
- main_content = div
317
- logger.info(f"Found content with div class: {class_name}")
318
- break
319
- if any(term in id_name.lower() for term in ['content', 'main', 'body', 'article', 'post']):
320
- main_content = div
321
- logger.info(f"Found content with div id: {id_name}")
322
- break
323
-
324
- # Strategy 3: Fall back to body
325
- if not main_content or not main_content.get_text(strip=True):
326
- logger.info(f"No main content container found for {url}, using body")
327
- main_content = soup.body if soup.body else soup
328
-
329
- # Extract text with proper spacing
330
- text_content = main_content.get_text(separator='\n', strip=True)
331
-
332
- # Strategy 4: If content is too short, extract all visible text
333
- if len(text_content) < 100:
334
- logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
335
- visible_text = []
336
- for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
337
- if element.get_text(strip=True):
338
- visible_text.append(element.get_text(strip=True))
339
- text_content = '\n'.join(visible_text)
340
-
341
- # Strategy 5: Last resort - get all text from the page
342
- if len(text_content) < 50:
343
- logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
344
- text_content = soup.get_text(separator='\n', strip=True)
345
- # Clean and structure content
346
- cleaned_content = self.advanced_text_cleaning(text_content)
347
-
348
- logger.info(f"Final content length: {len(cleaned_content)} chars")
349
-
350
- # If we still have no content, this is a failure
351
- if len(cleaned_content) < 20:
352
- logger.error(f"Failed to extract meaningful content from {url}")
353
- return None
354
 
355
- return {
356
- 'content': cleaned_content,
357
- 'content_type': response.headers.get('Content-Type', ''),
358
- 'timestamp': datetime.now().isoformat(),
359
- 'url': url # Add the URL to the returned data for reference
360
- }
 
361
  except Exception as e:
362
- logger.error(f"HTML processing failed for {url}: {e}")
363
  return None
364
 
365
  def _handle_google_drive(self, url: str) -> Optional[Dict]:
366
- """Handle Google Drive document URLs"""
367
  try:
368
- # Construct direct download URL
369
- file_id = url.split("/d/")[1].split("/")[0]
370
- download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
371
- response = self.session.get(download_url, stream=True, timeout=self.timeout)
 
 
 
372
  response.raise_for_status()
373
-
374
- # Read content (limit to the first 1MB)
375
- content = b""
376
- for chunk in response.iter_content(chunk_size=8192): # 8KB chunks
377
- content += chunk
378
- if len(content) > 1024 * 1024: # 1MB limit
379
- content = content[:1024 * 1024]
380
- logger.warning(f"Truncated Google Drive file after 1MB")
381
- break
382
- text_content = content.decode('utf-8', errors='ignore')
383
- cleaned_text = self.advanced_text_cleaning(text_content)
384
-
385
  return {
386
- 'content': cleaned_text,
387
- 'content_type': 'text/plain', # Assume plain text for simplicity
388
- 'timestamp': datetime.now().isoformat(),
389
- 'url': url,
390
- 'source': 'google_drive'
391
  }
392
  except Exception as e:
393
- logger.error(f"Error handling Google Drive URL {url}: {e}")
394
  return None
395
 
396
  def _handle_google_calendar(self, url: str) -> Optional[Dict]:
397
- """Handle Google Calendar ICS URLs"""
398
  try:
399
  response = self.session.get(url, timeout=self.timeout)
400
  response.raise_for_status()
401
- text_content = response.text
402
- cleaned_text = self.advanced_text_cleaning(text_content)
403
  return {
404
- 'content': cleaned_text,
405
- 'content_type': 'text/calendar', # Correct MIME type
406
- 'timestamp': datetime.now().isoformat(),
407
- 'url': url,
408
- 'source': 'google_calendar'
409
  }
410
  except Exception as e:
411
- logger.error(f"Error handling Google Calendar URL {url}: {e}")
412
  return None
413
 
414
- def _fetch_with_selenium(self, url: str) -> Optional[str]:
415
- """Use Selenium as a fallback for difficult sites"""
416
  try:
417
- from selenium import webdriver
418
- from selenium.webdriver.chrome.options import Options
419
- from selenium.webdriver.common.by import By
420
- from selenium.webdriver.support.ui import WebDriverWait
421
- from selenium.webdriver.support import expected_conditions as EC
422
- from selenium.common.exceptions import TimeoutException
423
- import time
424
-
425
- logger.info(f"Attempting to fetch {url} with Selenium")
426
-
427
- # Set up Chrome options
428
- chrome_options = Options()
429
- chrome_options.add_argument("--headless")
430
- chrome_options.add_argument("--no-sandbox")
431
- chrome_options.add_argument("--disable-dev-shm-usage")
432
- chrome_options.add_argument("--disable-gpu")
433
- chrome_options.add_argument("--window-size=1920,1080")
434
- chrome_options.add_argument(
435
- "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
436
-
437
- # Initialize the driver
438
- driver = webdriver.Chrome(options=chrome_options)
439
-
440
- try:
441
- # Navigate to the URL
442
- driver.get(url)
443
-
444
- # Wait for the page to load
445
- WebDriverWait(driver, 10).until(
446
- EC.presence_of_element_located((By.TAG_NAME, "body"))
447
- )
448
-
449
- # Simulate pressing ESC key to dismiss overlays
450
- from selenium.webdriver.common.keys import Keys
451
- action_chains = webdriver.ActionChains(driver)
452
- action_chains.send_keys(Keys.ESCAPE).perform()
453
- time.sleep(1) # give it a moment to take effect
454
- action_chains.reset_actions() # Clear actions
455
-
456
- # try again
457
- action_chains.send_keys(Keys.ESCAPE).perform()
458
- time.sleep(1) # give it a moment to take effect
459
- action_chains.reset_actions()
460
-
461
- # Get the page source
462
- page_source = driver.page_source
463
-
464
- # Save the Selenium HTML for debugging
465
- debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
466
- with open(debug_path, "w", encoding="utf-8") as f:
467
- f.write(page_source)
468
- logger.info(f"Saved Selenium HTML to {debug_path}")
469
-
470
- return page_source
471
- finally:
472
- driver.quit()
473
-
474
- except ImportError:
475
- logger.error("Selenium is not installed. Cannot use browser automation.")
476
- return None
477
  except Exception as e:
478
- logger.error(f"Selenium processing failed for {url}: {e}")
479
  return None
480
 
481
  class FileProcessor:
482
- """Class to handle file processing"""
483
-
484
- def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
485
  self.max_file_size = max_file_size
486
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
487
 
488
  def is_text_file(self, filepath: str) -> bool:
489
- """Check if file is a text file"""
490
  try:
491
  mime_type, _ = mimetypes.guess_type(filepath)
492
  return (mime_type and mime_type.startswith('text/')) or \
@@ -495,60 +166,65 @@ class FileProcessor:
495
  return False
496
 
497
  def process_file(self, file) -> List[Dict]:
498
- """Process uploaded file with enhanced error handling"""
499
  if not file:
500
  return []
 
501
  dataset = []
502
  try:
503
  file_size = os.path.getsize(file.name)
504
  if file_size > self.max_file_size:
505
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
506
  return []
 
507
  with tempfile.TemporaryDirectory() as temp_dir:
508
  if zipfile.is_zipfile(file.name):
509
  dataset.extend(self._process_zip_file(file.name, temp_dir))
510
  else:
511
  dataset.extend(self._process_single_file(file))
 
512
  except Exception as e:
513
  logger.error(f"Error processing file: {str(e)}")
514
  return []
 
515
  return dataset
516
 
517
- def _process_zip_file(self, zip_path, temp_dir):
518
- """Extract and process files within a ZIP archive."""
519
- result = []
520
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
521
  zip_ref.extractall(temp_dir)
522
- for extracted_file in os.listdir(temp_dir):
523
- extracted_file_path = os.path.join(temp_dir, extracted_file)
524
- if os.path.isfile(extracted_file_path):
525
- with open(extracted_file_path, 'r', encoding='utf-8', errors='ignore') as f:
526
- result.append({
527
- 'source': 'file_from_zip',
528
- 'filename': extracted_file,
529
- 'content': f.read(),
530
- 'timestamp': datetime.now().isoformat()
531
- })
532
- return result
 
 
 
 
 
 
533
 
534
  def _process_single_file(self, file) -> List[Dict]:
535
  try:
536
  file_stat = os.stat(file.name)
537
- # For very large files, read in chunks and summarize
538
- if file_stat.st_size > 100 * 1024 * 1024: # 100MB
539
  logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
540
- # Read first and last 1MB for extremely large files
541
  content = ""
542
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
543
- content = f.read(1 * 1024 * 1024) # First 1MB
544
  content += "\n...[Content truncated due to large file size]...\n"
545
- # Seek to the last 1MB
546
  f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
547
- content += f.read() # Last 1MB
548
  else:
549
- # Regular file processing
550
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
551
  content = f.read()
 
552
  return [{
553
  'source': 'file',
554
  'filename': os.path.basename(file.name),
@@ -563,118 +239,27 @@ class FileProcessor:
563
  logger.error(f"File processing error: {e}")
564
  return []
565
 
566
-
567
- # Move process_all_inputs outside of the FileProcessor class
568
- def process_all_inputs(urls, file, text, notes):
569
- """Process all input types with progress tracking"""
570
- try:
571
- processor = URLProcessor()
572
- file_processor = FileProcessor()
573
- results = []
574
-
575
- # Process URLs
576
- if urls:
577
- url_list = re.split(r'[,\n]', urls)
578
- url_list = [url.strip() for url in url_list if url.strip()]
579
-
580
- for url in url_list:
581
- validation = processor.validate_url(url)
582
- if validation.get('is_valid'):
583
- content = processor.fetch_content(url)
584
- if content:
585
- results.append({
586
- 'source': 'url',
587
- 'url': url,
588
- 'content': content,
589
- 'timestamp': datetime.now().isoformat()
590
- })
591
- # Process files
592
- if file:
593
- results.extend(file_processor.process_file(file))
594
- # Process text input
595
- if text:
596
- cleaned_text = processor.advanced_text_cleaning(text)
597
- results.append({
598
- 'source': 'direct_input',
599
- 'content': cleaned_text,
600
- 'timestamp': datetime.now().isoformat()
601
- })
602
- # Generate output
603
- if results:
604
- output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
605
- output_dir.mkdir(parents=True, exist_ok=True)
606
- output_path = output_dir / f'processed_{int(time.time())}.json'
607
-
608
- with open(output_path, 'w', encoding='utf-8') as f:
609
- json.dump(results, f, ensure_ascii=False, indent=2)
610
- summary = f"Processed {len(results)} items successfully!"
611
- json_data = json.dumps(results, indent=2) # Prepare JSON for QR code
612
- return str(output_path), summary, json_data # Return JSON for editor
613
- else:
614
- return None, "No valid content to process.", ""
615
- except Exception as e:
616
- logger.error(f"Processing error: {e}")
617
- return None, f"Error: {str(e)}", ""
618
-
619
-
620
- # Also move generate_qr_code outside of the FileProcessor class
621
- def generate_qr_code(json_data):
622
- """Generate QR code from JSON data and return the file path."""
623
- if json_data:
624
- return generate_qr(json_data)
625
-
626
-
627
- # Move generate_qr outside of the FileProcessor class as well
628
  def generate_qr(json_data):
629
- """Generate QR code from JSON data and return the file path."""
630
- try:
631
- # Try first with automatic version selection
632
- qr = qrcode.QRCode(
633
- error_correction=qrcode.constants.ERROR_CORRECT_L,
634
- box_size=10,
635
- border=4,
636
- )
637
- qr.add_data(json_data)
638
- qr.make(fit=True)
639
-
640
- img = qrcode.make_image(fill_color="black", back_color="white")
641
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
642
- img.save(temp_file.name)
643
- return temp_file.name
644
- except Exception as e:
645
- # If the data is too large for a QR code
646
- logger.error(f"QR generation error: {e}")
647
-
648
- # Create a simple QR with error message
649
- qr = qrcode.QRCode(
650
- version=1,
651
- error_correction=qrcode.constants.ERROR_CORRECT_L,
652
- box_size=10,
653
- border=4,
654
- )
655
- qr.add_data("Error: Data too large for QR code")
656
- qr.make(fit=True)
657
-
658
- img = qrcode.make_image(fill_color="black", back_color="white")
659
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
660
- img.save(temp_file.name)
661
- return temp_file.name
662
-
663
 
664
  def create_interface():
665
- """Create a comprehensive Gradio interface with advanced features"""
666
  css = """
667
  .container { max-width: 1200px; margin: auto; }
668
  .warning { background-color: #fff3cd; color: #856404; }
669
  .error { background-color: #f8d7da; color: #721c24; }
670
  """
671
 
672
- with gr.Blocks(css=css, title="Advanced Text & URL Processing") as interface:
673
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
674
 
675
  with gr.Tab("URL Processing"):
676
  url_input = gr.Textbox(
677
- label="Enter URLs (comma or newline separated)",
678
  lines=5,
679
  placeholder="https://example1.com\nhttps://example2.com"
680
  )
@@ -687,99 +272,84 @@ def create_interface():
687
 
688
  with gr.Tab("Text Input"):
689
  text_input = gr.Textbox(
690
- label="Raw Text Input",
691
  lines=5,
692
  placeholder="Paste your text here..."
693
  )
694
 
695
- with gr.Tab("JSON Editor"):
696
- json_editor = gr.Textbox(
697
- label="JSON Editor",
698
- lines=20,
699
- placeholder="View and edit your JSON data here...",
700
- interactive=True,
701
- elem_id="json-editor" # Optional: for custom styling
702
- )
703
-
704
- with gr.Tab("Scratchpad"):
705
- scratchpad = gr.Textbox(
706
- label="Scratchpad",
707
- lines=10,
708
- placeholder="Quick notes or text collections...",
709
- interactive=True
710
- )
711
-
712
  process_btn = gr.Button("Process Input", variant="primary")
713
- qr_btn = gr.Button("Generate QR Code", variant="secondary")
714
-
715
  output_text = gr.Textbox(label="Processing Results", interactive=False)
716
  output_file = gr.File(label="Processed Output")
717
- qr_output = gr.Image(label="QR Code", type="filepath") # To display the generated QR code
718
-
719
- process_btn.click(
720
- process_all_inputs,
721
- inputs=[url_input, file_input, text_input, scratchpad],
722
- outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
723
- )
724
- qr_btn.click(
725
- generate_qr_code,
726
- inputs=json_editor,
727
- outputs=qr_output
728
- )
729
- gr.Markdown("""
730
- ### Usage Guidelines
731
- - **URL Processing**: Enter valid HTTP/HTTPS URLs
732
- - **File Input**: Upload text files or ZIP archives
733
- - ** Text Input**: Direct text processing
734
- - **JSON Editor**: View and edit your JSON data
735
- - **Scratchpad**: Quick notes or text collections
736
- - Advanced cleaning and validation included
737
- """)
738
- return interface
739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740
 
741
- def check_network_connectivity():
742
- """Check if the network is working properly by testing connection to common sites"""
743
- test_sites = ["https://www.google.com", "https://www.cloudflare.com", "https://www.amazon.com"]
744
- results = []
745
 
746
- for site in test_sites:
747
- try:
748
- response = requests.get(site, timeout=5)
749
- results.append({
750
- "site": site,
751
- "status": "OK" if response.status_code == 200 else f"Error: {response.status_code}",
752
- "response_time": response.elapsed.total_seconds()
753
- })
754
- except Exception as e:
755
- results.append({
756
- "site": site,
757
- "status": f"Error: {str(e)}",
758
- "response_time": None
759
- })
760
- # If all sites failed, there might be a network issue
761
- if all(result["status"].startswith("Error") for result in results):
762
- logger.error("Network connectivity issue detected. All test sites failed.")
763
- return False, results
764
 
765
- return True, results
 
 
 
 
 
 
766
 
 
767
 
768
- # Add this to the main function
769
  def main():
770
- # Configure system settings
771
  mimetypes.init()
772
-
773
- # Check network connectivity
774
- network_ok, network_results = check_network_connectivity()
775
- if not network_ok:
776
- logger.warning("Network connectivity issues detected. Some features may not work properly.")
777
- for result in network_results:
778
- logger.warning(f"Test site {result['site']}: {result['status']}")
779
- # Create and launch interface
780
  interface = create_interface()
781
-
782
- # Launch with proper configuration
783
  interface.launch(
784
  server_name="0.0.0.0",
785
  server_port=7860,
@@ -789,7 +359,5 @@ def main():
789
  debug=True
790
  )
791
 
792
-
793
  if __name__ == "__main__":
794
  main()
795
-
 
1
  import json
 
2
  import os
3
  import re
4
  import time
5
  import logging
6
  import mimetypes
7
+ import concurrent.futures
8
+ import string
9
+ import zipfile
10
  import tempfile
11
  from datetime import datetime
12
+ from typing import List, Dict, Optional, Union
13
  from pathlib import Path
14
  from urllib.parse import urlparse
15
+
16
  import requests
17
  import validators
18
  import gradio as gr
19
+ from diskcache import Cache
20
  from bs4 import BeautifulSoup
21
  from fake_useragent import UserAgent
22
+ from ratelimit import limits, sleep_and_retry
23
  from cleantext import clean
 
 
24
 
25
  # Setup logging with detailed configuration
26
  logging.basicConfig(
 
29
  handlers=[
30
  logging.StreamHandler(),
31
  logging.FileHandler('app.log', encoding='utf-8')
32
+ ]
33
+ )
34
  logger = logging.getLogger(__name__)
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  class URLProcessor:
37
  def __init__(self):
38
+ self.session = requests.Session()
39
+ self.timeout = 10 # seconds
40
+ self.session.headers.update({
 
 
 
 
 
 
 
41
  'User-Agent': UserAgent().random,
42
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
43
  'Accept-Language': 'en-US,en;q=0.5',
 
45
  'Connection': 'keep-alive',
46
  'Upgrade-Insecure-Requests': '1'
47
  })
 
48
 
49
+ def advanced_text_cleaning(self, text: str) -> str:
50
  try:
51
+ cleaned_text = clean(
52
+ text,
53
+ fix_unicode=True,
54
+ to_ascii=True,
55
+ lower=True,
56
+ no_line_breaks=True,
57
+ no_urls=True,
58
+ no_emails=True,
59
+ no_phone_numbers=True,
60
+ no_numbers=False,
61
+ no_digits=False,
62
+ no_currency_symbols=True,
63
+ no_punct=False
64
+ ).strip()
65
+ return cleaned_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  except Exception as e:
67
+ logger.warning(f"Text cleaning error: {e}. Using fallback method.")
68
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
69
+ text = text.encode('ascii', 'ignore').decode('ascii')
70
+ text = re.sub(r'\s+', ' ', text)
71
+ return text.strip()
 
 
 
 
 
 
 
 
72
 
73
+ def validate_url(self, url: str) -> Dict:
 
74
  try:
75
+ if not validators.url(url):
76
+ return {'is_valid': False, 'message': 'Invalid URL format'}
77
+
78
+ response = self.session.head(url, timeout=self.timeout)
79
+ response.raise_for_status()
80
+ return {'is_valid': True, 'message': 'URL is valid and accessible'}
81
+ except Exception as e:
82
+ return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ def fetch_content(self, url: str) -> Optional[Dict]:
85
+ try:
86
+ if 'drive.google.com' in url:
87
+ return self._handle_google_drive(url)
88
+ if 'calendar.google.com' in url and 'ical' in url:
89
+ return self._handle_google_calendar(url)
90
+ return self._fetch_html_content(url)
91
  except Exception as e:
92
+ logger.error(f"Content fetch failed: {e}")
93
  return None
94
 
95
  def _handle_google_drive(self, url: str) -> Optional[Dict]:
 
96
  try:
97
+ file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
98
+ if not file_id:
99
+ logger.error(f"Invalid Google Drive URL: {url}")
100
+ return None
101
+
102
+ direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
103
+ response = self.session.get(direct_url, timeout=self.timeout)
104
  response.raise_for_status()
105
+
 
 
 
 
 
 
 
 
 
 
 
106
  return {
107
+ 'content': response.text,
108
+ 'content_type': response.headers.get('Content-Type', ''),
109
+ 'timestamp': datetime.now().isoformat()
 
 
110
  }
111
  except Exception as e:
112
+ logger.error(f"Google Drive processing failed: {e}")
113
  return None
114
 
115
  def _handle_google_calendar(self, url: str) -> Optional[Dict]:
 
116
  try:
117
  response = self.session.get(url, timeout=self.timeout)
118
  response.raise_for_status()
 
 
119
  return {
120
+ 'content': response.text,
121
+ 'content_type': 'text/calendar',
122
+ 'timestamp': datetime.now().isoformat()
 
 
123
  }
124
  except Exception as e:
125
+ logger.error(f"Calendar fetch failed: {e}")
126
  return None
127
 
128
+ def _fetch_html_content(self, url: str) -> Optional[Dict]:
 
129
  try:
130
+ response = self.session.get(url, timeout=self.timeout)
131
+ response.raise_for_status()
132
+
133
+ soup = BeautifulSoup(response.text, 'html.parser')
134
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
135
+ element.decompose()
136
+ main_content = soup.find('main') or soup.find('article') or soup.body
137
+ if main_content is None:
138
+ logger.warning(f"No main content found for URL: {url}")
139
+ return {
140
+ 'content': '',
141
+ 'content_type': response.headers.get('Content-Type', ''),
142
+ 'timestamp': datetime.now().isoformat()
143
+ }
144
+ text_content = main_content.get_text(separator='\n', strip=True)
145
+ cleaned_content = self.advanced_text_cleaning(text_content)
146
+ return {
147
+ 'content': cleaned_content,
148
+ 'content_type': response.headers.get('Content-Type', ''),
149
+ 'timestamp': datetime.now().isoformat()
150
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  except Exception as e:
152
+ logger.error(f"HTML processing failed: {e}")
153
  return None
154
 
155
  class FileProcessor:
156
+ def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):
 
 
157
  self.max_file_size = max_file_size
158
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
159
 
160
  def is_text_file(self, filepath: str) -> bool:
 
161
  try:
162
  mime_type, _ = mimetypes.guess_type(filepath)
163
  return (mime_type and mime_type.startswith('text/')) or \
 
166
  return False
167
 
168
  def process_file(self, file) -> List[Dict]:
 
169
  if not file:
170
  return []
171
+
172
  dataset = []
173
  try:
174
  file_size = os.path.getsize(file.name)
175
  if file_size > self.max_file_size:
176
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
177
  return []
178
+
179
  with tempfile.TemporaryDirectory() as temp_dir:
180
  if zipfile.is_zipfile(file.name):
181
  dataset.extend(self._process_zip_file(file.name, temp_dir))
182
  else:
183
  dataset.extend(self._process_single_file(file))
184
+
185
  except Exception as e:
186
  logger.error(f"Error processing file: {str(e)}")
187
  return []
188
+
189
  return dataset
190
 
191
+ def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
192
+ results = []
 
193
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
194
  zip_ref.extractall(temp_dir)
195
+ for root, _, files in os.walk(temp_dir):
196
+ for filename in files:
197
+ filepath = os.path.join(root, filename)
198
+ if self.is_text_file(filepath):
199
+ try:
200
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
201
+ content = f.read()
202
+ if content.strip():
203
+ results.append({
204
+ "source": "file",
205
+ "filename": filename,
206
+ "content": content,
207
+ "timestamp": datetime.now().isoformat()
208
+ })
209
+ except Exception as e:
210
+ logger.error(f"Error reading file {filename}: {str(e)}")
211
+ return results
212
 
213
  def _process_single_file(self, file) -> List[Dict]:
214
  try:
215
  file_stat = os.stat(file.name)
216
+ if file_stat.st_size > 100 * 1024 * 1024:
 
217
  logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
 
218
  content = ""
219
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
220
+ content = f.read(1 * 1024 * 1024)
221
  content += "\n...[Content truncated due to large file size]...\n"
 
222
  f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
223
+ content += f.read()
224
  else:
 
225
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
226
  content = f.read()
227
+
228
  return [{
229
  'source': 'file',
230
  'filename': os.path.basename(file.name),
 
239
  logger.error(f"File processing error: {e}")
240
  return []
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  def generate_qr(json_data):
243
+ if json_data:
244
+ qr = qrcode.make(json_data)
245
+ qr_path = f"output/qr_code_{int(time.time())}.png"
246
+ qr.save(qr_path)
247
+ return qr_path
248
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
  def create_interface():
 
251
  css = """
252
  .container { max-width: 1200px; margin: auto; }
253
  .warning { background-color: #fff3cd; color: #856404; }
254
  .error { background-color: #f8d7da; color: #721c24; }
255
  """
256
 
257
+ with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
258
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
259
 
260
  with gr.Tab("URL Processing"):
261
  url_input = gr.Textbox(
262
+ label="Enter URLs (comma or newline separated)",
263
  lines=5,
264
  placeholder="https://example1.com\nhttps://example2.com"
265
  )
 
272
 
273
  with gr.Tab("Text Input"):
274
  text_input = gr.Textbox(
275
+ label="Raw Text Input",
276
  lines=5,
277
  placeholder="Paste your text here..."
278
  )
279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  process_btn = gr.Button("Process Input", variant="primary")
 
 
281
  output_text = gr.Textbox(label="Processing Results", interactive=False)
282
  output_file = gr.File(label="Processed Output")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
+ def process_all_inputs(urls, file, text):
285
+ try:
286
+ processor = URLProcessor()
287
+ file_processor = FileProcessor()
288
+ results = []
289
+
290
+ if urls:
291
+ url_list = re.split(r'[\,\n]', urls)
292
+ url_list = [url.strip() for url in url_list if url.strip()]
293
+
294
+ for url in url_list:
295
+ validation = processor.validate_url(url)
296
+ if validation.get('is_valid'):
297
+ content = processor.fetch_content(url)
298
+ if content:
299
+ results.append({
300
+ 'source': 'url',
301
+ 'url': url,
302
+ 'content': content,
303
+ 'timestamp': datetime.now().isoformat()
304
+ })
305
+
306
+ if file:
307
+ results.extend(file_processor.process_file(file))
308
+
309
+ if text:
310
+ cleaned_text = processor.advanced_text_cleaning(text)
311
+ results.append({
312
+ 'source': 'direct_input',
313
+ 'content': cleaned_text,
314
+ 'timestamp': datetime.now().isoformat()
315
+ })
316
+
317
+ if results:
318
+ output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
319
+ output_dir.mkdir(parents=True, exist_ok=True)
320
+ output_path = output_dir / f'processed_{int(time.time())}.json'
321
+
322
+ with open(output_path, 'w', encoding='utf-8') as f:
323
+ json.dump(results, f, ensure_ascii=False, indent=2)
324
+
325
+ summary = f"Processed {len(results)} items successfully!"
326
+ return str(output_path), summary
327
+ else:
328
+ return None, "No valid content to process."
329
 
330
+ except Exception as e:
331
+ logger.error(f"Processing error: {e}")
332
+ return None, f"Error: {str(e)}"
 
333
 
334
+ process_btn.click(
335
+ process_all_inputs,
336
+ inputs=[url_input, file_input, text_input],
337
+ outputs=[output_file, output_text]
338
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ gr.Markdown("""
341
+ ### Usage Guidelines
342
+ - **URL Processing**: Enter valid HTTP/HTTPS URLs
343
+ - **File Input**: Upload text files or ZIP archives
344
+ - **Text Input**: Direct text processing
345
+ - Advanced cleaning and validation included
346
+ """)
347
 
348
+ return interface
349
 
 
350
  def main():
 
351
  mimetypes.init()
 
 
 
 
 
 
 
 
352
  interface = create_interface()
 
 
353
  interface.launch(
354
  server_name="0.0.0.0",
355
  server_port=7860,
 
359
  debug=True
360
  )
361
 
 
362
  if __name__ == "__main__":
363
  main()