acecalisto3 commited on
Commit
ab01192
·
verified ·
1 Parent(s): 5bbbe92

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +7 -45
app2.py CHANGED
@@ -1,11 +1,10 @@
1
  import json
2
  import os
3
  import re
4
- import time
5
  import logging
6
  import mimetypes
7
  from selenium import webdriver
8
- from chromedriver_py import binary_path # this will get you the path variable
9
  import concurrent.futures
10
  import string
11
  import zipfile
@@ -17,7 +16,6 @@ from urllib.parse import urlparse
17
  import requests
18
  import validators
19
  import gradio as gr
20
- from diskcache import Cache
21
  from bs4 import BeautifulSoup
22
  from fake_useragent import UserAgent
23
  from ratelimit import limits, sleep_and_retry
@@ -27,9 +25,6 @@ import nest_asyncio
27
  nest_asyncio.apply()
28
  import aiohttp
29
 
30
- svc = webdriver.ChromeService(executable_path=binary_path)
31
- driver = webdriver.Chrome(service=svc)
32
-
33
  # Setup logging
34
  logging.basicConfig(
35
  level=logging.INFO,
@@ -49,7 +44,7 @@ class URLProcessor:
49
  self.session = requests.Session()
50
  self.timeout = 10 # seconds
51
  self.session.headers.update({
52
- 'User-Agent': UserAgent().random,
53
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
54
  'Accept-Language': 'en-US,en;q=0.5',
55
  'Accept-Encoding': 'gzip, deflate, br',
@@ -77,7 +72,7 @@ class URLProcessor:
77
  return cleaned_text
78
  except Exception as e:
79
  logger.warning(f"Text cleaning error: {e}. Using fallback method.")
80
- text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Remove control characters
81
  text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
82
  text = re.sub(r'\s+', ' ', text) # Normalize whitespace
83
  return text.strip()
@@ -97,15 +92,10 @@ class URLProcessor:
97
  def fetch_content(self, url: str) -> Optional[Dict]:
98
  """Universal content fetcher with special case handling"""
99
  try:
100
- # Google Drive document handling
101
  if 'drive.google.com' in url:
102
  return self._handle_google_drive(url)
103
-
104
- # Google Calendar ICS handling
105
  if 'calendar.google.com' in url and 'ical' in url:
106
  return self._handle_google_calendar(url)
107
-
108
- # Standard HTML processing
109
  return self._fetch_html_content(url)
110
  except Exception as e:
111
  logger.error(f"Content fetch failed: {e}")
@@ -153,12 +143,9 @@ class URLProcessor:
153
  response.raise_for_status()
154
 
155
  soup = BeautifulSoup(response.text, 'html.parser')
156
-
157
- # Remove unwanted elements
158
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
159
  element.decompose()
160
 
161
- # Extract main content
162
  main_content = soup.find('main') or soup.find('article') or soup.body
163
 
164
  if main_content is None:
@@ -169,7 +156,6 @@ class URLProcessor:
169
  'timestamp': datetime.now().isoformat()
170
  }
171
 
172
- # Clean and structure content
173
  text_content = main_content.get_text(separator='\n', strip=True)
174
  cleaned_content = self.advanced_text_cleaning(text_content)
175
 
@@ -206,7 +192,7 @@ class FileProcessor:
206
  dataset = []
207
  try:
208
  file_size = os.path.getsize(file.name)
209
- if file_size > self.max_file_size:
210
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
211
  return []
212
 
@@ -250,21 +236,17 @@ class FileProcessor:
250
  try:
251
  file_stat = os.stat(file.name)
252
 
253
- # For very large files, read in chunks and summarize
254
  if file_stat.st_size > 100 * 1024 * 1024: # 100MB
255
  logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
256
 
257
- # Read first and last 1MB for extremely large files
258
  content = ""
259
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
260
  content = f.read(1 * 1024 * 1024) # First 1MB
261
  content += "\n...[Content truncated due to large file size]...\n"
262
 
263
- # Seek to the last 1MB
264
  f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
265
  content += f.read() # Last 1MB
266
  else:
267
- # Regular file processing
268
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
269
  content = f.read()
270
 
@@ -285,13 +267,10 @@ class FileProcessor:
285
  def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
286
  """Clean and validate JSON data"""
287
  try:
288
- # If it's a string, try to parse it
289
  if isinstance(data, str):
290
- # Remove any existing content and extra whitespace
291
  data = data.strip()
292
  data = json.loads(data)
293
 
294
- # Convert to string and back to ensure proper JSON format
295
  cleaned = json.loads(json.dumps(data))
296
  return cleaned
297
  except json.JSONDecodeError as e:
@@ -308,7 +287,6 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
308
  output_dir.mkdir(parents=True, exist_ok=True)
309
 
310
  if combined:
311
- # Generate single QR code for all data
312
  cleaned_data = clean_json(data)
313
  if cleaned_data:
314
  qr = qrcode.QRCode(
@@ -326,7 +304,6 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
326
  img.save(str(output_path))
327
  return [str(output_path)]
328
  else:
329
- # Generate separate QR codes for each item
330
  if isinstance(data, list):
331
  paths = []
332
  for idx, item in enumerate(data):
@@ -339,7 +316,8 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
339
  border=4,
340
  )
341
  json_str = json.dumps(cleaned_item, ensure_ascii=False)
342
- qr.add_data(json_str)
 
343
  qr.make(fit=True)
344
 
345
  img = qr.make_image(fill_color="black", back_color="white")
@@ -348,7 +326,6 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
348
  paths.append(str(output_path))
349
  return paths
350
  else:
351
- # Single item, not combined
352
  cleaned_item = clean_json(data)
353
  if cleaned_item:
354
  qr = qrcode.QRCode(
@@ -459,10 +436,8 @@ def create_interface():
459
  try:
460
  results = []
461
 
462
- # Process text input first (since it's direct JSON)
463
  if text and text.strip():
464
  try:
465
- # Try to parse as JSON
466
  json_data = json.loads(text)
467
  if isinstance(json_data, list):
468
  results.extend(json_data)
@@ -471,7 +446,6 @@ def create_interface():
471
  except json.JSONDecodeError as e:
472
  return None, [], f"❌ Invalid JSON format: {str(e)}"
473
 
474
- # Process URLs if provided
475
  if urls and urls.strip():
476
  processor = URLProcessor()
477
  url_list = re.split(r'[,\n]', urls)
@@ -489,14 +463,12 @@ def create_interface():
489
  'timestamp': datetime.now().isoformat()
490
  })
491
 
492
- # Process files if provided
493
  if file:
494
  file_processor = FileProcessor()
495
  file_results = file_processor.process_file(file)
496
  if file_results:
497
  results.extend(file_results)
498
 
499
- # Generate QR codes
500
  if results:
501
  qr_paths = generate_qr_code(results, combined=combine)
502
  if qr_paths:
@@ -514,7 +486,6 @@ def create_interface():
514
  logger.error(f"Processing error: {e}")
515
  return None, [], f"❌ Error: {str(e)}"
516
 
517
- # Set up event handlers
518
  example_btn.click(load_example, outputs=[text_input])
519
  clear_btn.click(clear_input, outputs=[text_input])
520
  process_btn.click(
@@ -542,16 +513,9 @@ def create_interface():
542
  return interface
543
 
544
  def main():
545
- # Configure system settings
546
  mimetypes.init()
547
-
548
- # Create output directories
549
  Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
550
-
551
- # Create and launch interface
552
  interface = create_interface()
553
-
554
- # Launch with proper configuration
555
  interface.launch(
556
  server_name="0.0.0.0",
557
  server_port=8000,
@@ -562,6 +526,4 @@ def main():
562
  )
563
 
564
  if __name__ == "__main__":
565
- main()
566
- app.interface
567
-
 
1
  import json
2
  import os
3
  import re
 
4
  import logging
5
  import mimetypes
6
  from selenium import webdriver
7
+ from chromedriver_py import binary_path
8
  import concurrent.futures
9
  import string
10
  import zipfile
 
16
  import requests
17
  import validators
18
  import gradio as gr
 
19
  from bs4 import BeautifulSoup
20
  from fake_useragent import UserAgent
21
  from ratelimit import limits, sleep_and_retry
 
25
  nest_asyncio.apply()
26
  import aiohttp
27
 
 
 
 
28
  # Setup logging
29
  logging.basicConfig(
30
  level=logging.INFO,
 
44
  self.session = requests.Session()
45
  self.timeout = 10 # seconds
46
  self.session.headers.update({
47
+ 'User -Agent': UserAgent().random,
48
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
49
  'Accept-Language': 'en-US,en;q=0.5',
50
  'Accept-Encoding': 'gzip, deflate, br',
 
72
  return cleaned_text
73
  except Exception as e:
74
  logger.warning(f"Text cleaning error: {e}. Using fallback method.")
75
+ text = re.sub(r'[\x00 -\x1F\x7F-\x9F]', '', text) # Remove control characters
76
  text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
77
  text = re.sub(r'\s+', ' ', text) # Normalize whitespace
78
  return text.strip()
 
92
  def fetch_content(self, url: str) -> Optional[Dict]:
93
  """Universal content fetcher with special case handling"""
94
  try:
 
95
  if 'drive.google.com' in url:
96
  return self._handle_google_drive(url)
 
 
97
  if 'calendar.google.com' in url and 'ical' in url:
98
  return self._handle_google_calendar(url)
 
 
99
  return self._fetch_html_content(url)
100
  except Exception as e:
101
  logger.error(f"Content fetch failed: {e}")
 
143
  response.raise_for_status()
144
 
145
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
146
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
147
  element.decompose()
148
 
 
149
  main_content = soup.find('main') or soup.find('article') or soup.body
150
 
151
  if main_content is None:
 
156
  'timestamp': datetime.now().isoformat()
157
  }
158
 
 
159
  text_content = main_content.get_text(separator='\n', strip=True)
160
  cleaned_content = self.advanced_text_cleaning(text_content)
161
 
 
192
  dataset = []
193
  try:
194
  file_size = os.path.getsize(file.name)
195
+ if file_size > self.max_file_size:
196
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
197
  return []
198
 
 
236
  try:
237
  file_stat = os.stat(file.name)
238
 
 
239
  if file_stat.st_size > 100 * 1024 * 1024: # 100MB
240
  logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
241
 
 
242
  content = ""
243
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
244
  content = f.read(1 * 1024 * 1024) # First 1MB
245
  content += "\n...[Content truncated due to large file size]...\n"
246
 
 
247
  f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
248
  content += f.read() # Last 1MB
249
  else:
 
250
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
251
  content = f.read()
252
 
 
267
  def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
268
  """Clean and validate JSON data"""
269
  try:
 
270
  if isinstance(data, str):
 
271
  data = data.strip()
272
  data = json.loads(data)
273
 
 
274
  cleaned = json.loads(json.dumps(data))
275
  return cleaned
276
  except json.JSONDecodeError as e:
 
287
  output_dir.mkdir(parents=True, exist_ok=True)
288
 
289
  if combined:
 
290
  cleaned_data = clean_json(data)
291
  if cleaned_data:
292
  qr = qrcode.QRCode(
 
304
  img.save(str(output_path))
305
  return [str(output_path)]
306
  else:
 
307
  if isinstance(data, list):
308
  paths = []
309
  for idx, item in enumerate(data):
 
316
  border=4,
317
  )
318
  json_str = json.dumps(cleaned_item, ensure_ascii=False)
319
+ qr.add_data(json ```python
320
+ _str)
321
  qr.make(fit=True)
322
 
323
  img = qr.make_image(fill_color="black", back_color="white")
 
326
  paths.append(str(output_path))
327
  return paths
328
  else:
 
329
  cleaned_item = clean_json(data)
330
  if cleaned_item:
331
  qr = qrcode.QRCode(
 
436
  try:
437
  results = []
438
 
 
439
  if text and text.strip():
440
  try:
 
441
  json_data = json.loads(text)
442
  if isinstance(json_data, list):
443
  results.extend(json_data)
 
446
  except json.JSONDecodeError as e:
447
  return None, [], f"❌ Invalid JSON format: {str(e)}"
448
 
 
449
  if urls and urls.strip():
450
  processor = URLProcessor()
451
  url_list = re.split(r'[,\n]', urls)
 
463
  'timestamp': datetime.now().isoformat()
464
  })
465
 
 
466
  if file:
467
  file_processor = FileProcessor()
468
  file_results = file_processor.process_file(file)
469
  if file_results:
470
  results.extend(file_results)
471
 
 
472
  if results:
473
  qr_paths = generate_qr_code(results, combined=combine)
474
  if qr_paths:
 
486
  logger.error(f"Processing error: {e}")
487
  return None, [], f"❌ Error: {str(e)}"
488
 
 
489
  example_btn.click(load_example, outputs=[text_input])
490
  clear_btn.click(clear_input, outputs=[text_input])
491
  process_btn.click(
 
513
  return interface
514
 
515
  def main():
 
516
  mimetypes.init()
 
 
517
  Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
 
 
518
  interface = create_interface()
 
 
519
  interface.launch(
520
  server_name="0.0.0.0",
521
  server_port=8000,
 
526
  )
527
 
528
  if __name__ == "__main__":
529
+ main()