acecalisto3 commited on
Commit
53a70fe
Β·
verified Β·
1 Parent(s): 8ba5dfb

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +242 -107
app2.py CHANGED
@@ -21,8 +21,9 @@ from bs4 import BeautifulSoup
21
  from fake_useragent import UserAgent
22
  from ratelimit import limits, sleep_and_retry
23
  from cleantext import clean
 
24
 
25
- # Setup logging with detailed configuration
26
  logging.basicConfig(
27
  level=logging.INFO,
28
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
@@ -33,6 +34,9 @@ logging.basicConfig(
33
  )
34
  logger = logging.getLogger(__name__)
35
 
 
 
 
36
  class URLProcessor:
37
  def __init__(self):
38
  self.session = requests.Session()
@@ -170,7 +174,7 @@ class URLProcessor:
170
  except Exception as e:
171
  logger.error(f"HTML processing failed: {e}")
172
  return None
173
-
174
  class FileProcessor:
175
  """Class to handle file processing"""
176
 
@@ -234,70 +238,151 @@ class FileProcessor:
234
  logger.error(f"Error reading file {filename}: {str(e)}")
235
  return results
236
 
237
- def _process_single_file(self, file) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  try:
239
- file_stat = os.stat(file.name)
 
 
 
 
240
 
241
- # For very large files, read in chunks and summarize
242
- if file_stat.st_size > 100 * 1024 * 1024: # 100MB
243
- logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
244
-
245
- # Read first and last 1MB for extremely large files
246
- content = ""
247
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
248
- content = f.read(1 * 1024 * 1024) # First 1MB
249
- content += "\n...[Content truncated due to large file size]...\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- # Seek to the last 1MB
252
- f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
253
- content += f.read() # Last 1MB
 
254
  else:
255
- # Regular file processing
256
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
257
- content = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
- return [{
260
- 'source': 'file',
261
- 'filename': os.path.basename(file.name),
262
- 'file_size': file_stat.st_size,
263
- 'mime_type': mimetypes.guess_type(file.name)[0],
264
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
265
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
266
- 'content': content,
267
- 'timestamp': datetime.now().isoformat()
268
- }]
269
  except Exception as e:
270
- logger.error(f"File processing error: {e}")
271
  return []
272
 
273
- import qrcode # Import the qrcode library
274
-
275
- def generate_qr(json_data):
276
- """Generate QR code from JSON data and return the file path."""
277
- if json_data:
278
- qr = qrcode.make(json_data)
279
- qr_path = f"output/qr_code_{int(time.time())}.png"
280
- qr.save(qr_path)
281
- return qr_path
282
- return None
283
-
284
  def create_interface():
285
  """Create a comprehensive Gradio interface with advanced features"""
286
 
287
  css = """
288
  .container { max-width: 1200px; margin: auto; }
289
- .warning { background-color: #fff3cd; color: #856404; }
290
- .error { background-color: #f8d7da; color: #721c24; }
 
291
  """
292
 
293
- with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
294
- gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
295
 
296
  with gr.Tab("URL Processing"):
297
  url_input = gr.Textbox(
298
  label="Enter URLs (comma or newline separated)",
299
  lines=5,
300
- placeholder="https://example1.com\nhttps://example2.com"
 
301
  )
302
 
303
  with gr.Tab("File Input"):
@@ -306,27 +391,82 @@ def create_interface():
306
  file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
307
  )
308
 
309
- with gr.Tab("Text Input"):
310
- text_input = gr.Textbox(
311
- label="Raw Text Input",
312
- lines=5,
313
- placeholder="Paste your text here..."
 
 
 
 
 
 
 
 
 
 
 
 
314
  )
 
315
 
316
- process_btn = gr.Button("Process Input", variant="primary")
 
 
317
 
318
- output_text = gr.Textbox(label="Processing Results", interactive=False)
319
- output_file = gr.File(label="Processed Output")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
- def process_all_inputs(urls, file, text):
322
- """Process all input types with progress tracking"""
323
  try:
324
- processor = URLProcessor()
325
- file_processor = FileProcessor()
326
  results = []
327
 
328
- # Process URLs
329
- if urls:
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  url_list = re.split(r'[,\n]', urls)
331
  url_list = [url.strip() for url in url_list if url.strip()]
332
 
@@ -342,82 +482,77 @@ def create_interface():
342
  'timestamp': datetime.now().isoformat()
343
  })
344
 
345
- # Process files
346
  if file:
347
- results.extend(file_processor.process_file(file))
348
-
349
- # Process text input
350
- if text:
351
- cleaned_text = processor.advanced_text_cleaning(text)
352
- results.append({
353
- 'source': 'direct_input',
354
- 'content': cleaned_text,
355
- 'timestamp': datetime.now().isoformat()
356
- })
357
 
358
- # Generate output
359
  if results:
360
- output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
361
- output_dir.mkdir(parents=True, exist_ok=True)
362
- output_path = output_dir / f'processed_{int(time.time())}.json'
363
-
364
- with open(output_path, 'w', encoding='utf-8') as f:
365
- json.dump(results, f, ensure_ascii=False, indent=2)
366
-
367
- summary = f"Processed {len(results)} items successfully!"
368
- # Convert Path object to string here
369
- return str(output_path), summary
370
  else:
371
- return None, "No valid content to process."
372
 
373
  except Exception as e:
374
  logger.error(f"Processing error: {e}")
375
- return None, f"Error: {str(e)}"
376
 
 
 
 
377
  process_btn.click(
378
  process_all_inputs,
379
- inputs=[url_input, file_input, text_input],
380
- outputs=[output_file, output_text]
381
  )
382
 
383
  gr.Markdown("""
384
- ### Usage Guidelines
385
- - **URL Processing**: Enter valid HTTP/HTTPS URLs
386
- - **File Input**: Upload text files or ZIP archives
387
- - **Text Input**: Direct text processing
388
- - Advanced cleaning and validation included
 
 
 
 
 
 
 
 
389
  """)
390
 
391
  return interface
392
 
393
- import qrcode # Import the qrcode library
394
-
395
- def generate_qr(json_data):
396
- """Generate QR code from JSON data and return the file path."""
397
- if json_data:
398
- qr = qrcode.make(json_data)
399
- qr_path = f"output/qr_code_{int(time.time())}.png"
400
- qr.save(qr_path)
401
- return qr_path
402
- return None
403
-
404
  def main():
405
  # Configure system settings
406
  mimetypes.init()
 
 
 
 
407
  # Create and launch interface
408
  interface = create_interface()
409
-
410
  # Launch with proper configuration
411
  interface.launch(
412
  server_name="0.0.0.0",
413
- server_port=7860,
414
  show_error=True,
415
  share=False,
416
  inbrowser=True,
417
  debug=True
418
  )
419
 
420
-
421
  if __name__ == "__main__":
422
- main()
423
-
 
21
  from fake_useragent import UserAgent
22
  from ratelimit import limits, sleep_and_retry
23
  from cleantext import clean
24
+ import qrcode
25
 
26
+ # Setup logging
27
  logging.basicConfig(
28
  level=logging.INFO,
29
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
 
34
  )
35
  logger = logging.getLogger(__name__)
36
 
37
+ # Ensure output directories exist
38
+ Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
39
+
40
  class URLProcessor:
41
  def __init__(self):
42
  self.session = requests.Session()
 
174
  except Exception as e:
175
  logger.error(f"HTML processing failed: {e}")
176
  return None
177
+
178
  class FileProcessor:
179
  """Class to handle file processing"""
180
 
 
238
  logger.error(f"Error reading file {filename}: {str(e)}")
239
  return results
240
 
241
+ def _process_single_file(self, file) -> List[Dict]:
242
+ """Process a single file"""
243
+ try:
244
+ file_stat = os.stat(file.name)
245
+
246
+ # For very large files, read in chunks and summarize
247
+ if file_stat.st_size > 100 * 1024 * 1024: # 100MB
248
+ logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
249
+
250
+ # Read first and last 1MB for extremely large files
251
+ content = ""
252
+ with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
253
+ content = f.read(1 * 1024 * 1024) # First 1MB
254
+ content += "\n...[Content truncated due to large file size]...\n"
255
+
256
+ # Seek to the last 1MB
257
+ f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
258
+ content += f.read() # Last 1MB
259
+ else:
260
+ # Regular file processing
261
+ with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
262
+ content = f.read()
263
+
264
+ return [{
265
+ 'source': 'file',
266
+ 'filename': os.path.basename(file.name),
267
+ 'file_size': file_stat.st_size,
268
+ 'mime_type': mimetypes.guess_type(file.name)[0],
269
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
270
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
271
+ 'content': content,
272
+ 'timestamp': datetime.now().isoformat()
273
+ }]
274
+ except Exception as e:
275
+ logger.error(f"File processing error: {e}")
276
+ return []
277
+
278
+ def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
279
+ """Clean and validate JSON data"""
280
  try:
281
+ # If it's a string, try to parse it
282
+ if isinstance(data, str):
283
+ # Remove any existing content and extra whitespace
284
+ data = data.strip()
285
+ data = json.loads(data)
286
 
287
+ # Convert to string and back to ensure proper JSON format
288
+ cleaned = json.loads(json.dumps(data))
289
+ return cleaned
290
+ except json.JSONDecodeError as e:
291
+ logger.error(f"JSON cleaning error: {e}")
292
+ return None
293
+ except Exception as e:
294
+ logger.error(f"Unexpected error while cleaning JSON: {e}")
295
+ return None
296
+
297
+ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]:
298
+ """Generate QR code(s) from data"""
299
+ try:
300
+ output_dir = Path('output/qr_codes')
301
+ output_dir.mkdir(parents=True, exist_ok=True)
302
+
303
+ if combined:
304
+ # Generate single QR code for all data
305
+ cleaned_data = clean_json(data)
306
+ if cleaned_data:
307
+ qr = qrcode.QRCode(
308
+ version=None,
309
+ error_correction=qrcode.constants.ERROR_CORRECT_L,
310
+ box_size=10,
311
+ border=4,
312
+ )
313
+ json_str = json.dumps(cleaned_data, ensure_ascii=False)
314
+ qr.add_data(json_str)
315
+ qr.make(fit=True)
316
 
317
+ img = qr.make_image(fill_color="black", back_color="white")
318
+ output_path = output_dir / f'combined_qr_{int(time.time())}.png'
319
+ img.save(str(output_path))
320
+ return [str(output_path)]
321
  else:
322
+ # Generate separate QR codes for each item
323
+ if isinstance(data, list):
324
+ paths = []
325
+ for idx, item in enumerate(data):
326
+ cleaned_item = clean_json(item)
327
+ if cleaned_item:
328
+ qr = qrcode.QRCode(
329
+ version=None,
330
+ error_correction=qrcode.constants.ERROR_CORRECT_L,
331
+ box_size=10,
332
+ border=4,
333
+ )
334
+ json_str = json.dumps(cleaned_item, ensure_ascii=False)
335
+ qr.add_data(json_str)
336
+ qr.make(fit=True)
337
+
338
+ img = qr.make_image(fill_color="black", back_color="white")
339
+ output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
340
+ img.save(str(output_path))
341
+ paths.append(str(output_path))
342
+ return paths
343
+ else:
344
+ # Single item, not combined
345
+ cleaned_item = clean_json(data)
346
+ if cleaned_item:
347
+ qr = qrcode.QRCode(
348
+ version=None,
349
+ error_correction=qrcode.constants.ERROR_CORRECT_L,
350
+ box_size=10,
351
+ border=4,
352
+ )
353
+ json_str = json.dumps(cleaned_item, ensure_ascii=False)
354
+ qr.add_data(json_str)
355
+ qr.make(fit=True)
356
+
357
+ img = qr.make_image(fill_color="black", back_color="white")
358
+ output_path = output_dir / f'single_qr_{int(time.time())}.png'
359
+ img.save(str(output_path))
360
+ return [str(output_path)]
361
 
362
+ return []
 
 
 
 
 
 
 
 
 
363
  except Exception as e:
364
+ logger.error(f"QR generation error: {e}")
365
  return []
366
 
 
 
 
 
 
 
 
 
 
 
 
367
  def create_interface():
368
  """Create a comprehensive Gradio interface with advanced features"""
369
 
370
  css = """
371
  .container { max-width: 1200px; margin: auto; }
372
+ .warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 4px; }
373
+ .error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 4px; }
374
+ .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
375
  """
376
 
377
+ with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
378
+ gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
379
 
380
  with gr.Tab("URL Processing"):
381
  url_input = gr.Textbox(
382
  label="Enter URLs (comma or newline separated)",
383
  lines=5,
384
+ placeholder="https://example1.com\nhttps://example2.com",
385
+ value=""
386
  )
387
 
388
  with gr.Tab("File Input"):
 
391
  file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
392
  )
393
 
394
+ with gr.Tab("Notepad"):
395
+ text_input = gr.TextArea(
396
+ label="JSON Data Input",
397
+ lines=15,
398
+ placeholder="Paste your JSON data here...",
399
+ value=""
400
+ )
401
+
402
+ with gr.Row():
403
+ example_btn = gr.Button("πŸ“ Load Example JSON", variant="secondary")
404
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Input", variant="secondary")
405
+
406
+ with gr.Row():
407
+ combine_data = gr.Checkbox(
408
+ label="Combine all data into single QR code",
409
+ value=True,
410
+ info="Generate one QR code for all data, or separate QR codes for each item"
411
  )
412
+ process_btn = gr.Button("πŸ”„ Process & Generate QR", variant="primary", scale=2)
413
 
414
+ output_json = gr.JSON(label="Processed JSON Data")
415
+ output_gallery = gr.Gallery(label="Generated QR Codes", columns=2, height=400)
416
+ output_text = gr.Textbox(label="Processing Status", interactive=False)
417
 
418
+ def load_example():
419
+ example_json = {
420
+ "type": "product_catalog",
421
+ "items": [
422
+ {
423
+ "id": "123",
424
+ "name": "Test Product",
425
+ "description": "This is a test product description",
426
+ "price": 29.99,
427
+ "category": "electronics",
428
+ "tags": ["test", "sample", "demo"]
429
+ },
430
+ {
431
+ "id": "456",
432
+ "name": "Another Product",
433
+ "description": "Another test product description",
434
+ "price": 49.99,
435
+ "category": "accessories",
436
+ "tags": ["sample", "test"]
437
+ }
438
+ ],
439
+ "metadata": {
440
+ "timestamp": datetime.now().isoformat(),
441
+ "version": "1.0",
442
+ "source": "example"
443
+ }
444
+ }
445
+ return json.dumps(example_json, indent=2)
446
+
447
+ def clear_input():
448
+ return ""
449
 
450
+ def process_all_inputs(urls, file, text, combine):
451
+ """Process all input types and generate QR codes"""
452
  try:
 
 
453
  results = []
454
 
455
+ # Process text input first (since it's direct JSON)
456
+ if text and text.strip():
457
+ try:
458
+ # Try to parse as JSON
459
+ json_data = json.loads(text)
460
+ if isinstance(json_data, list):
461
+ results.extend(json_data)
462
+ else:
463
+ results.append(json_data)
464
+ except json.JSONDecodeError as e:
465
+ return None, [], f"❌ Invalid JSON format: {str(e)}"
466
+
467
+ # Process URLs if provided
468
+ if urls and urls.strip():
469
+ processor = URLProcessor()
470
  url_list = re.split(r'[,\n]', urls)
471
  url_list = [url.strip() for url in url_list if url.strip()]
472
 
 
482
  'timestamp': datetime.now().isoformat()
483
  })
484
 
485
+ # Process files if provided
486
  if file:
487
+ file_processor = FileProcessor()
488
+ file_results = file_processor.process_file(file)
489
+ if file_results:
490
+ results.extend(file_results)
 
 
 
 
 
 
491
 
492
+ # Generate QR codes
493
  if results:
494
+ qr_paths = generate_qr_code(results, combined=combine)
495
+ if qr_paths:
496
+ return (
497
+ results,
498
+ [str(path) for path in qr_paths],
499
+ f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR code(s)!"
500
+ )
501
+ else:
502
+ return None, [], "❌ Failed to generate QR codes. Please check the input data."
 
503
  else:
504
+ return None, [], "⚠️ No valid content to process. Please provide some input data."
505
 
506
  except Exception as e:
507
  logger.error(f"Processing error: {e}")
508
+ return None, [], f"❌ Error: {str(e)}"
509
 
510
+ # Set up event handlers
511
+ example_btn.click(load_example, outputs=[text_input])
512
+ clear_btn.click(clear_input, outputs=[text_input])
513
  process_btn.click(
514
  process_all_inputs,
515
+ inputs=[url_input, file_input, text_input, combine_data],
516
+ outputs=[output_json, output_gallery, output_text]
517
  )
518
 
519
  gr.Markdown("""
520
+ ### Features
521
+ - **URL Processing**: Extract content from websites
522
+ - **File Processing**: Handle text files and archives
523
+ - **Notepad**: Direct JSON data input/manipulation
524
+ - **JSON Cleaning**: Automatic JSON validation and formatting
525
+ - **QR Generation**: Generate QR codes with embedded JSON data
526
+ - **Flexible Output**: Choose between combined or separate QR codes
527
+
528
+ ### Usage Tips
529
+ 1. Use the **Notepad** tab for direct JSON input
530
+ 2. Click "Load Example JSON" to see a sample format
531
+ 3. Choose whether to combine all data into a single QR code
532
+ 4. The generated QR codes will contain the complete JSON data
533
  """)
534
 
535
  return interface
536
 
 
 
 
 
 
 
 
 
 
 
 
537
  def main():
538
  # Configure system settings
539
  mimetypes.init()
540
+
541
+ # Create output directories
542
+ Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
543
+
544
  # Create and launch interface
545
  interface = create_interface()
546
+
547
  # Launch with proper configuration
548
  interface.launch(
549
  server_name="0.0.0.0",
550
+ server_port=8000,
551
  show_error=True,
552
  share=False,
553
  inbrowser=True,
554
  debug=True
555
  )
556
 
 
557
  if __name__ == "__main__":
558
+ main()