acecalisto3 commited on
Commit
8cbfc35
Β·
verified Β·
1 Parent(s): 97adf15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +285 -322
app.py CHANGED
@@ -4,28 +4,26 @@ import re
4
  import time
5
  import logging
6
  import mimetypes
 
 
 
7
  import tempfile
8
  from datetime import datetime
 
9
  from pathlib import Path
10
  from urllib.parse import urlparse
11
- from typing import List, Dict, Tuple, Union, Optional
12
  import requests
13
  import validators
14
  import gradio as gr
15
  from diskcache import Cache
16
  from bs4 import BeautifulSoup
17
  from fake_useragent import UserAgent
 
18
  from cleantext import clean
19
  import qrcode
20
- import PyPDF2
21
- from PIL import Image
22
- import pytesseract
23
- import cv2
24
- import numpy as np
25
- import fitz # PyMuPDF
26
- import zipfile
27
 
28
- # Setup logging with detailed configuration
29
  logging.basicConfig(
30
  level=logging.INFO,
31
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
@@ -36,6 +34,9 @@ logging.basicConfig(
36
  )
37
  logger = logging.getLogger(__name__)
38
 
 
 
 
39
  class URLProcessor:
40
  def __init__(self):
41
  self.session = requests.Session()
@@ -48,13 +49,6 @@ class URLProcessor:
48
  'Connection': 'keep-alive',
49
  'Upgrade-Insecure-Requests': '1'
50
  })
51
- self.supported_content_types = {
52
- 'text/html': self._fetch_html_content,
53
- 'application/pdf': self._fetch_pdf_content,
54
- 'image': self._fetch_image_content,
55
- 'application/json': self._fetch_json_content,
56
- 'text/plain': self._fetch_text_content
57
- }
58
 
59
  def advanced_text_cleaning(self, text: str) -> str:
60
  """Robust text cleaning with version compatibility"""
@@ -86,7 +80,7 @@ class URLProcessor:
86
  try:
87
  if not validators.url(url):
88
  return {'is_valid': False, 'message': 'Invalid URL format'}
89
-
90
  response = self.session.head(url, timeout=self.timeout)
91
  response.raise_for_status()
92
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
@@ -94,31 +88,18 @@ class URLProcessor:
94
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
95
 
96
  def fetch_content(self, url: str) -> Optional[Dict]:
97
- """Universal content fetcher with enhanced content type handling"""
98
  try:
99
- # Special case handling
100
  if 'drive.google.com' in url:
101
  return self._handle_google_drive(url)
 
 
102
  if 'calendar.google.com' in url and 'ical' in url:
103
  return self._handle_google_calendar(url)
104
 
105
- # Get content type
106
- response = self.session.head(url, timeout=self.timeout)
107
- content_type = response.headers.get('Content-Type', '').split(';')[0].lower()
108
-
109
- # Find appropriate handler
110
- handler = None
111
- for supported_type, type_handler in self.supported_content_types.items():
112
- if content_type.startswith(supported_type):
113
- handler = type_handler
114
- break
115
-
116
- if handler:
117
- return handler(url)
118
- else:
119
- logger.warning(f"Unsupported content type: {content_type}")
120
- return self._fetch_text_content(url)
121
-
122
  except Exception as e:
123
  logger.error(f"Content fetch failed: {e}")
124
  return None
@@ -130,11 +111,11 @@ class URLProcessor:
130
  if not file_id:
131
  logger.error(f"Invalid Google Drive URL: {url}")
132
  return None
133
-
134
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
135
  response = self.session.get(direct_url, timeout=self.timeout)
136
  response.raise_for_status()
137
-
138
  return {
139
  'content': response.text,
140
  'content_type': response.headers.get('Content-Type', ''),
@@ -159,180 +140,48 @@ class URLProcessor:
159
  return None
160
 
161
  def _fetch_html_content(self, url: str) -> Optional[Dict]:
162
- """Enhanced HTML content processing with metadata extraction"""
163
  try:
164
  response = self.session.get(url, timeout=self.timeout)
165
  response.raise_for_status()
166
-
167
  soup = BeautifulSoup(response.text, 'html.parser')
168
-
169
  # Remove unwanted elements
170
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
171
  element.decompose()
172
-
173
  # Extract main content
174
  main_content = soup.find('main') or soup.find('article') or soup.body
175
-
176
- # Extract metadata
177
- metadata = {
178
- 'title': soup.title.string if soup.title else None,
179
- 'description': soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else None,
180
- 'keywords': soup.find('meta', {'name': 'keywords'})['content'] if soup.find('meta', {'name': 'keywords'}) else None,
181
- 'author': soup.find('meta', {'name': 'author'})['content'] if soup.find('meta', {'name': 'author'}) else None
182
- }
183
-
184
- # Clean and structure content
185
- text_content = main_content.get_text(separator='\n', strip=True)
186
- cleaned_content = self.advanced_text_cleaning(text_content)
187
-
188
- return {
189
- 'content': cleaned_content,
190
- 'metadata': metadata,
191
- 'content_type': response.headers.get('Content-Type', ''),
192
- 'timestamp': datetime.now().isoformat()
193
- }
194
- except Exception as e:
195
- logger.error(f"HTML processing failed: {e}")
196
- return None
197
 
198
- def _fetch_pdf_content(self, url: str) -> Optional[Dict]:
199
- """Process PDF content with enhanced metadata extraction"""
200
- try:
201
- response = self.session.get(url, timeout=self.timeout)
202
- response.raise_for_status()
203
-
204
- with tempfile.NamedTemporaryFile(suffix='.pdf') as temp_file:
205
- temp_file.write(response.content)
206
- temp_file.flush()
207
-
208
- # Extract text and metadata using PyMuPDF
209
- doc = fitz.open(temp_file.name)
210
-
211
- # Extract text with formatting preservation
212
- text = ""
213
- metadata = {
214
- 'title': doc.metadata.get('title'),
215
- 'author': doc.metadata.get('author'),
216
- 'subject': doc.metadata.get('subject'),
217
- 'keywords': doc.metadata.get('keywords'),
218
- 'creator': doc.metadata.get('creator'),
219
- 'producer': doc.metadata.get('producer'),
220
- 'page_count': len(doc),
221
- 'file_size': os.path.getsize(temp_file.name),
222
- 'version': doc.version
223
- }
224
-
225
- # Extract text with layout preservation
226
- for page in doc:
227
- blocks = page.get_text("blocks")
228
- for block in blocks:
229
- if block[6] == 0: # Text block
230
- text += block[4] + "\n"
231
-
232
- doc.close()
233
- cleaned_content = self.advanced_text_cleaning(text)
234
-
235
  return {
236
- 'content': cleaned_content,
237
- 'metadata': metadata,
238
- 'content_type': 'application/pdf',
239
- 'timestamp': datetime.now().isoformat()
240
- }
241
- except Exception as e:
242
- logger.error(f"PDF processing failed: {e}")
243
- return None
244
-
245
- def _fetch_image_content(self, url: str) -> Optional[Dict]:
246
- """Process image content with OCR and advanced image processing"""
247
- try:
248
- response = self.session.get(url, timeout=self.timeout)
249
- response.raise_for_status()
250
-
251
- with tempfile.NamedTemporaryFile(suffix='.jpg') as temp_file:
252
- temp_file.write(response.content)
253
- temp_file.flush()
254
-
255
- # Load image with OpenCV
256
- img = cv2.imread(temp_file.name)
257
- if img is None:
258
- raise ValueError("Failed to load image")
259
-
260
- # Image preprocessing for better OCR
261
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
262
- denoised = cv2.fastNlMeansDenoising(gray)
263
- thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
264
-
265
- # Extract text using Tesseract
266
- text = pytesseract.image_to_string(thresh)
267
- cleaned_text = self.advanced_text_cleaning(text) if text else None
268
-
269
- # Extract metadata and additional image features
270
- with Image.open(temp_file.name) as pil_img:
271
- exif = pil_img._getexif() if hasattr(pil_img, '_getexif') else None
272
- metadata = {
273
- 'format': pil_img.format,
274
- 'mode': pil_img.mode,
275
- 'size': pil_img.size,
276
- 'exif': exif,
277
- 'image_features': {
278
- 'resolution': img.shape,
279
- 'channels': img.shape[2] if len(img.shape) > 2 else 1,
280
- 'mean_brightness': np.mean(gray),
281
- 'has_text': bool(cleaned_text and cleaned_text.strip())
282
- }
283
- }
284
-
285
- return {
286
- 'content': cleaned_text,
287
- 'metadata': metadata,
288
  'content_type': response.headers.get('Content-Type', ''),
289
  'timestamp': datetime.now().isoformat()
290
  }
291
- except Exception as e:
292
- logger.error(f"Image processing failed: {e}")
293
- return None
294
 
295
- def _fetch_json_content(self, url: str) -> Optional[Dict]:
296
- """Process JSON content"""
297
- try:
298
- response = self.session.get(url, timeout=self.timeout)
299
- response.raise_for_status()
300
-
301
- content = response.json()
302
-
303
- return {
304
- 'content': json.dumps(content, indent=2),
305
- 'content_type': 'application/json',
306
- 'timestamp': datetime.now().isoformat()
307
- }
308
- except Exception as e:
309
- logger.error(f"JSON processing failed: {e}")
310
- return None
311
 
312
- def _fetch_text_content(self, url: str) -> Optional[Dict]:
313
- """Process plain text content"""
314
- try:
315
- response = self.session.get(url, timeout=self.timeout)
316
- response.raise_for_status()
317
-
318
- cleaned_content = self.advanced_text_cleaning(response.text)
319
-
320
  return {
321
  'content': cleaned_content,
322
  'content_type': response.headers.get('Content-Type', ''),
323
  'timestamp': datetime.now().isoformat()
324
  }
325
  except Exception as e:
326
- logger.error(f"Text processing failed: {e}")
327
  return None
328
-
329
  class FileProcessor:
330
  """Class to handle file processing"""
331
-
332
  def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
333
  self.max_file_size = max_file_size
334
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
335
-
336
  def is_text_file(self, filepath: str) -> bool:
337
  """Check if file is a text file"""
338
  try:
@@ -389,117 +238,238 @@ class FileProcessor:
389
  logger.error(f"Error reading file {filename}: {str(e)}")
390
  return results
391
 
392
- def _process_single_file(self, file) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  try:
394
- file_stat = os.stat(file.name)
395
-
396
- # For very large files, read in chunks and summarize
397
- if file_stat.st_size > 100 * 1024 * 1024: # 100MB
398
- logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
399
-
400
- # Read first and last 1MB for extremely large files
401
- content = ""
402
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
403
- content = f.read(1 * 1024 * 1024) # First 1MB
404
- content += "\n...[Content truncated due to large file size]...\n"
405
-
406
- # Seek to the last 1MB
407
- f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
408
- content += f.read() # Last 1MB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  else:
410
- # Regular file processing
411
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
412
- content = f.read()
413
-
414
- return [{
415
- 'source': 'file',
416
- 'filename': os.path.basename(file.name),
417
- 'file_size': file_stat.st_size,
418
- 'mime_type': mimetypes.guess_type(file.name)[0],
419
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
420
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
421
- 'content': content,
422
- 'timestamp': datetime.now().isoformat()
423
- }]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  except Exception as e:
425
- logger.error(f"File processing error: {e}")
426
  return []
427
 
428
- def generate_qr_code(json_data):
429
- """Generate a QR code from JSON data."""
430
- qr = qrcode.make(json_data)
431
- qr_path = "output/qr_code.png"
432
- qr.save(qr_path)
433
- return qr_path
434
-
435
  def create_interface():
436
  """Create a comprehensive Gradio interface with advanced features"""
437
-
438
  css = """
439
  .container { max-width: 1200px; margin: auto; }
440
- .warning { background-color: #fff3cd; color: #856404; }
441
- .error { background-color: #f8d7da; color: #721c24; }
 
442
  """
443
-
444
- with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
445
- gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
446
-
447
  with gr.Tab("URL Processing"):
448
  url_input = gr.Textbox(
449
  label="Enter URLs (comma or newline separated)",
450
  lines=5,
451
- placeholder="https://example1.com\nhttps://example2.com"
 
452
  )
453
-
454
  with gr.Tab("File Input"):
455
  file_input = gr.File(
456
  label="Upload text file or ZIP archive",
457
  file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
458
  )
459
 
460
- with gr.Tab("Text Input"):
461
- text_input = gr.Textbox(
462
- label="Raw Text Input",
463
- lines=5,
464
- placeholder="Paste your text here..."
465
- )
466
-
467
- with gr.Tab("JSON Editor"):
468
- json_editor = gr.Textbox(
469
- label="JSON Editor",
470
- lines=20,
471
- placeholder="View and edit your JSON data here...",
472
- interactive=True,
473
- elem_id="json-editor" # Optional: for custom styling
474
  )
475
-
476
- with gr.Tab("Scratchpad"):
477
- scratchpad = gr.Textbox(
478
- label="Scratchpad",
479
- lines=10,
480
- placeholder="Quick notes or text collections...",
481
- interactive=True
 
 
 
482
  )
483
-
484
- process_btn = gr.Button("Process Input", variant="primary")
485
- qr_btn = gr.Button("Generate QR Code", variant="secondary")
486
-
487
- output_text = gr.Textbox(label="Processing Results", interactive=False)
488
- output_file = gr.File(label="Processed Output")
489
- qr_output = gr.Image(label="QR Code", type="filepath") # To display the generated QR code
490
-
491
- def process_all_inputs(urls, file, text, notes):
492
- """Process all input types with progress tracking"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  try:
494
- processor = URLProcessor()
495
- file_processor = FileProcessor()
496
  results = []
497
-
498
- # Process URLs
499
- if urls:
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  url_list = re.split(r'[,\n]', urls)
501
  url_list = [url.strip() for url in url_list if url.strip()]
502
-
503
  for url in url_list:
504
  validation = processor.validate_url(url)
505
  if validation.get('is_valid'):
@@ -511,80 +481,73 @@ def create_interface():
511
  'content': content,
512
  'timestamp': datetime.now().isoformat()
513
  })
514
-
515
- # Process files
516
  if file:
517
- results.extend(file_processor.process_file(file))
518
-
519
- # Process text input
520
- if text:
521
- cleaned_text = processor.advanced_text_cleaning(text)
522
- results.append({
523
- 'source': 'direct_input',
524
- 'content': cleaned_text,
525
- 'timestamp': datetime.now().isoformat()
526
- })
527
-
528
- # Generate output
529
  if results:
530
- output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
531
- output_dir.mkdir(parents=True, exist_ok=True)
532
- output_path = output_dir / f'processed_{int(time.time())}.json'
533
-
534
- with open(output_path, 'w', encoding='utf-8') as f:
535
- json.dump(results, f, ensure_ascii=False, indent=2)
536
-
537
- summary = f"Processed {len(results)} items successfully!"
538
- json_data = json.dumps(results, indent=2) # Prepare JSON for QR code
539
- return str(output_path), summary, json_data # Return JSON for editor
540
  else:
541
- return None, "No valid content to process.", ""
542
-
543
  except Exception as e:
544
  logger.error(f"Processing error: {e}")
545
- return None, f"Error: {str(e)}", ""
546
-
547
- def generate_qr(json_data):
548
- """Generate QR code from JSON data and return the file path."""
549
- if json_data:
550
- return generate_qr_code(json_data)
551
- return None
552
-
553
  process_btn.click(
554
  process_all_inputs,
555
- inputs=[url_input, file_input, text_input, scratchpad],
556
- outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
557
- )
558
-
559
- qr_btn.click(
560
- generate_qr,
561
- inputs=json_editor,
562
- outputs=qr_output
563
  )
564
-
565
  gr.Markdown("""
566
- ### Usage Guidelines
567
- - **URL Processing**: Enter valid HTTP/HTTPS URLs
568
- - **File Input**: Upload text files or ZIP archives
569
- - **Text Input**: Direct text processing
570
- - **JSON Editor**: View and edit your JSON data
571
- - **Scratchpad**: Quick notes or text collections
572
- - Advanced cleaning and validation included
 
 
 
 
 
 
573
  """)
574
-
575
  return interface
576
 
577
  def main():
578
  # Configure system settings
579
  mimetypes.init()
580
-
 
 
 
581
  # Create and launch interface
582
  interface = create_interface()
583
-
584
  # Launch with proper configuration
585
  interface.launch(
586
  server_name="0.0.0.0",
587
- server_port=7860,
588
  show_error=True,
589
  share=False,
590
  inbrowser=True,
 
4
  import time
5
  import logging
6
  import mimetypes
7
+ import concurrent.futures
8
+ import string
9
+ import zipfile
10
  import tempfile
11
  from datetime import datetime
12
+ from typing import List, Dict, Optional, Union
13
  from pathlib import Path
14
  from urllib.parse import urlparse
15
+
16
  import requests
17
  import validators
18
  import gradio as gr
19
  from diskcache import Cache
20
  from bs4 import BeautifulSoup
21
  from fake_useragent import UserAgent
22
+ from ratelimit import limits, sleep_and_retry
23
  from cleantext import clean
24
  import qrcode
 
 
 
 
 
 
 
25
 
26
+ # Setup logging
27
  logging.basicConfig(
28
  level=logging.INFO,
29
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
 
34
  )
35
  logger = logging.getLogger(__name__)
36
 
37
+ # Ensure output directories exist
38
+ Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
39
+
40
  class URLProcessor:
41
  def __init__(self):
42
  self.session = requests.Session()
 
49
  'Connection': 'keep-alive',
50
  'Upgrade-Insecure-Requests': '1'
51
  })
 
 
 
 
 
 
 
52
 
53
  def advanced_text_cleaning(self, text: str) -> str:
54
  """Robust text cleaning with version compatibility"""
 
80
  try:
81
  if not validators.url(url):
82
  return {'is_valid': False, 'message': 'Invalid URL format'}
83
+
84
  response = self.session.head(url, timeout=self.timeout)
85
  response.raise_for_status()
86
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
 
88
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
89
 
90
  def fetch_content(self, url: str) -> Optional[Dict]:
91
+ """Universal content fetcher with special case handling"""
92
  try:
93
+ # Google Drive document handling
94
  if 'drive.google.com' in url:
95
  return self._handle_google_drive(url)
96
+
97
+ # Google Calendar ICS handling
98
  if 'calendar.google.com' in url and 'ical' in url:
99
  return self._handle_google_calendar(url)
100
 
101
+ # Standard HTML processing
102
+ return self._fetch_html_content(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  except Exception as e:
104
  logger.error(f"Content fetch failed: {e}")
105
  return None
 
111
  if not file_id:
112
  logger.error(f"Invalid Google Drive URL: {url}")
113
  return None
114
+
115
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
116
  response = self.session.get(direct_url, timeout=self.timeout)
117
  response.raise_for_status()
118
+
119
  return {
120
  'content': response.text,
121
  'content_type': response.headers.get('Content-Type', ''),
 
140
  return None
141
 
142
  def _fetch_html_content(self, url: str) -> Optional[Dict]:
143
+ """Standard HTML content processing"""
144
  try:
145
  response = self.session.get(url, timeout=self.timeout)
146
  response.raise_for_status()
147
+
148
  soup = BeautifulSoup(response.text, 'html.parser')
149
+
150
  # Remove unwanted elements
151
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
152
  element.decompose()
153
+
154
  # Extract main content
155
  main_content = soup.find('main') or soup.find('article') or soup.body
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
+ if main_content is None:
158
+ logger.warning(f"No main content found for URL: {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  return {
160
+ 'content': '',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  'content_type': response.headers.get('Content-Type', ''),
162
  'timestamp': datetime.now().isoformat()
163
  }
 
 
 
164
 
165
+ # Clean and structure content
166
+ text_content = main_content.get_text(separator='\n', strip=True)
167
+ cleaned_content = self.advanced_text_cleaning(text_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
 
 
 
 
 
 
 
 
169
  return {
170
  'content': cleaned_content,
171
  'content_type': response.headers.get('Content-Type', ''),
172
  'timestamp': datetime.now().isoformat()
173
  }
174
  except Exception as e:
175
+ logger.error(f"HTML processing failed: {e}")
176
  return None
177
+
178
  class FileProcessor:
179
  """Class to handle file processing"""
180
+
181
  def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
182
  self.max_file_size = max_file_size
183
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
184
+
185
  def is_text_file(self, filepath: str) -> bool:
186
  """Check if file is a text file"""
187
  try:
 
238
  logger.error(f"Error reading file {filename}: {str(e)}")
239
  return results
240
 
241
+ def _process_single_file(self, file) -> List[Dict]:
242
+ """Process a single file"""
243
+ try:
244
+ file_stat = os.stat(file.name)
245
+
246
+ # For very large files, read in chunks and summarize
247
+ if file_stat.st_size > 100 * 1024 * 1024: # 100MB
248
+ logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
249
+
250
+ # Read first and last 1MB for extremely large files
251
+ content = ""
252
+ with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
253
+ content = f.read(1 * 1024 * 1024) # First 1MB
254
+ content += "\n...[Content truncated due to large file size]...\n"
255
+
256
+ # Seek to the last 1MB
257
+ f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
258
+ content += f.read() # Last 1MB
259
+ else:
260
+ # Regular file processing
261
+ with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
262
+ content = f.read()
263
+
264
+ return [{
265
+ 'source': 'file',
266
+ 'filename': os.path.basename(file.name),
267
+ 'file_size': file_stat.st_size,
268
+ 'mime_type': mimetypes.guess_type(file.name)[0],
269
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
270
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
271
+ 'content': content,
272
+ 'timestamp': datetime.now().isoformat()
273
+ }]
274
+ except Exception as e:
275
+ logger.error(f"File processing error: {e}")
276
+ return []
277
+
278
+ def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
279
+ """Clean and validate JSON data"""
280
  try:
281
+ # If it's a string, try to parse it
282
+ if isinstance(data, str):
283
+ # Remove any existing content and extra whitespace
284
+ data = data.strip()
285
+ data = json.loads(data)
286
+
287
+ # Convert to string and back to ensure proper JSON format
288
+ cleaned = json.loads(json.dumps(data))
289
+ return cleaned
290
+ except json.JSONDecodeError as e:
291
+ logger.error(f"JSON cleaning error: {e}")
292
+ return None
293
+ except Exception as e:
294
+ logger.error(f"Unexpected error while cleaning JSON: {e}")
295
+ return None
296
+
297
+ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]:
298
+ """Generate QR code(s) from data"""
299
+ try:
300
+ output_dir = Path('output/qr_codes')
301
+ output_dir.mkdir(parents=True, exist_ok=True)
302
+
303
+ if combined:
304
+ # Generate single QR code for all data
305
+ cleaned_data = clean_json(data)
306
+ if cleaned_data:
307
+ qr = qrcode.QRCode(
308
+ version=None,
309
+ error_correction=qrcode.constants.ERROR_CORRECT_L,
310
+ box_size=10,
311
+ border=4,
312
+ )
313
+ json_str = json.dumps(cleaned_data, ensure_ascii=False)
314
+ qr.add_data(json_str)
315
+ qr.make(fit=True)
316
+
317
+ img = qr.make_image(fill_color="black", back_color="white")
318
+ output_path = output_dir / f'combined_qr_{int(time.time())}.png'
319
+ img.save(str(output_path))
320
+ return [str(output_path)]
321
  else:
322
+ # Generate separate QR codes for each item
323
+ if isinstance(data, list):
324
+ paths = []
325
+ for idx, item in enumerate(data):
326
+ cleaned_item = clean_json(item)
327
+ if cleaned_item:
328
+ qr = qrcode.QRCode(
329
+ version=None,
330
+ error_correction=qrcode.constants.ERROR_CORRECT_L,
331
+ box_size=10,
332
+ border=4,
333
+ )
334
+ json_str = json.dumps(cleaned_item, ensure_ascii=False)
335
+ qr.add_data(json_str)
336
+ qr.make(fit=True)
337
+
338
+ img = qr.make_image(fill_color="black", back_color="white")
339
+ output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
340
+ img.save(str(output_path))
341
+ paths.append(str(output_path))
342
+ return paths
343
+ else:
344
+ # Single item, not combined
345
+ cleaned_item = clean_json(data)
346
+ if cleaned_item:
347
+ qr = qrcode.QRCode(
348
+ version=None,
349
+ error_correction=qrcode.constants.ERROR_CORRECT_L,
350
+ box_size=10,
351
+ border=4,
352
+ )
353
+ json_str = json.dumps(cleaned_item, ensure_ascii=False)
354
+ qr.add_data(json_str)
355
+ qr.make(fit=True)
356
+
357
+ img = qr.make_image(fill_color="black", back_color="white")
358
+ output_path = output_dir / f'single_qr_{int(time.time())}.png'
359
+ img.save(str(output_path))
360
+ return [str(output_path)]
361
+
362
+ return []
363
  except Exception as e:
364
+ logger.error(f"QR generation error: {e}")
365
  return []
366
 
 
 
 
 
 
 
 
367
  def create_interface():
368
  """Create a comprehensive Gradio interface with advanced features"""
369
+
370
  css = """
371
  .container { max-width: 1200px; margin: auto; }
372
+ .warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 4px; }
373
+ .error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 4px; }
374
+ .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
375
  """
376
+
377
+ with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
378
+ gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
379
+
380
  with gr.Tab("URL Processing"):
381
  url_input = gr.Textbox(
382
  label="Enter URLs (comma or newline separated)",
383
  lines=5,
384
+ placeholder="https://example1.com\nhttps://example2.com",
385
+ value=""
386
  )
387
+
388
  with gr.Tab("File Input"):
389
  file_input = gr.File(
390
  label="Upload text file or ZIP archive",
391
  file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
392
  )
393
 
394
+ with gr.Tab("Notepad"):
395
+ text_input = gr.TextArea(
396
+ label="JSON Data Input",
397
+ lines=15,
398
+ placeholder="Paste your JSON data here...",
399
+ value=""
 
 
 
 
 
 
 
 
400
  )
401
+
402
+ with gr.Row():
403
+ example_btn = gr.Button("πŸ“ Load Example JSON", variant="secondary")
404
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Input", variant="secondary")
405
+
406
+ with gr.Row():
407
+ combine_data = gr.Checkbox(
408
+ label="Combine all data into single QR code",
409
+ value=True,
410
+ info="Generate one QR code for all data, or separate QR codes for each item"
411
  )
412
+ process_btn = gr.Button("πŸ”„ Process & Generate QR", variant="primary", scale=2)
413
+
414
+ output_json = gr.JSON(label="Processed JSON Data")
415
+ output_gallery = gr.Gallery(label="Generated QR Codes", columns=2, height=400)
416
+ output_text = gr.Textbox(label="Processing Status", interactive=False)
417
+
418
+ def load_example():
419
+ example_json = {
420
+ "type": "product_catalog",
421
+ "items": [
422
+ {
423
+ "id": "123",
424
+ "name": "Test Product",
425
+ "description": "This is a test product description",
426
+ "price": 29.99,
427
+ "category": "electronics",
428
+ "tags": ["test", "sample", "demo"]
429
+ },
430
+ {
431
+ "id": "456",
432
+ "name": "Another Product",
433
+ "description": "Another test product description",
434
+ "price": 49.99,
435
+ "category": "accessories",
436
+ "tags": ["sample", "test"]
437
+ }
438
+ ],
439
+ "metadata": {
440
+ "timestamp": datetime.now().isoformat(),
441
+ "version": "1.0",
442
+ "source": "example"
443
+ }
444
+ }
445
+ return json.dumps(example_json, indent=2)
446
+
447
+ def clear_input():
448
+ return ""
449
+
450
+ def process_all_inputs(urls, file, text, combine):
451
+ """Process all input types and generate QR codes"""
452
  try:
 
 
453
  results = []
454
+
455
+ # Process text input first (since it's direct JSON)
456
+ if text and text.strip():
457
+ try:
458
+ # Try to parse as JSON
459
+ json_data = json.loads(text)
460
+ if isinstance(json_data, list):
461
+ results.extend(json_data)
462
+ else:
463
+ results.append(json_data)
464
+ except json.JSONDecodeError as e:
465
+ return None, [], f"❌ Invalid JSON format: {str(e)}"
466
+
467
+ # Process URLs if provided
468
+ if urls and urls.strip():
469
+ processor = URLProcessor()
470
  url_list = re.split(r'[,\n]', urls)
471
  url_list = [url.strip() for url in url_list if url.strip()]
472
+
473
  for url in url_list:
474
  validation = processor.validate_url(url)
475
  if validation.get('is_valid'):
 
481
  'content': content,
482
  'timestamp': datetime.now().isoformat()
483
  })
484
+
485
+ # Process files if provided
486
  if file:
487
+ file_processor = FileProcessor()
488
+ file_results = file_processor.process_file(file)
489
+ if file_results:
490
+ results.extend(file_results)
491
+
492
+ # Generate QR codes
 
 
 
 
 
 
493
  if results:
494
+ qr_paths = generate_qr_code(results, combined=combine)
495
+ if qr_paths:
496
+ return (
497
+ results,
498
+ [str(path) for path in qr_paths],
499
+ f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR code(s)!"
500
+ )
501
+ else:
502
+ return None, [], "❌ Failed to generate QR codes. Please check the input data."
 
503
  else:
504
+ return None, [], "⚠️ No valid content to process. Please provide some input data."
505
+
506
  except Exception as e:
507
  logger.error(f"Processing error: {e}")
508
+ return None, [], f"❌ Error: {str(e)}"
509
+
510
+ # Set up event handlers
511
+ example_btn.click(load_example, outputs=[text_input])
512
+ clear_btn.click(clear_input, outputs=[text_input])
 
 
 
513
  process_btn.click(
514
  process_all_inputs,
515
+ inputs=[url_input, file_input, text_input, combine_data],
516
+ outputs=[output_json, output_gallery, output_text]
 
 
 
 
 
 
517
  )
518
+
519
  gr.Markdown("""
520
+ ### Features
521
+ - **URL Processing**: Extract content from websites
522
+ - **File Processing**: Handle text files and archives
523
+ - **Notepad**: Direct JSON data input/manipulation
524
+ - **JSON Cleaning**: Automatic JSON validation and formatting
525
+ - **QR Generation**: Generate QR codes with embedded JSON data
526
+ - **Flexible Output**: Choose between combined or separate QR codes
527
+
528
+ ### Usage Tips
529
+ 1. Use the **Notepad** tab for direct JSON input
530
+ 2. Click "Load Example JSON" to see a sample format
531
+ 3. Choose whether to combine all data into a single QR code
532
+ 4. The generated QR codes will contain the complete JSON data
533
  """)
534
+
535
  return interface
536
 
537
  def main():
538
  # Configure system settings
539
  mimetypes.init()
540
+
541
+ # Create output directories
542
+ Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
543
+
544
  # Create and launch interface
545
  interface = create_interface()
546
+
547
  # Launch with proper configuration
548
  interface.launch(
549
  server_name="0.0.0.0",
550
+ server_port=8000,
551
  show_error=True,
552
  share=False,
553
  inbrowser=True,