acecalisto3 commited on
Commit
bc33f9a
·
verified ·
1 Parent(s): fe6a8fc

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +775 -335
app2.py CHANGED
@@ -1,28 +1,30 @@
1
  import json
2
  import os
3
  import re
4
- import time
5
  import logging
6
  import mimetypes
7
- import concurrent.futures
8
- import string
 
 
9
  import zipfile
10
  import tempfile
11
  from datetime import datetime
12
- from typing import List, Dict, Optional, Union, Tuple
13
  from pathlib import Path
14
- from urllib.parse import urlparse
15
  import requests
16
  import validators
17
  import gradio as gr
18
  from bs4 import BeautifulSoup
19
  from fake_useragent import UserAgent
20
- from ratelimit import limits, sleep_and_retry
21
  from cleantext import clean
22
  import qrcode
23
- import zxing
24
-
25
  # Setup logging
 
 
 
 
26
  logging.basicConfig(
27
  level=logging.INFO,
28
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
@@ -39,15 +41,276 @@ Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
39
  class URLProcessor:
40
  def __init__(self):
41
  self.session = requests.Session()
42
- self.timeout = 10 # seconds
43
- self.session.headers.update({
44
- 'User-Agent': UserAgent().random,
45
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
46
- 'Accept-Language': 'en-US,en;q=0.5',
47
- 'Accept-Encoding': 'gzip, deflate, br',
48
- 'Connection': 'keep-alive',
49
- 'Upgrade-Insecure-Requests': '1'
50
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def advanced_text_cleaning(self, text: str) -> str:
53
  """Robust text cleaning with version compatibility"""
@@ -69,9 +332,9 @@ class URLProcessor:
69
  return cleaned_text
70
  except Exception as e:
71
  logger.warning(f"Text cleaning error: {e}. Using fallback method.")
72
- text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Remove control characters
73
- text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
74
- text = re.sub(r'\s+', ' ', text) # Normalize whitespace
75
  return text.strip()
76
 
77
  def validate_url(self, url: str) -> Dict:
@@ -79,7 +342,7 @@ class URLProcessor:
79
  try:
80
  if not validators.url(url):
81
  return {'is_valid': False, 'message': 'Invalid URL format'}
82
-
83
  response = self.session.head(url, timeout=self.timeout)
84
  response.raise_for_status()
85
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
@@ -89,15 +352,10 @@ class URLProcessor:
89
  def fetch_content(self, url: str) -> Optional[Dict]:
90
  """Universal content fetcher with special case handling"""
91
  try:
92
- # Google Drive document handling
93
  if 'drive.google.com' in url:
94
  return self._handle_google_drive(url)
95
-
96
- # Google Calendar ICS handling
97
  if 'calendar.google.com' in url and 'ical' in url:
98
  return self._handle_google_calendar(url)
99
-
100
- # Standard HTML processing
101
  return self._fetch_html_content(url)
102
  except Exception as e:
103
  logger.error(f"Content fetch failed: {e}")
@@ -110,11 +368,11 @@ class URLProcessor:
110
  if not file_id:
111
  logger.error(f"Invalid Google Drive URL: {url}")
112
  return None
113
-
114
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
115
  response = self.session.get(direct_url, timeout=self.timeout)
116
  response.raise_for_status()
117
-
118
  return {
119
  'content': response.text,
120
  'content_type': response.headers.get('Content-Type', ''),
@@ -143,16 +401,13 @@ class URLProcessor:
143
  try:
144
  response = self.session.get(url, timeout=self.timeout)
145
  response.raise_for_status()
146
-
147
  soup = BeautifulSoup(response.text, 'html.parser')
148
-
149
- # Remove unwanted elements
150
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
151
  element.decompose()
152
-
153
- # Extract main content
154
  main_content = soup.find('main') or soup.find('article') or soup.body
155
-
156
  if main_content is None:
157
  logger.warning(f"No main content found for URL: {url}")
158
  return {
@@ -160,11 +415,10 @@ class URLProcessor:
160
  'content_type': response.headers.get('Content-Type', ''),
161
  'timestamp': datetime.now().isoformat()
162
  }
163
-
164
- # Clean and structure content
165
  text_content = main_content.get_text(separator='\n', strip=True)
166
  cleaned_content = self.advanced_text_cleaning(text_content)
167
-
168
  return {
169
  'content': cleaned_content,
170
  'content_type': response.headers.get('Content-Type', ''),
@@ -175,91 +429,269 @@ class URLProcessor:
175
  return None
176
 
177
  class FileProcessor:
178
- """Class to handle file processing"""
179
-
180
  def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
181
  self.max_file_size = max_file_size
182
- self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
183
-
 
184
  def is_text_file(self, filepath: str) -> bool:
185
  """Check if file is a text file"""
186
  try:
187
  mime_type, _ = mimetypes.guess_type(filepath)
188
- return (mime_type and mime_type.startswith('text/')) or \
189
- (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
190
- except Exception:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  return False
192
 
193
  def process_file(self, file) -> List[Dict]:
194
- """Process uploaded file with enhanced error handling"""
195
  if not file:
196
- return []
197
 
198
  dataset = []
199
  try:
200
  file_size = os.path.getsize(file.name)
201
  if file_size > self.max_file_size:
202
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
203
- return []
204
 
205
  with tempfile.TemporaryDirectory() as temp_dir:
 
206
  if zipfile.is_zipfile(file.name):
207
  dataset.extend(self._process_zip_file(file.name, temp_dir))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  else:
209
  dataset.extend(self._process_single_file(file))
210
 
 
 
 
211
  except Exception as e:
212
  logger.error(f"Error processing file: {str(e)}")
213
- return []
214
 
215
  return dataset
216
 
217
  def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
218
- """Process ZIP file contents"""
219
  results = []
220
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
221
- zip_ref.extractall(temp_dir)
222
- for root, _, files in os.walk(temp_dir):
223
- for filename in files:
224
- filepath = os.path.join(root, filename)
225
- if self.is_text_file(filepath):
 
 
 
 
 
 
 
 
 
 
 
226
  try:
227
- with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
228
- content = f.read()
229
- if content.strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  results.append({
231
- "source": "file",
 
232
  "filename": filename,
 
 
233
  "content": content,
234
  "timestamp": datetime.now().isoformat()
235
  })
236
- except Exception as e:
237
- logger.error(f"Error reading file {filename}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  return results
239
 
240
  def _process_single_file(self, file) -> List[Dict]:
241
- """Process a single file"""
242
  try:
243
  file_stat = os.stat(file.name)
244
-
245
- # For very large files, read in chunks and summarize
246
- if file_stat.st_size > 100 * 1024 * 1024: # 100MB
247
- logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
248
-
249
- # Read first and last 1MB for extremely large files
250
- content = ""
251
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
252
- content = f.read(1 * 1024 * 1024) # First 1MB
253
- content += "\n...[Content truncated due to large file size]...\n"
254
-
255
- # Seek to the last 1MB
256
- f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
257
- content += f.read() # Last 1MB
 
 
 
 
258
  else:
259
- # Regular file processing
260
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
261
- content = f.read()
262
-
263
  return [{
264
  'source': 'file',
265
  'filename': os.path.basename(file.name),
@@ -274,284 +706,292 @@ class FileProcessor:
274
  logger.error(f"File processing error: {e}")
275
  return []
276
 
277
- def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
278
- """Clean and validate JSON data"""
279
- try:
280
- # If it's a string, try to parse it
281
- if isinstance(data, str):
282
- # Remove any existing content and extra whitespace
283
- data = data.strip()
284
- data = json.loads(data)
285
-
286
- # Convert to string and back to ensure proper JSON format
287
- cleaned = json.loads(json.dumps(data))
288
- return cleaned
289
- except json.JSONDecodeError as e:
290
- logger.error(f"JSON cleaning error: {e}")
291
- return None
292
- except Exception as e:
293
- logger.error(f"Unexpected error while cleaning JSON: {e}")
294
- return None
295
-
296
- def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]:
297
- """Generate QR code(s) from data"""
298
- try:
299
- output_dir = Path('output/qr_codes')
300
- output_dir.mkdir(parents=True, exist_ok=True)
301
-
302
- if combined:
303
- # Generate single QR code for all data
304
- cleaned_data = clean_json(data)
305
- if cleaned_data:
306
- qr = qrcode.QRCode(
307
- version=None,
308
- error_correction=qrcode.constants.ERROR_CORRECT_L,
309
- box_size=10,
310
- border=4,
311
- )
312
- json_str = json.dumps(cleaned_data, ensure_ascii=False)
313
- qr.add_data(json_str)
314
- qr.make(fit=True)
315
-
316
- img = qr.make_image(fill_color="black", back_color="white")
317
- output_path = output_dir / f'combined_qr_{int(time.time())}.png'
318
- img.save(str(output_path))
319
- return [str(output_path)]
320
- else:
321
- # Generate separate QR codes for each item
322
- if isinstance(data, list):
323
- paths = []
324
- for idx, item in enumerate(data):
325
- cleaned_item = clean_json(item)
326
- if cleaned_item:
327
- qr = qrcode.QRCode(
328
- version=None,
329
- error_correction=qrcode.constants.ERROR_CORRECT_L,
330
- box_size=10,
331
- border=4,
332
- )
333
- json_str = json.dumps(cleaned_item, ensure_ascii=False)
334
- qr.add_data(json_str)
335
- qr.make(fit=True)
336
-
337
- img = qr.make_image(fill_color="black", back_color="white")
338
- output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
339
- img.save(str(output_path))
340
- paths.append(str(output_path))
341
- return paths
342
- else:
343
- # Single item, not combined
344
- cleaned_item = clean_json(data)
345
- if cleaned_item:
346
  qr = qrcode.QRCode(
347
  version=None,
348
  error_correction=qrcode.constants.ERROR_CORRECT_L,
349
  box_size=10,
350
  border=4,
351
  )
352
- json_str = json.dumps(cleaned_item, ensure_ascii=False)
353
  qr.add_data(json_str)
354
  qr.make(fit=True)
355
-
356
  img = qr.make_image(fill_color="black", back_color="white")
357
- output_path = output_dir / f'single_qr_{int(time.time())}.png'
358
  img.save(str(output_path))
359
  return [str(output_path)]
360
-
361
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  except Exception as e:
363
- logger.error(f"QR generation error: {e}")
364
- return []
365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  def create_interface():
367
  """Create a comprehensive Gradio interface with advanced features"""
368
-
369
  css = """
370
  .container { max-width: 1200px; margin: auto; }
371
  .warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 4px; }
372
  .error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 4px; }
373
  .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
374
  """
375
-
376
- with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
377
- gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
378
-
379
- with gr.Tab("URL Processing"):
380
- url_input = gr.Textbox(
381
- label="Enter URLs (comma or newline separated)",
382
- lines=5,
383
- placeholder="https://example1.com\nhttps://example2.com",
384
- value=""
385
- )
386
-
387
- with gr.Tab("File Input"):
388
- file_input = gr.File(
389
- label="Upload text file or ZIP archive",
390
- file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
391
- )
392
-
393
- with gr.Tab("Notepad"):
394
- text_input = gr.TextArea(
395
- label="JSON Data Input",
396
- lines=15,
397
- placeholder="Paste your JSON data here...",
398
- value=""
399
- )
400
-
401
- with gr.Row():
402
- example_btn = gr.Button("📝 Load Example JSON", variant="secondary")
403
- clear_btn = gr.Button("🗑️ Clear Input", variant="secondary")
404
-
405
- with gr.Row():
406
- combine_data = gr.Checkbox(
407
- label="Combine all data into single QR code",
408
- value=True,
409
- info="Generate one QR code for all data, or separate QR codes for each item"
410
- )
411
- process_btn = gr.Button("🔄 Process & Generate QR", variant="primary", scale=2)
412
-
413
- output_json = gr.JSON(label="Processed JSON Data")
414
- output_gallery = gr.Gallery(label="Generated QR Codes", columns=2, height=400)
415
- output_text = gr.Textbox(label="Processing Status", interactive=False)
416
-
417
- def load_example():
418
- example_json = {
419
- "type": "product_catalog",
420
- "items": [
421
- {
422
- "id": "123",
423
- "name": "Test Product",
424
- "description": "This is a test product description",
425
- "price": 29.99,
426
- "category": "electronics",
427
- "tags": ["test", "sample", "demo"]
428
- },
429
- {
430
- "id": "456",
431
- "name": "Another Product",
432
- "description": "Another test product description",
433
- "price": 49.99,
434
- "category": "accessories",
435
- "tags": ["sample", "test"]
436
- }
437
- ],
438
- "metadata": {
439
- "timestamp": datetime.now().isoformat(),
440
- "version": "1.0",
441
- "source": "example"
442
- }
443
- }
444
- return json.dumps(example_json, indent=2)
445
-
446
- def clear_input():
447
- return ""
448
-
449
- def process_all_inputs(urls, file, text, combine):
450
- """Process all input types and generate QR codes"""
451
- try:
452
- results = []
453
-
454
- # Process text input first (since it's direct JSON)
455
- if text and text.strip():
456
- try:
457
- # Try to parse as JSON
458
- json_data = json.loads(text)
459
- if isinstance(json_data, list):
460
- results.extend(json_data)
461
- else:
462
- results.append(json_data)
463
- except json.JSONDecodeError as e:
464
- return None, [], f"❌ Invalid JSON format: {str(e)}"
465
-
466
- # Process URLs if provided
467
- if urls and urls.strip():
468
- processor = URLProcessor()
469
- url_list = re.split(r'[,\n]', urls)
470
- url_list = [url.strip() for url in url_list if url.strip()]
471
-
472
- for url in url_list:
473
- validation = processor.validate_url(url)
474
- if validation.get('is_valid'):
475
- content = processor.fetch_content(url)
476
- if content:
477
- results.append({
478
- 'source': 'url',
479
- 'url': url,
480
- 'content': content,
481
- 'timestamp': datetime.now().isoformat()
482
- })
483
-
484
- # Process files if provided
485
- if file:
486
- file_processor = FileProcessor()
487
- file_results = file_processor.process_file(file)
488
- if file_results:
489
- results.extend(file_results)
490
-
491
- # Generate QR codes
492
- if results:
493
- qr_paths = generate_qr_code(results, combined=combine)
494
- if qr_paths:
495
- return (
496
- results,
497
- [str(path) for path in qr_paths],
498
- f"✅ Successfully processed {len(results)} items and generated {len(qr_paths)} QR code(s)!"
499
- )
500
- else:
501
- return None, [], "❌ Failed to generate QR codes. Please check the input data."
502
- else:
503
- return None, [], "⚠️ No valid content to process. Please provide some input data."
504
-
505
- except Exception as e:
506
- logger.error(f"Processing error: {e}")
507
- return None, [], f"❌ Error: {str(e)}"
508
-
509
- # Set up event handlers
510
- example_btn.click(load_example, outputs=[text_input])
511
- clear_btn.click(clear_input, outputs=[text_input])
512
- process_btn.click(
513
- process_all_inputs,
514
- inputs=[url_input, file_input, text_input, combine_data],
515
- outputs=[output_json, output_gallery, output_text]
516
- )
517
-
518
- gr.Markdown("""
519
- ### Features
520
- - **URL Processing**: Extract content from websites
521
- - **File Processing**: Handle text files and archives
522
- - **Notepad**: Direct JSON data input/manipulation
523
- - **JSON Cleaning**: Automatic JSON validation and formatting
524
- - **QR Generation**: Generate QR codes with embedded JSON data
525
- - **Flexible Output**: Choose between combined or separate QR codes
526
-
527
- ### Usage Tips
528
- 1. Use the **Notepad** tab for direct JSON input
529
- 2. Click "Load Example JSON" to see a sample format
530
- 3. Choose whether to combine all data into a single QR code
531
- 4. The generated QR codes will contain the complete JSON data
532
- """)
533
-
534
  return interface
535
 
536
  def main():
537
- # Configure system settings
538
- mimetypes.init()
539
-
540
- # Create output directories
541
- Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
542
-
543
- # Create and launch interface
544
- interface = create_interface()
545
-
546
- # Launch with proper configuration
547
- interface.launch(
548
- server_name="0.0.0.0",
549
- server_port=8000,
550
- show_error=True,
551
- share=False,
552
- inbrowser=True,
553
- debug=True
554
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
 
556
  if __name__ == "__main__":
557
- main()
 
1
  import json
2
  import os
3
  import re
 
4
  import logging
5
  import mimetypes
6
+ import time
7
+ from PIL import Image
8
+ import zxing
9
+ import io
10
  import zipfile
11
  import tempfile
12
  from datetime import datetime
13
+ from typing import List, Dict, Optional, Union, Any
14
  from pathlib import Path
 
15
  import requests
16
  import validators
17
  import gradio as gr
18
  from bs4 import BeautifulSoup
19
  from fake_useragent import UserAgent
 
20
  from cleantext import clean
21
  import qrcode
22
+ import cv2 # Add this import for the decode_qr_code function
 
23
  # Setup logging
24
+ import sys
25
+ import argparse
26
+ import base64
27
+ import io
28
  logging.basicConfig(
29
  level=logging.INFO,
30
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
 
41
  class URLProcessor:
42
  def __init__(self):
43
  self.session = requests.Session()
44
+ self.timeout = 10
45
+ self.max_retries = 3
46
+ self.request_delay = 1.0
47
+ self.respect_robots = True
48
+ self.use_proxy = False
49
+ self.proxy_url = None
50
+ self.rate_limits = {} # Track rate limits per domain
51
+ self.selenium_driver = None
52
+
53
+ # Update session headers with rotating user agents
54
+ self.update_user_agent()
55
+
56
+ if self.use_proxy and self.proxy_url:
57
+ self.session.proxies = {
58
+ 'http': self.proxy_url,
59
+ 'https': self.proxy_url
60
+ }
61
+
62
+ def update_user_agent(self):
63
+ """Rotate user agents to avoid detection"""
64
+ try:
65
+ self.session.headers.update({
66
+ 'User-Agent': UserAgent().random,
67
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
68
+ 'Accept-Language': 'en-US,en;q=0.5',
69
+ 'Accept-Encoding': 'gzip, deflate, br',
70
+ 'Connection': 'keep-alive',
71
+ 'Upgrade-Insecure-Requests': '1',
72
+ 'Cache-Control': 'max-age=0'
73
+ })
74
+ except Exception as e:
75
+ logger.warning(f"Failed to update user agent: {e}")
76
+ # Fallback to a common user agent
77
+ self.session.headers.update({
78
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
79
+ })
80
+
81
+ def get_selenium_driver(self):
82
+ """Initialize Selenium WebDriver for interactive sites"""
83
+ if self.selenium_driver is not None:
84
+ return self.selenium_driver
85
+
86
+ try:
87
+ from selenium import webdriver
88
+ from selenium.webdriver.chrome.service import Service
89
+ from selenium.webdriver.chrome.options import Options
90
+ from webdriver_manager.chrome import ChromeDriverManager
91
+
92
+ options = Options()
93
+ options.add_argument("--headless")
94
+ options.add_argument("--no-sandbox")
95
+ options.add_argument("--disable-dev-shm-usage")
96
+ options.add_argument(f"user-agent={self.session.headers['User-Agent']}")
97
+ options.add_argument("--disable-notifications")
98
+ options.add_argument("--disable-popup-blocking")
99
+ options.add_argument("--disable-extensions")
100
+
101
+ service = Service(ChromeDriverManager().install())
102
+ self.selenium_driver = webdriver.Chrome(service=service, options=options)
103
+ return self.selenium_driver
104
+ except Exception as e:
105
+ logger.error(f"Failed to initialize Selenium: {e}")
106
+ return None
107
+
108
+ def handle_rate_limits(self, domain):
109
+ """Smart rate limiting based on domain"""
110
+ from urllib.parse import urlparse
111
+ import time
112
+
113
+ # Extract domain from URL
114
+ parsed_domain = urlparse(domain).netloc
115
+
116
+ # Check if we've accessed this domain recently
117
+ current_time = time.time()
118
+ if parsed_domain in self.rate_limits:
119
+ last_access, count = self.rate_limits[parsed_domain]
120
+
121
+ # Different delay strategies for different domains
122
+ if "facebook" in parsed_domain or "instagram" in parsed_domain:
123
+ min_delay = 5.0 # Longer delay for social media sites
124
+ elif "gov" in parsed_domain:
125
+ min_delay = 2.0 # Be respectful with government sites
126
+ else:
127
+ min_delay = self.request_delay
128
+
129
+ # Exponential backoff if we're making many requests
130
+ if count > 10:
131
+ min_delay *= 2
132
+
133
+ # Wait if needed
134
+ elapsed = current_time - last_access
135
+ if elapsed < min_delay:
136
+ time.sleep(min_delay - elapsed)
137
+
138
+ # Update count
139
+ self.rate_limits[parsed_domain] = (time.time(), count + 1)
140
+ else:
141
+ # First time accessing this domain
142
+ self.rate_limits[parsed_domain] = (current_time, 1)
143
+
144
+ def handle_interactive_site(self, url):
145
+ """Handle sites that require interaction to bypass blocks"""
146
+ driver = self.get_selenium_driver()
147
+ if not driver:
148
+ return None
149
+
150
+ try:
151
+ driver.get(url)
152
+
153
+ # Wait for page to load
154
+ import time
155
+ time.sleep(3)
156
+
157
+ # Handle different types of sites
158
+ if "facebook.com" in url or "instagram.com" in url:
159
+ self._handle_social_media_site(driver)
160
+ elif "google.com" in url:
161
+ self._handle_google_site(driver)
162
+
163
+ # Get the page source after interaction
164
+ page_source = driver.page_source
165
+
166
+ return {
167
+ 'content': page_source,
168
+ 'content_type': 'text/html',
169
+ 'url': url,
170
+ 'title': driver.title
171
+ }
172
+ except Exception as e:
173
+ logger.error(f"Error handling interactive site {url}: {e}")
174
+ return None
175
+
176
+ def _handle_social_media_site(self, driver):
177
+ """Handle Facebook/Instagram login walls"""
178
+ from selenium.webdriver.common.by import By
179
+ from selenium.webdriver.common.keys import Keys
180
+ from selenium.webdriver.support.ui import WebDriverWait
181
+ from selenium.webdriver.support import expected_conditions as EC
182
+
183
+ try:
184
+ # Try to find and close login popups
185
+ close_buttons = driver.find_elements(By.XPATH, "//button[contains(@aria-label, 'Close')]")
186
+ if close_buttons:
187
+ close_buttons[0].click()
188
+ time.sleep(1)
189
+
190
+ # Press ESC key to dismiss popups
191
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
192
+ time.sleep(1)
193
+
194
+ # Scroll down to load more content
195
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
196
+ time.sleep(2)
197
+ except Exception as e:
198
+ logger.warning(f"Error handling social media site: {e}")
199
+
200
+ def _handle_google_site(self, driver):
201
+ """Handle Google authentication and consent pages"""
202
+ from selenium.webdriver.common.by import By
203
+
204
+ try:
205
+ # Look for consent buttons
206
+ consent_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'Accept all')]")
207
+ if consent_buttons:
208
+ consent_buttons[0].click()
209
+ time.sleep(1)
210
+
211
+ # Look for "I agree" buttons
212
+ agree_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'I agree')]")
213
+ if agree_buttons:
214
+ agree_buttons[0].click()
215
+ time.sleep(1)
216
+ except Exception as e:
217
+ logger.warning(f"Error handling Google site: {e}")
218
+
219
+ def fetch_content(self, url: str) -> Optional[Dict]:
220
+ """Fetch content with smart handling for different sites"""
221
+ # Check if URL is allowed by robots.txt
222
+ if self.respect_robots and not self.check_robots_txt(url):
223
+ logger.warning(f"URL {url} is disallowed by robots.txt")
224
+ return None
225
+
226
+ # Apply rate limiting
227
+ self.handle_rate_limits(url)
228
+
229
+ # Rotate user agent occasionally
230
+ if random.random() < 0.3: # 30% chance to rotate
231
+ self.update_user_agent()
232
+
233
+ # Determine if site needs special handling
234
+ needs_selenium = any(domain in url.lower() for domain in [
235
+ 'facebook.com', 'instagram.com', 'linkedin.com',
236
+ 'google.com/search', 'twitter.com', 'x.com'
237
+ ])
238
+
239
+ for attempt in range(self.max_retries):
240
+ try:
241
+ if needs_selenium:
242
+ return self.handle_interactive_site(url)
243
+
244
+ # Try with cloudscraper first for sites with anti-bot measures
245
+ if any(domain in url.lower() for domain in ['cloudflare', '.gov']):
246
+ import cloudscraper
247
+ scraper = cloudscraper.create_scraper(
248
+ browser={'browser': 'chrome', 'platform': 'darwin', 'mobile': False}
249
+ )
250
+ response = scraper.get(url, timeout=self.timeout)
251
+ else:
252
+ # Standard request for most sites
253
+ response = self.session.get(url, timeout=self.timeout)
254
+
255
+ response.raise_for_status()
256
+
257
+ return {
258
+ 'content': response.text,
259
+ 'content_type': response.headers.get('Content-Type', ''),
260
+ 'url': url,
261
+ 'status_code': response.status_code
262
+ }
263
+ except Exception as e:
264
+ logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
265
+ if attempt < self.max_retries - 1:
266
+ # Exponential backoff
267
+ time.sleep(self.request_delay * (2 ** attempt))
268
+
269
+ logger.error(f"All attempts failed for {url}")
270
+ return None
271
+
272
+ def check_robots_txt(self, url: str) -> bool:
273
+ """Check if URL is allowed by robots.txt"""
274
+ if not self.respect_robots:
275
+ return True
276
+
277
+ try:
278
+ from urllib.parse import urlparse
279
+ from urllib.robotparser import RobotFileParser
280
+
281
+ parsed_url = urlparse(url)
282
+ robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
283
+
284
+ rp = RobotFileParser()
285
+ rp.set_url(robots_url)
286
+ rp.read()
287
+
288
+ return rp.can_fetch(self.session.headers['User-Agent'], url)
289
+ except Exception as e:
290
+ logger.warning(f"Error checking robots.txt: {e}")
291
+ return True
292
+
293
+ def fetch_content(self, url: str) -> Optional[Dict]:
294
+ """Fetch content with built-in rate limiting and robots.txt checking"""
295
+ if not self.check_robots_txt(url):
296
+ logger.warning(f"URL {url} is disallowed by robots.txt")
297
+ return None
298
+
299
+ time.sleep(self.request_delay) # Basic rate limiting
300
+
301
+ for attempt in range(self.max_retries):
302
+ try:
303
+ if 'drive.google.com' in url:
304
+ return self._handle_google_drive(url)
305
+ if 'calendar.google.com' in url:
306
+ return self._handle_google_calendar(url)
307
+ return self._fetch_html_content(url)
308
+ except Exception as e:
309
+ logger.error(f"Attempt {attempt + 1} failed: {e}")
310
+ if attempt < self.max_retries - 1:
311
+ time.sleep(self.request_delay * (attempt + 1))
312
+
313
+ return None
314
 
315
  def advanced_text_cleaning(self, text: str) -> str:
316
  """Robust text cleaning with version compatibility"""
 
332
  return cleaned_text
333
  except Exception as e:
334
  logger.warning(f"Text cleaning error: {e}. Using fallback method.")
335
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
336
+ text = text.encode('ascii', 'ignore').decode('ascii')
337
+ text = re.sub(r'\s+', ' ', text)
338
  return text.strip()
339
 
340
  def validate_url(self, url: str) -> Dict:
 
342
  try:
343
  if not validators.url(url):
344
  return {'is_valid': False, 'message': 'Invalid URL format'}
345
+
346
  response = self.session.head(url, timeout=self.timeout)
347
  response.raise_for_status()
348
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
 
352
  def fetch_content(self, url: str) -> Optional[Dict]:
353
  """Universal content fetcher with special case handling"""
354
  try:
 
355
  if 'drive.google.com' in url:
356
  return self._handle_google_drive(url)
 
 
357
  if 'calendar.google.com' in url and 'ical' in url:
358
  return self._handle_google_calendar(url)
 
 
359
  return self._fetch_html_content(url)
360
  except Exception as e:
361
  logger.error(f"Content fetch failed: {e}")
 
368
  if not file_id:
369
  logger.error(f"Invalid Google Drive URL: {url}")
370
  return None
371
+
372
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
373
  response = self.session.get(direct_url, timeout=self.timeout)
374
  response.raise_for_status()
375
+
376
  return {
377
  'content': response.text,
378
  'content_type': response.headers.get('Content-Type', ''),
 
401
  try:
402
  response = self.session.get(url, timeout=self.timeout)
403
  response.raise_for_status()
404
+
405
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
406
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
407
  element.decompose()
408
+
 
409
  main_content = soup.find('main') or soup.find('article') or soup.body
410
+
411
  if main_content is None:
412
  logger.warning(f"No main content found for URL: {url}")
413
  return {
 
415
  'content_type': response.headers.get('Content-Type', ''),
416
  'timestamp': datetime.now().isoformat()
417
  }
418
+
 
419
  text_content = main_content.get_text(separator='\n', strip=True)
420
  cleaned_content = self.advanced_text_cleaning(text_content)
421
+
422
  return {
423
  'content': cleaned_content,
424
  'content_type': response.headers.get('Content-Type', ''),
 
429
  return None
430
 
431
  class FileProcessor:
432
+ """Class to handle file processing with enhanced capabilities"""
433
+
434
  def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
435
  self.max_file_size = max_file_size
436
+ self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.js', '.css', '.py', '.java', '.c', '.cpp', '.h', '.rb', '.php', '.sql', '.yaml', '.yml', '.ini', '.cfg', '.conf', '.log', '.sh', '.bat', '.ps1'}
437
+ self.supported_binary_extensions = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip', '.tar', '.gz', '.rar', '.7z', '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.wav', '.ogg'}
438
+
439
  def is_text_file(self, filepath: str) -> bool:
440
  """Check if file is a text file"""
441
  try:
442
  mime_type, _ = mimetypes.guess_type(filepath)
443
+ ext = os.path.splitext(filepath)[1].lower()
444
+
445
+ # Check by extension first
446
+ if ext in self.supported_text_extensions:
447
+ return True
448
+
449
+ # Then check by mime type
450
+ if mime_type and mime_type.startswith('text/'):
451
+ return True
452
+
453
+ # Try to read the file as text
454
+ if os.path.exists(filepath) and os.path.getsize(filepath) < 1024 * 1024: # Only try for files < 1MB
455
+ try:
456
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
457
+ sample = f.read(1024) # Read first 1KB
458
+ # Check if it's mostly printable ASCII
459
+ printable_ratio = sum(c.isprintable() for c in sample) / len(sample) if sample else 0
460
+ return printable_ratio > 0.8
461
+ except Exception:
462
+ pass
463
+
464
+ return False
465
+ except Exception as e:
466
+ logger.error(f"Error checking if file is text: {e}")
467
  return False
468
 
469
  def process_file(self, file) -> List[Dict]:
470
+ """Process uploaded file with enhanced error handling and binary support"""
471
  if not file:
472
+ return [{"error": "No file provided"}]
473
 
474
  dataset = []
475
  try:
476
  file_size = os.path.getsize(file.name)
477
  if file_size > self.max_file_size:
478
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
479
+ return [{"error": f"File size ({file_size} bytes) exceeds maximum allowed size of {self.max_file_size} bytes"}]
480
 
481
  with tempfile.TemporaryDirectory() as temp_dir:
482
+ # Check if it's an archive file
483
  if zipfile.is_zipfile(file.name):
484
  dataset.extend(self._process_zip_file(file.name, temp_dir))
485
+ elif file.name.endswith('.tar.gz') or file.name.endswith('.tgz'):
486
+ dataset.extend(self._process_tar_file(file.name, temp_dir))
487
+ elif file.name.endswith('.rar'):
488
+ dataset.extend(self._process_rar_file(file.name, temp_dir))
489
+ elif file.name.endswith('.7z'):
490
+ dataset.extend(self._process_7z_file(file.name, temp_dir))
491
+ # Check if it's a document file
492
+ elif file.name.endswith(('.doc', '.docx')):
493
+ dataset.extend(self._process_word_file(file.name))
494
+ elif file.name.endswith(('.xls', '.xlsx')):
495
+ dataset.extend(self._process_excel_file(file.name))
496
+ elif file.name.endswith(('.ppt', '.pptx')):
497
+ dataset.extend(self._process_powerpoint_file(file.name))
498
+ elif file.name.endswith('.pdf'):
499
+ dataset.extend(self._process_pdf_file(file.name))
500
+ # Check if it's an image file
501
+ elif file.name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff')):
502
+ dataset.extend(self._process_image_file(file.name))
503
+ # Check if it's an audio/video file
504
+ elif file.name.endswith(('.mp3', '.wav', '.ogg', '.mp4', '.avi', '.mov', '.wmv', '.flv')):
505
+ dataset.extend(self._process_media_file(file.name))
506
+ # Default to text file processing
507
  else:
508
  dataset.extend(self._process_single_file(file))
509
 
510
+ if not dataset:
511
+ return [{"warning": "No extractable content found in the file"}]
512
+
513
  except Exception as e:
514
  logger.error(f"Error processing file: {str(e)}")
515
+ return [{"error": f"Error processing file: {str(e)}"}]
516
 
517
  return dataset
518
 
519
  def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
520
+ """Process ZIP file contents with enhanced extraction"""
521
  results = []
522
+ try:
523
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
524
+ # Get file list first
525
+ file_list = zip_ref.namelist()
526
+ total_files = len(file_list)
527
+
528
+ # Extract all files
529
+ zip_ref.extractall(temp_dir)
530
+
531
+ # Process each file
532
+ processed_count = 0
533
+ for root, dirs, files in os.walk(temp_dir):
534
+ for filename in files:
535
+ filepath = os.path.join(root, filename)
536
+ rel_path = os.path.relpath(filepath, temp_dir)
537
+
538
+ # Get file info from zip
539
  try:
540
+ zip_info = zip_ref.getinfo(rel_path.replace('\\', '/'))
541
+ file_size = zip_info.file_size
542
+ compressed_size = zip_info.compress_size
543
+ compression_ratio = (1 - compressed_size / file_size) * 100 if file_size > 0 else 0
544
+ except Exception:
545
+ file_size = os.path.getsize(filepath)
546
+ compressed_size = None
547
+ compression_ratio = None
548
+
549
+ # Process based on file type
550
+ if self.is_text_file(filepath):
551
+ try:
552
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
553
+ content = f.read()
554
+
555
+ results.append({
556
+ "source": "zip",
557
+ "archive": os.path.basename(zip_path),
558
+ "filename": filename,
559
+ "path": rel_path,
560
+ "size": file_size,
561
+ "compressed_size": compressed_size,
562
+ "compression_ratio": f"{compression_ratio:.2f}%" if compression_ratio is not None else None,
563
+ "content": content,
564
+ "timestamp": datetime.now().isoformat()
565
+ })
566
+ processed_count += 1
567
+ except Exception as e:
568
+ logger.error(f"Error reading file {filename}: {str(e)}")
569
+ else:
570
+ # For binary files, just record metadata
571
+ mime_type, _ = mimetypes.guess_type(filepath)
572
+ results.append({
573
+ "source": "zip",
574
+ "archive": os.path.basename(zip_path),
575
+ "filename": filename,
576
+ "path": rel_path,
577
+ "size": file_size,
578
+ "compressed_size": compressed_size,
579
+ "compression_ratio": f"{compression_ratio:.2f}%" if compression_ratio is not None else None,
580
+ "mime_type": mime_type,
581
+ "content": f"[Binary file: {mime_type or 'unknown type'}]",
582
+ "timestamp": datetime.now().isoformat()
583
+ })
584
+ processed_count += 1
585
+
586
+ # Add summary
587
+ results.append({
588
+ "source": "zip_summary",
589
+ "archive": os.path.basename(zip_path),
590
+ "total_files": total_files,
591
+ "processed_files": processed_count,
592
+ "timestamp": datetime.now().isoformat()
593
+ })
594
+
595
+ except Exception as e:
596
+ logger.error(f"Error processing ZIP file: {str(e)}")
597
+ results.append({"error": f"Error processing ZIP file: {str(e)}"})
598
+
599
+ return results
600
+
601
+ def _process_tar_file(self, tar_path: str, temp_dir: str) -> List[Dict]:
602
+ """Process TAR/GZ file contents"""
603
+ results = []
604
+ try:
605
+ import tarfile
606
+ with tarfile.open(tar_path, 'r:*') as tar:
607
+ # Get file list
608
+ file_list = tar.getnames()
609
+ total_files = len(file_list)
610
+
611
+ # Extract all files
612
+ tar.extractall(temp_dir)
613
+
614
+ # Process each file
615
+ processed_count = 0
616
+ for root, dirs, files in os.walk(temp_dir):
617
+ for filename in files:
618
+ filepath = os.path.join(root, filename)
619
+ rel_path = os.path.relpath(filepath, temp_dir)
620
+
621
+ # Process based on file type
622
+ if self.is_text_file(filepath):
623
+ try:
624
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
625
+ content = f.read()
626
+
627
  results.append({
628
+ "source": "tar",
629
+ "archive": os.path.basename(tar_path),
630
  "filename": filename,
631
+ "path": rel_path,
632
+ "size": os.path.getsize(filepath),
633
  "content": content,
634
  "timestamp": datetime.now().isoformat()
635
  })
636
+ processed_count += 1
637
+ except Exception as e:
638
+ logger.error(f"Error reading file {filename}: {str(e)}")
639
+ else:
640
+ # For binary files, just record metadata
641
+ mime_type, _ = mimetypes.guess_type(filepath)
642
+ results.append({
643
+ "source": "tar",
644
+ "archive": os.path.basename(tar_path),
645
+ "filename": filename,
646
+ "path": rel_path,
647
+ "size": os.path.getsize(filepath),
648
+ "mime_type": mime_type,
649
+ "content": f"[Binary file: {mime_type or 'unknown type'}]",
650
+ "timestamp": datetime.now().isoformat()
651
+ })
652
+ processed_count += 1
653
+
654
+ # Add summary
655
+ results.append({
656
+ "source": "tar_summary",
657
+ "archive": os.path.basename(tar_path),
658
+ "total_files": total_files,
659
+ "processed_files": processed_count,
660
+ "timestamp": datetime.now().isoformat()
661
+ })
662
+
663
+ except Exception as e:
664
+ logger.error(f"Error processing TAR file: {str(e)}")
665
+ results.append({"error": f"Error processing TAR file: {str(e)}"})
666
+
667
  return results
668
 
669
  def _process_single_file(self, file) -> List[Dict]:
670
+ """Process a single file with enhanced metadata extraction"""
671
  try:
672
  file_stat = os.stat(file.name)
673
+ file_path = file.name
674
+ filename = os.path.basename(file_path)
675
+ mime_type, _ = mimetypes.guess_type(file_path)
676
+
677
+ # For text files
678
+ if self.is_text_file(file_path):
679
+ if file_stat.st_size > 100 * 1024 * 1024: # 100MB
680
+ logger.info(f"Processing large file: {file_path} ({file_stat.st_size} bytes)")
681
+
682
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
683
+ content = f.read(1 * 1024 * 1024) # First 1MB
684
+ content += "\n...[Content truncated due to large file size]...\n"
685
+
686
+ f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
687
+ content += f.read() # Last 1MB
688
+ else:
689
+ with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
690
+ content = f.read()
691
  else:
692
+ # For binary files, just record metadata
693
+ content = f"[Binary file: {mime_type or 'unknown type'}]"
694
+
 
695
  return [{
696
  'source': 'file',
697
  'filename': os.path.basename(file.name),
 
706
  logger.error(f"File processing error: {e}")
707
  return []
708
 
709
+ def clean_json(self, data: Union[str, Dict]) -> Optional[Dict]:
710
+ """Clean and validate JSON data"""
711
+ try:
712
+ if isinstance(data, str):
713
+ data = data.strip()
714
+ data = json.loads(data)
715
+
716
+ cleaned = json.loads(json.dumps(data))
717
+ return cleaned
718
+ except json.JSONDecodeError as e:
719
+ logger.error(f"JSON cleaning error: {e}")
720
+ return None
721
+ except Exception as e:
722
+ logger.error(f"Unexpected error while cleaning JSON: {e}")
723
+ return None
724
+
725
+ def generate_qr_code(self, data: Union[str, Dict], combined: bool = True) -> List[str]:
726
+ """Generate QR code(s) from data"""
727
+ try:
728
+ output_dir = Path('output/qr_codes')
729
+ output_dir.mkdir(parents=True, exist_ok=True)
730
+
731
+ if combined:
732
+ cleaned_data = self.clean_json(data)
733
+ if cleaned_data:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
  qr = qrcode.QRCode(
735
  version=None,
736
  error_correction=qrcode.constants.ERROR_CORRECT_L,
737
  box_size=10,
738
  border=4,
739
  )
740
+ json_str = json.dumps(cleaned_data, ensure_ascii=False)
741
  qr.add_data(json_str)
742
  qr.make(fit=True)
743
+
744
  img = qr.make_image(fill_color="black", back_color="white")
745
+ output_path = output_dir / f'combined_qr_{int(time.time())}.png'
746
  img.save(str(output_path))
747
  return [str(output_path)]
748
+ else:
749
+ if isinstance(data, list):
750
+ paths = []
751
+ for idx, item in enumerate(data):
752
+ cleaned_item = self.clean_json(item)
753
+ if cleaned_item:
754
+ qr = qrcode.QRCode(
755
+ version=None,
756
+ error_correction=qrcode.constants.ERROR_CORRECT_L,
757
+ box_size=10,
758
+ border=4,
759
+ )
760
+ json_str = json.dumps(cleaned_item, ensure_ascii=False)
761
+ qr.add_data(json_str)
762
+ qr.make(fit=True)
763
+
764
+ img = qrcode.make_image(fill_color="black", back_color="white")
765
+ output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
766
+ img.save(str(output_path))
767
+ paths.append(str(output_path))
768
+ return paths
769
+ else:
770
+ cleaned_item = self.clean_json(data)
771
+ if cleaned_item:
772
+ qr = qrcode.QRCode(
773
+ version=None,
774
+ error_correction=qrcode.constants.ERROR_CORRECT_L,
775
+ box_size=10,
776
+ border=4,
777
+ )
778
+ json_str = json.dumps(cleaned_item, ensure_ascii=False)
779
+ qr.add_data(json_str)
780
+ qr.make(fit=True)
781
+
782
+ img = qrcode.make_image(fill_color="black", back_color="white")
783
+ output_path = output_dir / f'single_qr_{int(time.time())}.png'
784
+ img.save(str(output_path))
785
+ return [str(output_path)]
786
+
787
+ return []
788
+ except Exception as e:
789
+ logger.error(f"QR generation error: {e}")
790
+ return []
791
+ def decode_qr_code(image_path: str) -> Optional[str]:
792
+ """Decode QR code from an image file using OpenCV with improved binary handling"""
793
+ try:
794
+ # Read image using OpenCV
795
+ img = cv2.imread(image_path)
796
+ if img is None:
797
+ logger.error(f"Failed to read image: {image_path}")
798
+ return None
799
+
800
+ # Convert to grayscale
801
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
802
+
803
+ # Initialize QRCode detector
804
+ detector = cv2.QRCodeDetector()
805
+
806
+ # Detect and decode
807
+ data, vertices, _ = detector.detectAndDecode(gray)
808
+
809
+ if vertices is not None and data:
810
+ # Check if this might be binary data (like a PDF)
811
+ if data.startswith("%PDF") or not all(ord(c) < 128 for c in data):
812
+ # This is likely binary data, encode as base64
813
+ try:
814
+ # If it's already a string representation, convert to bytes first
815
+ if isinstance(data, str):
816
+ data_bytes = data.encode('latin-1') # Use latin-1 to preserve byte values
817
+ else:
818
+ data_bytes = data
819
+
820
+ # Encode as base64
821
+ base64_data = base64.b64encode(data_bytes).decode('ascii')
822
+ return f"base64:{base64_data}"
823
+ except Exception as e:
824
+ logger.error(f"Error encoding binary data: {e}")
825
+
826
+ return data
827
+
828
+ logger.warning("No QR code found in image")
829
+ return None
830
  except Exception as e:
831
+ logger.error(f"QR decoding error: {e}")
832
+ return None
833
 
834
+ # Also update the datachat_interface function to handle base64 data
835
+ def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: str, query: str) -> str:
836
+ """Interface for DataChat functionality with binary data support"""
837
+ data = None
838
+ if data_source == "JSON Input":
839
+ data = json_input
840
+ elif data_source == "QR Code":
841
+ try:
842
+ decoded_data = decode_qr_code(qr_image)
843
+
844
+ # Handle base64 encoded data
845
+ if decoded_data and decoded_data.startswith("base64:"):
846
+ base64_part = decoded_data[7:] # Remove the "base64:" prefix
847
+ try:
848
+ # For PDFs and other binary data, provide info about the content
849
+ binary_data = base64.b64decode(base64_part)
850
+ if binary_data.startswith(b"%PDF"):
851
+ data = "The QR code contains a PDF document. Binary data cannot be processed directly."
852
+ else:
853
+ # Try to decode as text as a fallback
854
+ data = binary_data.decode('utf-8', errors='replace')
855
+ except Exception as e:
856
+ logger.error(f"Error processing base64 data: {e}")
857
+ data = "The QR code contains binary data that cannot be processed directly."
858
+ else:
859
+ data = decoded_data
860
+
861
+ if not data:
862
+ return "No QR code found in the provided image."
863
+ except Exception as e:
864
+ return f"Invalid QR code data provided: {e}"
865
+ else:
866
+ return "No valid data source selected."
867
+
868
+ if mode == "Trained with Data":
869
+ return datachat_trained(data, query)
870
+ elif mode == "Chat about Data":
871
+ return datachat_simple(data, query)
872
+ else:
873
+ return "Invalid mode selected."
874
+
875
+ # Replace the create_interface function with this version
876
  def create_interface():
877
  """Create a comprehensive Gradio interface with advanced features"""
 
878
  css = """
879
  .container { max-width: 1200px; margin: auto; }
880
  .warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 4px; }
881
  .error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 4px; }
882
  .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
883
  """
884
+
885
+ # Use Interface instead of Blocks
886
+ interface = gr.Interface(
887
+ fn=datachat_interface,
888
+ inputs=[
889
+ gr.Radio(["Trained with Data", "Chat about Data"], label="Mode"),
890
+ gr.Radio(["JSON Input", "QR Code"], label="Data Source"),
891
+ gr.Textbox(lines=8, label="JSON Data"),
892
+ gr.Image(label="QR Code Image", type="filepath"),
893
+ gr.Textbox(label="Query")
894
+ ],
895
+ outputs=gr.Textbox(label="Response"),
896
+ title="Advanced Data Processor & QR Code Generator",
897
+ description="# 🌐 Advanced Data Processing & QR Code Generator",
898
+ css=css
899
+ )
900
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
901
  return interface
902
 
903
  def main():
904
+ """Main entry point for the application"""
905
+ parser = argparse.ArgumentParser(description='URL and File Processor')
906
+ parser.add_argument('--mode', choices=['web', 'cli'], default='web', help='Run mode (web interface or CLI)')
907
+ parser.add_argument('--url', help='URL to process (CLI mode)')
908
+ parser.add_argument('--file', help='File to process (CLI mode)')
909
+ parser.add_argument('--output', help='Output directory for results (CLI mode)')
910
+ parser.add_argument('--share', action='store_true', help='Share the web interface publicly (web mode)')
911
+ parser.add_argument('--check-deps', action='store_true', help='Check dependencies and install missing ones')
912
+
913
+ args = parser.parse_args()
914
+
915
+ # Check dependencies if requested
916
+ if args.check_deps:
917
+ from utils import check_dependencies, install_missing_dependencies
918
+
919
+ logger.info("Checking dependencies...")
920
+ deps = check_dependencies()
921
+
922
+ missing = [pkg for pkg, installed in deps.items() if not installed]
923
+ if missing:
924
+ logger.info(f"Missing dependencies: {', '.join(missing)}")
925
+ if input("Install missing dependencies? (y/n): ").lower() == 'y':
926
+ install_missing_dependencies(missing)
927
+ else:
928
+ logger.warning("Some features may not work without required dependencies.")
929
+ else:
930
+ logger.info("All dependencies are installed.")
931
+
932
+ # Run in web mode
933
+ if args.mode == 'web':
934
+ try:
935
+ import gradio
936
+ except ImportError:
937
+ logger.error("Gradio is required for web mode. Install with 'pip install gradio'")
938
+ sys.exit(1)
939
+
940
+ from interface import Interface
941
+
942
+ logger.info("Starting web interface...")
943
+ interface = Interface()
944
+ interface.launch(share=args.share)
945
+
946
+ # Run in CLI mode
947
+ elif args.mode == 'cli':
948
+ if not args.url and not args.file:
949
+ logger.error("In CLI mode, you must provide either --url or --file")
950
+ sys.exit(1)
951
+
952
+ results = []
953
+
954
+ # Process URL if provided
955
+ if args.url:
956
+ from url_processor import URLProcessor
957
+
958
+ logger.info(f"Processing URL: {args.url}")
959
+ url_processor = URLProcessor()
960
+ url_results = url_processor.process_urls([args.url])
961
+ results.extend(url_results)
962
+
963
+ # Process file if provided
964
+ if args.file:
965
+ from file_processor import FileProcessor
966
+
967
+ if not os.path.exists(args.file):
968
+ logger.error(f"File not found: {args.file}")
969
+ sys.exit(1)
970
+
971
+ logger.info(f"Processing file: {args.file}")
972
+ file_processor = FileProcessor()
973
+
974
+ # Create a file-like object with a name attribute
975
+ class FileObj:
976
+ def __init__(self, path):
977
+ self.name = path
978
+
979
+ file_results = file_processor.process_file(FileObj(args.file))
980
+ results.extend(file_results)
981
+
982
+ # Save results
983
+ if results:
984
+ from utils import save_results
985
+
986
+ output_dir = args.output or os.getcwd()
987
+ filepath = save_results(results, output_dir)
988
+
989
+ if filepath:
990
+ logger.info(f"Results saved to: {filepath}")
991
+ else:
992
+ logger.error("Failed to save results")
993
+ else:
994
+ logger.warning("No results to save")
995
 
996
  if __name__ == "__main__":
997
+ main()