Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
@@ -7,8 +7,11 @@ import mimetypes
|
|
7 |
import zipfile
|
8 |
import tempfile
|
9 |
import chardet
|
|
|
|
|
|
|
10 |
from datetime import datetime
|
11 |
-
from typing import List, Dict, Optional, Union, Tuple
|
12 |
from pathlib import Path
|
13 |
from urllib.parse import urlparse, urljoin
|
14 |
import requests
|
@@ -25,6 +28,38 @@ import tarfile
|
|
25 |
import gzip
|
26 |
import math
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# Setup enhanced logging with more detailed formatting
|
29 |
logging.basicConfig(
|
30 |
level=logging.INFO,
|
@@ -43,7 +78,7 @@ for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
|
|
43 |
directory.mkdir(parents=True, exist_ok=True)
|
44 |
|
45 |
class EnhancedURLProcessor:
|
46 |
-
"""Advanced URL processing with
|
47 |
def __init__(self):
|
48 |
self.session = requests.Session()
|
49 |
self.timeout = 15 # Extended timeout for larger content
|
@@ -53,11 +88,11 @@ class EnhancedURLProcessor:
|
|
53 |
# Enhanced headers for better site compatibility
|
54 |
self.session.headers.update({
|
55 |
'User-Agent': self.user_agent.random,
|
56 |
-
'Accept': '*/*',
|
57 |
'Accept-Language': 'en-US,en;q=0.9',
|
58 |
'Accept-Encoding': 'gzip, deflate, br',
|
59 |
'Connection': 'keep-alive',
|
60 |
-
'Upgrade-Insecure-Requests': '1',
|
61 |
'Sec-Fetch-Dest': 'document',
|
62 |
'Sec-Fetch-Mode': 'navigate',
|
63 |
'Sec-Fetch-Site': 'none',
|
@@ -77,15 +112,18 @@ class EnhancedURLProcessor:
|
|
77 |
try:
|
78 |
head_response = self.session.head(url, timeout=5)
|
79 |
head_response.raise_for_status()
|
|
|
80 |
except requests.exceptions.RequestException:
|
81 |
-
|
82 |
response = self.session.get(url, timeout=self.timeout)
|
83 |
response.raise_for_status()
|
|
|
84 |
|
85 |
return {
|
86 |
'is_valid': True,
|
87 |
'message': 'URL is valid and accessible',
|
88 |
'details': {
|
|
|
89 |
'content_type': head_response.headers.get('Content-Type', 'unknown'),
|
90 |
'server': head_response.headers.get('Server', 'unknown'),
|
91 |
'size': head_response.headers.get('Content-Length', 'unknown')
|
@@ -104,23 +142,38 @@ class EnhancedURLProcessor:
|
|
104 |
|
105 |
response = self.session.get(url, timeout=self.timeout)
|
106 |
response.raise_for_status()
|
|
|
107 |
|
108 |
# Detect encoding
|
109 |
-
if response.encoding is None:
|
110 |
-
|
|
|
|
|
111 |
else:
|
112 |
encoding = response.encoding
|
|
|
|
|
113 |
# Decode content with fallback
|
114 |
try:
|
115 |
raw_content = response.content.decode(encoding, errors='replace')
|
116 |
except (UnicodeDecodeError, LookupError):
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
# Extract metadata
|
120 |
metadata = {
|
121 |
-
'
|
|
|
122 |
'timestamp': datetime.now().isoformat(),
|
123 |
-
'
|
124 |
'content_type': response.headers.get('Content-Type', ''),
|
125 |
'content_length': len(response.content),
|
126 |
'headers': dict(response.headers),
|
@@ -128,271 +181,636 @@ class EnhancedURLProcessor:
|
|
128 |
}
|
129 |
|
130 |
# Process based on content type
|
131 |
-
|
132 |
-
|
133 |
-
processed_content = self._process_html_content(raw_content, url)
|
134 |
-
else:
|
135 |
-
processed_content = raw_content
|
136 |
return {
|
137 |
-
'
|
|
|
138 |
'raw_content': raw_content,
|
139 |
-
'metadata': metadata
|
|
|
|
|
140 |
}
|
141 |
except requests.exceptions.RequestException as e:
|
142 |
if retry_count < self.max_retries - 1:
|
143 |
logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
|
144 |
time.sleep(2 ** retry_count) # Exponential backoff
|
145 |
return self.fetch_content(url, retry_count + 1)
|
146 |
-
logger.error(f"Failed to fetch content after {self.max_retries} attempts: {e}")
|
147 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
except Exception as e:
|
149 |
-
logger.error(f"Unexpected error
|
150 |
-
|
|
|
|
|
|
|
151 |
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
try:
|
155 |
soup = BeautifulSoup(content, 'html.parser')
|
156 |
|
157 |
-
#
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
text_parts = []
|
167 |
-
|
168 |
-
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
except Exception as e:
|
171 |
-
logger.error(f"HTML processing error: {e}")
|
172 |
-
|
|
|
|
|
|
|
173 |
|
174 |
class EnhancedFileProcessor:
|
175 |
-
"""Advanced file processing with
|
176 |
def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
|
177 |
self.max_file_size = max_file_size
|
|
|
178 |
self.supported_extensions = {
|
179 |
'.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
|
180 |
'.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
|
|
|
|
|
181 |
'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
|
182 |
-
'.pdf', '.doc', '.docx', '.rtf', '.odt'
|
183 |
}
|
|
|
|
|
184 |
|
185 |
def process_file(self, file) -> List[Dict]:
|
186 |
"""Process uploaded file with enhanced error handling and complete extraction"""
|
187 |
-
if not file:
|
|
|
188 |
return []
|
189 |
|
190 |
dataset = []
|
|
|
|
|
191 |
try:
|
192 |
-
file_size =
|
193 |
if file_size > self.max_file_size:
|
194 |
-
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
195 |
-
return [
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
with tempfile.TemporaryDirectory() as temp_dir:
|
198 |
temp_dir_path = Path(temp_dir)
|
199 |
|
200 |
-
#
|
201 |
-
if
|
202 |
-
dataset.extend(self._process_archive(
|
203 |
-
elif
|
204 |
-
|
|
|
205 |
else:
|
206 |
-
logger.warning(f"Unsupported file type: {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
except Exception as e:
|
209 |
-
logger.error(f"Error processing file: {str(e)}")
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
return dataset
|
212 |
|
213 |
-
def _is_archive(self, filepath: str) -> bool:
|
214 |
"""Check if file is an archive"""
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
-
def _process_single_file(self, file) -> List[Dict]:
|
220 |
-
"""Process a single file with enhanced character extraction and JSON handling"""
|
221 |
try:
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
# Process file in chunks for large files
|
229 |
-
chunk_size = 10 * 1024 * 1024 # 10MB chunks
|
230 |
-
with open(file.name, 'rb') as f:
|
231 |
-
while True:
|
232 |
-
chunk = f.read(chunk_size)
|
233 |
-
if not chunk:
|
234 |
-
break
|
235 |
-
|
236 |
-
# Detect encoding for each chunk
|
237 |
-
encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
|
238 |
-
try:
|
239 |
-
decoded_chunk = chunk.decode(encoding, errors='replace')
|
240 |
-
content_parts.append(decoded_chunk)
|
241 |
-
except (UnicodeDecodeError, LookupError):
|
242 |
-
decoded_chunk = chunk.decode('utf-8', errors='replace')
|
243 |
-
content_parts.append(decoded_chunk)
|
244 |
|
245 |
-
#
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
-
# Check if the content is valid JSON regardless of file extension
|
249 |
-
try:
|
250 |
-
if mimetypes.guess_type(file.name)[0] == 'application/json' or file.name.lower().endswith('.json'):
|
251 |
-
# It's a JSON file by type or extension
|
252 |
-
json_data = json.loads(complete_content)
|
253 |
-
return [{
|
254 |
-
'source': 'json_file',
|
255 |
-
'filename': os.path.basename(file.name),
|
256 |
-
'file_size': file_size,
|
257 |
-
'mime_type': 'application/json',
|
258 |
-
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
259 |
-
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
260 |
-
'content': json_data, # Store the parsed JSON object
|
261 |
-
'raw_content': complete_content, # Store the original JSON string
|
262 |
-
'timestamp': datetime.now().isoformat()
|
263 |
-
}]
|
264 |
-
else:
|
265 |
-
# Try to parse as JSON anyway
|
266 |
-
try:
|
267 |
-
json_data = json.loads(complete_content)
|
268 |
-
# If we get here, it's valid JSON despite the extension
|
269 |
-
return [{
|
270 |
-
'source': 'json_content',
|
271 |
-
'filename': os.path.basename(file.name),
|
272 |
-
'file_size': file_size,
|
273 |
-
'mime_type': 'application/json',
|
274 |
-
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
275 |
-
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
276 |
-
'content': json_data, # Store the parsed JSON object
|
277 |
-
'raw_content': complete_content, # Store the original JSON string
|
278 |
-
'timestamp': datetime.now().isoformat()
|
279 |
-
}]
|
280 |
-
except json.JSONDecodeError:
|
281 |
-
logger.warning(f"File {file.name} is not valid JSON.")
|
282 |
-
except Exception as e:
|
283 |
-
logger.error(f"Error during JSON processing: {e}")
|
284 |
|
285 |
-
return [{
|
286 |
-
'source': 'file',
|
287 |
-
'filename': os.path.basename(file.name),
|
288 |
-
'file_size': file_size,
|
289 |
-
'mime_type': mimetypes.guess_type(file.name)[0],
|
290 |
-
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
291 |
-
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
292 |
-
'content': complete_content,
|
293 |
-
'timestamp': datetime.now().isoformat()
|
294 |
-
}]
|
295 |
except Exception as e:
|
296 |
-
|
297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
-
|
|
|
300 |
"""Process an archive file with enhanced extraction"""
|
301 |
dataset = []
|
|
|
|
|
|
|
302 |
try:
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
try:
|
316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
for member in tar_ref.getmembers():
|
318 |
if member.isfile():
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
except tarfile.TarError as e:
|
325 |
-
logger.error(f"Error processing TAR archive: {e}")
|
326 |
-
|
327 |
-
elif
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
|
341 |
except Exception as e:
|
342 |
-
logger.error(f"
|
|
|
|
|
|
|
|
|
|
|
343 |
return dataset
|
344 |
|
345 |
def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
|
346 |
"""Enhanced data chunking with sequence metadata"""
|
347 |
try:
|
348 |
# Convert data to JSON string
|
349 |
-
|
|
|
350 |
total_length = len(json_str)
|
351 |
|
352 |
# Calculate overhead for metadata
|
|
|
353 |
metadata_template = {
|
354 |
-
"
|
355 |
-
"
|
356 |
-
"
|
357 |
-
"
|
358 |
-
"data": ""
|
359 |
}
|
360 |
-
overhead
|
|
|
|
|
361 |
|
362 |
# Calculate effective chunk size
|
363 |
-
effective_chunk_size = max_size -
|
|
|
|
|
|
|
|
|
364 |
|
365 |
if total_length <= effective_chunk_size:
|
366 |
# Data fits in one chunk
|
|
|
|
|
367 |
chunk = {
|
368 |
-
"
|
369 |
-
"
|
370 |
-
"
|
371 |
-
"
|
372 |
-
"data":
|
373 |
}
|
374 |
return [chunk]
|
375 |
|
376 |
# Calculate number of chunks needed
|
377 |
num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
|
378 |
-
chunk_size
|
|
|
379 |
|
380 |
chunks = []
|
|
|
381 |
for i in range(num_chunks):
|
382 |
-
|
383 |
-
|
384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
|
386 |
chunk = {
|
387 |
-
"
|
388 |
-
"
|
389 |
-
"
|
390 |
-
"
|
391 |
-
"data":
|
392 |
}
|
393 |
chunks.append(chunk)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
|
|
|
|
|
395 |
return chunks
|
|
|
396 |
except Exception as e:
|
397 |
logger.error(f"Error chunking data: {e}")
|
398 |
return []
|
@@ -407,38 +825,51 @@ def generate_stylish_qr(data: Union[str, Dict],
|
|
407 |
try:
|
408 |
qr = qrcode.QRCode(
|
409 |
version=None,
|
410 |
-
error_correction=qrcode.constants.ERROR_CORRECT_M,
|
411 |
box_size=size,
|
412 |
border=border
|
413 |
)
|
414 |
|
415 |
# Add data to QR code
|
416 |
if isinstance(data, dict):
|
417 |
-
|
|
|
418 |
else:
|
419 |
-
qr.add_data(data)
|
420 |
|
421 |
qr.make(fit=True)
|
422 |
|
423 |
# Create QR code image with custom colors
|
424 |
qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
|
425 |
|
426 |
-
# Convert to RGBA for transparency support
|
427 |
qr_image = qr_image.convert('RGBA')
|
428 |
|
429 |
-
# Add
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
435 |
|
436 |
-
# Combine images
|
437 |
-
final_image = Image.alpha_composite(qr_image, gradient)
|
438 |
|
439 |
# Save the image
|
440 |
output_path = QR_CODES_DIR / filename
|
441 |
-
final_image.save(output_path, quality=
|
442 |
|
443 |
return str(output_path)
|
444 |
except Exception as e:
|
@@ -447,55 +878,68 @@ def generate_stylish_qr(data: Union[str, Dict],
|
|
447 |
|
448 |
def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
|
449 |
"""Generate QR codes with enhanced visual appeal and metadata"""
|
|
|
|
|
|
|
|
|
|
|
450 |
try:
|
451 |
-
file_processor = EnhancedFileProcessor()
|
452 |
paths = []
|
453 |
|
454 |
if combined:
|
455 |
# Process combined data
|
456 |
-
chunks = file_processor.chunk_data(data)
|
|
|
|
|
|
|
457 |
for i, chunk in enumerate(chunks):
|
458 |
filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
|
459 |
qr_path = generate_stylish_qr(
|
460 |
-
data=chunk,
|
461 |
filename=filename,
|
462 |
fill_color="#1a365d", # Deep blue
|
463 |
back_color="#ffffff"
|
464 |
)
|
465 |
if qr_path:
|
466 |
paths.append(qr_path)
|
|
|
|
|
467 |
else:
|
468 |
-
# Process individual items
|
469 |
-
if
|
470 |
for idx, item in enumerate(data):
|
471 |
-
chunks = file_processor.chunk_data(item)
|
|
|
|
|
|
|
472 |
for chunk_idx, chunk in enumerate(chunks):
|
473 |
filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
|
474 |
qr_path = generate_stylish_qr(
|
475 |
-
data=chunk,
|
476 |
filename=filename,
|
477 |
fill_color="#1a365d", # Deep blue
|
478 |
back_color="#ffffff"
|
479 |
)
|
480 |
if qr_path:
|
481 |
paths.append(qr_path)
|
|
|
|
|
482 |
else:
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
filename=filename,
|
489 |
-
fill_color="#1a365d", # Deep blue
|
490 |
-
back_color="#ffffff"
|
491 |
-
)
|
492 |
-
if qr_path:
|
493 |
-
paths.append(qr_path)
|
494 |
-
return paths
|
495 |
except Exception as e:
|
496 |
logger.error(f"QR code generation error: {e}")
|
497 |
return []
|
498 |
|
|
|
|
|
|
|
|
|
|
|
499 |
def create_modern_interface():
|
500 |
"""Create a modern and visually appealing Gradio interface"""
|
501 |
|
@@ -599,7 +1043,6 @@ def create_modern_interface():
|
|
599 |
interface.head += """
|
600 |
<script>
|
601 |
let enabledStates = [];
|
602 |
-
|
603 |
function updateEnabledStates(checkbox) {
|
604 |
const index = parseInt(checkbox.dataset.index);
|
605 |
if (checkbox.checked) {
|
@@ -623,7 +1066,6 @@ def create_modern_interface():
|
|
623 |
qr_code_paths = gr.State([])
|
624 |
gr.Markdown("""
|
625 |
# π Advanced Data Processing & QR Code Generator
|
626 |
-
|
627 |
Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
|
628 |
""")
|
629 |
with gr.Tab("π URL Processing"):
|
@@ -707,24 +1149,30 @@ def create_modern_interface():
|
|
707 |
return json.dumps(example, indent=2)
|
708 |
|
709 |
def clear_input():
|
710 |
-
return ""
|
711 |
|
712 |
def update_viewport(paths, enabled_states):
|
713 |
if not paths:
|
714 |
return "<p>No QR codes generated yet.</p>"
|
715 |
|
716 |
num_qr_codes = len(paths)
|
717 |
-
cols = math.ceil(math.sqrt(num_qr_codes))
|
|
|
718 |
rows = math.ceil(num_qr_codes / cols)
|
719 |
|
720 |
-
viewport_html = '<div class="viewport-container" style="grid-template-columns: repeat({}, 1fr);">'.format(cols)
|
|
|
|
|
|
|
|
|
721 |
|
722 |
for i, path in enumerate(paths):
|
723 |
is_enabled = i in enabled_states
|
724 |
border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;"
|
|
|
725 |
viewport_html += f'<div class="viewport-item" id="qr_item_{i}">'
|
726 |
-
viewport_html += f'<img src="{path}" style="{border}" alt="QR Code {i+1}">'
|
727 |
-
viewport_html += f'<input type="checkbox"
|
728 |
viewport_html += '</div>'
|
729 |
viewport_html += '</div>'
|
730 |
|
@@ -732,21 +1180,30 @@ def create_modern_interface():
|
|
732 |
|
733 |
def process_inputs(urls, files, text, combine):
|
734 |
"""Process all inputs and generate QR codes"""
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
|
|
739 |
|
|
|
740 |
# Process JSON input
|
741 |
if text and text.strip():
|
742 |
try:
|
743 |
json_data = json.loads(text)
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
|
|
|
|
|
|
|
|
748 |
except json.JSONDecodeError as e:
|
749 |
-
|
|
|
|
|
|
|
750 |
|
751 |
# Process URLs
|
752 |
if urls and urls.strip():
|
@@ -755,79 +1212,122 @@ def create_modern_interface():
|
|
755 |
for url in url_list:
|
756 |
validation = url_processor.validate_url(url)
|
757 |
if validation['is_valid']:
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
|
|
|
|
|
|
766 |
|
767 |
# Process files
|
768 |
if files:
|
769 |
for file in files:
|
|
|
770 |
file_results = file_processor.process_file(file)
|
771 |
if file_results:
|
772 |
-
|
|
|
|
|
|
|
773 |
|
774 |
# Generate QR codes
|
|
|
|
|
|
|
775 |
if results:
|
|
|
776 |
qr_paths = generate_qr_codes(results, combine)
|
|
|
|
|
777 |
if qr_paths:
|
778 |
-
|
779 |
-
results,
|
780 |
-
[str(path) for path in qr_paths],
|
781 |
-
f"β
Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
|
782 |
-
)
|
783 |
else:
|
784 |
-
|
|
|
785 |
else:
|
786 |
-
|
|
|
|
|
787 |
except Exception as e:
|
788 |
-
logger.error(f"
|
789 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
790 |
|
791 |
-
def on_qr_generation(
|
792 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
793 |
|
794 |
process_btn.click(
|
795 |
process_inputs,
|
796 |
inputs=[url_input, file_input, text_input, combine_data],
|
797 |
outputs=[output_json, output_gallery, output_text]
|
798 |
-
).then(
|
|
|
|
|
|
|
|
|
799 |
|
|
|
800 |
viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output])
|
801 |
|
802 |
# Add helpful documentation
|
803 |
gr.Markdown("""
|
804 |
### π Features
|
805 |
-
|
806 |
-
|
807 |
-
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
-
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
-
|
824 |
-
|
825 |
-
|
826 |
-
|
827 |
-
|
828 |
-
|
829 |
-
|
830 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
831 |
return interface
|
832 |
|
833 |
def main():
|
@@ -842,13 +1342,15 @@ def main():
|
|
842 |
# Launch with configuration
|
843 |
interface.launch(
|
844 |
share=False,
|
845 |
-
debug=False,
|
846 |
show_error=True,
|
847 |
show_api=False
|
848 |
)
|
849 |
except Exception as e:
|
850 |
logger.error(f"Application startup error: {e}")
|
851 |
-
|
|
|
|
|
852 |
|
853 |
if __name__ == "__main__":
|
854 |
main()
|
|
|
7 |
import zipfile
|
8 |
import tempfile
|
9 |
import chardet
|
10 |
+
import io # Needed for processing CSV from string
|
11 |
+
import csv # Needed for CSV
|
12 |
+
import xml.etree.ElementTree as ET # Needed for XML
|
13 |
from datetime import datetime
|
14 |
+
from typing import List, Dict, Optional, Union, Tuple, Any # Added Any for extracted_data
|
15 |
from pathlib import Path
|
16 |
from urllib.parse import urlparse, urljoin
|
17 |
import requests
|
|
|
28 |
import gzip
|
29 |
import math
|
30 |
|
31 |
+
# Conditional imports for document processing
|
32 |
+
try:
|
33 |
+
from PyPDF2 import PdfReader
|
34 |
+
PDF_SUPPORT = True
|
35 |
+
except ImportError:
|
36 |
+
PDF_SUPPORT = False
|
37 |
+
logger.warning("PyPDF2 not installed. PDF file processing will be limited.")
|
38 |
+
|
39 |
+
try:
|
40 |
+
from docx import Document
|
41 |
+
DOCX_SUPPORT = True
|
42 |
+
except ImportError:
|
43 |
+
DOCX_SUPPORT = False
|
44 |
+
logger.warning("python-docx not installed. DOCX file processing will be limited.")
|
45 |
+
|
46 |
+
try:
|
47 |
+
from pyth.plugins.rtf15.reader import Rtf15Reader
|
48 |
+
from pyth.plugins.plaintext.writer import PlaintextWriter
|
49 |
+
RTF_SUPPORT = True
|
50 |
+
except ImportError:
|
51 |
+
RTF_SUPPORT = False
|
52 |
+
logger.warning("pyth not installed. RTF file processing will be limited.")
|
53 |
+
|
54 |
+
try:
|
55 |
+
from odf.opendocument import OpenDocumentText
|
56 |
+
from odf import text as odftext
|
57 |
+
ODT_SUPPORT = True
|
58 |
+
except ImportError:
|
59 |
+
ODT_SUPPORT = False
|
60 |
+
logger.warning("odfpy not installed. ODT file processing will be limited.")
|
61 |
+
|
62 |
+
|
63 |
# Setup enhanced logging with more detailed formatting
|
64 |
logging.basicConfig(
|
65 |
level=logging.INFO,
|
|
|
78 |
directory.mkdir(parents=True, exist_ok=True)
|
79 |
|
80 |
class EnhancedURLProcessor:
|
81 |
+
"""Advanced URL processing with enhanced content extraction"""
|
82 |
def __init__(self):
|
83 |
self.session = requests.Session()
|
84 |
self.timeout = 15 # Extended timeout for larger content
|
|
|
88 |
# Enhanced headers for better site compatibility
|
89 |
self.session.headers.update({
|
90 |
'User-Agent': self.user_agent.random,
|
91 |
+
'Accept': 'text/html, application/json, application/xml, text/plain, */*', # Request common types
|
92 |
'Accept-Language': 'en-US,en;q=0.9',
|
93 |
'Accept-Encoding': 'gzip, deflate, br',
|
94 |
'Connection': 'keep-alive',
|
95 |
+
'Upgrade-Insecure-Requests': '1', # May be ignored for non-HTML
|
96 |
'Sec-Fetch-Dest': 'document',
|
97 |
'Sec-Fetch-Mode': 'navigate',
|
98 |
'Sec-Fetch-Site': 'none',
|
|
|
112 |
try:
|
113 |
head_response = self.session.head(url, timeout=5)
|
114 |
head_response.raise_for_status()
|
115 |
+
final_url = head_response.url # Capture potential redirects
|
116 |
except requests.exceptions.RequestException:
|
117 |
+
# If HEAD fails, try GET as some servers don't support HEAD
|
118 |
response = self.session.get(url, timeout=self.timeout)
|
119 |
response.raise_for_status()
|
120 |
+
final_url = response.url # Capture potential redirects
|
121 |
|
122 |
return {
|
123 |
'is_valid': True,
|
124 |
'message': 'URL is valid and accessible',
|
125 |
'details': {
|
126 |
+
'final_url': final_url,
|
127 |
'content_type': head_response.headers.get('Content-Type', 'unknown'),
|
128 |
'server': head_response.headers.get('Server', 'unknown'),
|
129 |
'size': head_response.headers.get('Content-Length', 'unknown')
|
|
|
142 |
|
143 |
response = self.session.get(url, timeout=self.timeout)
|
144 |
response.raise_for_status()
|
145 |
+
final_url = response.url # Capture potential redirects
|
146 |
|
147 |
# Detect encoding
|
148 |
+
if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
|
149 |
+
encoding_detection = chardet.detect(response.content)
|
150 |
+
encoding = encoding_detection['encoding'] or 'utf-8'
|
151 |
+
logger.debug(f"Detected encoding '{encoding}' with confidence {encoding_detection['confidence']:.2f} for {url}")
|
152 |
else:
|
153 |
encoding = response.encoding
|
154 |
+
logger.debug(f"Using response.encoding '{encoding}' for {url}")
|
155 |
+
|
156 |
# Decode content with fallback
|
157 |
try:
|
158 |
raw_content = response.content.decode(encoding, errors='replace')
|
159 |
except (UnicodeDecodeError, LookupError):
|
160 |
+
# Fallback to a more common encoding if the first attempt fails
|
161 |
+
try:
|
162 |
+
raw_content = response.content.decode('utf-8', errors='replace')
|
163 |
+
encoding = 'utf-8 (fallback)'
|
164 |
+
logger.warning(f"Decoding with {encoding} fallback for {url}")
|
165 |
+
except Exception:
|
166 |
+
raw_content = response.content.decode('latin-1', errors='replace') # Another common fallback
|
167 |
+
encoding = 'latin-1 (fallback)'
|
168 |
+
logger.warning(f"Decoding with {encoding} fallback for {url}")
|
169 |
+
|
170 |
|
171 |
# Extract metadata
|
172 |
metadata = {
|
173 |
+
'original_url': url,
|
174 |
+
'final_url': final_url,
|
175 |
'timestamp': datetime.now().isoformat(),
|
176 |
+
'detected_encoding': encoding,
|
177 |
'content_type': response.headers.get('Content-Type', ''),
|
178 |
'content_length': len(response.content),
|
179 |
'headers': dict(response.headers),
|
|
|
181 |
}
|
182 |
|
183 |
# Process based on content type
|
184 |
+
processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url)
|
185 |
+
|
|
|
|
|
|
|
186 |
return {
|
187 |
+
'source': 'url',
|
188 |
+
'url': url, # Keep original URL as identifier
|
189 |
'raw_content': raw_content,
|
190 |
+
'metadata': metadata,
|
191 |
+
'extracted_data': processed_extraction['data'],
|
192 |
+
'processing_notes': processed_extraction['notes']
|
193 |
}
|
194 |
except requests.exceptions.RequestException as e:
|
195 |
if retry_count < self.max_retries - 1:
|
196 |
logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
|
197 |
time.sleep(2 ** retry_count) # Exponential backoff
|
198 |
return self.fetch_content(url, retry_count + 1)
|
199 |
+
logger.error(f"Failed to fetch content after {self.max_retries} attempts from {url}: {e}")
|
200 |
+
return {
|
201 |
+
'source': 'url',
|
202 |
+
'url': url,
|
203 |
+
'raw_content': None,
|
204 |
+
'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat()},
|
205 |
+
'extracted_data': None,
|
206 |
+
'processing_notes': f"Failed to fetch content: {str(e)}"
|
207 |
+
}
|
208 |
+
except Exception as e:
|
209 |
+
logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
|
210 |
+
return {
|
211 |
+
'source': 'url',
|
212 |
+
'url': url,
|
213 |
+
'raw_content': raw_content if 'raw_content' in locals() else None,
|
214 |
+
'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat()},
|
215 |
+
'extracted_data': None,
|
216 |
+
'processing_notes': f"Unexpected processing error: {str(e)}"
|
217 |
+
}
|
218 |
+
|
219 |
+
def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
|
220 |
+
"""Process content based on detected content type"""
|
221 |
+
lower_content_type = content_type.lower()
|
222 |
+
notes = []
|
223 |
+
extracted_data: Any = None # Use Any to allow different types
|
224 |
+
|
225 |
+
try:
|
226 |
+
if 'text/html' in lower_content_type:
|
227 |
+
logger.debug(f"Processing HTML content from {base_url}")
|
228 |
+
extracted_data = self._process_html_content_enhanced(content, base_url)
|
229 |
+
notes.append("Processed as HTML")
|
230 |
+
elif 'application/json' in lower_content_type or 'text/json' in lower_content_type:
|
231 |
+
logger.debug(f"Processing JSON content from {base_url}")
|
232 |
+
try:
|
233 |
+
extracted_data = json.loads(content)
|
234 |
+
notes.append("Parsed as JSON")
|
235 |
+
except json.JSONDecodeError as e:
|
236 |
+
extracted_data = content # Keep raw text if invalid JSON
|
237 |
+
notes.append(f"Failed to parse as JSON: {e}")
|
238 |
+
logger.warning(f"Failed to parse JSON from {base_url}: {e}")
|
239 |
+
except Exception as e:
|
240 |
+
extracted_data = content
|
241 |
+
notes.append(f"Error processing JSON: {e}")
|
242 |
+
logger.error(f"Error processing JSON from {base_url}: {e}")
|
243 |
+
elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
|
244 |
+
logger.debug(f"Processing XML content from {base_url}")
|
245 |
+
try:
|
246 |
+
# Try parsing XML. Convert to a string or a dict representation if needed.
|
247 |
+
# For simplicity, we'll convert to a readable string representation of the tree.
|
248 |
+
root = ET.fromstring(content)
|
249 |
+
# A simple way to represent XML as text
|
250 |
+
xml_text = ET.tostring(root, encoding='unicode', method='xml')
|
251 |
+
extracted_data = xml_text # Store as string for now
|
252 |
+
notes.append("Parsed as XML (text representation)")
|
253 |
+
except ET.ParseError as e:
|
254 |
+
extracted_data = content
|
255 |
+
notes.append(f"Failed to parse as XML: {e}")
|
256 |
+
logger.warning(f"Failed to parse XML from {base_url}: {e}")
|
257 |
+
except Exception as e:
|
258 |
+
extracted_data = content
|
259 |
+
notes.append(f"Error processing XML: {e}")
|
260 |
+
logger.error(f"Error processing XML from {base_url}: {e}")
|
261 |
+
elif 'text/plain' in lower_content_type or 'text/' in lower_content_type: # Catch other text types
|
262 |
+
logger.debug(f"Processing Plain Text content from {base_url}")
|
263 |
+
extracted_data = content
|
264 |
+
notes.append("Processed as Plain Text")
|
265 |
+
else:
|
266 |
+
logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
|
267 |
+
extracted_data = content # Store raw content for unknown types
|
268 |
+
notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
|
269 |
+
|
270 |
except Exception as e:
|
271 |
+
logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
|
272 |
+
extracted_data = content # Fallback to raw content on error
|
273 |
+
notes.append(f"Unexpected processing error: {e}. Stored raw text.")
|
274 |
+
|
275 |
+
return {'data': extracted_data, 'notes': notes}
|
276 |
|
277 |
+
|
278 |
+
def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
|
279 |
+
"""Process HTML content, preserving text, and extracting metadata."""
|
280 |
+
extracted: Dict[str, Any] = {
|
281 |
+
'title': None,
|
282 |
+
'meta_description': None, # Add extraction for meta description
|
283 |
+
'full_text': "",
|
284 |
+
'links': [] # Add extraction for links
|
285 |
+
}
|
286 |
try:
|
287 |
soup = BeautifulSoup(content, 'html.parser')
|
288 |
|
289 |
+
# Extract Title
|
290 |
+
if soup.title and soup.title.string:
|
291 |
+
extracted['title'] = soup.title.string.strip()
|
292 |
+
|
293 |
+
# Extract Meta Description
|
294 |
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
295 |
+
if meta_desc and meta_desc.get('content'):
|
296 |
+
extracted['meta_description'] = meta_desc['content'].strip()
|
297 |
+
|
298 |
+
# Extract and process links (convert relative to absolute)
|
299 |
+
for a_tag in soup.find_all('a', href=True):
|
300 |
+
href = a_tag['href']
|
301 |
+
text = a_tag.get_text().strip()
|
302 |
+
try:
|
303 |
+
absolute_url = urljoin(base_url, href)
|
304 |
+
extracted['links'].append({'text': text, 'url': absolute_url})
|
305 |
+
except Exception:
|
306 |
+
extracted['links'].append({'text': text, 'url': href}) # Keep relative if join fails
|
307 |
+
|
308 |
+
|
309 |
+
# Extract all text content (similar to stripped_strings but ensures order)
|
310 |
text_parts = []
|
311 |
+
# Use a more robust way to get visible text, including handling script/style tags
|
312 |
+
for script_or_style in soup(["script", "style"]):
|
313 |
+
script_or_style.extract() # Remove script and style tags
|
314 |
+
text = soup.get_text(separator='\n') # Get text with newlines
|
315 |
+
|
316 |
+
# Clean up whitespace and empty lines
|
317 |
+
lines = text.splitlines()
|
318 |
+
cleaned_lines = [line.strip() for line in lines if line.strip()]
|
319 |
+
extracted['full_text'] = '\n'.join(cleaned_lines)
|
320 |
+
|
321 |
except Exception as e:
|
322 |
+
logger.error(f"Enhanced HTML processing error for {base_url}: {e}")
|
323 |
+
extracted['full_text'] = content # Fallback to raw content
|
324 |
+
extracted['processing_error'] = f"Enhanced HTML processing failed: {e}"
|
325 |
+
|
326 |
+
return extracted
|
327 |
|
328 |
class EnhancedFileProcessor:
|
329 |
+
"""Advanced file processing with enhanced content extraction"""
|
330 |
def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
|
331 |
self.max_file_size = max_file_size
|
332 |
+
# Expanded supported extensions to include common docs and structured formats
|
333 |
self.supported_extensions = {
|
334 |
'.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
|
335 |
'.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
|
336 |
+
'.pdf', '.doc', '.docx', '.rtf', '.odt',
|
337 |
+
# Archives are handled separately but listed for context
|
338 |
'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
|
|
|
339 |
}
|
340 |
+
self.archive_extensions = {'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'}
|
341 |
+
|
342 |
|
343 |
def process_file(self, file) -> List[Dict]:
|
344 |
"""Process uploaded file with enhanced error handling and complete extraction"""
|
345 |
+
if not file or not hasattr(file, 'name'):
|
346 |
+
logger.warning("Received invalid file object.")
|
347 |
return []
|
348 |
|
349 |
dataset = []
|
350 |
+
file_path = Path(file.name) # Use Path object for easier handling
|
351 |
+
|
352 |
try:
|
353 |
+
file_size = file_path.stat().st_size
|
354 |
if file_size > self.max_file_size:
|
355 |
+
logger.warning(f"File '{file_path.name}' size ({file_size} bytes) exceeds maximum allowed size ({self.max_file_size} bytes).")
|
356 |
+
return [{
|
357 |
+
'source': 'file',
|
358 |
+
'filename': file_path.name,
|
359 |
+
'file_size': file_size,
|
360 |
+
'extracted_data': None,
|
361 |
+
'processing_notes': 'File size exceeds limit.'
|
362 |
+
}]
|
363 |
|
364 |
with tempfile.TemporaryDirectory() as temp_dir:
|
365 |
temp_dir_path = Path(temp_dir)
|
366 |
|
367 |
+
# Decide processing strategy
|
368 |
+
if file_path.suffix.lower() in self.archive_extensions:
|
369 |
+
dataset.extend(self._process_archive(file_path, temp_dir_path))
|
370 |
+
elif file_path.suffix.lower() in self.supported_extensions:
|
371 |
+
# Pass the path to the single file processor
|
372 |
+
dataset.extend(self._process_single_file(file_path))
|
373 |
else:
|
374 |
+
logger.warning(f"Unsupported file type for processing: '{file_path.name}'")
|
375 |
+
# Optionally process as raw text even if extension is unsupported
|
376 |
+
try:
|
377 |
+
# Read as text with error replacement
|
378 |
+
content_bytes = file_path.read_bytes()
|
379 |
+
encoding_detection = chardet.detect(content_bytes)
|
380 |
+
encoding = encoding_detection['encoding'] or 'utf-8'
|
381 |
+
raw_content = content_bytes.decode(encoding, errors='replace')
|
382 |
+
dataset.append({
|
383 |
+
'source': 'file',
|
384 |
+
'filename': file_path.name,
|
385 |
+
'file_size': file_size,
|
386 |
+
'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
|
387 |
+
'extracted_data': {'plain_text': raw_content}, # Store raw text under a key
|
388 |
+
'processing_notes': 'Processed as plain text (unsupported extension).'
|
389 |
+
})
|
390 |
+
except Exception as e:
|
391 |
+
logger.error(f"Error reading or processing unsupported file '{file_path.name}' as text: {e}")
|
392 |
+
dataset.append({
|
393 |
+
'source': 'file',
|
394 |
+
'filename': file_path.name,
|
395 |
+
'file_size': file_size,
|
396 |
+
'mime_type': mimetypes.guess_type(file_path.name)[0] or 'unknown/unknown',
|
397 |
+
'extracted_data': None,
|
398 |
+
'processing_notes': f'Unsupported file type and failed to read as text: {e}'
|
399 |
+
})
|
400 |
+
|
401 |
|
402 |
except Exception as e:
|
403 |
+
logger.error(f"Error processing file '{file_path.name}': {str(e)}")
|
404 |
+
dataset.append({
|
405 |
+
'source': 'file',
|
406 |
+
'filename': file_path.name,
|
407 |
+
'file_size': file_size if 'file_size' in locals() else None,
|
408 |
+
'extracted_data': None,
|
409 |
+
'processing_notes': f'Overall file processing error: {str(e)}'
|
410 |
+
})
|
411 |
return dataset
|
412 |
|
413 |
+
def _is_archive(self, filepath: Union[str, Path]) -> bool:
|
414 |
"""Check if file is an archive"""
|
415 |
+
p = Path(filepath) if isinstance(filepath, str) else filepath
|
416 |
+
return p.suffix.lower() in self.archive_extensions
|
417 |
+
|
418 |
+
def _process_single_file(self, file_path: Path) -> List[Dict]:
|
419 |
+
"""Process a single file with enhanced character extraction and format-specific handling"""
|
420 |
+
dataset_entries = []
|
421 |
+
filename = file_path.name
|
422 |
+
file_size = file_path.stat().st_size
|
423 |
+
mime_type, _ = mimetypes.guess_type(file_path)
|
424 |
+
mime_type = mime_type or 'unknown/unknown'
|
425 |
+
file_extension = file_path.suffix.lower()
|
426 |
+
|
427 |
+
logger.info(f"Processing single file: '{filename}' ({mime_type}, {file_size} bytes)")
|
428 |
+
|
429 |
+
raw_content: Optional[str] = None
|
430 |
+
extracted_data: Any = None
|
431 |
+
processing_notes = []
|
432 |
|
|
|
|
|
433 |
try:
|
434 |
+
# Read content efficiently
|
435 |
+
content_bytes = file_path.read_bytes()
|
436 |
+
encoding_detection = chardet.detect(content_bytes)
|
437 |
+
encoding = encoding_detection['encoding'] or 'utf-8'
|
438 |
+
raw_content = content_bytes.decode(encoding, errors='replace')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
|
440 |
+
# --- Attempt format-specific parsing ---
|
441 |
+
|
442 |
+
# 1. Attempt JSON parsing (explicit .json or application/json, OR if content looks like JSON)
|
443 |
+
is_explicit_json = mime_type == 'application/json' or file_extension == '.json'
|
444 |
+
looks_like_json = raw_content.strip().startswith('{') or raw_content.strip().startswith('[')
|
445 |
+
|
446 |
+
if is_explicit_json or looks_like_json:
|
447 |
+
try:
|
448 |
+
extracted_data = json.loads(raw_content)
|
449 |
+
processing_notes.append("Parsed as JSON.")
|
450 |
+
if not is_explicit_json:
|
451 |
+
processing_notes.append("Note: Content looked like JSON despite extension/mime.")
|
452 |
+
logger.warning(f"File '{filename}' identified as JSON content despite extension/mime.")
|
453 |
+
mime_type = 'application/json' # Update mime_type if successfully parsed as JSON
|
454 |
+
except json.JSONDecodeError as e:
|
455 |
+
processing_notes.append(f"Failed to parse as JSON: {e}.")
|
456 |
+
if is_explicit_json:
|
457 |
+
logger.error(f"Explicit JSON file '{filename}' has invalid format: {e}")
|
458 |
+
else:
|
459 |
+
logger.warning(f"Content of '{filename}' looks like JSON but failed to parse: {e}")
|
460 |
+
except Exception as e:
|
461 |
+
processing_notes.append(f"Error processing JSON: {e}.")
|
462 |
+
logger.error(f"Error processing JSON in '{filename}': {e}")
|
463 |
+
|
464 |
+
# 2. Attempt XML parsing (if not already parsed as JSON, and looks like XML)
|
465 |
+
# Add check if extracted_data is still None (meaning JSON parsing failed or wasn't attempted/relevant)
|
466 |
+
looks_like_xml = extracted_data is None and raw_content.strip().startswith('<') and raw_content.strip().endswith('>') # Simple heuristic
|
467 |
+
is_explicit_xml = extracted_data is None and (mime_type in ('application/xml', 'text/xml') or mime_type.endswith('+xml') or file_extension in ('.xml', '.xsd'))
|
468 |
+
|
469 |
+
if extracted_data is None and (is_explicit_xml or looks_like_xml):
|
470 |
+
try:
|
471 |
+
root = ET.fromstring(raw_content)
|
472 |
+
# Convert XML element tree to a structured dictionary or string
|
473 |
+
# Simple string representation for QR code suitability
|
474 |
+
extracted_data = ET.tostring(root, encoding='unicode', method='xml')
|
475 |
+
processing_notes.append("Parsed as XML (text representation).")
|
476 |
+
if not is_explicit_xml:
|
477 |
+
processing_notes.append("Note: Content looked like XML despite extension/mime.")
|
478 |
+
# Update mime_type if successfully parsed as XML
|
479 |
+
if 'xml' not in mime_type: mime_type = 'application/xml'
|
480 |
+
except ET.ParseError as e:
|
481 |
+
processing_notes.append(f"Failed to parse as XML: {e}.")
|
482 |
+
if is_explicit_xml:
|
483 |
+
logger.error(f"Explicit XML file '{filename}' has invalid format: {e}")
|
484 |
+
else:
|
485 |
+
logger.warning(f"Content of '{filename}' looks like XML but failed to parse: {e}")
|
486 |
+
except Exception as e:
|
487 |
+
processing_notes.append(f"Error processing XML: {e}.")
|
488 |
+
logger.error(f"Error processing XML in '{filename}': {e}")
|
489 |
+
|
490 |
+
|
491 |
+
# 3. Attempt CSV parsing (if not already parsed, and looks like CSV or is explicit CSV)
|
492 |
+
is_explicit_csv = extracted_data is None and (mime_type == 'text/csv' or file_extension == '.csv')
|
493 |
+
# Heuristic: check for commas/semicolons and multiple lines
|
494 |
+
looks_like_csv = extracted_data is None and (',' in raw_content or ';' in raw_content) and ('\n' in raw_content or len(raw_content.splitlines()) > 1)
|
495 |
+
|
496 |
+
if extracted_data is None and (is_explicit_csv or looks_like_csv):
|
497 |
+
try:
|
498 |
+
# Use Sniffer to guess dialect for better compatibility
|
499 |
+
dialect = 'excel' # Default dialect
|
500 |
+
try:
|
501 |
+
# Look at first few lines to guess dialect
|
502 |
+
sample = '\n'.join(raw_content.splitlines()[:10])
|
503 |
+
if sample:
|
504 |
+
dialect = csv.Sniffer().sniff(sample).name
|
505 |
+
logger.debug(f"Sniffer detected CSV dialect: {dialect} for '{filename}'")
|
506 |
+
except csv.Error:
|
507 |
+
logger.debug(f"Sniffer failed to detect dialect for '{filename}', using 'excel'.")
|
508 |
+
dialect = 'excel' # Fallback
|
509 |
+
|
510 |
+
# Read using the guessed or default dialect
|
511 |
+
csv_reader = csv.reader(io.StringIO(raw_content), dialect=dialect)
|
512 |
+
rows = list(csv_reader)
|
513 |
+
|
514 |
+
if rows:
|
515 |
+
# Limit the number of rows included for potentially huge CSVs
|
516 |
+
max_rows_preview = 100
|
517 |
+
extracted_data = {
|
518 |
+
'headers': rows[0] if rows[0] else None, # Assume first row is header
|
519 |
+
'rows': rows[1:max_rows_preview+1] # Get up to max_rows_preview data rows
|
520 |
+
}
|
521 |
+
if len(rows) > max_rows_preview + 1:
|
522 |
+
processing_notes.append(f"CSV truncated to {max_rows_preview} data rows.")
|
523 |
+
processing_notes.append("Parsed as CSV.")
|
524 |
+
if not is_explicit_csv:
|
525 |
+
processing_notes.append("Note: Content looked like CSV despite extension/mime.")
|
526 |
+
mime_type = 'text/csv' # Update mime_type
|
527 |
+
|
528 |
+
else:
|
529 |
+
extracted_data = "Empty CSV"
|
530 |
+
processing_notes.append("Parsed as empty CSV.")
|
531 |
+
if not is_explicit_csv:
|
532 |
+
processing_notes.append("Note: Content looked like CSV but was empty.")
|
533 |
+
|
534 |
+
except Exception as e:
|
535 |
+
processing_notes.append(f"Failed to parse as CSV: {e}.")
|
536 |
+
logger.warning(f"Failed to parse CSV from '{filename}': {e}")
|
537 |
+
|
538 |
+
|
539 |
+
# 4. Attempt Document Text Extraction (if not already parsed)
|
540 |
+
if extracted_data is None:
|
541 |
+
try:
|
542 |
+
extracted_text = None
|
543 |
+
if file_extension == '.pdf' and PDF_SUPPORT:
|
544 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
|
545 |
+
tmp_file.write(content_bytes) # Write bytes to temp file
|
546 |
+
temp_path = Path(tmp_file.name)
|
547 |
+
try:
|
548 |
+
reader = PdfReader(temp_path)
|
549 |
+
text_content = "".join(page.extract_text() or "" for page in reader.pages)
|
550 |
+
extracted_text = text_content
|
551 |
+
processing_notes.append("Extracted text from PDF.")
|
552 |
+
finally:
|
553 |
+
temp_path.unlink() # Clean up temp file
|
554 |
+
elif file_extension == '.docx' and DOCX_SUPPORT:
|
555 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
|
556 |
+
tmp_file.write(content_bytes) # Write bytes to temp file
|
557 |
+
temp_path = Path(tmp_file.name)
|
558 |
+
try:
|
559 |
+
document = Document(temp_path)
|
560 |
+
text_content = "\n".join(paragraph.text for paragraph in document.paragraphs)
|
561 |
+
extracted_text = text_content
|
562 |
+
processing_notes.append("Extracted text from DOCX.")
|
563 |
+
finally:
|
564 |
+
temp_path.unlink() # Clean up temp file
|
565 |
+
elif file_extension == '.rtf' and RTF_SUPPORT:
|
566 |
+
# pyth can read directly from file-like object or string
|
567 |
+
try:
|
568 |
+
doc = Rtf15Reader.read(io.StringIO(raw_content))
|
569 |
+
text_content = PlaintextWriter.write(doc).getvalue()
|
570 |
+
extracted_text = text_content
|
571 |
+
processing_notes.append("Extracted text from RTF.")
|
572 |
+
except Exception as e:
|
573 |
+
processing_notes.append(f"RTF extraction error: {e}")
|
574 |
+
logger.warning(f"Failed to extract RTF text from '{filename}': {e}")
|
575 |
+
elif file_extension == '.odt' and ODT_SUPPORT:
|
576 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.odt') as tmp_file:
|
577 |
+
tmp_file.write(content_bytes) # Write bytes to temp file
|
578 |
+
temp_path = Path(tmp_file.name)
|
579 |
+
try:
|
580 |
+
text_doc = OpenDocumentText(temp_path)
|
581 |
+
paragraphs = text_doc.getElementsByType(odftext.P)
|
582 |
+
text_content = "\n".join("".join(node.text for node in p.childNodes) for p in paragraphs)
|
583 |
+
extracted_text = text_content
|
584 |
+
processing_notes.append("Extracted text from ODT.")
|
585 |
+
finally:
|
586 |
+
temp_path.unlink() # Clean up temp file
|
587 |
+
elif file_extension in ['.doc', '.ppt', '.pptx', '.xls', '.xlsx']:
|
588 |
+
# These require more complex or platform-specific libraries (e.g. antiword, pandoc, COM objects on Windows)
|
589 |
+
processing_notes.append(f"Automatic text extraction for {file_extension.upper()} not fully implemented.")
|
590 |
+
logger.warning(f"Automatic text extraction for {file_extension.upper()} not fully implemented for '{filename}'.")
|
591 |
+
|
592 |
+
if extracted_text is not None:
|
593 |
+
# Limit extracted text size
|
594 |
+
max_extracted_text_size = 10000 # Limit text preview
|
595 |
+
extracted_data = {'text': extracted_text[:max_extracted_text_size]}
|
596 |
+
if len(extracted_text) > max_extracted_text_size:
|
597 |
+
extracted_data['text'] += "..."
|
598 |
+
processing_notes.append("Extracted text truncated.")
|
599 |
+
|
600 |
+
except ImportError as e:
|
601 |
+
processing_notes.append(f"Missing dependency for document type ({e}). Cannot extract text.")
|
602 |
+
except Exception as e:
|
603 |
+
processing_notes.append(f"Error during document text extraction: {e}")
|
604 |
+
logger.warning(f"Error during document text extraction for '{filename}': {e}")
|
605 |
+
|
606 |
+
|
607 |
+
# 5. Fallback to Plain Text (if no specific extraction succeeded)
|
608 |
+
if extracted_data is None:
|
609 |
+
extracted_data = {'plain_text': raw_content}
|
610 |
+
processing_notes.append("Stored as plain text.")
|
611 |
+
# Re-guess mime type if it was something specific like application/octet-stream and we just got text
|
612 |
+
if mime_type in ['unknown/unknown', 'application/octet-stream']:
|
613 |
+
guessed_text_mime, _ = mimetypes.guess_type('dummy.txt') # Use a dummy file name to guess plain text
|
614 |
+
if guessed_text_mime: mime_type = guessed_text_mime
|
615 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
616 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
617 |
except Exception as e:
|
618 |
+
# Catch errors during initial read or other unexpected issues
|
619 |
+
logger.error(f"Fatal error processing single file '{filename}': {e}")
|
620 |
+
processing_notes.append(f"Fatal processing error: {e}")
|
621 |
+
raw_content = None # Ensure raw_content is None if reading failed
|
622 |
+
extracted_data = None
|
623 |
+
|
624 |
+
|
625 |
+
# Add file info to the entry
|
626 |
+
entry = {
|
627 |
+
'source': 'file',
|
628 |
+
'filename': filename,
|
629 |
+
'file_size': file_size,
|
630 |
+
'mime_type': mime_type,
|
631 |
+
'created': datetime.fromtimestamp(file_path.stat().st_ctime).isoformat() if file_path.exists() else None,
|
632 |
+
'modified': datetime.fromtimestamp(file_path.stat().st_mtime).isoformat() if file_path.exists() else None,
|
633 |
+
'raw_content': raw_content, # Always include raw content if readable
|
634 |
+
'extracted_data': extracted_data, # Include the structured/extracted data
|
635 |
+
'processing_notes': processing_notes # Include any notes/errors encountered
|
636 |
+
}
|
637 |
+
|
638 |
+
dataset_entries.append(entry)
|
639 |
+
return dataset_entries
|
640 |
|
641 |
+
|
642 |
+
def _process_archive(self, archive_path: Path, extract_to: Path) -> List[Dict]:
|
643 |
"""Process an archive file with enhanced extraction"""
|
644 |
dataset = []
|
645 |
+
archive_extension = archive_path.suffix.lower()
|
646 |
+
logger.info(f"Processing archive: '{archive_path.name}'")
|
647 |
+
|
648 |
try:
|
649 |
+
if archive_extension == '.zip':
|
650 |
+
if zipfile.is_zipfile(archive_path):
|
651 |
+
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
|
652 |
+
for file_info in zip_ref.infolist():
|
653 |
+
if file_info.file_size > 0 and not file_info.filename.endswith('/'):
|
654 |
+
try:
|
655 |
+
zip_ref.extract(file_info, path=extract_to)
|
656 |
+
extracted_file_path = extract_to / file_info.filename
|
657 |
+
# Recursively process the extracted file if it's supported and not an archive itself
|
658 |
+
if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
|
659 |
+
dataset.extend(self._process_single_file(extracted_file_path))
|
660 |
+
elif extracted_file_path.suffix.lower() in self.archive_extensions:
|
661 |
+
# Recursively process nested archives (careful with depth!)
|
662 |
+
logger.info(f"Found nested archive '{file_info.filename}', processing recursively.")
|
663 |
+
dataset.extend(self._process_archive(extracted_file_path, extract_to))
|
664 |
+
else:
|
665 |
+
logger.debug(f"Skipping unsupported file in archive: '{file_info.filename}'")
|
666 |
+
except Exception as e:
|
667 |
+
logger.warning(f"Error extracting/processing file '{file_info.filename}' from zip '{archive_path.name}': {e}")
|
668 |
+
else:
|
669 |
+
logger.error(f"'{archive_path.name}' is not a valid zip file.")
|
670 |
+
|
671 |
+
elif archive_extension in ('.tar', '.gz', '.tgz'):
|
672 |
try:
|
673 |
+
# Determine mode: 'r' for tar, 'r:gz' for tar.gz, 'r:bz2' for tar.bz2 (bz2 not fully supported yet)
|
674 |
+
mode = 'r'
|
675 |
+
if archive_extension in ('.tar.gz', '.tgz'): mode = 'r:gz'
|
676 |
+
# elif archive_extension == '.tar.bz2': mode = 'r:bz2' # Needs bz2 support
|
677 |
+
# Note: 'r:*' attempts to guess compression, safer to be explicit
|
678 |
+
|
679 |
+
with tarfile.open(archive_path, mode) as tar_ref:
|
680 |
for member in tar_ref.getmembers():
|
681 |
if member.isfile():
|
682 |
+
try:
|
683 |
+
tar_ref.extract(member, path=extract_to)
|
684 |
+
extracted_file_path = extract_to / member.name
|
685 |
+
# Recursively process extracted file
|
686 |
+
if extracted_file_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_file_path):
|
687 |
+
dataset.extend(self._process_single_file(extracted_file_path))
|
688 |
+
elif extracted_file_path.suffix.lower() in self.archive_extensions:
|
689 |
+
logger.info(f"Found nested archive '{member.name}', processing recursively.")
|
690 |
+
dataset.extend(self._process_archive(extracted_file_path, extract_to))
|
691 |
+
else:
|
692 |
+
logger.debug(f"Skipping unsupported file in archive: '{member.name}'")
|
693 |
+
except Exception as e:
|
694 |
+
logger.warning(f"Error extracting/processing file '{member.name}' from tar '{archive_path.name}': {e}")
|
695 |
except tarfile.TarError as e:
|
696 |
+
logger.error(f"Error processing TAR archive '{archive_path.name}': {e}")
|
697 |
+
|
698 |
+
elif archive_extension == '.gz':
|
699 |
+
# GZIP archives typically contain a single file. Extract it and process.
|
700 |
+
extracted_name = archive_path.stem # Get name without .gz
|
701 |
+
extracted_path = extract_to / extracted_name
|
702 |
+
try:
|
703 |
+
with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
|
704 |
+
outfile.write(gz_file.read())
|
705 |
+
# Process the extracted file if supported
|
706 |
+
if extracted_path.suffix.lower() in self.supported_extensions and not self._is_archive(extracted_path):
|
707 |
+
dataset.extend(self._process_single_file(extracted_path))
|
708 |
+
elif extracted_path.suffix.lower() in self.archive_extensions:
|
709 |
+
logger.info(f"Found nested archive '{extracted_name}', processing recursively.")
|
710 |
+
dataset.extend(self._process_archive(extracted_path, extract_to))
|
711 |
+
else:
|
712 |
+
logger.debug(f"Skipping unsupported file (from gz): '{extracted_name}'")
|
713 |
+
|
714 |
+
except gzip.GzipFile as e:
|
715 |
+
logger.error(f"Error processing GZIP file '{archive_path.name}': {e}")
|
716 |
+
except Exception as e:
|
717 |
+
logger.error(f"Error extracting/processing from GZIP '{archive_path.name}': {e}")
|
718 |
+
finally:
|
719 |
+
if extracted_path.exists(): extracted_path.unlink() # Clean up extracted file
|
720 |
+
|
721 |
+
# TODO: Add support for other archive types (.bz2, .7z, .rar)
|
722 |
+
elif archive_extension in ('.bz2', '.7z', '.rar'):
|
723 |
+
logger.warning(f"Support for {archive_extension} archives is not yet fully implemented and requires external tools/libraries.")
|
724 |
|
725 |
except Exception as e:
|
726 |
+
logger.error(f"Overall archive processing error for '{archive_path.name}': {e}")
|
727 |
+
|
728 |
+
# Clean up extracted files in temp_dir after processing
|
729 |
+
# Handled by context manager 'with tempfile.TemporaryDirectory()'
|
730 |
+
|
731 |
+
|
732 |
return dataset
|
733 |
|
734 |
def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
|
735 |
"""Enhanced data chunking with sequence metadata"""
|
736 |
try:
|
737 |
# Convert data to JSON string
|
738 |
+
# Use separators=(',', ':') to remove unnecessary whitespace for maximum data density in QR code
|
739 |
+
json_str = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
|
740 |
total_length = len(json_str)
|
741 |
|
742 |
# Calculate overhead for metadata
|
743 |
+
# Metadata structure: {"idx":0,"tc":1,"tl":XXX,"hash":"YYYY","data":"..."}, shortened keys
|
744 |
metadata_template = {
|
745 |
+
"idx": 0, # chunk_index
|
746 |
+
"tc": 1, # total_chunks
|
747 |
+
"tl": total_length, # total_length
|
748 |
+
"hash": "", # chunk_hash
|
749 |
+
"data": "" # chunk_data
|
750 |
}
|
751 |
+
# Estimate overhead more accurately by dumping a sample metadata structure
|
752 |
+
# and adding some safety margin. Shortened keys reduce overhead.
|
753 |
+
overhead_estimate = len(json.dumps(metadata_template, separators=(',', ':'))) + 50 # Extra padding
|
754 |
|
755 |
# Calculate effective chunk size
|
756 |
+
effective_chunk_size = max_size - overhead_estimate
|
757 |
+
|
758 |
+
if effective_chunk_size <= 0:
|
759 |
+
logger.error(f"Max QR size ({max_size}) is too small for metadata overhead ({overhead_estimate}). Cannot chunk.")
|
760 |
+
return []
|
761 |
|
762 |
if total_length <= effective_chunk_size:
|
763 |
# Data fits in one chunk
|
764 |
+
chunk_data = json_str # Use the full string
|
765 |
+
|
766 |
chunk = {
|
767 |
+
"idx": 0,
|
768 |
+
"tc": 1,
|
769 |
+
"tl": total_length,
|
770 |
+
"hash": hash(chunk_data) & 0xFFFFFFFF, # 32-bit hash
|
771 |
+
"data": chunk_data
|
772 |
}
|
773 |
return [chunk]
|
774 |
|
775 |
# Calculate number of chunks needed
|
776 |
num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
|
777 |
+
# Adjust chunk_size slightly to distribute evenly, maybe not strictly necessary
|
778 |
+
# chunk_size = -(-total_length // num_chunks) # Use this if perfect distribution is needed
|
779 |
|
780 |
chunks = []
|
781 |
+
current_pos = 0
|
782 |
for i in range(num_chunks):
|
783 |
+
# Find the end of the current chunk. Avoid splitting in the middle of escaped characters or surrogate pairs if possible,
|
784 |
+
# but simple slicing is usually okay for standard text that's already been errors='replace'.
|
785 |
+
# We'll use basic slicing for simplicity, as the JSON string is just text.
|
786 |
+
end_pos = min(current_pos + effective_chunk_size, total_length)
|
787 |
+
|
788 |
+
# Basic attempt to not break in the middle of a UTF-8 character if slicing bytes,
|
789 |
+
# but since we are slicing a *decoded string*, this is less of an issue.
|
790 |
+
# However, slicing in the middle of JSON structure is bad.
|
791 |
+
# For simplicity and robustness with arbitrary JSON structures, slicing the raw string is the easiest.
|
792 |
+
chunk_data_str = json_str[current_pos:end_pos]
|
793 |
|
794 |
chunk = {
|
795 |
+
"idx": i,
|
796 |
+
"tc": num_chunks,
|
797 |
+
"tl": total_length,
|
798 |
+
"hash": hash(chunk_data_str) & 0xFFFFFFFF,
|
799 |
+
"data": chunk_data_str
|
800 |
}
|
801 |
chunks.append(chunk)
|
802 |
+
current_pos = end_pos
|
803 |
+
|
804 |
+
# Final check: Ensure all data was chunked
|
805 |
+
if current_pos < total_length:
|
806 |
+
# This shouldn't happen with correct ceiling division and min()
|
807 |
+
logger.error(f"Chunking logic error: Only processed {current_pos} of {total_length} characters.")
|
808 |
+
return [] # Indicate failure
|
809 |
|
810 |
+
|
811 |
+
logger.info(f"Chunked data into {num_chunks} chunks for QR codes.")
|
812 |
return chunks
|
813 |
+
|
814 |
except Exception as e:
|
815 |
logger.error(f"Error chunking data: {e}")
|
816 |
return []
|
|
|
825 |
try:
|
826 |
qr = qrcode.QRCode(
|
827 |
version=None,
|
828 |
+
error_correction=qrcode.constants.ERROR_CORRECT_M, # Increased error correction
|
829 |
box_size=size,
|
830 |
border=border
|
831 |
)
|
832 |
|
833 |
# Add data to QR code
|
834 |
if isinstance(data, dict):
|
835 |
+
# Use compact JSON representation
|
836 |
+
qr.add_data(json.dumps(data, ensure_ascii=False, separators=(',', ':')))
|
837 |
else:
|
838 |
+
qr.add_data(str(data)) # Ensure it's a string
|
839 |
|
840 |
qr.make(fit=True)
|
841 |
|
842 |
# Create QR code image with custom colors
|
843 |
qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
|
844 |
|
845 |
+
# Convert to RGBA for transparency support and potential overlays
|
846 |
qr_image = qr_image.convert('RGBA')
|
847 |
|
848 |
+
# Optional: Add a small logo or icon in the center (requires design)
|
849 |
+
# logo = Image.open("logo.png").convert("RGBA")
|
850 |
+
# logo = logo.resize((logo.width // 4, logo.height // 4)) # Resize logo
|
851 |
+
# logo_pos = ((qr_image.width - logo.width) // 2, (qr_image.height - logo.height) // 2)
|
852 |
+
# qr_image.paste(logo, logo_pos, logo)
|
853 |
+
|
854 |
+
# Add subtle gradient overlay (optional visual enhancement)
|
855 |
+
try:
|
856 |
+
gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
|
857 |
+
draw = ImageDraw.Draw(gradient)
|
858 |
+
# Horizontal gradient for subtle effect
|
859 |
+
for i in range(qr_image.width):
|
860 |
+
# Fades from left (alpha=0) to right (max_alpha)
|
861 |
+
alpha = int(255 * (i/qr_image.width) * 0.05) # e.g., 5% maximum opacity fade-in
|
862 |
+
draw.line([(i, 0), (i, qr_image.height)], fill=(0, 0, 0, alpha))
|
863 |
+
# Combine images
|
864 |
+
final_image = Image.alpha_composite(qr_image, gradient)
|
865 |
+
except Exception as e:
|
866 |
+
logger.warning(f"Failed to add gradient overlay to QR code: {e}. Using plain QR.")
|
867 |
+
final_image = qr_image
|
868 |
|
|
|
|
|
869 |
|
870 |
# Save the image
|
871 |
output_path = QR_CODES_DIR / filename
|
872 |
+
final_image.save(output_path, quality=90) # Save with slightly lower quality for smaller file size
|
873 |
|
874 |
return str(output_path)
|
875 |
except Exception as e:
|
|
|
878 |
|
879 |
def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
|
880 |
"""Generate QR codes with enhanced visual appeal and metadata"""
|
881 |
+
# Assume 'data' here is the list of dictionaries produced by process_inputs
|
882 |
+
if not isinstance(data, list):
|
883 |
+
logger.error("generate_qr_codes received data that is not a list.")
|
884 |
+
return []
|
885 |
+
|
886 |
try:
|
887 |
+
file_processor = EnhancedFileProcessor() # Use the enhanced processor for chunking
|
888 |
paths = []
|
889 |
|
890 |
if combined:
|
891 |
# Process combined data
|
892 |
+
chunks = file_processor.chunk_data(data) # chunk_data works on the list of dicts
|
893 |
+
if not chunks:
|
894 |
+
logger.warning("No chunks generated for combined data.")
|
895 |
+
return []
|
896 |
for i, chunk in enumerate(chunks):
|
897 |
filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
|
898 |
qr_path = generate_stylish_qr(
|
899 |
+
data=chunk, # Pass the chunk dictionary
|
900 |
filename=filename,
|
901 |
fill_color="#1a365d", # Deep blue
|
902 |
back_color="#ffffff"
|
903 |
)
|
904 |
if qr_path:
|
905 |
paths.append(qr_path)
|
906 |
+
else:
|
907 |
+
logger.warning(f"Failed to generate QR for chunk {i+1}/{len(chunks)}.")
|
908 |
else:
|
909 |
+
# Process individual items (each dictionary in the list)
|
910 |
+
if data: # Ensure data is not empty
|
911 |
for idx, item in enumerate(data):
|
912 |
+
chunks = file_processor.chunk_data(item) # chunk_data works on individual dict
|
913 |
+
if not chunks:
|
914 |
+
logger.warning(f"No chunks generated for item {idx+1}.")
|
915 |
+
continue
|
916 |
for chunk_idx, chunk in enumerate(chunks):
|
917 |
filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
|
918 |
qr_path = generate_stylish_qr(
|
919 |
+
data=chunk, # Pass the chunk dictionary
|
920 |
filename=filename,
|
921 |
fill_color="#1a365d", # Deep blue
|
922 |
back_color="#ffffff"
|
923 |
)
|
924 |
if qr_path:
|
925 |
paths.append(qr_path)
|
926 |
+
else:
|
927 |
+
logger.warning(f"Failed to generate QR for item {idx+1} chunk {chunk_idx+1}/{len(chunks)}.")
|
928 |
else:
|
929 |
+
logger.warning("No items in data list to process individually.")
|
930 |
+
|
931 |
+
logger.info(f"Generated {len(paths)} QR codes.")
|
932 |
+
return paths
|
933 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
934 |
except Exception as e:
|
935 |
logger.error(f"QR code generation error: {e}")
|
936 |
return []
|
937 |
|
938 |
+
# Keep the Gradio UI definition and main function as they are,
|
939 |
+
# as the changes are internal to the processing classes and the
|
940 |
+
# process_inputs function already handles calling them and getting
|
941 |
+
# the combined list of results.
|
942 |
+
|
943 |
def create_modern_interface():
|
944 |
"""Create a modern and visually appealing Gradio interface"""
|
945 |
|
|
|
1043 |
interface.head += """
|
1044 |
<script>
|
1045 |
let enabledStates = [];
|
|
|
1046 |
function updateEnabledStates(checkbox) {
|
1047 |
const index = parseInt(checkbox.dataset.index);
|
1048 |
if (checkbox.checked) {
|
|
|
1066 |
qr_code_paths = gr.State([])
|
1067 |
gr.Markdown("""
|
1068 |
# π Advanced Data Processing & QR Code Generator
|
|
|
1069 |
Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
|
1070 |
""")
|
1071 |
with gr.Tab("π URL Processing"):
|
|
|
1149 |
return json.dumps(example, indent=2)
|
1150 |
|
1151 |
def clear_input():
|
1152 |
+
return "", None, "" # Clear url, files, text
|
1153 |
|
1154 |
def update_viewport(paths, enabled_states):
|
1155 |
if not paths:
|
1156 |
return "<p>No QR codes generated yet.</p>"
|
1157 |
|
1158 |
num_qr_codes = len(paths)
|
1159 |
+
cols = math.ceil(math.sqrt(num_qr_codes)) # Calculate columns for a roughly square grid
|
1160 |
+
cols = max(1, min(cols, 6)) # Limit max columns for small screens
|
1161 |
rows = math.ceil(num_qr_codes / cols)
|
1162 |
|
1163 |
+
viewport_html = f'<div class="viewport-container" style="grid-template-columns: repeat({cols}, 1fr);">'.format(cols)
|
1164 |
+
|
1165 |
+
# Initialize enabledStates if it's empty (first load)
|
1166 |
+
if not enabled_states and paths:
|
1167 |
+
enabled_states = list(range(num_qr_codes)) # Enable all by default on first view
|
1168 |
|
1169 |
for i, path in enumerate(paths):
|
1170 |
is_enabled = i in enabled_states
|
1171 |
border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;"
|
1172 |
+
opacity = "opacity: 1.0;" if is_enabled else "opacity: 0.5;"
|
1173 |
viewport_html += f'<div class="viewport-item" id="qr_item_{i}">'
|
1174 |
+
viewport_html += f'<img src="/file={path}" style="{border} {opacity}" alt="QR Code {i+1}">' # Use /file= for Gradio to serve static files
|
1175 |
+
viewport_html += f'<label><input type="checkbox" data-index="{i}" {"checked" if is_enabled else ""} onchange="updateEnabledStates(this)"> Enable</label>'
|
1176 |
viewport_html += '</div>'
|
1177 |
viewport_html += '</div>'
|
1178 |
|
|
|
1180 |
|
1181 |
def process_inputs(urls, files, text, combine):
|
1182 |
"""Process all inputs and generate QR codes"""
|
1183 |
+
results = []
|
1184 |
+
processing_status_messages = []
|
1185 |
+
|
1186 |
+
url_processor = EnhancedURLProcessor()
|
1187 |
+
file_processor = EnhancedFileProcessor()
|
1188 |
|
1189 |
+
try:
|
1190 |
# Process JSON input
|
1191 |
if text and text.strip():
|
1192 |
try:
|
1193 |
json_data = json.loads(text)
|
1194 |
+
# Wrap direct JSON input in a dictionary for consistency with file/URL output structure
|
1195 |
+
results.append({
|
1196 |
+
'source': 'json_input',
|
1197 |
+
'extracted_data': json_data,
|
1198 |
+
'timestamp': datetime.now().isoformat(),
|
1199 |
+
'processing_notes': ['Parsed from direct JSON input.']
|
1200 |
+
})
|
1201 |
+
processing_status_messages.append("β
Successfully parsed direct JSON input.")
|
1202 |
except json.JSONDecodeError as e:
|
1203 |
+
processing_status_messages.append(f"β Invalid JSON format in text input: {str(e)}")
|
1204 |
+
except Exception as e:
|
1205 |
+
processing_status_messages.append(f"β Error processing direct JSON input: {str(e)}")
|
1206 |
+
|
1207 |
|
1208 |
# Process URLs
|
1209 |
if urls and urls.strip():
|
|
|
1212 |
for url in url_list:
|
1213 |
validation = url_processor.validate_url(url)
|
1214 |
if validation['is_valid']:
|
1215 |
+
processing_status_messages.append(f"π Fetching URL: {url}...")
|
1216 |
+
content_result = url_processor.fetch_content(url)
|
1217 |
+
if content_result:
|
1218 |
+
results.append(content_result)
|
1219 |
+
processing_status_messages.append(f"β
Fetched and processed URL: {url}")
|
1220 |
+
else:
|
1221 |
+
processing_status_messages.append(f"β Failed to fetch/process URL: {url}")
|
1222 |
+
if validation['details'].get('final_url'):
|
1223 |
+
processing_status_messages[-1] += f" (Redirected to {validation['details']['final_url']})"
|
1224 |
+
else:
|
1225 |
+
processing_status_messages.append(f"β οΈ Skipping invalid URL: {url} ({validation['message']})")
|
1226 |
|
1227 |
# Process files
|
1228 |
if files:
|
1229 |
for file in files:
|
1230 |
+
processing_status_messages.append(f"π Processing file: {file.name}...")
|
1231 |
file_results = file_processor.process_file(file)
|
1232 |
if file_results:
|
1233 |
+
results.extend(file_results)
|
1234 |
+
processing_status_messages.append(f"β
Processed file: {file.name}")
|
1235 |
+
else:
|
1236 |
+
processing_status_messages.append(f"β Failed to process file: {file.name}")
|
1237 |
|
1238 |
# Generate QR codes
|
1239 |
+
qr_paths = []
|
1240 |
+
final_json_output = None
|
1241 |
+
|
1242 |
if results:
|
1243 |
+
# Use the collected results (list of dicts) for QR code generation
|
1244 |
qr_paths = generate_qr_codes(results, combine)
|
1245 |
+
final_json_output = results # Show the structured data in the JSON output box
|
1246 |
+
|
1247 |
if qr_paths:
|
1248 |
+
processing_status_messages.append(f"β
Successfully generated {len(qr_paths)} QR codes.")
|
|
|
|
|
|
|
|
|
1249 |
else:
|
1250 |
+
processing_status_messages.append("β Failed to generate QR codes.")
|
1251 |
+
|
1252 |
else:
|
1253 |
+
processing_status_messages.append("β οΈ No valid content collected from inputs.")
|
1254 |
+
|
1255 |
+
|
1256 |
except Exception as e:
|
1257 |
+
logger.error(f"Overall processing error in process_inputs: {e}")
|
1258 |
+
processing_status_messages.append(f"β An unexpected error occurred during processing: {str(e)}")
|
1259 |
+
|
1260 |
+
return (
|
1261 |
+
final_json_output,
|
1262 |
+
[str(path) for path in qr_paths], # Gradio Gallery expects list of paths (strings)
|
1263 |
+
"\n".join(processing_status_messages) # Join status messages
|
1264 |
+
)
|
1265 |
|
1266 |
+
def on_qr_generation(qr_paths_list):
|
1267 |
+
# When QR codes are generated, update the state with the list of paths
|
1268 |
+
# and initialize the enabled_qr_codes state with all indices enabled
|
1269 |
+
num_qrs = len(qr_paths_list)
|
1270 |
+
initial_enabled_states = list(range(num_qrs))
|
1271 |
+
return qr_paths_list, initial_enabled_states # Return paths list and initial enabled state
|
1272 |
+
|
1273 |
+
|
1274 |
+
# Link events
|
1275 |
+
example_btn.click(load_example, inputs=[], outputs=text_input)
|
1276 |
+
clear_btn.click(clear_input, inputs=[], outputs=[url_input, file_input, text_input]) # Clear all inputs
|
1277 |
|
1278 |
process_btn.click(
|
1279 |
process_inputs,
|
1280 |
inputs=[url_input, file_input, text_input, combine_data],
|
1281 |
outputs=[output_json, output_gallery, output_text]
|
1282 |
+
).then( # Chain a .then() to update the QR paths state and trigger viewport update
|
1283 |
+
on_qr_generation,
|
1284 |
+
inputs=[output_gallery], # Get the list of paths from the gallery output
|
1285 |
+
outputs=[qr_code_paths, enabled_qr_codes] # Update the state variables
|
1286 |
+
)
|
1287 |
|
1288 |
+
# The viewport tab's select event will trigger update_viewport to render the grid
|
1289 |
viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output])
|
1290 |
|
1291 |
# Add helpful documentation
|
1292 |
gr.Markdown("""
|
1293 |
### π Features
|
1294 |
+
|
1295 |
+
- **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type.
|
1296 |
+
- **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*).
|
1297 |
+
- **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs.
|
1298 |
+
- **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives.
|
1299 |
+
- **Robust Encoding Detection**: Uses `chardet` for reliable character encoding identification.
|
1300 |
+
- **Structured Output**: Provides a consistent JSON output format containing raw content (if applicable), extracted data, and processing notes for each processed item.
|
1301 |
+
- **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data.
|
1302 |
+
- **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing.
|
1303 |
+
- **Modern Design**: Clean, responsive interface with visual feedback.
|
1304 |
+
|
1305 |
+
### π‘ Tips
|
1306 |
+
|
1307 |
+
1. **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type.
|
1308 |
+
2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats.
|
1309 |
+
3. **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure.
|
1310 |
+
4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries. Check the console logs for warnings if a library is missing.
|
1311 |
+
5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item.
|
1312 |
+
6. **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps.
|
1313 |
+
7. **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images.
|
1314 |
+
|
1315 |
+
### π¨ Output Details
|
1316 |
+
|
1317 |
+
- The "Processed Data" JSON will be a list of dictionaries. Each dictionary represents one processed input (URL or file).
|
1318 |
+
- Each item will have keys like `source`, `filename` (for files), `url` (for URLs), `mime_type`, `raw_content` (if readable), `extracted_data`, and `processing_notes`.
|
1319 |
+
- `extracted_data` will contain the parsed/extracted content, structured according to the input type (e.g., dictionary for JSON, text for documents, list of rows for CSV, dictionary with title/text/links for HTML).
|
1320 |
+
- `processing_notes` will list any issues encountered during extraction.
|
1321 |
+
- Generated QR codes are saved in the `output/qr_codes` directory.
|
1322 |
+
|
1323 |
+
### βοΈ QR Code Viewport Instructions
|
1324 |
+
|
1325 |
+
1. Navigate to the **QR Code Viewport** tab after generating QR codes.
|
1326 |
+
2. The generated QR codes will be displayed in a grid based on their total count.
|
1327 |
+
3. Use the checkboxes below each QR code to enable or disable it for visual selection. Enabled codes have a green border and full opacity.
|
1328 |
+
4. This viewport is currently for visualization and selection *within the UI*; it doesn't change the generated files themselves. You would manually select which physical QR codes to scan based on this view.
|
1329 |
+
|
1330 |
+
""")
|
1331 |
return interface
|
1332 |
|
1333 |
def main():
|
|
|
1342 |
# Launch with configuration
|
1343 |
interface.launch(
|
1344 |
share=False,
|
1345 |
+
debug=False, # Set to True for more verbose Gradio logging
|
1346 |
show_error=True,
|
1347 |
show_api=False
|
1348 |
)
|
1349 |
except Exception as e:
|
1350 |
logger.error(f"Application startup error: {e}")
|
1351 |
+
# Optionally print a user-friendly message before exiting
|
1352 |
+
print(f"\nFatal Error: {e}\nCheck the logs for details.")
|
1353 |
+
raise # Re-raise the exception to ensure the process exits if launch fails
|
1354 |
|
1355 |
if __name__ == "__main__":
|
1356 |
main()
|