acecalisto3 commited on
Commit
c7e50ec
·
verified ·
1 Parent(s): abeedee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -93
app.py CHANGED
@@ -19,7 +19,6 @@ from fake_useragent import UserAgent
19
  from cleantext import clean
20
  import qrcode
21
  import zipfile
22
- import zipfile36 as zipfile
23
 
24
  # Setup logging with detailed configuration
25
  logging.basicConfig(
@@ -28,10 +27,10 @@ logging.basicConfig(
28
  handlers=[
29
  logging.StreamHandler(),
30
  logging.FileHandler('app.log', encoding='utf-8')
31
- ]
32
- )
33
  logger = logging.getLogger(__name__)
34
 
 
35
  class URLProcessor:
36
  def __init__(self):
37
  self.session = requests.Session()
@@ -44,7 +43,7 @@ class URLProcessor:
44
  'Connection': 'keep-alive',
45
  'Upgrade-Insecure-Requests': '1'
46
  })
47
-
48
  def advanced_text_cleaning(self, text: str) -> str:
49
  """Robust text cleaning with version compatibility"""
50
  try:
@@ -74,7 +73,7 @@ class URLProcessor:
74
  try:
75
  if not validators.url(url):
76
  return {'is_valid': False, 'message': 'Invalid URL format'}
77
-
78
  response = self.session.head(url, timeout=self.timeout)
79
  response.raise_for_status()
80
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
@@ -87,11 +86,11 @@ class URLProcessor:
87
  # Google Drive document handling
88
  if 'drive.google.com' in url:
89
  return self._handle_google_drive(url)
90
-
91
  # Google Calendar ICS handling
92
  if 'calendar.google.com' in url and 'ical' in url:
93
  return self._handle_google_calendar(url)
94
-
95
  # Standard HTML processing
96
  return self._fetch_html_content(url)
97
  except Exception as e:
@@ -105,11 +104,11 @@ class URLProcessor:
105
  if not file_id:
106
  logger.error(f"Invalid Google Drive URL: {url}")
107
  return None
108
-
109
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
110
  response = self.session.get(direct_url, timeout=self.timeout)
111
  response.raise_for_status()
112
-
113
  return {
114
  'content': response.text,
115
  'content_type': response.headers.get('Content-Type', ''),
@@ -138,20 +137,20 @@ class URLProcessor:
138
  try:
139
  response = self.session.get(url, timeout=self.timeout)
140
  response.raise_for_status()
141
-
142
  soup = BeautifulSoup(response.text, 'html.parser')
143
-
144
  # Remove unwanted elements
145
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
146
  element.decompose()
147
-
148
  # Extract main content
149
  main_content = soup.find('main') or soup.find('article') or soup.body
150
-
151
  # Clean and structure content
152
  text_content = main_content.get_text(separator='\n', strip=True)
153
  cleaned_content = self.advanced_text_cleaning(text_content)
154
-
155
  return {
156
  'content': cleaned_content,
157
  'content_type': response.headers.get('Content-Type', ''),
@@ -160,14 +159,15 @@ class URLProcessor:
160
  except Exception as e:
161
  logger.error(f"HTML processing failed: {e}")
162
  return None
163
-
 
164
  class FileProcessor:
165
  """Class to handle file processing"""
166
-
167
  def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
168
  self.max_file_size = max_file_size
169
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
170
-
171
  def is_text_file(self, filepath: str) -> bool:
172
  """Check if file is a text file"""
173
  try:
@@ -181,24 +181,20 @@ class FileProcessor:
181
  """Process uploaded file with enhanced error handling"""
182
  if not file:
183
  return []
184
-
185
  dataset = []
186
  try:
187
  file_size = os.path.getsize(file.name)
188
  if file_size > self.max_file_size:
189
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
190
  return []
191
-
192
  with tempfile.TemporaryDirectory() as temp_dir:
193
  if zipfile.is_zipfile(file.name):
194
  dataset.extend(self._process_zip_file(file.name, temp_dir))
195
  else:
196
  dataset.extend(self._process_single_file(file))
197
-
198
  except Exception as e:
199
  logger.error(f"Error processing file: {str(e)}")
200
  return []
201
-
202
  return dataset
203
 
204
  def _process_zip_file(self, zip_path, temp_dir):
@@ -217,29 +213,29 @@ class FileProcessor:
217
  'timestamp': datetime.now().isoformat()
218
  })
219
  return result
220
-
221
  def _process_single_file(self, file) -> List[Dict]:
222
  try:
223
  file_stat = os.stat(file.name)
224
-
225
  # For very large files, read in chunks and summarize
226
  if file_stat.st_size > 100 * 1024 * 1024: # 100MB
227
  logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
228
-
229
  # Read first and last 1MB for extremely large files
230
  content = ""
231
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
232
  content = f.read(1 * 1024 * 1024) # First 1MB
233
  content += "\n...[Content truncated due to large file size]...\n"
234
-
235
  # Seek to the last 1MB
236
  f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
237
  content += f.read() # Last 1MB
238
  else:
239
  # Regular file processing
240
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
241
- content =f.read()
242
-
243
  return [{
244
  'source': 'file',
245
  'filename': os.path.basename(file.name),
@@ -253,38 +249,41 @@ class FileProcessor:
253
  except Exception as e:
254
  logger.error(f"File processing error: {e}")
255
  return []
256
-
257
- def generate_qr(json_data):
258
- """Generate QR code from JSON data and return the file path."""
259
- qr = qrcode.QRCode(
260
- version=40, # Force maximum version
261
- error_correction=qrcode.constants.ERROR_CORRECT_L, # Use lower error correction
262
- box_size=10,
263
- border=4,
264
- )
265
- qr.add_data(json_data)
266
- qr.make(fit=True)
267
- return qr.make_image(fill_color="black", back_color="white")
 
 
 
268
 
269
  def create_interface():
270
  """Create a comprehensive Gradio interface with advanced features"""
271
-
272
  css = """
273
  .container { max-width: 1200px; margin: auto; }
274
  .warning { background-color: #fff3cd; color: #856404; }
275
  .error { background-color: #f8d7da; color: #721c24; }
276
  """
277
-
278
  with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
279
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
280
-
281
  with gr.Tab("URL Processing"):
282
  url_input = gr.Textbox(
283
- label="Enter URLs (comma or newline separated)",
284
  lines=5,
285
  placeholder="https://example1.com\nhttps://example2.com"
286
  )
287
-
288
  with gr.Tab("File Input"):
289
  file_input = gr.File(
290
  label="Upload text file or ZIP archive",
@@ -293,11 +292,11 @@ def create_interface():
293
 
294
  with gr.Tab("Text Input"):
295
  text_input = gr.Textbox(
296
- label="Raw Text Input",
297
  lines=5,
298
  placeholder="Paste your text here..."
299
  )
300
-
301
  with gr.Tab("JSON Editor"):
302
  json_editor = gr.Textbox(
303
  label="JSON Editor",
@@ -306,7 +305,7 @@ def create_interface():
306
  interactive=True,
307
  elem_id="json-editor" # Optional: for custom styling
308
  )
309
-
310
  with gr.Tab("Scratchpad"):
311
  scratchpad = gr.Textbox(
312
  label="Scratchpad",
@@ -314,26 +313,26 @@ def create_interface():
314
  placeholder="Quick notes or text collections...",
315
  interactive=True
316
  )
317
-
318
  process_btn = gr.Button("Process Input", variant="primary")
319
  qr_btn = gr.Button("Generate QR Code", variant="secondary")
320
-
321
  output_text = gr.Textbox(label="Processing Results", interactive=False)
322
  output_file = gr.File(label="Processed Output")
323
  qr_output = gr.Image(label="QR Code", type="filepath") # To display the generated QR code
324
-
325
- def process_all_inputs(urls, file, text, notes):
326
  """Process all input types with progress tracking"""
327
  try:
328
  processor = URLProcessor()
329
  file_processor = FileProcessor()
330
  results = []
331
-
332
  # Process URLs
333
  if urls:
334
  url_list = re.split(r'[,\n]', urls)
335
  url_list = [url.strip() for url in url_list if url.strip()]
336
-
337
  for url in url_list:
338
  validation = processor.validate_url(url)
339
  if validation.get('is_valid'):
@@ -345,11 +344,11 @@ def process_all_inputs(urls, file, text, notes):
345
  'content': content,
346
  'timestamp': datetime.now().isoformat()
347
  })
348
-
349
  # Process files
350
  if file:
351
  results.extend(file_processor.process_file(file))
352
-
353
  # Process text input
354
  if text:
355
  cleaned_text = processor.advanced_text_cleaning(text)
@@ -358,56 +357,45 @@ def process_all_inputs(urls, file, text, notes):
358
  'content': cleaned_text,
359
  'timestamp': datetime.now().isoformat()
360
  })
361
-
362
  # Generate output
363
  if results:
364
  output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
365
  output_dir.mkdir(parents=True, exist_ok=True)
366
  output_path = output_dir / f'processed_{int(time.time())}.json'
367
-
368
  with open(output_path, 'w', encoding='utf-8') as f:
369
  json.dump(results, f, ensure_ascii=False, indent=2)
370
-
371
  summary = f"Processed {len(results)} items successfully!"
372
  json_data = json.dumps(results, indent=2) # Prepare JSON for QR code
373
  return str(output_path), summary, json_data # Return JSON for editor
374
  else:
375
  return None, "No valid content to process.", ""
376
-
377
  except Exception as e:
378
  logger.error(f"Processing error: {e}")
379
  return None, f"Error: {str(e)}", ""
380
-
381
- def generate_qr(json_data):
382
- """Generate QR code from JSON data and return the file path."""
383
- qr = qrcode.QRCode(
384
- version=40, # Force maximum version
385
- error_correction=qrcode.constants.ERROR_CORRECT_L, # Use lower error correction
386
- box_size=10,
387
- border=4,
388
- )
389
- qr.add_data(json_data)
390
- qr.make(fit=True)
391
- return qr.make_image(fill_color="black", back_color="white")
392
-
393
  if json_data:
394
- return generate_qr_code(json_data)
395
  return None
396
-
397
- process_btn.click(
398
- process_all_inputs,
399
- inputs=[url_input, file_input, text_input, scratchpad],
400
- outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
401
- )
402
-
403
- qr_btn.click(
404
- generate_qr,
405
- inputs=json_editor,
406
- outputs=qr_output
407
- )
408
-
409
-
410
- gr.Markdown("""
411
  ### Usage Guidelines
412
  - **URL Processing**: Enter valid HTTP/HTTPS URLs
413
  - **File Input**: Upload text files or ZIP archives
@@ -416,16 +404,15 @@ def generate_qr(json_data):
416
  - **Scratchpad**: Quick notes or text collections
417
  - Advanced cleaning and validation included
418
  """)
419
-
420
- return interface
421
 
422
  def main():
423
  # Configure system settings
424
  mimetypes.init()
425
-
426
  # Create and launch interface
427
  interface = create_interface()
428
-
429
  # Launch with proper configuration
430
  interface.launch(
431
  server_name="0.0.0.0",
@@ -437,4 +424,4 @@ def main():
437
  )
438
 
439
  if __name__ == "__main__":
440
- main()
 
19
  from cleantext import clean
20
  import qrcode
21
  import zipfile
 
22
 
23
  # Setup logging with detailed configuration
24
  logging.basicConfig(
 
27
  handlers=[
28
  logging.StreamHandler(),
29
  logging.FileHandler('app.log', encoding='utf-8')
30
+ ])
 
31
  logger = logging.getLogger(__name__)
32
 
33
+
34
  class URLProcessor:
35
  def __init__(self):
36
  self.session = requests.Session()
 
43
  'Connection': 'keep-alive',
44
  'Upgrade-Insecure-Requests': '1'
45
  })
46
+
47
  def advanced_text_cleaning(self, text: str) -> str:
48
  """Robust text cleaning with version compatibility"""
49
  try:
 
73
  try:
74
  if not validators.url(url):
75
  return {'is_valid': False, 'message': 'Invalid URL format'}
76
+
77
  response = self.session.head(url, timeout=self.timeout)
78
  response.raise_for_status()
79
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
 
86
  # Google Drive document handling
87
  if 'drive.google.com' in url:
88
  return self._handle_google_drive(url)
89
+
90
  # Google Calendar ICS handling
91
  if 'calendar.google.com' in url and 'ical' in url:
92
  return self._handle_google_calendar(url)
93
+
94
  # Standard HTML processing
95
  return self._fetch_html_content(url)
96
  except Exception as e:
 
104
  if not file_id:
105
  logger.error(f"Invalid Google Drive URL: {url}")
106
  return None
107
+
108
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
109
  response = self.session.get(direct_url, timeout=self.timeout)
110
  response.raise_for_status()
111
+
112
  return {
113
  'content': response.text,
114
  'content_type': response.headers.get('Content-Type', ''),
 
137
  try:
138
  response = self.session.get(url, timeout=self.timeout)
139
  response.raise_for_status()
140
+
141
  soup = BeautifulSoup(response.text, 'html.parser')
142
+
143
  # Remove unwanted elements
144
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
145
  element.decompose()
146
+
147
  # Extract main content
148
  main_content = soup.find('main') or soup.find('article') or soup.body
149
+
150
  # Clean and structure content
151
  text_content = main_content.get_text(separator='\n', strip=True)
152
  cleaned_content = self.advanced_text_cleaning(text_content)
153
+
154
  return {
155
  'content': cleaned_content,
156
  'content_type': response.headers.get('Content-Type', ''),
 
159
  except Exception as e:
160
  logger.error(f"HTML processing failed: {e}")
161
  return None
162
+
163
+
164
  class FileProcessor:
165
  """Class to handle file processing"""
166
+
167
  def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
168
  self.max_file_size = max_file_size
169
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
170
+
171
  def is_text_file(self, filepath: str) -> bool:
172
  """Check if file is a text file"""
173
  try:
 
181
  """Process uploaded file with enhanced error handling"""
182
  if not file:
183
  return []
 
184
  dataset = []
185
  try:
186
  file_size = os.path.getsize(file.name)
187
  if file_size > self.max_file_size:
188
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
189
  return []
 
190
  with tempfile.TemporaryDirectory() as temp_dir:
191
  if zipfile.is_zipfile(file.name):
192
  dataset.extend(self._process_zip_file(file.name, temp_dir))
193
  else:
194
  dataset.extend(self._process_single_file(file))
 
195
  except Exception as e:
196
  logger.error(f"Error processing file: {str(e)}")
197
  return []
 
198
  return dataset
199
 
200
  def _process_zip_file(self, zip_path, temp_dir):
 
213
  'timestamp': datetime.now().isoformat()
214
  })
215
  return result
216
+
217
  def _process_single_file(self, file) -> List[Dict]:
218
  try:
219
  file_stat = os.stat(file.name)
220
+
221
  # For very large files, read in chunks and summarize
222
  if file_stat.st_size > 100 * 1024 * 1024: # 100MB
223
  logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
224
+
225
  # Read first and last 1MB for extremely large files
226
  content = ""
227
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
228
  content = f.read(1 * 1024 * 1024) # First 1MB
229
  content += "\n...[Content truncated due to large file size]...\n"
230
+
231
  # Seek to the last 1MB
232
  f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
233
  content += f.read() # Last 1MB
234
  else:
235
  # Regular file processing
236
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
237
+ content = f.read()
238
+
239
  return [{
240
  'source': 'file',
241
  'filename': os.path.basename(file.name),
 
249
  except Exception as e:
250
  logger.error(f"File processing error: {e}")
251
  return []
252
+
253
+ def generate_qr(json_data):
254
+ """Generate QR code from JSON data and return the file path."""
255
+ qr = qrcode.QRCode(
256
+ version=40, # Force maximum version
257
+ error_correction=qrcode.constants.ERROR_CORRECT_L, # Use lower error correction
258
+ box_size=10,
259
+ border=4,
260
+ )
261
+ qr.add_data(json_data)
262
+ qr.make(fit=True)
263
+ img = qr.make_image(fill_color="black", back_color="white")
264
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
265
+ img.save(temp_file.name)
266
+ return temp_file.name
267
 
268
  def create_interface():
269
  """Create a comprehensive Gradio interface with advanced features"""
270
+
271
  css = """
272
  .container { max-width: 1200px; margin: auto; }
273
  .warning { background-color: #fff3cd; color: #856404; }
274
  .error { background-color: #f8d7da; color: #721c24; }
275
  """
276
+
277
  with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
278
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
279
+
280
  with gr.Tab("URL Processing"):
281
  url_input = gr.Textbox(
282
+ label="Enter URLs (comma or newline separated)",
283
  lines=5,
284
  placeholder="https://example1.com\nhttps://example2.com"
285
  )
286
+
287
  with gr.Tab("File Input"):
288
  file_input = gr.File(
289
  label="Upload text file or ZIP archive",
 
292
 
293
  with gr.Tab("Text Input"):
294
  text_input = gr.Textbox(
295
+ label="Raw Text Input",
296
  lines=5,
297
  placeholder="Paste your text here..."
298
  )
299
+
300
  with gr.Tab("JSON Editor"):
301
  json_editor = gr.Textbox(
302
  label="JSON Editor",
 
305
  interactive=True,
306
  elem_id="json-editor" # Optional: for custom styling
307
  )
308
+
309
  with gr.Tab("Scratchpad"):
310
  scratchpad = gr.Textbox(
311
  label="Scratchpad",
 
313
  placeholder="Quick notes or text collections...",
314
  interactive=True
315
  )
316
+
317
  process_btn = gr.Button("Process Input", variant="primary")
318
  qr_btn = gr.Button("Generate QR Code", variant="secondary")
319
+
320
  output_text = gr.Textbox(label="Processing Results", interactive=False)
321
  output_file = gr.File(label="Processed Output")
322
  qr_output = gr.Image(label="QR Code", type="filepath") # To display the generated QR code
323
+
324
+ def process_all_inputs(urls, file, text, notes):
325
  """Process all input types with progress tracking"""
326
  try:
327
  processor = URLProcessor()
328
  file_processor = FileProcessor()
329
  results = []
330
+
331
  # Process URLs
332
  if urls:
333
  url_list = re.split(r'[,\n]', urls)
334
  url_list = [url.strip() for url in url_list if url.strip()]
335
+
336
  for url in url_list:
337
  validation = processor.validate_url(url)
338
  if validation.get('is_valid'):
 
344
  'content': content,
345
  'timestamp': datetime.now().isoformat()
346
  })
347
+
348
  # Process files
349
  if file:
350
  results.extend(file_processor.process_file(file))
351
+
352
  # Process text input
353
  if text:
354
  cleaned_text = processor.advanced_text_cleaning(text)
 
357
  'content': cleaned_text,
358
  'timestamp': datetime.now().isoformat()
359
  })
360
+
361
  # Generate output
362
  if results:
363
  output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
364
  output_dir.mkdir(parents=True, exist_ok=True)
365
  output_path = output_dir / f'processed_{int(time.time())}.json'
366
+
367
  with open(output_path, 'w', encoding='utf-8') as f:
368
  json.dump(results, f, ensure_ascii=False, indent=2)
369
+
370
  summary = f"Processed {len(results)} items successfully!"
371
  json_data = json.dumps(results, indent=2) # Prepare JSON for QR code
372
  return str(output_path), summary, json_data # Return JSON for editor
373
  else:
374
  return None, "No valid content to process.", ""
375
+
376
  except Exception as e:
377
  logger.error(f"Processing error: {e}")
378
  return None, f"Error: {str(e)}", ""
379
+
380
+ def generate_qr_code(json_data):
381
+ """Generate QR code from JSON data and return the file path."""
 
 
 
 
 
 
 
 
 
 
382
  if json_data:
383
+ return generate_qr(json_data)
384
  return None
385
+
386
+ process_btn.click(
387
+ process_all_inputs,
388
+ inputs=[url_input, file_input, text_input, scratchpad],
389
+ outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
390
+ )
391
+
392
+ qr_btn.click(
393
+ generate_qr_code,
394
+ inputs=json_editor,
395
+ outputs=qr_output
396
+ )
397
+
398
+ gr.Markdown("""
 
399
  ### Usage Guidelines
400
  - **URL Processing**: Enter valid HTTP/HTTPS URLs
401
  - **File Input**: Upload text files or ZIP archives
 
404
  - **Scratchpad**: Quick notes or text collections
405
  - Advanced cleaning and validation included
406
  """)
407
+ return interface
 
408
 
409
  def main():
410
  # Configure system settings
411
  mimetypes.init()
412
+
413
  # Create and launch interface
414
  interface = create_interface()
415
+
416
  # Launch with proper configuration
417
  interface.launch(
418
  server_name="0.0.0.0",
 
424
  )
425
 
426
  if __name__ == "__main__":
427
+ main()