acecalisto3 commited on
Commit
fe14e10
·
verified ·
1 Parent(s): b3a8443

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +55 -63
app2.py CHANGED
@@ -4,23 +4,23 @@ import re
4
  import time
5
  import logging
6
  import mimetypes
 
 
 
7
  import tempfile
8
  from datetime import datetime
 
9
  from pathlib import Path
10
  from urllib.parse import urlparse
11
- from typing import List, Dict, Tuple, Union, Optional
12
  import requests
13
  import validators
14
  import gradio as gr
15
  from diskcache import Cache
16
  from bs4 import BeautifulSoup
17
  from fake_useragent import UserAgent
 
18
  from cleantext import clean
19
- import qrcode
20
- if sys.version_info >= (3, 6):
21
- import zipfile
22
- else:
23
- import zipfile36 as zipfile
24
 
25
  # Setup logging with detailed configuration
26
  logging.basicConfig(
@@ -45,12 +45,13 @@ class URLProcessor:
45
  'Connection': 'keep-alive',
46
  'Upgrade-Insecure-Requests': '1'
47
  })
48
-
49
  def advanced_text_cleaning(self, text: str) -> str:
50
  """Robust text cleaning with version compatibility"""
51
  try:
52
  cleaned_text = clean(
53
  text,
 
54
  to_ascii=True,
55
  lower=True,
56
  no_line_breaks=True,
@@ -149,6 +150,14 @@ class URLProcessor:
149
  # Extract main content
150
  main_content = soup.find('main') or soup.find('article') or soup.body
151
 
 
 
 
 
 
 
 
 
152
  # Clean and structure content
153
  text_content = main_content.get_text(separator='\n', strip=True)
154
  cleaned_content = self.advanced_text_cleaning(text_content)
@@ -202,15 +211,29 @@ class FileProcessor:
202
 
203
  return dataset
204
 
205
- def process_zip_file(zip_path):
206
- """Extract and process files within a ZIP archive."""
207
- extraction_directory = tempfile.mkdtemp()
208
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
209
- zip_ref.extractall(extraction_directory)
210
- for extracted_file in os.listdir(extraction_directory):
211
- extracted_file_path = os.path.join(extraction_directory, extracted_file)
212
- process_file(extracted_file_path)
213
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  def _process_single_file(self, file) -> List[Dict]:
215
  try:
216
  file_stat = os.stat(file.name)
@@ -247,14 +270,17 @@ def _process_single_file(self, file) -> List[Dict]:
247
  logger.error(f"File processing error: {e}")
248
  return []
249
 
250
- def generate_qr_code(json_data):
251
- """Generate a QR code from JSON data."""
252
- qr = qrcode.make(json_data)
253
- qr_path = "output/qr_code.png"
254
- qr.save(qr_path)
255
- return qr_path
 
 
 
 
256
 
257
- def create_interface():
258
  """Create a comprehensive Gradio interface with advanced features"""
259
 
260
  css = """
@@ -286,31 +312,12 @@ def create_interface():
286
  placeholder="Paste your text here..."
287
  )
288
 
289
- with gr.Tab("JSON Editor"):
290
- json_editor = gr.Textbox(
291
- label="JSON Editor",
292
- lines=20,
293
- placeholder="View and edit your JSON data here...",
294
- interactive=True,
295
- elem_id="json-editor" # Optional: for custom styling
296
- )
297
-
298
- with gr.Tab("Scratchpad"):
299
- scratchpad = gr.Textbox(
300
- label="Scratchpad",
301
- lines=10,
302
- placeholder="Quick notes or text collections...",
303
- interactive=True
304
- )
305
-
306
  process_btn = gr.Button("Process Input", variant="primary")
307
- qr_btn = gr.Button("Generate QR Code", variant="secondary")
308
 
309
  output_text = gr.Textbox(label="Processing Results", interactive=False)
310
  output_file = gr.File(label="Processed Output")
311
- qr_output = gr.Image(label="QR Code", type="filepath") # To display the generated QR code
312
 
313
- def process_all_inputs(urls, file, text, notes):
314
  """Process all input types with progress tracking"""
315
  try:
316
  processor = URLProcessor()
@@ -357,31 +364,19 @@ def create_interface():
357
  json.dump(results, f, ensure_ascii=False, indent=2)
358
 
359
  summary = f"Processed {len(results)} items successfully!"
360
- json_data = json.dumps(results, indent=2) # Prepare JSON for QR code
361
- return str(output_path), summary, json_data # Return JSON for editor
362
  else:
363
- return None, "No valid content to process.", ""
364
 
365
  except Exception as e:
366
  logger.error(f"Processing error: {e}")
367
- return None, f"Error: {str(e)}", ""
368
-
369
- def generate_qr(json_data):
370
- """Generate QR code from JSON data and return the file path."""
371
- if json_data:
372
- return generate_qr_code(json_data)
373
- return None
374
 
375
  process_btn.click(
376
  process_all_inputs,
377
- inputs=[url_input, file_input, text_input, scratchpad],
378
- outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
379
- )
380
-
381
- qr_btn.click(
382
- generate_qr,
383
- inputs=json_editor,
384
- outputs=qr_output
385
  )
386
 
387
  gr.Markdown("""
@@ -389,8 +384,6 @@ def create_interface():
389
  - **URL Processing**: Enter valid HTTP/HTTPS URLs
390
  - **File Input**: Upload text files or ZIP archives
391
  - **Text Input**: Direct text processing
392
- - **JSON Editor**: View and edit your JSON data
393
- - **Scratchpad**: Quick notes or text collections
394
  - Advanced cleaning and validation included
395
  """)
396
 
@@ -412,6 +405,5 @@ def main():
412
  inbrowser=True,
413
  debug=True
414
  )
415
-
416
  if __name__ == "__main__":
417
  main()
 
4
  import time
5
  import logging
6
  import mimetypes
7
+ import concurrent.futures
8
+ import string
9
+ import zipfile
10
  import tempfile
11
  from datetime import datetime
12
+ from typing import List, Dict, Optional, Union
13
  from pathlib import Path
14
  from urllib.parse import urlparse
15
+
16
  import requests
17
  import validators
18
  import gradio as gr
19
  from diskcache import Cache
20
  from bs4 import BeautifulSoup
21
  from fake_useragent import UserAgent
22
+ from ratelimit import limits, sleep_and_retry
23
  from cleantext import clean
 
 
 
 
 
24
 
25
  # Setup logging with detailed configuration
26
  logging.basicConfig(
 
45
  'Connection': 'keep-alive',
46
  'Upgrade-Insecure-Requests': '1'
47
  })
48
+
49
  def advanced_text_cleaning(self, text: str) -> str:
50
  """Robust text cleaning with version compatibility"""
51
  try:
52
  cleaned_text = clean(
53
  text,
54
+ fix_unicode=True,
55
  to_ascii=True,
56
  lower=True,
57
  no_line_breaks=True,
 
150
  # Extract main content
151
  main_content = soup.find('main') or soup.find('article') or soup.body
152
 
153
+ if main_content is None:
154
+ logger.warning(f"No main content found for URL: {url}")
155
+ return {
156
+ 'content': '',
157
+ 'content_type': response.headers.get('Content-Type', ''),
158
+ 'timestamp': datetime.now().isoformat()
159
+ }
160
+
161
  # Clean and structure content
162
  text_content = main_content.get_text(separator='\n', strip=True)
163
  cleaned_content = self.advanced_text_cleaning(text_content)
 
211
 
212
  return dataset
213
 
214
+ def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
215
+ """Process ZIP file contents"""
216
+ results = []
217
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
218
+ zip_ref.extractall(temp_dir)
219
+ for root, _, files in os.walk(temp_dir):
220
+ for filename in files:
221
+ filepath = os.path.join(root, filename)
222
+ if self.is_text_file(filepath):
223
+ try:
224
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
225
+ content = f.read()
226
+ if content.strip():
227
+ results.append({
228
+ "source": "file",
229
+ "filename": filename,
230
+ "content": content,
231
+ "timestamp": datetime.now().isoformat()
232
+ })
233
+ except Exception as e:
234
+ logger.error(f"Error reading file {filename}: {str(e)}")
235
+ return results
236
+
237
  def _process_single_file(self, file) -> List[Dict]:
238
  try:
239
  file_stat = os.stat(file.name)
 
270
  logger.error(f"File processing error: {e}")
271
  return []
272
 
273
+ import qrcode # Import the qrcode library
274
+
275
+ def generate_qr(json_data):
276
+ """Generate QR code from JSON data and return the file path."""
277
+ if json_data:
278
+ qr = qrcode.make(json_data)
279
+ qr_path = f"output/qr_code_{int(time.time())}.png"
280
+ qr.save(qr_path)
281
+ return qr_path
282
+ return None
283
 
 
284
  """Create a comprehensive Gradio interface with advanced features"""
285
 
286
  css = """
 
312
  placeholder="Paste your text here..."
313
  )
314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  process_btn = gr.Button("Process Input", variant="primary")
 
316
 
317
  output_text = gr.Textbox(label="Processing Results", interactive=False)
318
  output_file = gr.File(label="Processed Output")
 
319
 
320
+ def process_all_inputs(urls, file, text):
321
  """Process all input types with progress tracking"""
322
  try:
323
  processor = URLProcessor()
 
364
  json.dump(results, f, ensure_ascii=False, indent=2)
365
 
366
  summary = f"Processed {len(results)} items successfully!"
367
+ # Convert Path object to string here
368
+ return str(output_path), summary
369
  else:
370
+ return None, "No valid content to process."
371
 
372
  except Exception as e:
373
  logger.error(f"Processing error: {e}")
374
+ return None, f"Error: {str(e)}"
 
 
 
 
 
 
375
 
376
  process_btn.click(
377
  process_all_inputs,
378
+ inputs=[url_input, file_input, text_input],
379
+ outputs=[output_file, output_text]
 
 
 
 
 
 
380
  )
381
 
382
  gr.Markdown("""
 
384
  - **URL Processing**: Enter valid HTTP/HTTPS URLs
385
  - **File Input**: Upload text files or ZIP archives
386
  - **Text Input**: Direct text processing
 
 
387
  - Advanced cleaning and validation included
388
  """)
389
 
 
405
  inbrowser=True,
406
  debug=True
407
  )
 
408
  if __name__ == "__main__":
409
  main()