acecalisto3 commited on
Commit
4d3af6e
·
verified ·
1 Parent(s): 9791aa9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +233 -293
app.py CHANGED
@@ -6,43 +6,37 @@ import logging
6
  import mimetypes
7
  import concurrent.futures
8
  import string
9
- import random
10
- from typing import List, Dict, Optional, Union, Any
11
  from pathlib import Path
12
  from urllib.parse import urlparse
13
 
14
  import requests
15
  import validators
16
  import gradio as gr
17
- import torch
18
  import cachetools
19
  from bs4 import BeautifulSoup
 
20
  from ratelimit import limits, sleep_and_retry
21
 
22
- # Advanced Logging Configuration
23
  logging.basicConfig(
24
  level=logging.INFO,
25
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
26
  handlers=[
27
  logging.StreamHandler(),
28
- logging.FileHandler('app_advanced.log', encoding='utf-8')
29
  ]
30
  )
 
31
 
32
- class AdvancedURLProcessor:
33
- """Enhanced URL processing with advanced features"""
34
 
35
- def __init__(
36
- self,
37
- timeout: int = 15,
38
- max_retries: int = 3,
39
- concurrent_requests: int = 5,
40
- cache_size: int = 100
41
- ):
42
  self.timeout = timeout
43
  self.max_retries = max_retries
44
  self.concurrent_requests = concurrent_requests
45
- self.ua = UserAgent()
46
 
47
  # Implement multilevel caching
48
  self.url_cache = cachetools.LRUCache(maxsize=cache_size)
@@ -50,7 +44,7 @@ class AdvancedURLProcessor:
50
 
51
  self.session = requests.Session()
52
  self.session.headers.update({
53
- 'User-Agent': self.ua.random,
54
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55
  'Accept-Language': 'en-US,en;q=0.5',
56
  'Connection': 'keep-alive'
@@ -59,13 +53,12 @@ class AdvancedURLProcessor:
59
  @sleep_and_retry
60
  @limits(calls=10, period=60) # Rate limiting: 10 calls per minute
61
  def validate_url(self, url: str) -> Dict[str, Union[bool, str]]:
62
- """Enhanced URL validation with comprehensive checks"""
63
  try:
64
  # Check cache first
65
  if url in self.url_cache:
66
  return self.url_cache[url]
67
 
68
- # Comprehensive URL validation
69
  result = urlparse(url)
70
  validation_result = {
71
  'is_valid': False,
@@ -78,23 +71,16 @@ class AdvancedURLProcessor:
78
  validation_result['message'] = 'Missing scheme or network location'
79
  return validation_result
80
 
81
- # Use validators for additional checks
82
  if not validators.url(url):
83
  validation_result['message'] = 'URL format validation failed'
84
  return validation_result
85
 
86
  # Perform HEAD request for accessibility
87
  try:
88
- response = self.session.head(
89
- url,
90
- timeout=self.timeout,
91
- allow_redirects=True
92
- )
93
-
94
  validation_result['is_valid'] = response.status_code in [200, 301, 302]
95
  validation_result['status_code'] = response.status_code
96
  validation_result['message'] = f"URL is {'valid' if validation_result['is_valid'] else 'invalid'}"
97
-
98
  except requests.RequestException as e:
99
  validation_result['message'] = f"Connection error: {str(e)}"
100
 
@@ -110,306 +96,260 @@ class AdvancedURLProcessor:
110
  }
111
 
112
  @sleep_and_retry
113
- @limits(calls=10, period=60)
114
- async def fetch_content(self, url: str) -> Optional[str]:
115
- """Fetch content from URL with retry mechanism and caching"""
116
- try:
117
- # Check content cache first
118
- if url in self.content_cache:
119
- logger.info(f"Cache hit for URL: {url}")
120
- return self.content_cache[url]
121
-
122
- for attempt in range(self.max_retries):
123
- try:
124
- response = self.session.get(url, timeout=self.timeout)
125
- response.raise_for_status()
126
- content = response.text
127
-
128
- # Cache the content
129
- self.content_cache[url] = content
130
- return content
131
-
132
- except requests.RequestException as e:
133
- logger.warning(f"Attempt {attempt + 1}/{self.max_retries} failed for {url}: {str(e)}")
134
- if attempt == self.max_retries - 1:
135
- raise
136
- time.sleep(1) # Delay between retries
137
-
138
- except Exception as e:
139
- logger.error(f"Error fetching content from {url}: {e}")
140
- return None
141
-
142
- class ContentExtractor:
143
- """Advanced content extraction and processing"""
144
-
145
- def __init__(self):
146
- self.cleaners = [
147
- self._remove_scripts,
148
- self._remove_styles,
149
- self._remove_special_chars,
150
- self._normalize_whitespace
151
- ]
152
 
153
- def extract_text(self, html: str, url: str = "") -> Dict[str, Union[str, Dict]]:
154
- """Extract and clean text content with metadata"""
155
- try:
156
- if not html:
157
- return {
158
- "success": False,
159
- "content": "",
160
- "metadata": {"error": "Empty HTML content"}
161
- }
162
 
163
- soup = BeautifulSoup(html, 'html.parser')
164
-
165
- # Extract metadata
166
- metadata = self._extract_metadata(soup, url)
167
-
168
- # Clean content
169
- content = self._process_content(soup)
170
-
171
- return {
172
- "success": True,
173
- "content": content,
174
- "metadata": metadata
175
- }
176
-
177
- except Exception as e:
178
- logger.error(f"Content extraction error for {url}: {e}")
179
- return {
180
- "success": False,
181
- "content": "",
182
- "metadata": {"error": str(e)}
183
- }
184
-
185
- def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
186
- """Extract page metadata"""
187
- metadata = {
188
- "title": self._get_title(soup),
189
- "description": self._get_meta_description(soup),
190
- "keywords": self._get_meta_keywords(soup),
191
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
192
- "url": url
193
- }
194
- return metadata
195
-
196
- def _process_content(self, soup: BeautifulSoup) -> str:
197
- """Process and clean content through multiple passes"""
198
- for cleaner in self.cleaners:
199
- soup = cleaner(soup)
200
 
201
- # Extract text with preserved structure
202
- lines = []
203
- for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
204
- text = element.get_text(strip=True)
205
- if text:
206
- lines.append(text)
207
 
208
- return "\n".join(lines)
209
-
210
- @staticmethod
211
- def _remove_scripts(soup: BeautifulSoup) -> BeautifulSoup:
212
- for script in soup(["script", "style", "iframe", "noscript"]):
213
- script.decompose()
214
- return soup
215
-
216
- @staticmethod
217
- def _remove_styles(soup: BeautifulSoup) -> BeautifulSoup:
218
- for element in soup.find_all(style=True):
219
- del element['style']
220
- return soup
221
-
222
- @staticmethod
223
- def _remove_special_chars(soup: BeautifulSoup) -> BeautifulSoup:
224
- text = soup.get_text()
225
- text = re.sub(r'[^\w\s\.\,\!\?\-]', '', text)
226
- new_soup = BeautifulSoup(f"<div>{text}</div>", 'html.parser')
227
- return new_soup
228
-
229
- @staticmethod
230
- def _normalize_whitespace(soup: BeautifulSoup) -> BeautifulSoup:
231
- text = soup.get_text()
232
  text = re.sub(r'\s+', ' ', text)
233
- new_soup = BeautifulSoup(f"<div>{text}</div>", 'html.parser')
234
- return new_soup
235
-
236
- @staticmethod
237
- def _get_title(soup: BeautifulSoup) -> str:
238
- title = soup.find('title')
239
- return title.get_text(strip=True) if title else ""
240
-
241
- @staticmethod
242
- def _get_meta_description(soup: BeautifulSoup) -> str:
243
- meta = soup.find('meta', attrs={'name': 'description'})
244
- return meta.get('content', '') if meta else ""
245
-
246
- @staticmethod
247
- def _get_meta_keywords(soup: BeautifulSoup) -> str:
248
- meta = soup.find('meta', attrs={'name': 'keywords'})
249
- return meta.get('content', '') if meta else ""
250
 
251
- class ContentProcessor:
252
- """Main content processing orchestrator"""
253
-
254
- def __init__(self):
255
- self.url_processor = AdvancedURLProcessor()
256
- self.content_extractor = ContentExtractor()
257
- self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
258
 
259
- async def process_urls(self, urls: List[str]) -> Dict[str, Any]:
260
- """Process multiple URLs concurrently with advanced error handling"""
261
- results = {
262
- "successful": [],
263
- "failed": [],
264
- "metadata": {
265
- "total_urls": len(urls),
266
- "start_time": time.strftime("%Y-%m-%d %H:%M:%S")
267
- }
268
- }
269
-
 
 
 
 
 
 
 
270
  try:
271
- # Validate URLs first
272
- valid_urls = []
273
- for url in urls:
274
- validation_result = self.url_processor.validate_url(url)
275
- if validation_result['is_valid']:
276
- valid_urls.append(url)
 
 
277
  else:
278
- results['failed'].append({
279
- "url": url,
280
- "error": validation_result['message']
281
- })
282
-
283
- # Process valid URLs concurrently
284
- futures = []
285
- for url in valid_urls:
286
- future = self.executor.submit(self._process_single_url, url)
287
- futures.append((url, future))
288
-
289
- # Collect results
290
- for url, future in futures:
291
- try:
292
- result = future.result(timeout=30) # 30-second timeout
293
- if result["success"]:
294
- results["successful"].append(result)
295
- else:
296
- results["failed"].append({
297
- "url": url,
298
- "error": "Processing failed"
299
- })
300
- except Exception as e:
301
- logger.error(f"Error processing {url}: {e}")
302
- results["failed"].append({
303
- "url": url,
304
- "error": str(e)
305
- })
306
-
307
- # Update metadata
308
- results["metadata"].update({
309
- "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
310
- "successful_count": len(results["successful"]),
311
- "failed_count": len(results["failed"])
312
- })
313
-
314
- return results
315
-
316
  except Exception as e:
317
- logger.error(f"Batch processing error: {e}")
318
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
- def _process_single_url(self, url: str) -> Dict:
321
- """Process a single URL with comprehensive error handling"""
 
322
  try:
323
- response = self.url_processor.session.get(
324
- url,
325
- timeout=self.url_processor.timeout
326
- )
327
- response.raise_for_status()
328
-
329
- result = self.content_extractor.extract_text(
330
- response.text,
331
- url
332
- )
333
-
334
- result["url"] = url
335
- result["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S")
336
-
337
- return result
338
-
339
  except Exception as e:
340
- logger.error(f"Error processing {url}: {e}")
341
- return {
342
- "success": False,
343
- "url": url,
344
- "error": str(e)
345
- }
346
 
347
  def create_interface():
348
- """Create Gradio interface with advanced features"""
349
- processor = ContentProcessor()
350
 
351
- with gr.Blocks(title="Advanced URL Content Processor") as interface:
352
- gr.Markdown("# Advanced URL Content Processor")
 
 
 
 
 
 
353
 
354
- with gr.Row():
355
- with gr.Column():
356
- url_input = gr.Textbox(
357
- label="Enter URLs (one per line)",
358
- placeholder="https://example.com\nhttps://example.org",
359
- lines=5
360
- )
361
-
362
- with gr.Row():
363
- process_btn = gr.Button("Process URLs", variant="primary")
364
- clear_btn = gr.Button("Clear")
365
-
366
- with gr.Column():
367
- status_output = gr.JSON(
368
- label="Processing Results",
369
- show_label=True
370
- )
371
 
372
- gr.Markdown("## Processing Status")
373
- with gr.Row():
374
- progress_output = gr.Textbox(
375
- label="Progress",
376
- show_label=True
 
 
 
 
 
 
377
  )
378
 
379
- async def process_urls(urls):
380
- if not urls.strip():
381
- return {"error": "No URLs provided"}
382
-
383
- url_list = [url.strip() for url in urls.splitlines() if url.strip()]
384
- results = await processor.process_urls(url_list)
385
- return results
386
 
387
- def clear_inputs():
388
- return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
  process_btn.click(
391
- fn=process_urls,
392
- inputs=[url_input],
393
- outputs=[status_output]
394
  )
395
 
396
- clear_btn.click(
397
- fn=clear_inputs,
398
- inputs=[],
399
- outputs=[url_input, status_output]
400
- )
 
 
401
 
402
  return interface
403
 
404
- if __name__ == "__main__":
405
- # Initialize mimetypes
406
  mimetypes.init()
407
 
408
  # Create and launch interface
409
  interface = create_interface()
410
  interface.launch(
 
411
  server_name="0.0.0.0",
412
  server_port=7860,
413
- share=False,
414
  debug=True
415
- )
 
 
 
 
6
  import mimetypes
7
  import concurrent.futures
8
  import string
9
+ from typing import List, Dict, Optional, Union
 
10
  from pathlib import Path
11
  from urllib.parse import urlparse
12
 
13
  import requests
14
  import validators
15
  import gradio as gr
 
16
  import cachetools
17
  from bs4 import BeautifulSoup
18
+ from fake_useragent import UserAgent
19
  from ratelimit import limits, sleep_and_retry
20
 
21
+ # Setup logging with detailed configuration
22
  logging.basicConfig(
23
  level=logging.INFO,
24
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
25
  handlers=[
26
  logging.StreamHandler(),
27
+ logging.FileHandler('app.log', encoding='utf-8')
28
  ]
29
  )
30
+ logger = logging.getLogger(__name__)
31
 
32
+ class URLProcessor:
33
+ """Class to handle URL processing with advanced features"""
34
 
35
+ def __init__(self, timeout: int = 15, max_retries: int = 3, concurrent_requests: int = 5, cache_size: int = 100):
 
 
 
 
 
 
36
  self.timeout = timeout
37
  self.max_retries = max_retries
38
  self.concurrent_requests = concurrent_requests
39
+ self.ua = UserAgent() # Initialize UserAgent
40
 
41
  # Implement multilevel caching
42
  self.url_cache = cachetools.LRUCache(maxsize=cache_size)
 
44
 
45
  self.session = requests.Session()
46
  self.session.headers.update({
47
+ 'User -Agent': self.ua.random, # Use random User-Agent
48
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
49
  'Accept-Language': 'en-US,en;q=0.5',
50
  'Connection': 'keep-alive'
 
53
  @sleep_and_retry
54
  @limits(calls=10, period=60) # Rate limiting: 10 calls per minute
55
  def validate_url(self, url: str) -> Dict[str, Union[bool, str]]:
56
+ """Validate URL format and accessibility"""
57
  try:
58
  # Check cache first
59
  if url in self.url_cache:
60
  return self.url_cache[url]
61
 
 
62
  result = urlparse(url)
63
  validation_result = {
64
  'is_valid': False,
 
71
  validation_result['message'] = 'Missing scheme or network location'
72
  return validation_result
73
 
 
74
  if not validators.url(url):
75
  validation_result['message'] = 'URL format validation failed'
76
  return validation_result
77
 
78
  # Perform HEAD request for accessibility
79
  try:
80
+ response = self.session.head(url, timeout=self.timeout, allow_redirects=True)
 
 
 
 
 
81
  validation_result['is_valid'] = response.status_code in [200, 301, 302]
82
  validation_result['status_code'] = response.status_code
83
  validation_result['message'] = f"URL is {'valid' if validation_result['is_valid'] else 'invalid'}"
 
84
  except requests.RequestException as e:
85
  validation_result['message'] = f"Connection error: {str(e)}"
86
 
 
96
  }
97
 
98
  @sleep_and_retry
99
+ @limits(calls=20, period=60) # Refined rate limiting
100
+ def fetch_content(self, url: str) -> Optional[str]:
101
+ """Fetch content from URL with retry mechanism"""
102
+ # Check content cache first
103
+ if url in self.content_cache:
104
+ return self.content_cache[url]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ for attempt in range(self.max_retries):
107
+ try:
108
+ response = self.session.get(url, timeout=self.timeout)
109
+ response.raise_for_status()
 
 
 
 
 
110
 
111
+ # Use BeautifulSoup for more robust parsing
112
+ soup = BeautifulSoup(response.text, 'html.parser')
113
+
114
+ # Remove scripts, styles, comments
115
+ for script in soup(["script", "style"]):
116
+ script.decompose()
117
+
118
+ # Extract clean text
119
+ text = soup.get_text(separator=' ')
120
+ cleaned_text = self .advanced_text_cleaning(text)
121
+
122
+ # Cache the result
123
+ self.content_cache[url] = cleaned_text
124
+ return cleaned_text
125
+
126
+ except requests.RequestException as e:
127
+ logger.warning(f"Fetch attempt {attempt + 1} failed for {url}: {e}")
128
+ time.sleep(2 ** attempt) # Exponential backoff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ return None
131
+
132
+ def advanced_text_cleaning(self, text: str) -> str:
133
+ """Sophisticated text cleaning and normalization"""
134
+ if not text:
135
+ return ""
136
 
137
+ # Remove control characters
138
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
139
+
140
+ # Normalize Unicode characters
141
+ text = text.encode('ascii', 'ignore').decode('ascii')
142
+
143
+ # Replace multiple whitespaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  text = re.sub(r'\s+', ' ', text)
145
+
146
+ # Remove HTML entities
147
+ text = re.sub(r'&[a-zA-Z]+;', '', text)
148
+
149
+ # Normalize quotation marks
150
+ text = text.replace('"', '"').replace('"', '"')
151
+ text = text.replace('‘', "'").replace('’', "'")
152
+
153
+ # Remove excessive punctuation
154
+ text = re.sub(r'([.,!?]){2,}', r'\1', text)
155
+
156
+ return text.strip()
 
 
 
 
 
157
 
158
+ class FileProcessor:
159
+ """Class to handle file processing"""
 
 
 
 
 
160
 
161
+ def __init__(self, max_file_size: int = 10 * 1024 * 1024): # 10MB default
162
+ self.max_file_size = max_file_size
163
+ self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
164
+
165
+ def is_text_file(self, filepath: str) -> bool:
166
+ """Check if file is a text file"""
167
+ try:
168
+ mime_type, _ = mimetypes.guess_type(filepath)
169
+ return mime_type and mime_type.startswith('text/')
170
+ except Exception:
171
+ return False
172
+
173
+ def process_file(self, file) -> List[Dict]:
174
+ """Process uploaded file with enhanced error handling"""
175
+ if not file:
176
+ return []
177
+
178
+ dataset = []
179
  try:
180
+ file_size = os.path.getsize(file.name)
181
+ if file_size > self.max_file_size:
182
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
183
+ return []
184
+
185
+ with tempfile.TemporaryDirectory() as temp_dir:
186
+ if zipfile.is_zipfile(file.name):
187
+ dataset.extend(self._process_zip_file(file.name, temp_dir))
188
  else:
189
+ dataset.extend(self._process_single_file(file))
190
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  except Exception as e:
192
+ logger.error(f"Error processing file: {str(e)}")
193
+ return []
194
+
195
+ return dataset
196
+
197
+ def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
198
+ """Process ZIP file contents"""
199
+ results = []
200
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
201
+ zip_ref.extractall(temp_dir)
202
+ for root, _, files in os.walk(temp_dir):
203
+ for filename in files:
204
+ filepath = os.path.join(root, filename)
205
+ if self.is_text_file(filepath):
206
+ try:
207
+ with open(filepath, 'r', errors='ignore') as f:
208
+ content = f.read()
209
+ if content.strip():
210
+ results.append({
211
+ "source": "file",
212
+ "filename": filename,
213
+ "content": content,
214
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
215
+ })
216
+ except Exception as e:
217
+ logger.error(f"Error reading file {filename}: {str(e)}")
218
+ return results
219
 
220
+ def _process_single_file(self, file) -> List[Dict]:
221
+ """Process single file"""
222
+ results = []
223
  try:
224
+ content = file.read().decode('utf-8', errors='ignore')
225
+ if content.strip():
226
+ results.append({
227
+ "source": "file",
228
+ "filename": os.path.basename(file.name),
229
+ "content": content,
230
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
231
+ })
 
 
 
 
 
 
 
 
232
  except Exception as e:
233
+ logger.error(f"Error processing single file: {str(e)}")
234
+ return results
 
 
 
 
235
 
236
  def create_interface():
237
+ """Create a comprehensive Gradio interface with advanced features"""
 
238
 
239
+ css = """
240
+ .container { max-width: 1200px; margin: auto; }
241
+ .warning { background-color: #fff3cd; color: #856404; }
242
+ .error { background-color: #f8d7da; color: #721c24; }
243
+ """
244
+
245
+ with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
246
+ gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
247
 
248
+ with gr.Tab("URL Processing"):
249
+ url_input = gr.Textbox(
250
+ label="Enter URLs (comma or newline separated)",
251
+ lines=5,
252
+ placeholder="https://example1.com\nhttps://example2.com"
253
+ )
 
 
 
 
 
 
 
 
 
 
 
254
 
255
+ with gr.Tab("File Input"):
256
+ file_input = gr.File(
257
+ label="Upload text file or ZIP archive",
258
+ file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
259
+ )
260
+
261
+ with gr.Tab("Text Input"):
262
+ text_input = gr.Textbox(
263
+ label="Raw Text Input",
264
+ lines=5,
265
+ placeholder="Paste your text here..."
266
  )
267
 
268
+ process_btn = gr.Button("Process Input", variant="primary")
269
+
270
+ output_text = gr.Textbox(label="Processing Results", interactive=False)
271
+ output_file = gr.File(label="Processed Output")
 
 
 
272
 
273
+ def process_all_inputs(urls, file, text):
274
+ """Process all input types with progress tracking"""
275
+ try:
276
+ processor = URLProcessor()
277
+ file_processor = FileProcessor()
278
+ results = []
279
+
280
+ # Process URLs
281
+ if urls:
282
+ url_list = re.split(r'[,\n]', urls)
283
+ url_list = [url.strip() for url in url_list if url.strip()]
284
+
285
+ for url in url_list:
286
+ validation = processor.validate_url(url)
287
+ if validation.get('is_valid'):
288
+ content = processor.fetch_content(url)
289
+ if content:
290
+ results.append({
291
+ 'source': 'url',
292
+ 'url': url,
293
+ 'content': content,
294
+ 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
295
+ })
296
+
297
+ # Process files
298
+ if file:
299
+ results.extend(file_processor.process_file(file))
300
+
301
+ # Process text input
302
+ if text:
303
+ cleaned_text = processor.advanced_text_cleaning(text)
304
+ results.append({
305
+ 'source': 'direct_input',
306
+ 'content': cleaned_text,
307
+ 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
308
+ })
309
+
310
+ # Generate output
311
+ if results:
312
+ output_path = 'processed_data.json'
313
+ with open(output_path, 'w', encoding='utf-8') as f:
314
+ json.dump(results, f, ensure_ascii=False, indent=2)
315
+
316
+ summary = f"Processed {len(results)} items successfully!"
317
+ return output_path, summary
318
+ else:
319
+ return None, "No valid content to process."
320
+
321
+ except Exception as e:
322
+ logger.error(f"Processing error: {e}")
323
+ return None, f"Error: {str(e)}"
324
 
325
  process_btn.click(
326
+ process_all_inputs,
327
+ inputs=[url_input, file_input, text_input],
328
+ outputs=[output_file, output_text]
329
  )
330
 
331
+ gr.Markdown("""
332
+ ### Usage Guidelines
333
+ - **URL Processing**: Enter valid HTTP/HTTPS URLs
334
+ - **File Input**: Upload text files or ZIP archives
335
+ - **Text Input**: Direct text processing
336
+ - Advanced cleaning and validation included
337
+ """)
338
 
339
  return interface
340
 
341
+ def main():
342
+ # Configure system settings
343
  mimetypes.init()
344
 
345
  # Create and launch interface
346
  interface = create_interface()
347
  interface.launch(
348
+ share=True,
349
  server_name="0.0.0.0",
350
  server_port=7860,
 
351
  debug=True
352
+ )
353
+
354
+ if __name__ == "__main__":
355
+ main()