acecalisto3 commited on
Commit
e8a15e5
·
verified ·
1 Parent(s): b2eab13

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +307 -292
app2.py CHANGED
@@ -1,26 +1,26 @@
1
  import json
2
- import sys
3
- sys.path.append('./config')
4
- import config
5
  import os
6
  import re
7
  import time
8
  import logging
9
  import mimetypes
 
 
 
10
  import tempfile
11
  from datetime import datetime
 
12
  from pathlib import Path
13
  from urllib.parse import urlparse
14
- from typing import List, Dict, Tuple, Union, Optional
15
  import requests
16
  import validators
17
  import gradio as gr
 
18
  from bs4 import BeautifulSoup
19
  from fake_useragent import UserAgent
 
20
  from cleantext import clean
21
- import qrcode
22
- import zipfile
23
-
24
 
25
  # Setup logging with detailed configuration
26
  logging.basicConfig(
@@ -29,159 +29,242 @@ logging.basicConfig(
29
  handlers=[
30
  logging.StreamHandler(),
31
  logging.FileHandler('app.log', encoding='utf-8')
32
- ])
 
33
  logger = logging.getLogger(__name__)
34
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- # Add these imports at the top
37
- from config import Config
38
- from proxy_handler import ProxyHandler
39
- from robots_handler import RobotsHandler
40
- import asyncio
41
- import aiohttp
42
- from tqdm import tqdm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Add new imports for rate limiting and testing
45
- from ratelimit import limits, sleep_and_retry
46
- from typing import Dict, Any, Optional, List
47
- import pytest
48
- from urllib.robotparser import RobotFileParser
49
- import concurrent.futures
 
 
 
 
 
50
 
51
- class URLProcessor:
52
- def __init__(self):
53
- self.config = Config()
54
- self.proxy_handler = ProxyHandler(self.config.get('PROXY_URL'))
55
- self.robots_handler = RobotsHandler()
56
- self.session = self._create_session()
57
- self.rate_limit = self.config.get('RATE_LIMIT', 60) # requests per minute
58
- self.timeout = self.config.get('TIMEOUT', 10)
59
-
60
- @sleep_and_retry
61
- @limits(calls=60, period=60) # Rate limiting decorator
62
  def fetch_content(self, url: str) -> Optional[Dict]:
63
- """Fetch content with rate limiting"""
64
- if self.config.get('RESPECT_ROBOTS', True):
65
- if not self.robots_handler.can_fetch(url):
66
- logger.warning(f"Skipping {url} - robots.txt disallowed")
67
- return None
68
-
69
- def _create_session(self):
70
- session = requests.Session()
71
- if self.config.get('USE_PROXY'):
72
- session.proxies = self.proxy_handler.get_proxy_config()
73
- session.headers.update({
74
- 'User-Agent': UserAgent().random,
75
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
76
- 'Accept-Language': 'en-US,en;q=0.5',
77
- 'Accept-Encoding': 'gzip, deflate, br',
78
- 'Connection': 'keep-alive',
79
- 'Upgrade-Insecure-Requests': '1'
80
- })
81
- return session
82
-
83
- def _fetch_with_selenium(self, url: str) -> Optional[str]:
84
- try:
85
- chrome_options = Options()
86
- from selenium import webdriver
87
- from selenium.webdriver.chrome.options import Options
88
- from selenium.webdriver.common.by import By
89
- from selenium.webdriver.support.ui import WebDriverWait
90
- from selenium.webdriver.support import expected_conditions as EC
91
- from selenium.common.exceptions import TimeoutException
92
- import time
93
-
94
- logger.info(f"Attempting to fetch {url} with Selenium")
95
-
96
- # Set up Chrome options
97
- chrome_options = Options()
98
- chrome_options.add_argument("--headless")
99
- chrome_options.add_argument("--no-sandbox")
100
- chrome_options.add_argument("--disable-dev-shm-usage")
101
- chrome_options.add_argument("--disable-gpu")
102
- chrome_options.add_argument("--window-size=1920,1080")
103
- chrome_options.add_argument(
104
- "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
105
-
106
- # Initialize the driver
107
- driver = webdriver.Chrome(options=chrome_options)
108
-
109
- try:
110
- # Navigate to the URL
111
- driver.get(url)
112
-
113
- # Wait for the page to load
114
- WebDriverWait(driver, 10).until(
115
- EC.presence_of_element_located((By.TAG_NAME, "body"))
116
- )
117
-
118
- # Simulate pressing ESC key to dismiss overlays
119
- from selenium.webdriver.common.keys import Keys
120
- action_chains = webdriver.ActionChains(driver)
121
- action_chains.send_keys(Keys.ESCAPE).perform()
122
- time.sleep(1) # give it a moment to take effect
123
- action_chains.reset_actions() # Clear actions
124
-
125
- # try again
126
- action_chains.send_keys(Keys.ESCAPE).perform()
127
- time.sleep(1) # give it a moment to take effect
128
- action_chains.reset_actions()
129
-
130
- # Get the page source
131
- page_source = driver.page_source
132
-
133
- # Save the Selenium HTML for debugging
134
- debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
135
- with open(debug_path, "w", encoding="utf-8") as f:
136
- f.write(page_source)
137
- logger.info(f"Saved Selenium HTML to {debug_path}")
138
-
139
- return page_source
140
- finally:
141
- driver.quit()
142
-
143
- except ImportError:
144
- logger.error("Selenium is not installed. Cannot use browser automation.")
145
- return None
146
- except Exception as e:
147
- logger.error(f"Selenium processing failed for {url}: {e}")
148
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- async def fetch_urls_async(self, urls: List[str]) -> List[Dict]:
151
- """Asynchronous URL fetching with rate limiting"""
152
- async with aiohttp.ClientSession() as session:
153
- tasks = []
154
- for url in urls:
155
- if len(tasks) >= self.rate_limit:
156
- await asyncio.sleep(60) # Rate limiting
157
- tasks = []
158
- tasks.append(self.fetch_content_async(session, url))
159
- return await asyncio.gather(*tasks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
  def create_interface():
 
 
162
  css = """
163
  .container { max-width: 1200px; margin: auto; }
164
  .warning { background-color: #fff3cd; color: #856404; }
165
  .error { background-color: #f8d7da; color: #721c24; }
166
  """
167
-
168
  with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
169
- with gr.Tab("Settings"):
170
- respect_robots = gr.Checkbox(label="Respect robots.txt", value=True)
171
- use_proxy = gr.Checkbox(label="Use Proxy", value=False)
172
- proxy_url = gr.Textbox(label="Proxy URL", placeholder="http://proxy:port")
173
- request_delay = gr.Slider(minimum=0, maximum=10, value=1, label="Request Delay (seconds)")
174
- output_format = gr.Dropdown(choices=["json", "csv", "txt"], value="json", label="Output Format")
175
-
176
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
177
-
178
  with gr.Tab("URL Processing"):
179
  url_input = gr.Textbox(
180
- label="Enter URLs (comma or newline separated)",
181
  lines=5,
182
  placeholder="https://example1.com\nhttps://example2.com"
183
  )
184
-
185
  with gr.Tab("File Input"):
186
  file_input = gr.File(
187
  label="Upload text file or ZIP archive",
@@ -190,171 +273,103 @@ def create_interface():
190
 
191
  with gr.Tab("Text Input"):
192
  text_input = gr.Textbox(
193
- label="Raw Text Input",
194
  lines=5,
195
  placeholder="Paste your text here..."
196
  )
197
-
198
- with gr.Tab("JSON Editor"):
199
- json_editor = gr.Textbox(
200
- label="JSON Editor",
201
- lines=20,
202
- placeholder="View and edit your JSON data here...",
203
- interactive=True,
204
- elem_id="json-editor" # Optional: for custom styling
205
- )
206
-
207
- with gr.Tab("Scratchpad"):
208
- scratchpad = gr.Textbox(
209
- label="Scratchpad",
210
- lines=10,
211
- placeholder="Quick notes or text collections...",
212
- interactive=True
213
- )
214
-
215
  process_btn = gr.Button("Process Input", variant="primary")
216
- qr_btn = gr.Button("Generate QR Code", variant="secondary")
217
-
218
  output_text = gr.Textbox(label="Processing Results", interactive=False)
219
  output_file = gr.File(label="Processed Output")
220
- qr_output = gr.Image(label="QR Code", type="filepath") # To display the generated QR code
221
-
222
- process_btn.click(
223
- process_all_inputs,
224
- inputs=[url_input, file_input, text_input, scratchpad],
225
- outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
226
- )
227
- qr_btn.click(
228
- generate_qr_code,
229
- inputs=json_editor,
230
- outputs=qr_output
231
- )
232
- gr.Markdown("""
233
- ### Usage Guidelines
234
- - **URL Processing**: Enter valid HTTP/HTTPS URLs
235
- - **File Input**: Upload text files or ZIP archives
236
- - ** Text Input**: Direct text processing
237
- - **JSON Editor**: View and edit your JSON data
238
- - **Scratchpad**: Quick notes or text collections
239
- - Advanced cleaning and validation included
240
- """)
241
- return interface
242
-
243
-
244
- def check_network_connectivity():
245
- """Check if the network is working properly by testing connection to common sites"""
246
- test_sites = ["https://www.google.com", "https://www.cloudflare.com", "https://www.amazon.com"]
247
- results = []
248
-
249
- for site in test_sites:
250
- try:
251
- response = requests.get(site, timeout=5)
252
- results.append({
253
- "site": site,
254
- "status": "OK" if response.status_code == 200 else f"Error: {response.status_code}",
255
- "response_time": response.elapsed.total_seconds()
256
- })
257
- except Exception as e:
258
- results.append({
259
- "site": site,
260
- "status": f"Error: {str(e)}",
261
- "response_time": None
262
- })
263
- # If all sites failed, there might be a network issue
264
- if all(result["status"].startswith("Error") for result in results):
265
- logger.error("Network connectivity issue detected. All test sites failed.")
266
- return False, results
267
-
268
- return True, results
269
-
270
-
271
- def validate_config(config: Dict[str, Any]) -> Dict[str, str]:
272
- """Validate configuration settings"""
273
- errors = {}
274
- if config.get('RATE_LIMIT', 0) < 1:
275
- errors['rate_limit'] = "Rate limit must be positive"
276
- if config.get('TIMEOUT', 0) < 1:
277
- errors['timeout'] = "Timeout must be positive"
278
- if config.get('USE_PROXY') and not config.get('PROXY_URL'):
279
- errors['proxy'] = "Proxy URL required when proxy is enabled"
280
- return errors
281
-
282
- def update_settings(respect_robots: bool, use_proxy: bool, proxy_url: str,
283
- request_delay: float, output_format: str) -> str:
284
- """Update application settings"""
285
- config = Config()
286
- new_settings = {
287
- 'RESPECT_ROBOTS': respect_robots,
288
- 'USE_PROXY': use_proxy,
289
- 'PROXY_URL': proxy_url,
290
- 'REQUEST_DELAY': request_delay,
291
- 'OUTPUT_FORMAT': output_format
292
- }
293
-
294
- # Validate settings before updating
295
- errors = validate_config(new_settings)
296
- if errors:
297
- return f"Configuration error: {', '.join(errors.values())}"
298
 
299
- config.update(new_settings)
300
- return "Configuration updated successfully"
301
-
302
- def create_settings_tab() -> gr.Tab:
303
- """Create settings tab with configuration controls"""
304
- with gr.Tab("Settings") as settings_tab:
305
- respect_robots = gr.Checkbox(label="Respect robots.txt", value=True)
306
- use_proxy = gr.Checkbox(label="Use Proxy", value=False)
307
- proxy_url = gr.Textbox(label="Proxy URL", placeholder="http://proxy:port")
308
- request_delay = gr.Slider(minimum=0, maximum=10, value=1, label="Request Delay (seconds)")
309
- output_format = gr.Dropdown(choices=["json", "csv", "txt"], value="json", label="Output Format")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
- settings_btn = gr.Button("Update Settings")
312
- settings_output = gr.Textbox(label="Settings Status")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
- settings_btn.click(
315
- update_settings,
316
- inputs=[respect_robots, use_proxy, proxy_url, request_delay, output_format],
317
- outputs=settings_output
318
  )
319
 
320
- return settings_tab
 
 
 
 
 
 
 
 
321
 
322
  def main():
323
- """Main application entry point"""
324
- try:
325
- # Initialize system settings
326
- mimetypes.init()
327
-
328
- # Validate initial configuration
329
- config = Config()
330
- errors = validate_config(config.get_all())
331
- if errors:
332
- logger.error(f"Configuration errors found: {errors}")
333
- sys.exit(1)
334
-
335
- # Check network connectivity
336
- network_ok, network_results = check_network_connectivity()
337
- if not network_ok:
338
- logger.warning("Network connectivity issues detected. Some features may not work properly.")
339
- for result in network_results:
340
- logger.warning(f"Test site {result['site']}: {result['status']}")
341
-
342
- # Create and launch interface
343
- interface = create_interface()
344
-
345
- # Launch with proper configuration
346
- interface.launch(
347
- server_name="0.0.0.0",
348
- server_port=7860,
349
- show_error=True,
350
- share=False,
351
- inbrowser=True,
352
- debug=True
353
- )
354
-
355
- except Exception as e:
356
- logger.error(f"Application startup failed: {str(e)}")
357
- sys.exit(1)
358
-
359
  if __name__ == "__main__":
360
- main()
 
1
  import json
 
 
 
2
  import os
3
  import re
4
  import time
5
  import logging
6
  import mimetypes
7
+ import concurrent.futures
8
+ import string
9
+ import zipfile
10
  import tempfile
11
  from datetime import datetime
12
+ from typing import List, Dict, Optional, Union
13
  from pathlib import Path
14
  from urllib.parse import urlparse
15
+
16
  import requests
17
  import validators
18
  import gradio as gr
19
+ from diskcache import Cache
20
  from bs4 import BeautifulSoup
21
  from fake_useragent import UserAgent
22
+ from ratelimit import limits, sleep_and_retry
23
  from cleantext import clean
 
 
 
24
 
25
  # Setup logging with detailed configuration
26
  logging.basicConfig(
 
29
  handlers=[
30
  logging.StreamHandler(),
31
  logging.FileHandler('app.log', encoding='utf-8')
32
+ ]
33
+ )
34
  logger = logging.getLogger(__name__)
35
 
36
+ class URLProcessor:
37
+ def __init__(self):
38
+ self.session = requests.Session()
39
+ self.timeout = 10 # seconds
40
+ self.session.headers.update({
41
+ 'User-Agent': UserAgent().random,
42
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
43
+ 'Accept-Language': 'en-US,en;q=0.5',
44
+ 'Accept-Encoding': 'gzip, deflate, br',
45
+ 'Connection': 'keep-alive',
46
+ 'Upgrade-Insecure-Requests': '1'
47
+ })
48
 
49
+ def advanced_text_cleaning(self, text: str) -> str:
50
+ """Robust text cleaning with version compatibility"""
51
+ try:
52
+ cleaned_text = clean(
53
+ text,
54
+ fix_unicode=True,
55
+ to_ascii=True,
56
+ lower=True,
57
+ no_line_breaks=True,
58
+ no_urls=True,
59
+ no_emails=True,
60
+ no_phone_numbers=True,
61
+ no_numbers=False,
62
+ no_digits=False,
63
+ no_currency_symbols=True,
64
+ no_punct=False
65
+ ).strip()
66
+ return cleaned_text
67
+ except Exception as e:
68
+ logger.warning(f"Text cleaning error: {e}. Using fallback method.")
69
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Remove control characters
70
+ text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
71
+ text = re.sub(r'\s+', ' ', text) # Normalize whitespace
72
+ return text.strip()
73
 
74
+ def validate_url(self, url: str) -> Dict:
75
+ """Validate URL format and accessibility"""
76
+ try:
77
+ if not validators.url(url):
78
+ return {'is_valid': False, 'message': 'Invalid URL format'}
79
+
80
+ response = self.session.head(url, timeout=self.timeout)
81
+ response.raise_for_status()
82
+ return {'is_valid': True, 'message': 'URL is valid and accessible'}
83
+ except Exception as e:
84
+ return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
85
 
 
 
 
 
 
 
 
 
 
 
 
86
  def fetch_content(self, url: str) -> Optional[Dict]:
87
+ """Universal content fetcher with special case handling"""
88
+ try:
89
+ # Google Drive document handling
90
+ if 'drive.google.com' in url:
91
+ return self._handle_google_drive(url)
92
+
93
+ # Google Calendar ICS handling
94
+ if 'calendar.google.com' in url and 'ical' in url:
95
+ return self._handle_google_calendar(url)
96
+
97
+ # Standard HTML processing
98
+ return self._fetch_html_content(url)
99
+ except Exception as e:
100
+ logger.error(f"Content fetch failed: {e}")
101
+ return None
102
+
103
+ def _handle_google_drive(self, url: str) -> Optional[Dict]:
104
+ """Process Google Drive file links"""
105
+ try:
106
+ file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
107
+ if not file_id:
108
+ logger.error(f"Invalid Google Drive URL: {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  return None
110
+
111
+ direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
112
+ response = self.session.get(direct_url, timeout=self.timeout)
113
+ response.raise_for_status()
114
+
115
+ return {
116
+ 'content': response.text,
117
+ 'content_type': response.headers.get('Content-Type', ''),
118
+ 'timestamp': datetime.now().isoformat()
119
+ }
120
+ except Exception as e:
121
+ logger.error(f"Google Drive processing failed: {e}")
122
+ return None
123
+
124
+ def _handle_google_calendar(self, url: str) -> Optional[Dict]:
125
+ """Process Google Calendar ICS feeds"""
126
+ try:
127
+ response = self.session.get(url, timeout=self.timeout)
128
+ response.raise_for_status()
129
+ return {
130
+ 'content': response.text,
131
+ 'content_type': 'text/calendar',
132
+ 'timestamp': datetime.now().isoformat()
133
+ }
134
+ except Exception as e:
135
+ logger.error(f"Calendar fetch failed: {e}")
136
+ return None
137
+
138
+ def _fetch_html_content(self, url: str) -> Optional[Dict]:
139
+ """Standard HTML content processing"""
140
+ try:
141
+ response = self.session.get(url, timeout=self.timeout)
142
+ response.raise_for_status()
143
+
144
+ soup = BeautifulSoup(response.text, 'html.parser')
145
+
146
+ # Remove unwanted elements
147
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
148
+ element.decompose()
149
+
150
+ # Extract main content
151
+ main_content = soup.find('main') or soup.find('article') or soup.body
152
+
153
+ # Clean and structure content
154
+ text_content = main_content.get_text(separator='\n', strip=True)
155
+ cleaned_content = self.advanced_text_cleaning(text_content)
156
+
157
+ return {
158
+ 'content': cleaned_content,
159
+ 'content_type': response.headers.get('Content-Type', ''),
160
+ 'timestamp': datetime.now().isoformat()
161
+ }
162
+ except Exception as e:
163
+ logger.error(f"HTML processing failed: {e}")
164
+ return None
165
+
166
+ class FileProcessor:
167
+ """Class to handle file processing"""
168
 
169
+ def __init__(self, max_file_size: int = 10 * 1024 * 1024): # 10MB default
170
+ self.max_file_size = max_file_size
171
+ self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
172
+
173
+ def is_text_file(self, filepath: str) -> bool:
174
+ """Check if file is a text file"""
175
+ try:
176
+ mime_type, _ = mimetypes.guess_type(filepath)
177
+ return (mime_type and mime_type.startswith('text/')) or \
178
+ (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
179
+ except Exception:
180
+ return False
181
+
182
+ def process_file(self, file) -> List[Dict]:
183
+ """Process uploaded file with enhanced error handling"""
184
+ if not file:
185
+ return []
186
+
187
+ dataset = []
188
+ try:
189
+ file_size = os.path.getsize(file.name)
190
+ if file_size > self.max_file_size:
191
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
192
+ return []
193
+
194
+ with tempfile.TemporaryDirectory() as temp_dir:
195
+ if zipfile.is_zipfile(file.name):
196
+ dataset.extend(self._process_zip_file(file.name, temp_dir))
197
+ else:
198
+ dataset.extend(self._process_single_file(file))
199
+
200
+ except Exception as e:
201
+ logger.error(f"Error processing file: {str(e)}")
202
+ return []
203
+
204
+ return dataset
205
+
206
+ def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
207
+ """Process ZIP file contents"""
208
+ results = []
209
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
210
+ zip_ref.extractall(temp_dir)
211
+ for root, _, files in os.walk(temp_dir):
212
+ for filename in files:
213
+ filepath = os.path.join(root, filename)
214
+ if self.is_text_file(filepath):
215
+ try:
216
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
217
+ content = f.read()
218
+ if content.strip():
219
+ results.append({
220
+ "source": "file",
221
+ "filename": filename,
222
+ "content": content,
223
+ "timestamp": datetime.now().isoformat()
224
+ })
225
+ except Exception as e:
226
+ logger.error(f"Error reading file {filename}: {str(e)}")
227
+ return results
228
+
229
+ def _process_single_file(self, file) -> List[Dict]:
230
+ try:
231
+ file_stat = os.stat(file.name)
232
+ with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
233
+ content = f.read()
234
+
235
+ return [{
236
+ 'source': 'file',
237
+ 'filename': os.path.basename(file.name),
238
+ 'file_size': file_stat.st_size,
239
+ 'mime_type': mimetypes.guess_type(file.name)[0],
240
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
241
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
242
+ 'content': content,
243
+ 'timestamp': datetime.now().isoformat()
244
+ }]
245
+ except Exception as e:
246
+ logger.error(f"File processing error: {e}")
247
+ return []
248
 
249
  def create_interface():
250
+ """Create a comprehensive Gradio interface with advanced features"""
251
+
252
  css = """
253
  .container { max-width: 1200px; margin: auto; }
254
  .warning { background-color: #fff3cd; color: #856404; }
255
  .error { background-color: #f8d7da; color: #721c24; }
256
  """
257
+
258
  with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
 
 
 
 
 
 
 
259
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
260
+
261
  with gr.Tab("URL Processing"):
262
  url_input = gr.Textbox(
263
+ label="Enter URLs (comma or newline separated)",
264
  lines=5,
265
  placeholder="https://example1.com\nhttps://example2.com"
266
  )
267
+
268
  with gr.Tab("File Input"):
269
  file_input = gr.File(
270
  label="Upload text file or ZIP archive",
 
273
 
274
  with gr.Tab("Text Input"):
275
  text_input = gr.Textbox(
276
+ label="Raw Text Input",
277
  lines=5,
278
  placeholder="Paste your text here..."
279
  )
280
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  process_btn = gr.Button("Process Input", variant="primary")
282
+
 
283
  output_text = gr.Textbox(label="Processing Results", interactive=False)
284
  output_file = gr.File(label="Processed Output")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
+ def process_all_inputs(urls, file, text):
287
+ """Process all input types with progress tracking"""
288
+ try:
289
+ processor = URLProcessor()
290
+ file_processor = FileProcessor()
291
+ results = []
292
+
293
+ # Process URLs
294
+ if urls:
295
+ url_list = re.split(r'[,\n]', urls)
296
+ url_list = [url.strip() for url in url_list if url.strip()]
297
+
298
+ for url in url_list:
299
+ validation = processor.validate_url(url)
300
+ if validation.get('is_valid'):
301
+ content = processor.fetch_content(url)
302
+ if content:
303
+ results.append({
304
+ 'source': 'url',
305
+ 'url': url,
306
+ 'content': content,
307
+ 'timestamp': datetime.now().isoformat()
308
+ })
309
+
310
+ # Process files
311
+ if file:
312
+ results.extend(file_processor.process_file(file))
313
 
314
+ # Process text input
315
+ if text:
316
+ cleaned_text = processor.advanced_text_cleaning(text)
317
+ results.append({
318
+ 'source': 'direct_input',
319
+ 'content': cleaned_text,
320
+ 'timestamp': datetime.now().isoformat()
321
+ })
322
+
323
+ # Generate output
324
+ if results:
325
+ output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
326
+ output_dir.mkdir(parents=True, exist_ok=True)
327
+ output_path = output_dir / f'processed_{int(time.time())}.json'
328
+
329
+ with open(output_path, 'w', encoding='utf-8') as f:
330
+ json.dump(results, f, ensure_ascii=False, indent=2)
331
+
332
+ summary = f"Processed {len(results)} items successfully!"
333
+ # Convert Path object to string here
334
+ return str(output_path), summary
335
+ else:
336
+ return None, "No valid content to process."
337
+
338
+ except Exception as e:
339
+ logger.error(f"Processing error: {e}")
340
+ return None, f"Error: {str(e)}"
341
 
342
+ process_btn.click(
343
+ process_all_inputs,
344
+ inputs=[url_input, file_input, text_input],
345
+ outputs=[output_file, output_text]
346
  )
347
 
348
+ gr.Markdown("""
349
+ ### Usage Guidelines
350
+ - **URL Processing**: Enter valid HTTP/HTTPS URLs
351
+ - **File Input**: Upload text files or ZIP archives
352
+ - **Text Input**: Direct text processing
353
+ - Advanced cleaning and validation included
354
+ """)
355
+
356
+ return interface
357
 
358
  def main():
359
+ # Configure system settings
360
+ mimetypes.init()
361
+
362
+ # Create and launch interface
363
+ interface = create_interface()
364
+
365
+ # Launch with proper configuration
366
+ interface.launch(
367
+ server_name="0.0.0.0",
368
+ server_port=7860,
369
+ show_error=True,
370
+ share=False,
371
+ inbrowser=True,
372
+ debug=True
373
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  if __name__ == "__main__":
375
+ main()