Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -6,357 +6,277 @@ import logging
|
|
6 |
import mimetypes
|
7 |
import concurrent.futures
|
8 |
import string
|
9 |
-
import zipfile
|
10 |
-
import tempfile
|
11 |
-
from datetime import datetime
|
12 |
from typing import List, Dict, Optional, Union
|
13 |
from pathlib import Path
|
14 |
from urllib.parse import urlparse
|
|
|
15 |
import requests
|
16 |
import validators
|
17 |
import gradio as gr
|
|
|
|
|
18 |
from bs4 import BeautifulSoup
|
19 |
from fake_useragent import UserAgent
|
20 |
from ratelimit import limits, sleep_and_retry
|
21 |
-
from cleantext import clean
|
22 |
-
|
23 |
|
24 |
-
#
|
25 |
logging.basicConfig(
|
26 |
level=logging.INFO,
|
27 |
format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
|
28 |
handlers=[
|
29 |
logging.StreamHandler(),
|
30 |
-
logging.FileHandler('
|
31 |
]
|
32 |
)
|
33 |
logger = logging.getLogger(__name__)
|
34 |
|
35 |
-
class
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
self.session = requests.Session()
|
38 |
-
self.timeout = 10 # seconds
|
39 |
self.session.headers.update({
|
40 |
-
'User-Agent':
|
41 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9
|
42 |
'Accept-Language': 'en-US,en;q=0.5',
|
43 |
-
'
|
44 |
-
'Connection': 'keep-alive',
|
45 |
-
'Upgrade-Insecure-Requests': '1'
|
46 |
})
|
47 |
|
48 |
-
|
|
|
|
|
|
|
49 |
try:
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
to_ascii=True,
|
54 |
-
lower=True,
|
55 |
-
no_line_breaks=True,
|
56 |
-
no_urls=True,
|
57 |
-
no_emails=True,
|
58 |
-
no_phone_numbers=True,
|
59 |
-
no_numbers=False,
|
60 |
-
no_digits=False,
|
61 |
-
no_currency_symbols=True,
|
62 |
-
no_punct=False
|
63 |
-
).strip()
|
64 |
-
return cleaned_text
|
65 |
-
except Exception as e:
|
66 |
-
logger.warning(f"Text cleaning error: {e}. Using fallback method.")
|
67 |
-
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
|
68 |
-
text = text.encode('ascii', 'ignore').decode('ascii')
|
69 |
-
text = re.sub(r'\s+', ' ', text)
|
70 |
-
return text.strip()
|
71 |
-
|
72 |
-
def validate_url(self, url: str) -> Dict:
|
73 |
-
try:
|
74 |
-
if not validators.url(url):
|
75 |
-
return {'is_valid': False, 'message': 'Invalid URL format'}
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
if 'drive.google.com' in url:
|
86 |
-
return self._handle_google_drive(url)
|
87 |
-
if 'calendar.google.com' in url and 'ical' in url:
|
88 |
-
return self._handle_google_calendar(url)
|
89 |
-
return self._fetch_html_content(url)
|
90 |
-
except Exception as e:
|
91 |
-
logger.error(f"Content fetch failed: {e}")
|
92 |
-
return None
|
93 |
-
|
94 |
-
def _handle_google_drive(self, url: str) -> Optional[Dict]:
|
95 |
-
try:
|
96 |
-
file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
|
97 |
-
if not file_id:
|
98 |
-
logger.error(f"Invalid Google Drive URL: {url}")
|
99 |
-
return None
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
'
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
'
|
120 |
-
'
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
logger.error(f"Calendar fetch failed: {e}")
|
125 |
-
return None
|
126 |
-
|
127 |
-
def _fetch_html_content(self, url: str) -> Optional[Dict]:
|
128 |
-
try:
|
129 |
-
response = self.session.get(url, timeout=self.timeout)
|
130 |
-
response.raise_for_status()
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
return {
|
139 |
-
'content': '',
|
140 |
-
'content_type': response.headers.get('Content-Type', ''),
|
141 |
-
'timestamp': datetime.now().isoformat()
|
142 |
-
}
|
143 |
-
text_content = main_content.get_text(separator='\n', strip=True)
|
144 |
-
cleaned_content = self.advanced_text_cleaning(text_content)
|
145 |
return {
|
146 |
-
'
|
147 |
-
'
|
148 |
-
'timestamp': datetime.now().isoformat()
|
149 |
}
|
150 |
-
except Exception as e:
|
151 |
-
logger.error(f"HTML processing failed: {e}")
|
152 |
-
return None
|
153 |
-
|
154 |
-
class FileProcessor:
|
155 |
-
def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):
|
156 |
-
self.max_file_size = max_file_size
|
157 |
-
self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
|
158 |
-
|
159 |
-
def is_text_file(self, filepath: str) -> bool:
|
160 |
-
try:
|
161 |
-
mime_type, _ = mimetypes.guess_type(filepath)
|
162 |
-
return (mime_type and mime_type.startswith('text/')) or \
|
163 |
-
(os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
|
164 |
-
except Exception:
|
165 |
-
return False
|
166 |
-
|
167 |
-
def process_file(self, file) -> List[Dict]:
|
168 |
-
if not file:
|
169 |
-
return []
|
170 |
-
|
171 |
-
dataset = []
|
172 |
-
try:
|
173 |
-
file_size = os.path.getsize(file.name)
|
174 |
-
if file_size > self.max_file_size:
|
175 |
-
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
176 |
-
return []
|
177 |
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
qr.save(qr_path)
|
246 |
-
return qr_path
|
247 |
-
return None
|
248 |
-
|
249 |
-
def create_interface():
|
250 |
css = """
|
251 |
.container { max-width: 1200px; margin: auto; }
|
252 |
.warning { background-color: #fff3cd; color: #856404; }
|
253 |
.error { background-color: #f8d7da; color: #721c24; }
|
254 |
"""
|
255 |
-
|
256 |
with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
|
257 |
gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
|
258 |
-
|
259 |
with gr.Tab("URL Processing"):
|
260 |
url_input = gr.Textbox(
|
261 |
label="Enter URLs (comma or newline separated)",
|
262 |
lines=5,
|
263 |
placeholder="https://example1.com\nhttps://example2.com"
|
264 |
)
|
265 |
-
|
266 |
-
with gr.Tab("File Input"):
|
267 |
-
file_input = gr.File(
|
268 |
-
label="Upload text file or ZIP archive",
|
269 |
-
file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
|
270 |
-
)
|
271 |
-
|
272 |
with gr.Tab("Text Input"):
|
273 |
text_input = gr.Textbox(
|
274 |
label="Raw Text Input",
|
275 |
lines=5,
|
276 |
placeholder="Paste your text here..."
|
277 |
)
|
278 |
-
|
279 |
process_btn = gr.Button("Process Input", variant="primary")
|
|
|
280 |
output_text = gr.Textbox(label="Processing Results", interactive=False)
|
281 |
output_file = gr.File(label="Processed Output")
|
282 |
-
|
283 |
-
def
|
284 |
try:
|
285 |
-
processor =
|
286 |
-
file_processor = FileProcessor()
|
287 |
results = []
|
288 |
-
|
|
|
289 |
if urls:
|
290 |
-
url_list = re.split(r'[
|
291 |
url_list = [url.strip() for url in url_list if url.strip()]
|
292 |
-
|
293 |
for url in url_list:
|
294 |
validation = processor.validate_url(url)
|
295 |
if validation.get('is_valid'):
|
296 |
content = processor.fetch_content(url)
|
297 |
if content:
|
298 |
results.append({
|
299 |
-
'source': 'url',
|
300 |
'url': url,
|
301 |
'content': content,
|
302 |
-
'timestamp':
|
303 |
})
|
304 |
-
|
305 |
-
|
306 |
-
results.extend(file_processor.process_file(file))
|
307 |
-
|
308 |
if text:
|
309 |
cleaned_text = processor.advanced_text_cleaning(text)
|
310 |
results.append({
|
311 |
'source': 'direct_input',
|
312 |
'content': cleaned_text,
|
313 |
-
'timestamp':
|
314 |
})
|
315 |
-
|
|
|
316 |
if results:
|
317 |
-
|
318 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
319 |
-
output_path = output_dir / f'processed_{int(time.time())}.json'
|
320 |
-
|
321 |
with open(output_path, 'w', encoding='utf-8') as f:
|
322 |
json.dump(results, f, ensure_ascii=False, indent=2)
|
323 |
-
|
324 |
summary = f"Processed {len(results)} items successfully!"
|
325 |
-
return
|
326 |
else:
|
327 |
return None, "No valid content to process."
|
328 |
-
|
329 |
except Exception as e:
|
330 |
logger.error(f"Processing error: {e}")
|
331 |
return None, f"Error: {str(e)}"
|
332 |
-
|
333 |
process_btn.click(
|
334 |
-
|
335 |
-
inputs=[url_input,
|
336 |
outputs=[output_file, output_text]
|
337 |
)
|
338 |
-
|
339 |
gr.Markdown("""
|
340 |
### Usage Guidelines
|
341 |
-
-
|
342 |
-
-
|
343 |
-
- **Text Input**: Direct text processing
|
344 |
- Advanced cleaning and validation included
|
345 |
""")
|
346 |
-
|
347 |
return interface
|
348 |
|
349 |
def main():
|
|
|
350 |
mimetypes.init()
|
351 |
-
|
|
|
|
|
352 |
interface.launch(
|
|
|
353 |
server_name="0.0.0.0",
|
354 |
server_port=7860,
|
355 |
-
show_error=True,
|
356 |
-
share=False,
|
357 |
-
inbrowser=True,
|
358 |
debug=True
|
359 |
)
|
360 |
|
361 |
if __name__ == "__main__":
|
362 |
-
main()
|
|
|
6 |
import mimetypes
|
7 |
import concurrent.futures
|
8 |
import string
|
|
|
|
|
|
|
9 |
from typing import List, Dict, Optional, Union
|
10 |
from pathlib import Path
|
11 |
from urllib.parse import urlparse
|
12 |
+
|
13 |
import requests
|
14 |
import validators
|
15 |
import gradio as gr
|
16 |
+
import torch
|
17 |
+
import cachetools
|
18 |
from bs4 import BeautifulSoup
|
19 |
from fake_useragent import UserAgent
|
20 |
from ratelimit import limits, sleep_and_retry
|
|
|
|
|
21 |
|
22 |
+
# Advanced Logging Configuration
|
23 |
logging.basicConfig(
|
24 |
level=logging.INFO,
|
25 |
format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
|
26 |
handlers=[
|
27 |
logging.StreamHandler(),
|
28 |
+
logging.FileHandler('app_advanced.log', encoding='utf-8')
|
29 |
]
|
30 |
)
|
31 |
logger = logging.getLogger(__name__)
|
32 |
|
33 |
+
class AdvancedURLProcessor:
|
34 |
+
"""Enhanced URL processing with advanced features"""
|
35 |
+
|
36 |
+
def __init__(
|
37 |
+
self,
|
38 |
+
timeout: int = 15,
|
39 |
+
max_retries: int = 3,
|
40 |
+
concurrent_requests: int = 5,
|
41 |
+
cache_size: int = 100
|
42 |
+
):
|
43 |
+
self.timeout = timeout
|
44 |
+
self.max_retries = max_retries
|
45 |
+
self.concurrent_requests = concurrent_requests
|
46 |
+
self.ua = UserAgent()
|
47 |
+
|
48 |
+
# Implement multilevel caching
|
49 |
+
self.url_cache = cachetools.LRUCache(maxsize=cache_size)
|
50 |
+
self.content_cache = cachetools.TTLCache(maxsize=cache_size, ttl=3600) # 1-hour cache
|
51 |
+
|
52 |
self.session = requests.Session()
|
|
|
53 |
self.session.headers.update({
|
54 |
+
'User-Agent': self.ua.random,
|
55 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
56 |
'Accept-Language': 'en-US,en;q=0.5',
|
57 |
+
'Connection': 'keep-alive'
|
|
|
|
|
58 |
})
|
59 |
|
60 |
+
@sleep_and_retry
|
61 |
+
@limits(calls=10, period=60) # Rate limiting: 10 calls per minute
|
62 |
+
def validate_url(self, url: str) -> Dict[str, Union[bool, str]]:
|
63 |
+
"""Enhanced URL validation with comprehensive checks"""
|
64 |
try:
|
65 |
+
# Check cache first
|
66 |
+
if url in self.url_cache:
|
67 |
+
return self.url_cache[url]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
+
# Comprehensive URL validation
|
70 |
+
result = urlparse(url)
|
71 |
+
validation_result = {
|
72 |
+
'is_valid': False,
|
73 |
+
'message': 'Invalid URL',
|
74 |
+
'scheme': result.scheme,
|
75 |
+
'netloc': result.netloc
|
76 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
+
if not all([result.scheme, result.netloc]):
|
79 |
+
validation_result['message'] = 'Missing scheme or network location'
|
80 |
+
return validation_result
|
81 |
|
82 |
+
# Use validators for additional checks
|
83 |
+
if not validators.url(url):
|
84 |
+
validation_result['message'] = 'URL format validation failed'
|
85 |
+
return validation_result
|
86 |
+
|
87 |
+
# Perform HEAD request for accessibility
|
88 |
+
try:
|
89 |
+
response = self.session.head(
|
90 |
+
url,
|
91 |
+
timeout=self.timeout,
|
92 |
+
allow_redirects=True
|
93 |
+
)
|
94 |
+
|
95 |
+
validation_result['is_valid'] = response.status_code in [200, 301, 302]
|
96 |
+
validation_result['status_code'] = response.status_code
|
97 |
+
validation_result['message'] = f"URL is {'valid' if validation_result['is_valid'] else 'invalid'}"
|
98 |
+
|
99 |
+
except requests.RequestException as e:
|
100 |
+
validation_result['message'] = f"Connection error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
+
# Cache the result
|
103 |
+
self.url_cache[url] = validation_result
|
104 |
+
return validation_result
|
105 |
+
|
106 |
+
except Exception as e:
|
107 |
+
logger.error(f"Unexpected error validating URL {url}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
return {
|
109 |
+
'is_valid': False,
|
110 |
+
'message': f"Unexpected validation error: {str(e)}"
|
|
|
111 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
+
def advanced_text_cleaning(self, text: str) -> str:
|
114 |
+
"""Sophisticated text cleaning and normalization"""
|
115 |
+
if not text:
|
116 |
+
return ""
|
117 |
+
|
118 |
+
# Remove control characters
|
119 |
+
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
|
120 |
+
|
121 |
+
# Normalize Unicode characters
|
122 |
+
text = text.encode('ascii', 'ignore').decode('ascii')
|
123 |
+
|
124 |
+
# Replace multiple whitespaces
|
125 |
+
text = re.sub(r'\s+', ' ', text)
|
126 |
+
|
127 |
+
# Remove HTML entities
|
128 |
+
text = re.sub(r'&[a-zA-Z]+;', '', text)
|
129 |
+
|
130 |
+
# Normalize quotation marks
|
131 |
+
text = text.replace('"', '"').replace('"', '"')
|
132 |
+
text = text.replace(''', "'").replace(''', "'")
|
133 |
+
|
134 |
+
# Remove excessive punctuation
|
135 |
+
text = re.sub(r'([.,!?]){2,}', r'\1', text)
|
136 |
+
|
137 |
+
return text.strip()
|
138 |
+
|
139 |
+
@sleep_and_retry
|
140 |
+
@limits(calls=20, period=60) # Refined rate limiting
|
141 |
+
def fetch_content(self, url: str) -> Optional[str]:
|
142 |
+
"""Advanced content fetching with multiple safeguards"""
|
143 |
+
# Check content cache first
|
144 |
+
if url in self.content_cache:
|
145 |
+
return self.content_cache[url]
|
146 |
+
|
147 |
+
for attempt in range(self.max_retries):
|
148 |
+
try:
|
149 |
+
response = self.session.get(
|
150 |
+
url,
|
151 |
+
timeout=self.timeout,
|
152 |
+
headers={'User-Agent': self.ua.random}
|
153 |
+
)
|
154 |
+
response.raise_for_status()
|
155 |
+
|
156 |
+
# Use BeautifulSoup for more robust parsing
|
157 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
158 |
+
|
159 |
+
# Remove scripts, styles, comments
|
160 |
+
for script in soup(["script", "style"]):
|
161 |
+
script.decompose()
|
162 |
+
|
163 |
+
# Extract clean text
|
164 |
+
text = soup.get_text(separator=' ')
|
165 |
+
cleaned_text = self.advanced_text_cleaning(text)
|
166 |
+
|
167 |
+
# Cache the result
|
168 |
+
self.content_cache[url] = cleaned_text
|
169 |
+
return cleaned_text
|
170 |
+
|
171 |
+
except requests.RequestException as e:
|
172 |
+
logger.warning(f"Fetch attempt {attempt + 1} failed for {url}: {e}")
|
173 |
+
time.sleep(2 ** attempt) # Exponential backoff
|
174 |
+
|
175 |
+
return None
|
176 |
+
|
177 |
+
def create_advanced_interface():
|
178 |
+
"""Create a comprehensive Gradio interface with advanced features"""
|
179 |
+
|
|
|
|
|
|
|
|
|
|
|
180 |
css = """
|
181 |
.container { max-width: 1200px; margin: auto; }
|
182 |
.warning { background-color: #fff3cd; color: #856404; }
|
183 |
.error { background-color: #f8d7da; color: #721c24; }
|
184 |
"""
|
185 |
+
|
186 |
with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
|
187 |
gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
|
188 |
+
|
189 |
with gr.Tab("URL Processing"):
|
190 |
url_input = gr.Textbox(
|
191 |
label="Enter URLs (comma or newline separated)",
|
192 |
lines=5,
|
193 |
placeholder="https://example1.com\nhttps://example2.com"
|
194 |
)
|
195 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
with gr.Tab("Text Input"):
|
197 |
text_input = gr.Textbox(
|
198 |
label="Raw Text Input",
|
199 |
lines=5,
|
200 |
placeholder="Paste your text here..."
|
201 |
)
|
202 |
+
|
203 |
process_btn = gr.Button("Process Input", variant="primary")
|
204 |
+
|
205 |
output_text = gr.Textbox(label="Processing Results", interactive=False)
|
206 |
output_file = gr.File(label="Processed Output")
|
207 |
+
|
208 |
+
def process_input(urls, text):
|
209 |
try:
|
210 |
+
processor = AdvancedURLProcessor()
|
|
|
211 |
results = []
|
212 |
+
|
213 |
+
# Process URLs
|
214 |
if urls:
|
215 |
+
url_list = re.split(r'[,\n]', urls)
|
216 |
url_list = [url.strip() for url in url_list if url.strip()]
|
217 |
+
|
218 |
for url in url_list:
|
219 |
validation = processor.validate_url(url)
|
220 |
if validation.get('is_valid'):
|
221 |
content = processor.fetch_content(url)
|
222 |
if content:
|
223 |
results.append({
|
|
|
224 |
'url': url,
|
225 |
'content': content,
|
226 |
+
'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
|
227 |
})
|
228 |
+
|
229 |
+
# Process text input
|
|
|
|
|
230 |
if text:
|
231 |
cleaned_text = processor.advanced_text_cleaning(text)
|
232 |
results.append({
|
233 |
'source': 'direct_input',
|
234 |
'content': cleaned_text,
|
235 |
+
'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
|
236 |
})
|
237 |
+
|
238 |
+
# Generate output
|
239 |
if results:
|
240 |
+
output_path = 'processed_data.json'
|
|
|
|
|
|
|
241 |
with open(output_path, 'w', encoding='utf-8') as f:
|
242 |
json.dump(results, f, ensure_ascii=False, indent=2)
|
243 |
+
|
244 |
summary = f"Processed {len(results)} items successfully!"
|
245 |
+
return output_path, summary
|
246 |
else:
|
247 |
return None, "No valid content to process."
|
248 |
+
|
249 |
except Exception as e:
|
250 |
logger.error(f"Processing error: {e}")
|
251 |
return None, f"Error: {str(e)}"
|
252 |
+
|
253 |
process_btn.click(
|
254 |
+
process_input,
|
255 |
+
inputs=[url_input, text_input],
|
256 |
outputs=[output_file, output_text]
|
257 |
)
|
258 |
+
|
259 |
gr.Markdown("""
|
260 |
### Usage Guidelines
|
261 |
+
- URL Processing: Enter valid HTTP/HTTPS URLs
|
262 |
+
- Text Input: Direct text processing
|
|
|
263 |
- Advanced cleaning and validation included
|
264 |
""")
|
265 |
+
|
266 |
return interface
|
267 |
|
268 |
def main():
|
269 |
+
# Configure system settings
|
270 |
mimetypes.init()
|
271 |
+
|
272 |
+
# Create and launch interface
|
273 |
+
interface = create_advanced_interface()
|
274 |
interface.launch(
|
275 |
+
share=True,
|
276 |
server_name="0.0.0.0",
|
277 |
server_port=7860,
|
|
|
|
|
|
|
278 |
debug=True
|
279 |
)
|
280 |
|
281 |
if __name__ == "__main__":
|
282 |
+
main()
|