Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -6,43 +6,37 @@ import logging
|
|
6 |
import mimetypes
|
7 |
import concurrent.futures
|
8 |
import string
|
9 |
-
import
|
10 |
-
from typing import List, Dict, Optional, Union, Any
|
11 |
from pathlib import Path
|
12 |
from urllib.parse import urlparse
|
13 |
|
14 |
import requests
|
15 |
import validators
|
16 |
import gradio as gr
|
17 |
-
import torch
|
18 |
import cachetools
|
19 |
from bs4 import BeautifulSoup
|
|
|
20 |
from ratelimit import limits, sleep_and_retry
|
21 |
|
22 |
-
#
|
23 |
logging.basicConfig(
|
24 |
level=logging.INFO,
|
25 |
format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
|
26 |
handlers=[
|
27 |
logging.StreamHandler(),
|
28 |
-
logging.FileHandler('
|
29 |
]
|
30 |
)
|
|
|
31 |
|
32 |
-
class
|
33 |
-
"""
|
34 |
|
35 |
-
def __init__(
|
36 |
-
self,
|
37 |
-
timeout: int = 15,
|
38 |
-
max_retries: int = 3,
|
39 |
-
concurrent_requests: int = 5,
|
40 |
-
cache_size: int = 100
|
41 |
-
):
|
42 |
self.timeout = timeout
|
43 |
self.max_retries = max_retries
|
44 |
self.concurrent_requests = concurrent_requests
|
45 |
-
self.ua = UserAgent()
|
46 |
|
47 |
# Implement multilevel caching
|
48 |
self.url_cache = cachetools.LRUCache(maxsize=cache_size)
|
@@ -50,7 +44,7 @@ class AdvancedURLProcessor:
|
|
50 |
|
51 |
self.session = requests.Session()
|
52 |
self.session.headers.update({
|
53 |
-
'User-Agent': self.ua.random,
|
54 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
55 |
'Accept-Language': 'en-US,en;q=0.5',
|
56 |
'Connection': 'keep-alive'
|
@@ -59,13 +53,12 @@ class AdvancedURLProcessor:
|
|
59 |
@sleep_and_retry
|
60 |
@limits(calls=10, period=60) # Rate limiting: 10 calls per minute
|
61 |
def validate_url(self, url: str) -> Dict[str, Union[bool, str]]:
|
62 |
-
"""
|
63 |
try:
|
64 |
# Check cache first
|
65 |
if url in self.url_cache:
|
66 |
return self.url_cache[url]
|
67 |
|
68 |
-
# Comprehensive URL validation
|
69 |
result = urlparse(url)
|
70 |
validation_result = {
|
71 |
'is_valid': False,
|
@@ -78,23 +71,16 @@ class AdvancedURLProcessor:
|
|
78 |
validation_result['message'] = 'Missing scheme or network location'
|
79 |
return validation_result
|
80 |
|
81 |
-
# Use validators for additional checks
|
82 |
if not validators.url(url):
|
83 |
validation_result['message'] = 'URL format validation failed'
|
84 |
return validation_result
|
85 |
|
86 |
# Perform HEAD request for accessibility
|
87 |
try:
|
88 |
-
response = self.session.head(
|
89 |
-
url,
|
90 |
-
timeout=self.timeout,
|
91 |
-
allow_redirects=True
|
92 |
-
)
|
93 |
-
|
94 |
validation_result['is_valid'] = response.status_code in [200, 301, 302]
|
95 |
validation_result['status_code'] = response.status_code
|
96 |
validation_result['message'] = f"URL is {'valid' if validation_result['is_valid'] else 'invalid'}"
|
97 |
-
|
98 |
except requests.RequestException as e:
|
99 |
validation_result['message'] = f"Connection error: {str(e)}"
|
100 |
|
@@ -110,306 +96,260 @@ class AdvancedURLProcessor:
|
|
110 |
}
|
111 |
|
112 |
@sleep_and_retry
|
113 |
-
@limits(calls=
|
114 |
-
|
115 |
-
"""Fetch content from URL with retry mechanism
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
logger.info(f"Cache hit for URL: {url}")
|
120 |
-
return self.content_cache[url]
|
121 |
-
|
122 |
-
for attempt in range(self.max_retries):
|
123 |
-
try:
|
124 |
-
response = self.session.get(url, timeout=self.timeout)
|
125 |
-
response.raise_for_status()
|
126 |
-
content = response.text
|
127 |
-
|
128 |
-
# Cache the content
|
129 |
-
self.content_cache[url] = content
|
130 |
-
return content
|
131 |
-
|
132 |
-
except requests.RequestException as e:
|
133 |
-
logger.warning(f"Attempt {attempt + 1}/{self.max_retries} failed for {url}: {str(e)}")
|
134 |
-
if attempt == self.max_retries - 1:
|
135 |
-
raise
|
136 |
-
time.sleep(1) # Delay between retries
|
137 |
-
|
138 |
-
except Exception as e:
|
139 |
-
logger.error(f"Error fetching content from {url}: {e}")
|
140 |
-
return None
|
141 |
-
|
142 |
-
class ContentExtractor:
|
143 |
-
"""Advanced content extraction and processing"""
|
144 |
-
|
145 |
-
def __init__(self):
|
146 |
-
self.cleaners = [
|
147 |
-
self._remove_scripts,
|
148 |
-
self._remove_styles,
|
149 |
-
self._remove_special_chars,
|
150 |
-
self._normalize_whitespace
|
151 |
-
]
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
return {
|
158 |
-
"success": False,
|
159 |
-
"content": "",
|
160 |
-
"metadata": {"error": "Empty HTML content"}
|
161 |
-
}
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
"content": "",
|
182 |
-
"metadata": {"error": str(e)}
|
183 |
-
}
|
184 |
-
|
185 |
-
def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
|
186 |
-
"""Extract page metadata"""
|
187 |
-
metadata = {
|
188 |
-
"title": self._get_title(soup),
|
189 |
-
"description": self._get_meta_description(soup),
|
190 |
-
"keywords": self._get_meta_keywords(soup),
|
191 |
-
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
192 |
-
"url": url
|
193 |
-
}
|
194 |
-
return metadata
|
195 |
-
|
196 |
-
def _process_content(self, soup: BeautifulSoup) -> str:
|
197 |
-
"""Process and clean content through multiple passes"""
|
198 |
-
for cleaner in self.cleaners:
|
199 |
-
soup = cleaner(soup)
|
200 |
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
@staticmethod
|
217 |
-
def _remove_styles(soup: BeautifulSoup) -> BeautifulSoup:
|
218 |
-
for element in soup.find_all(style=True):
|
219 |
-
del element['style']
|
220 |
-
return soup
|
221 |
-
|
222 |
-
@staticmethod
|
223 |
-
def _remove_special_chars(soup: BeautifulSoup) -> BeautifulSoup:
|
224 |
-
text = soup.get_text()
|
225 |
-
text = re.sub(r'[^\w\s\.\,\!\?\-]', '', text)
|
226 |
-
new_soup = BeautifulSoup(f"<div>{text}</div>", 'html.parser')
|
227 |
-
return new_soup
|
228 |
-
|
229 |
-
@staticmethod
|
230 |
-
def _normalize_whitespace(soup: BeautifulSoup) -> BeautifulSoup:
|
231 |
-
text = soup.get_text()
|
232 |
text = re.sub(r'\s+', ' ', text)
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
return
|
245 |
-
|
246 |
-
@staticmethod
|
247 |
-
def _get_meta_keywords(soup: BeautifulSoup) -> str:
|
248 |
-
meta = soup.find('meta', attrs={'name': 'keywords'})
|
249 |
-
return meta.get('content', '') if meta else ""
|
250 |
|
251 |
-
class
|
252 |
-
"""
|
253 |
-
|
254 |
-
def __init__(self):
|
255 |
-
self.url_processor = AdvancedURLProcessor()
|
256 |
-
self.content_extractor = ContentExtractor()
|
257 |
-
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
|
258 |
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
try:
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
|
|
|
|
277 |
else:
|
278 |
-
|
279 |
-
|
280 |
-
"error": validation_result['message']
|
281 |
-
})
|
282 |
-
|
283 |
-
# Process valid URLs concurrently
|
284 |
-
futures = []
|
285 |
-
for url in valid_urls:
|
286 |
-
future = self.executor.submit(self._process_single_url, url)
|
287 |
-
futures.append((url, future))
|
288 |
-
|
289 |
-
# Collect results
|
290 |
-
for url, future in futures:
|
291 |
-
try:
|
292 |
-
result = future.result(timeout=30) # 30-second timeout
|
293 |
-
if result["success"]:
|
294 |
-
results["successful"].append(result)
|
295 |
-
else:
|
296 |
-
results["failed"].append({
|
297 |
-
"url": url,
|
298 |
-
"error": "Processing failed"
|
299 |
-
})
|
300 |
-
except Exception as e:
|
301 |
-
logger.error(f"Error processing {url}: {e}")
|
302 |
-
results["failed"].append({
|
303 |
-
"url": url,
|
304 |
-
"error": str(e)
|
305 |
-
})
|
306 |
-
|
307 |
-
# Update metadata
|
308 |
-
results["metadata"].update({
|
309 |
-
"end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
|
310 |
-
"successful_count": len(results["successful"]),
|
311 |
-
"failed_count": len(results["failed"])
|
312 |
-
})
|
313 |
-
|
314 |
-
return results
|
315 |
-
|
316 |
except Exception as e:
|
317 |
-
logger.error(f"
|
318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
|
320 |
-
def
|
321 |
-
"""Process
|
|
|
322 |
try:
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
url
|
332 |
-
)
|
333 |
-
|
334 |
-
result["url"] = url
|
335 |
-
result["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S")
|
336 |
-
|
337 |
-
return result
|
338 |
-
|
339 |
except Exception as e:
|
340 |
-
logger.error(f"Error processing
|
341 |
-
|
342 |
-
"success": False,
|
343 |
-
"url": url,
|
344 |
-
"error": str(e)
|
345 |
-
}
|
346 |
|
347 |
def create_interface():
|
348 |
-
"""Create Gradio interface with advanced features"""
|
349 |
-
processor = ContentProcessor()
|
350 |
|
351 |
-
|
352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
|
354 |
-
with gr.
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
)
|
361 |
-
|
362 |
-
with gr.Row():
|
363 |
-
process_btn = gr.Button("Process URLs", variant="primary")
|
364 |
-
clear_btn = gr.Button("Clear")
|
365 |
-
|
366 |
-
with gr.Column():
|
367 |
-
status_output = gr.JSON(
|
368 |
-
label="Processing Results",
|
369 |
-
show_label=True
|
370 |
-
)
|
371 |
|
372 |
-
gr.
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
)
|
378 |
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
url_list = [url.strip() for url in urls.splitlines() if url.strip()]
|
384 |
-
results = await processor.process_urls(url_list)
|
385 |
-
return results
|
386 |
|
387 |
-
def
|
388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
|
390 |
process_btn.click(
|
391 |
-
|
392 |
-
inputs=[url_input],
|
393 |
-
outputs=[
|
394 |
)
|
395 |
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
|
|
|
|
401 |
|
402 |
return interface
|
403 |
|
404 |
-
|
405 |
-
#
|
406 |
mimetypes.init()
|
407 |
|
408 |
# Create and launch interface
|
409 |
interface = create_interface()
|
410 |
interface.launch(
|
|
|
411 |
server_name="0.0.0.0",
|
412 |
server_port=7860,
|
413 |
-
share=False,
|
414 |
debug=True
|
415 |
-
)
|
|
|
|
|
|
|
|
6 |
import mimetypes
|
7 |
import concurrent.futures
|
8 |
import string
|
9 |
+
from typing import List, Dict, Optional, Union
|
|
|
10 |
from pathlib import Path
|
11 |
from urllib.parse import urlparse
|
12 |
|
13 |
import requests
|
14 |
import validators
|
15 |
import gradio as gr
|
|
|
16 |
import cachetools
|
17 |
from bs4 import BeautifulSoup
|
18 |
+
from fake_useragent import UserAgent
|
19 |
from ratelimit import limits, sleep_and_retry
|
20 |
|
21 |
+
# Setup logging with detailed configuration
|
22 |
logging.basicConfig(
|
23 |
level=logging.INFO,
|
24 |
format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
|
25 |
handlers=[
|
26 |
logging.StreamHandler(),
|
27 |
+
logging.FileHandler('app.log', encoding='utf-8')
|
28 |
]
|
29 |
)
|
30 |
+
logger = logging.getLogger(__name__)
|
31 |
|
32 |
+
class URLProcessor:
|
33 |
+
"""Class to handle URL processing with advanced features"""
|
34 |
|
35 |
+
def __init__(self, timeout: int = 15, max_retries: int = 3, concurrent_requests: int = 5, cache_size: int = 100):
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
self.timeout = timeout
|
37 |
self.max_retries = max_retries
|
38 |
self.concurrent_requests = concurrent_requests
|
39 |
+
self.ua = UserAgent() # Initialize UserAgent
|
40 |
|
41 |
# Implement multilevel caching
|
42 |
self.url_cache = cachetools.LRUCache(maxsize=cache_size)
|
|
|
44 |
|
45 |
self.session = requests.Session()
|
46 |
self.session.headers.update({
|
47 |
+
'User -Agent': self.ua.random, # Use random User-Agent
|
48 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
49 |
'Accept-Language': 'en-US,en;q=0.5',
|
50 |
'Connection': 'keep-alive'
|
|
|
53 |
@sleep_and_retry
|
54 |
@limits(calls=10, period=60) # Rate limiting: 10 calls per minute
|
55 |
def validate_url(self, url: str) -> Dict[str, Union[bool, str]]:
|
56 |
+
"""Validate URL format and accessibility"""
|
57 |
try:
|
58 |
# Check cache first
|
59 |
if url in self.url_cache:
|
60 |
return self.url_cache[url]
|
61 |
|
|
|
62 |
result = urlparse(url)
|
63 |
validation_result = {
|
64 |
'is_valid': False,
|
|
|
71 |
validation_result['message'] = 'Missing scheme or network location'
|
72 |
return validation_result
|
73 |
|
|
|
74 |
if not validators.url(url):
|
75 |
validation_result['message'] = 'URL format validation failed'
|
76 |
return validation_result
|
77 |
|
78 |
# Perform HEAD request for accessibility
|
79 |
try:
|
80 |
+
response = self.session.head(url, timeout=self.timeout, allow_redirects=True)
|
|
|
|
|
|
|
|
|
|
|
81 |
validation_result['is_valid'] = response.status_code in [200, 301, 302]
|
82 |
validation_result['status_code'] = response.status_code
|
83 |
validation_result['message'] = f"URL is {'valid' if validation_result['is_valid'] else 'invalid'}"
|
|
|
84 |
except requests.RequestException as e:
|
85 |
validation_result['message'] = f"Connection error: {str(e)}"
|
86 |
|
|
|
96 |
}
|
97 |
|
98 |
@sleep_and_retry
|
99 |
+
@limits(calls=20, period=60) # Refined rate limiting
|
100 |
+
def fetch_content(self, url: str) -> Optional[str]:
|
101 |
+
"""Fetch content from URL with retry mechanism"""
|
102 |
+
# Check content cache first
|
103 |
+
if url in self.content_cache:
|
104 |
+
return self.content_cache[url]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
for attempt in range(self.max_retries):
|
107 |
+
try:
|
108 |
+
response = self.session.get(url, timeout=self.timeout)
|
109 |
+
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
+
# Use BeautifulSoup for more robust parsing
|
112 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
113 |
+
|
114 |
+
# Remove scripts, styles, comments
|
115 |
+
for script in soup(["script", "style"]):
|
116 |
+
script.decompose()
|
117 |
+
|
118 |
+
# Extract clean text
|
119 |
+
text = soup.get_text(separator=' ')
|
120 |
+
cleaned_text = self .advanced_text_cleaning(text)
|
121 |
+
|
122 |
+
# Cache the result
|
123 |
+
self.content_cache[url] = cleaned_text
|
124 |
+
return cleaned_text
|
125 |
+
|
126 |
+
except requests.RequestException as e:
|
127 |
+
logger.warning(f"Fetch attempt {attempt + 1} failed for {url}: {e}")
|
128 |
+
time.sleep(2 ** attempt) # Exponential backoff
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
+
return None
|
131 |
+
|
132 |
+
def advanced_text_cleaning(self, text: str) -> str:
|
133 |
+
"""Sophisticated text cleaning and normalization"""
|
134 |
+
if not text:
|
135 |
+
return ""
|
136 |
|
137 |
+
# Remove control characters
|
138 |
+
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
|
139 |
+
|
140 |
+
# Normalize Unicode characters
|
141 |
+
text = text.encode('ascii', 'ignore').decode('ascii')
|
142 |
+
|
143 |
+
# Replace multiple whitespaces
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
text = re.sub(r'\s+', ' ', text)
|
145 |
+
|
146 |
+
# Remove HTML entities
|
147 |
+
text = re.sub(r'&[a-zA-Z]+;', '', text)
|
148 |
+
|
149 |
+
# Normalize quotation marks
|
150 |
+
text = text.replace('"', '"').replace('"', '"')
|
151 |
+
text = text.replace('‘', "'").replace('’', "'")
|
152 |
+
|
153 |
+
# Remove excessive punctuation
|
154 |
+
text = re.sub(r'([.,!?]){2,}', r'\1', text)
|
155 |
+
|
156 |
+
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
+
class FileProcessor:
|
159 |
+
"""Class to handle file processing"""
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
+
def __init__(self, max_file_size: int = 10 * 1024 * 1024): # 10MB default
|
162 |
+
self.max_file_size = max_file_size
|
163 |
+
self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
|
164 |
+
|
165 |
+
def is_text_file(self, filepath: str) -> bool:
|
166 |
+
"""Check if file is a text file"""
|
167 |
+
try:
|
168 |
+
mime_type, _ = mimetypes.guess_type(filepath)
|
169 |
+
return mime_type and mime_type.startswith('text/')
|
170 |
+
except Exception:
|
171 |
+
return False
|
172 |
+
|
173 |
+
def process_file(self, file) -> List[Dict]:
|
174 |
+
"""Process uploaded file with enhanced error handling"""
|
175 |
+
if not file:
|
176 |
+
return []
|
177 |
+
|
178 |
+
dataset = []
|
179 |
try:
|
180 |
+
file_size = os.path.getsize(file.name)
|
181 |
+
if file_size > self.max_file_size:
|
182 |
+
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
183 |
+
return []
|
184 |
+
|
185 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
186 |
+
if zipfile.is_zipfile(file.name):
|
187 |
+
dataset.extend(self._process_zip_file(file.name, temp_dir))
|
188 |
else:
|
189 |
+
dataset.extend(self._process_single_file(file))
|
190 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
except Exception as e:
|
192 |
+
logger.error(f"Error processing file: {str(e)}")
|
193 |
+
return []
|
194 |
+
|
195 |
+
return dataset
|
196 |
+
|
197 |
+
def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
|
198 |
+
"""Process ZIP file contents"""
|
199 |
+
results = []
|
200 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
201 |
+
zip_ref.extractall(temp_dir)
|
202 |
+
for root, _, files in os.walk(temp_dir):
|
203 |
+
for filename in files:
|
204 |
+
filepath = os.path.join(root, filename)
|
205 |
+
if self.is_text_file(filepath):
|
206 |
+
try:
|
207 |
+
with open(filepath, 'r', errors='ignore') as f:
|
208 |
+
content = f.read()
|
209 |
+
if content.strip():
|
210 |
+
results.append({
|
211 |
+
"source": "file",
|
212 |
+
"filename": filename,
|
213 |
+
"content": content,
|
214 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
215 |
+
})
|
216 |
+
except Exception as e:
|
217 |
+
logger.error(f"Error reading file {filename}: {str(e)}")
|
218 |
+
return results
|
219 |
|
220 |
+
def _process_single_file(self, file) -> List[Dict]:
|
221 |
+
"""Process single file"""
|
222 |
+
results = []
|
223 |
try:
|
224 |
+
content = file.read().decode('utf-8', errors='ignore')
|
225 |
+
if content.strip():
|
226 |
+
results.append({
|
227 |
+
"source": "file",
|
228 |
+
"filename": os.path.basename(file.name),
|
229 |
+
"content": content,
|
230 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
231 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
except Exception as e:
|
233 |
+
logger.error(f"Error processing single file: {str(e)}")
|
234 |
+
return results
|
|
|
|
|
|
|
|
|
235 |
|
236 |
def create_interface():
|
237 |
+
"""Create a comprehensive Gradio interface with advanced features"""
|
|
|
238 |
|
239 |
+
css = """
|
240 |
+
.container { max-width: 1200px; margin: auto; }
|
241 |
+
.warning { background-color: #fff3cd; color: #856404; }
|
242 |
+
.error { background-color: #f8d7da; color: #721c24; }
|
243 |
+
"""
|
244 |
+
|
245 |
+
with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
|
246 |
+
gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
|
247 |
|
248 |
+
with gr.Tab("URL Processing"):
|
249 |
+
url_input = gr.Textbox(
|
250 |
+
label="Enter URLs (comma or newline separated)",
|
251 |
+
lines=5,
|
252 |
+
placeholder="https://example1.com\nhttps://example2.com"
|
253 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
+
with gr.Tab("File Input"):
|
256 |
+
file_input = gr.File(
|
257 |
+
label="Upload text file or ZIP archive",
|
258 |
+
file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
|
259 |
+
)
|
260 |
+
|
261 |
+
with gr.Tab("Text Input"):
|
262 |
+
text_input = gr.Textbox(
|
263 |
+
label="Raw Text Input",
|
264 |
+
lines=5,
|
265 |
+
placeholder="Paste your text here..."
|
266 |
)
|
267 |
|
268 |
+
process_btn = gr.Button("Process Input", variant="primary")
|
269 |
+
|
270 |
+
output_text = gr.Textbox(label="Processing Results", interactive=False)
|
271 |
+
output_file = gr.File(label="Processed Output")
|
|
|
|
|
|
|
272 |
|
273 |
+
def process_all_inputs(urls, file, text):
|
274 |
+
"""Process all input types with progress tracking"""
|
275 |
+
try:
|
276 |
+
processor = URLProcessor()
|
277 |
+
file_processor = FileProcessor()
|
278 |
+
results = []
|
279 |
+
|
280 |
+
# Process URLs
|
281 |
+
if urls:
|
282 |
+
url_list = re.split(r'[,\n]', urls)
|
283 |
+
url_list = [url.strip() for url in url_list if url.strip()]
|
284 |
+
|
285 |
+
for url in url_list:
|
286 |
+
validation = processor.validate_url(url)
|
287 |
+
if validation.get('is_valid'):
|
288 |
+
content = processor.fetch_content(url)
|
289 |
+
if content:
|
290 |
+
results.append({
|
291 |
+
'source': 'url',
|
292 |
+
'url': url,
|
293 |
+
'content': content,
|
294 |
+
'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
|
295 |
+
})
|
296 |
+
|
297 |
+
# Process files
|
298 |
+
if file:
|
299 |
+
results.extend(file_processor.process_file(file))
|
300 |
+
|
301 |
+
# Process text input
|
302 |
+
if text:
|
303 |
+
cleaned_text = processor.advanced_text_cleaning(text)
|
304 |
+
results.append({
|
305 |
+
'source': 'direct_input',
|
306 |
+
'content': cleaned_text,
|
307 |
+
'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
|
308 |
+
})
|
309 |
+
|
310 |
+
# Generate output
|
311 |
+
if results:
|
312 |
+
output_path = 'processed_data.json'
|
313 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
314 |
+
json.dump(results, f, ensure_ascii=False, indent=2)
|
315 |
+
|
316 |
+
summary = f"Processed {len(results)} items successfully!"
|
317 |
+
return output_path, summary
|
318 |
+
else:
|
319 |
+
return None, "No valid content to process."
|
320 |
+
|
321 |
+
except Exception as e:
|
322 |
+
logger.error(f"Processing error: {e}")
|
323 |
+
return None, f"Error: {str(e)}"
|
324 |
|
325 |
process_btn.click(
|
326 |
+
process_all_inputs,
|
327 |
+
inputs=[url_input, file_input, text_input],
|
328 |
+
outputs=[output_file, output_text]
|
329 |
)
|
330 |
|
331 |
+
gr.Markdown("""
|
332 |
+
### Usage Guidelines
|
333 |
+
- **URL Processing**: Enter valid HTTP/HTTPS URLs
|
334 |
+
- **File Input**: Upload text files or ZIP archives
|
335 |
+
- **Text Input**: Direct text processing
|
336 |
+
- Advanced cleaning and validation included
|
337 |
+
""")
|
338 |
|
339 |
return interface
|
340 |
|
341 |
+
def main():
|
342 |
+
# Configure system settings
|
343 |
mimetypes.init()
|
344 |
|
345 |
# Create and launch interface
|
346 |
interface = create_interface()
|
347 |
interface.launch(
|
348 |
+
share=True,
|
349 |
server_name="0.0.0.0",
|
350 |
server_port=7860,
|
|
|
351 |
debug=True
|
352 |
+
)
|
353 |
+
|
354 |
+
if __name__ == "__main__":
|
355 |
+
main()
|