acecalisto3 commited on
Commit
c3b1d58
·
verified ·
1 Parent(s): 890dba9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +368 -45
app.py CHANGED
@@ -1,48 +1,371 @@
1
- def process_all_inputs(urls, file, text):
2
- """Process all input types with progress tracking"""
3
- try:
4
- processor = URLProcessor()
5
- file_processor = FileProcessor()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- # Process URLs
9
- if urls:
10
- url_list = re.split(r'[,\n]', urls)
11
- url_list = [url.strip() for url in url_list if url.strip()]
12
-
13
- for url in url_list:
14
- validation = processor.validate_url(url)
15
- if validation.get('is_valid'):
16
- content = processor.fetch_content(url)
17
- if content:
18
- results.append({
19
- 'source': 'url',
20
- 'url': url,
21
- 'content': content,
22
- 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
23
- })
24
-
25
- # Process files
26
- if file:
27
- results.extend(file_processor.process_file(file))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- # Process text input
30
- if text:
31
- cleaned_text = processor.advanced_text_cleaning(text)
32
- results.append({
33
- 'source': 'direct_input',
34
- 'content': cleaned_text,
35
- 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
36
- })
37
-
38
- # Generate output
39
- if results:
40
- output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
41
- output_dir.mkdir(parents=True, exist_ok=True)
42
- output_path = output_dir / f'processed_{int(time.time())}.json'
43
-
44
- with open(output_path, 'w', encoding='utf-8') as f:
45
- json.dump(results, f, ensure_ascii=False, indent=2)
46
-
47
- summary = f"Processed {len(results)} items successfully!"
48
- return output
 
1
+ import json
2
+ import os
3
+ import re
4
+ import time
5
+ import logging
6
+ import mimetypes
7
+ import concurrent.futures
8
+ import string
9
+ import zipfile
10
+ import tempfile
11
+ from datetime import datetime
12
+ from typing import List, Dict, Optional, Union
13
+ from pathlib import Path
14
+ from urllib.parse import urlparse
15
+
16
+ import requests
17
+ import validators
18
+ import gradio as gr
19
+ from diskcache import Cache
20
+ from bs4 import BeautifulSoup
21
+ from fake_useragent import UserAgent
22
+ from ratelimit import limits, sleep_and_retry
23
+ from cleantext import clean
24
+
25
+ # Setup logging with detailed configuration
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
29
+ handlers=[
30
+ logging.StreamHandler(),
31
+ logging.FileHandler('app.log', encoding='utf-8')
32
+ ]
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+ class URLProcessor:
37
+ def __init__(self):
38
+ self.session = requests.Session()
39
+ self.timeout = 10 # seconds
40
+ self.session.headers.update({
41
+ 'User-Agent': UserAgent().random,
42
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
43
+ 'Accept-Language': 'en-US,en;q=0.5',
44
+ 'Accept-Encoding': 'gzip, deflate, br',
45
+ 'Connection': 'keep-alive',
46
+ 'Upgrade-Insecure-Requests': '1'
47
+ })
48
+
49
+ def advanced_text_cleaning(self, text: str) -> str:
50
+ """Robust text cleaning with version compatibility"""
51
+ try:
52
+ cleaned_text = clean(
53
+ text,
54
+ fix_unicode=True,
55
+ to_ascii=True,
56
+ lower=True,
57
+ no_line_breaks=True,
58
+ no_urls=True,
59
+ no_emails=True,
60
+ no_phone_numbers=True,
61
+ no_numbers=False,
62
+ no_digits=False,
63
+ no_currency_symbols=True,
64
+ no_punct=False
65
+ ).strip()
66
+ return cleaned_text
67
+ except Exception as e:
68
+ logger.warning(f"Text cleaning error: {e}. Using fallback method.")
69
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Remove control characters
70
+ text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
71
+ text = re.sub(r'\s+', ' ', text) # Normalize whitespace
72
+ return text.strip()
73
+
74
+ def validate_url(self, url: str) -> Dict:
75
+ """Validate URL format and accessibility"""
76
+ try:
77
+ if not validators.url(url):
78
+ return {'is_valid': False, 'message': 'Invalid URL format'}
79
+
80
+ response = self.session.head(url, timeout=self.timeout)
81
+ response.raise_for_status()
82
+ return {'is_valid': True, 'message': 'URL is valid and accessible'}
83
+ except Exception as e:
84
+ return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
85
+
86
+ def fetch_content(self, url: str) -> Optional[Dict]:
87
+ """Universal content fetcher with special case handling"""
88
+ try:
89
+ # Google Drive document handling
90
+ if 'drive.google.com' in url:
91
+ return self._handle_google_drive(url)
92
+
93
+ # Google Calendar ICS handling
94
+ if 'calendar.google.com' in url and 'ical' in url:
95
+ return self._handle_google_calendar(url)
96
+
97
+ # Standard HTML processing
98
+ return self._fetch_html_content(url)
99
+ except Exception as e:
100
+ logger.error(f"Content fetch failed: {e}")
101
+ return None
102
+
103
+ def _handle_google_drive(self, url: str) -> Optional[Dict]:
104
+ """Process Google Drive file links"""
105
+ try:
106
+ file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
107
+ if not file_id:
108
+ logger.error(f"Invalid Google Drive URL: {url}")
109
+ return None
110
+
111
+ direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
112
+ response = self.session.get(direct_url, timeout=self.timeout)
113
+ response.raise_for_status()
114
+
115
+ return {
116
+ 'content': response.text,
117
+ 'content_type': response.headers.get('Content-Type', ''),
118
+ 'timestamp': datetime.now().isoformat()
119
+ }
120
+ except Exception as e:
121
+ logger.error(f"Google Drive processing failed: {e}")
122
+ return None
123
+
124
+ def _handle_google_calendar(self, url: str) -> Optional[Dict]:
125
+ """Process Google Calendar ICS feeds"""
126
+ try:
127
+ response = self.session.get(url, timeout=self.timeout)
128
+ response.raise_for_status()
129
+ return {
130
+ 'content': response.text,
131
+ 'content_type': 'text/calendar',
132
+ 'timestamp': datetime.now().isoformat()
133
+ }
134
+ except Exception as e:
135
+ logger.error(f"Calendar fetch failed: {e}")
136
+ return None
137
+
138
+ def _fetch_html_content(self, url: str) -> Optional[Dict]:
139
+ """Standard HTML content processing"""
140
+ try:
141
+ response = self.session.get(url, timeout=self.timeout)
142
+ response.raise_for_status()
143
+
144
+ soup = BeautifulSoup(response.text, 'html.parser')
145
+
146
+ # Remove unwanted elements
147
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
148
+ element.decompose()
149
+
150
+ # Extract main content
151
+ main_content = soup.find('main') or soup.find('article') or soup.body
152
+
153
+ # Clean and structure content
154
+ text_content = main_content.get_text(separator='\n', strip=True)
155
+ cleaned_content = self.advanced_text_cleaning(text_content)
156
+
157
+ return {
158
+ 'content': cleaned_content,
159
+ 'content_type': response.headers.get('Content-Type', ''),
160
+ 'timestamp': datetime.now().isoformat()
161
+ }
162
+ except Exception as e:
163
+ logger.error(f"HTML processing failed: {e}")
164
+ return None
165
+
166
+ class FileProcessor:
167
+ """Class to handle file processing"""
168
+
169
+ def __init__(self, max_file_size: int = 10 * 1024 * 1024): # 10MB default
170
+ self.max_file_size = max_file_size
171
+ self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
172
+
173
+ def is_text_file(self, filepath: str) -> bool:
174
+ """Check if file is a text file"""
175
+ try:
176
+ mime_type, _ = mimetypes.guess_type(filepath)
177
+ return (mime_type and mime_type.startswith('text/')) or \
178
+ (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
179
+ except Exception:
180
+ return False
181
+
182
+ def process_file(self, file) -> List[Dict]:
183
+ """Process uploaded file with enhanced error handling"""
184
+ if not file:
185
+ return []
186
+
187
+ dataset = []
188
+ try:
189
+ file_size = os.path.getsize(file.name)
190
+ if file_size > self.max_file_size:
191
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
192
+ return []
193
+
194
+ with tempfile.TemporaryDirectory() as temp_dir:
195
+ if zipfile.is_zipfile(file.name):
196
+ dataset.extend(self._process_zip_file(file.name, temp_dir))
197
+ else:
198
+ dataset.extend(self._process_single_file(file))
199
+
200
+ except Exception as e:
201
+ logger.error(f"Error processing file: {str(e)}")
202
+ return []
203
+
204
+ return dataset
205
+
206
+ def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
207
+ """Process ZIP file contents"""
208
  results = []
209
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
210
+ zip_ref.extractall(temp_dir)
211
+ for root, _, files in os.walk(temp_dir):
212
+ for filename in files:
213
+ filepath = os.path.join(root, filename)
214
+ if self.is_text_file(filepath):
215
+ try:
216
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
217
+ content = f.read()
218
+ if content.strip():
219
+ results.append({
220
+ "source": "file",
221
+ "filename": filename,
222
+ "content": content,
223
+ "timestamp": datetime.now().isoformat()
224
+ })
225
+ except Exception as e:
226
+ logger.error(f"Error reading file {filename}: {str(e)}")
227
+ return results
228
+
229
+ def _process_single_file(self, file) -> List[Dict]:
230
+ try:
231
+ file_stat = os.stat(file.name)
232
+ with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
233
+ content = f.read()
234
+
235
+ return [{
236
+ 'source': 'file',
237
+ 'filename': os.path.basename(file.name),
238
+ 'file_size': file_stat.st_size,
239
+ 'mime_type': mimetypes.guess_type(file.name)[0],
240
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
241
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
242
+ 'content': content,
243
+ 'timestamp': datetime.now().isoformat()
244
+ }]
245
+ except Exception as e:
246
+ logger.error(f"File processing error: {e}")
247
+ return []
248
+
249
+ def create_interface():
250
+ """Create a comprehensive Gradio interface with advanced features"""
251
+
252
+ css = """
253
+ .container { max-width: 1200px; margin: auto; }
254
+ .warning { background-color: #fff3cd; color: #856404; }
255
+ .error { background-color: #f8d7da; color: #721c24; }
256
+ """
257
+
258
+ with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
259
+ gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
260
+
261
+ with gr.Tab("URL Processing"):
262
+ url_input = gr.Textbox(
263
+ label="Enter URLs (comma or newline separated)",
264
+ lines=5,
265
+ placeholder="https://example1.com\nhttps://example2.com"
266
+ )
267
+
268
+ with gr.Tab("File Input"):
269
+ file_input = gr.File(
270
+ label="Upload text file or ZIP archive",
271
+ file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
272
+ )
273
+
274
+ with gr.Tab("Text Input"):
275
+ text_input = gr.Textbox(
276
+ label="Raw Text Input",
277
+ lines=5,
278
+ placeholder="Paste your text here..."
279
+ )
280
+
281
+ process_btn = gr.Button("Process Input", variant="primary")
282
+
283
+ output_text = gr.Textbox(label="Processing Results", interactive=False)
284
+ output_file = gr.File(label="Processed Output")
285
 
286
+ def process_all_inputs(urls, file, text):
287
+ """Process all input types with progress tracking"""
288
+ try:
289
+ processor = URLProcessor()
290
+ file_processor = FileProcessor()
291
+ results = []
292
+
293
+ # Process URLs
294
+ if urls:
295
+ url_list = re.split(r'[,\n]', urls)
296
+ url_list = [url.strip() for url in url_list if url.strip()]
297
+
298
+ for url in url_list:
299
+ validation = processor.validate_url(url)
300
+ if validation.get('is_valid'):
301
+ content = processor.fetch_content(url)
302
+ if content:
303
+ results.append({
304
+ 'source': 'url',
305
+ 'url': url,
306
+ 'content': content,
307
+ 'timestamp': datetime.now().isoformat()
308
+ })
309
+
310
+ # Process files
311
+ if file:
312
+ results.extend(file_processor.process_file(file))
313
+
314
+ # Process text input
315
+ if text:
316
+ cleaned_text = processor.advanced_text_cleaning(text)
317
+ results.append({
318
+ 'source': 'direct_input',
319
+ 'content': cleaned_text,
320
+ 'timestamp': datetime.now().isoformat()
321
+ })
322
+
323
+ # Generate output
324
+ if results:
325
+ output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
326
+ output_dir.mkdir(parents=True, exist_ok=True)
327
+ output_path = output_dir / f'processed_{int(time.time())}.json'
328
+
329
+ with open(output_path, 'w', encoding='utf-8') as f:
330
+ json.dump(results, f, ensure_ascii=False, indent=2)
331
+
332
+ summary = f"Processed {len(results)} items successfully!"
333
+ return output_path, summary
334
+ else:
335
+ return None, "No valid content to process."
336
+
337
+ except Exception as e:
338
+ logger.error(f"Processing error: {e}")
339
+ return None, f"Error: {str(e)}"
340
+
341
+ process_btn.click(
342
+ process_all_inputs,
343
+ inputs=[url_input, file_input, text_input],
344
+ outputs=[output_file, output_text]
345
+ )
346
+
347
+ gr.Markdown("""
348
+ ### Usage Guidelines
349
+ - **URL Processing**: Enter valid HTTP/HTTPS URLs
350
+ - **File Input**: Upload text files or ZIP archives
351
+ - **Text Input**: Direct text processing
352
+ - Advanced cleaning and validation included
353
+ """)
354
+
355
+ return interface
356
+
357
+ def main():
358
+ # Configure system settings
359
+ mimetypes.init()
360
 
361
+ # Create and launch interface
362
+ interface = create_interface()
363
+ interface.launch(
364
+ share=True,
365
+ server_name="0.0.0.0",
366
+ server_port=7860,
367
+ debug=True
368
+ )
369
+
370
+ if __name__ == "__main__":
371
+ main()