acecalisto3 commited on
Commit
554b5c7
·
verified ·
1 Parent(s): e9b5c07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -254
app.py CHANGED
@@ -6,357 +6,277 @@ import logging
6
  import mimetypes
7
  import concurrent.futures
8
  import string
9
- import zipfile
10
- import tempfile
11
- from datetime import datetime
12
  from typing import List, Dict, Optional, Union
13
  from pathlib import Path
14
  from urllib.parse import urlparse
 
15
  import requests
16
  import validators
17
  import gradio as gr
 
 
18
  from bs4 import BeautifulSoup
19
  from fake_useragent import UserAgent
20
  from ratelimit import limits, sleep_and_retry
21
- from cleantext import clean
22
-
23
 
24
- # Setup logging with detailed configuration
25
  logging.basicConfig(
26
  level=logging.INFO,
27
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
28
  handlers=[
29
  logging.StreamHandler(),
30
- logging.FileHandler('app.log', encoding='utf-8')
31
  ]
32
  )
33
  logger = logging.getLogger(__name__)
34
 
35
- class URLProcessor:
36
- def __init__(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  self.session = requests.Session()
38
- self.timeout = 10 # seconds
39
  self.session.headers.update({
40
- 'User-Agent': UserAgent().random,
41
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
42
  'Accept-Language': 'en-US,en;q=0.5',
43
- 'Accept-Encoding': 'gzip, deflate, br',
44
- 'Connection': 'keep-alive',
45
- 'Upgrade-Insecure-Requests': '1'
46
  })
47
 
48
- def advanced_text_cleaning(self, text: str) -> str:
 
 
 
49
  try:
50
- cleaned_text = clean(
51
- text,
52
- fix_unicode=True,
53
- to_ascii=True,
54
- lower=True,
55
- no_line_breaks=True,
56
- no_urls=True,
57
- no_emails=True,
58
- no_phone_numbers=True,
59
- no_numbers=False,
60
- no_digits=False,
61
- no_currency_symbols=True,
62
- no_punct=False
63
- ).strip()
64
- return cleaned_text
65
- except Exception as e:
66
- logger.warning(f"Text cleaning error: {e}. Using fallback method.")
67
- text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
68
- text = text.encode('ascii', 'ignore').decode('ascii')
69
- text = re.sub(r'\s+', ' ', text)
70
- return text.strip()
71
-
72
- def validate_url(self, url: str) -> Dict:
73
- try:
74
- if not validators.url(url):
75
- return {'is_valid': False, 'message': 'Invalid URL format'}
76
 
77
- response = self.session.head(url, timeout=self.timeout)
78
- response.raise_for_status()
79
- return {'is_valid': True, 'message': 'URL is valid and accessible'}
80
- except Exception as e:
81
- return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
82
-
83
- def fetch_content(self, url: str) -> Optional[Dict]:
84
- try:
85
- if 'drive.google.com' in url:
86
- return self._handle_google_drive(url)
87
- if 'calendar.google.com' in url and 'ical' in url:
88
- return self._handle_google_calendar(url)
89
- return self._fetch_html_content(url)
90
- except Exception as e:
91
- logger.error(f"Content fetch failed: {e}")
92
- return None
93
-
94
- def _handle_google_drive(self, url: str) -> Optional[Dict]:
95
- try:
96
- file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
97
- if not file_id:
98
- logger.error(f"Invalid Google Drive URL: {url}")
99
- return None
100
 
101
- direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
102
- response = self.session.get(direct_url, timeout=self.timeout)
103
- response.raise_for_status()
104
 
105
- return {
106
- 'content': response.text,
107
- 'content_type': response.headers.get('Content-Type', ''),
108
- 'timestamp': datetime.now().isoformat()
109
- }
110
- except Exception as e:
111
- logger.error(f"Google Drive processing failed: {e}")
112
- return None
113
-
114
- def _handle_google_calendar(self, url: str) -> Optional[Dict]:
115
- try:
116
- response = self.session.get(url, timeout=self.timeout)
117
- response.raise_for_status()
118
- return {
119
- 'content': response.text,
120
- 'content_type': 'text/calendar',
121
- 'timestamp': datetime.now().isoformat()
122
- }
123
- except Exception as e:
124
- logger.error(f"Calendar fetch failed: {e}")
125
- return None
126
-
127
- def _fetch_html_content(self, url: str) -> Optional[Dict]:
128
- try:
129
- response = self.session.get(url, timeout=self.timeout)
130
- response.raise_for_status()
131
 
132
- soup = BeautifulSoup(response.text, 'html.parser')
133
- for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
134
- element.decompose()
135
- main_content = soup.find('main') or soup.find('article') or soup.body
136
- if main_content is None:
137
- logger.warning(f"No main content found for URL: {url}")
138
- return {
139
- 'content': '',
140
- 'content_type': response.headers.get('Content-Type', ''),
141
- 'timestamp': datetime.now().isoformat()
142
- }
143
- text_content = main_content.get_text(separator='\n', strip=True)
144
- cleaned_content = self.advanced_text_cleaning(text_content)
145
  return {
146
- 'content': cleaned_content,
147
- 'content_type': response.headers.get('Content-Type', ''),
148
- 'timestamp': datetime.now().isoformat()
149
  }
150
- except Exception as e:
151
- logger.error(f"HTML processing failed: {e}")
152
- return None
153
-
154
- class FileProcessor:
155
- def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):
156
- self.max_file_size = max_file_size
157
- self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
158
-
159
- def is_text_file(self, filepath: str) -> bool:
160
- try:
161
- mime_type, _ = mimetypes.guess_type(filepath)
162
- return (mime_type and mime_type.startswith('text/')) or \
163
- (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
164
- except Exception:
165
- return False
166
-
167
- def process_file(self, file) -> List[Dict]:
168
- if not file:
169
- return []
170
-
171
- dataset = []
172
- try:
173
- file_size = os.path.getsize(file.name)
174
- if file_size > self.max_file_size:
175
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
176
- return []
177
 
178
- with tempfile.TemporaryDirectory() as temp_dir:
179
- if zipfile.is_zipfile(file.name):
180
- dataset.extend(self._process_zip_file(file.name, temp_dir))
181
- else:
182
- dataset.extend(self._process_single_file(file))
183
-
184
- except Exception as e:
185
- logger.error(f"Error processing file: {str(e)}")
186
- return []
187
-
188
- return dataset
189
-
190
- def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
191
- results = []
192
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
193
- zip_ref.extractall(temp_dir)
194
- for root, _, files in os.walk(temp_dir):
195
- for filename in files:
196
- filepath = os.path.join(root, filename)
197
- if self.is_text_file(filepath):
198
- try:
199
- with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
200
- content = f.read()
201
- if content.strip():
202
- results.append({
203
- "source": "file",
204
- "filename": filename,
205
- "content": content,
206
- "timestamp": datetime.now().isoformat()
207
- })
208
- except Exception as e:
209
- logger.error(f"Error reading file {filename}: {str(e)}")
210
- return results
211
-
212
- def _process_single_file(self, file) -> List[Dict]:
213
- try:
214
- file_stat = os.stat(file.name)
215
- if file_stat.st_size > 100 * 1024 * 1024:
216
- logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
217
- content = ""
218
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
219
- content = f.read(1 * 1024 * 1024)
220
- content += "\n...[Content truncated due to large file size]...\n"
221
- f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
222
- content += f.read()
223
- else:
224
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
225
- content = f.read()
226
-
227
- return [{
228
- 'source': 'file',
229
- 'filename': os.path.basename(file.name),
230
- 'file_size': file_stat.st_size,
231
- 'mime_type': mimetypes.guess_type(file.name)[0],
232
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
233
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
234
- 'content': content,
235
- 'timestamp': datetime.now().isoformat()
236
- }]
237
- except Exception as e:
238
- logger.error(f"File processing error: {e}")
239
- return []
240
-
241
- def generate_qr(json_data):
242
- if json_data:
243
- qr = qrcode.make(json_data)
244
- qr_path = f"output/qr_code_{int(time.time())}.png"
245
- qr.save(qr_path)
246
- return qr_path
247
- return None
248
-
249
- def create_interface():
250
  css = """
251
  .container { max-width: 1200px; margin: auto; }
252
  .warning { background-color: #fff3cd; color: #856404; }
253
  .error { background-color: #f8d7da; color: #721c24; }
254
  """
255
-
256
  with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
257
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
258
-
259
  with gr.Tab("URL Processing"):
260
  url_input = gr.Textbox(
261
  label="Enter URLs (comma or newline separated)",
262
  lines=5,
263
  placeholder="https://example1.com\nhttps://example2.com"
264
  )
265
-
266
- with gr.Tab("File Input"):
267
- file_input = gr.File(
268
- label="Upload text file or ZIP archive",
269
- file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
270
- )
271
-
272
  with gr.Tab("Text Input"):
273
  text_input = gr.Textbox(
274
  label="Raw Text Input",
275
  lines=5,
276
  placeholder="Paste your text here..."
277
  )
278
-
279
  process_btn = gr.Button("Process Input", variant="primary")
 
280
  output_text = gr.Textbox(label="Processing Results", interactive=False)
281
  output_file = gr.File(label="Processed Output")
282
-
283
- def process_all_inputs(urls, file, text):
284
  try:
285
- processor = URLProcessor()
286
- file_processor = FileProcessor()
287
  results = []
288
-
 
289
  if urls:
290
- url_list = re.split(r'[\,\n]', urls)
291
  url_list = [url.strip() for url in url_list if url.strip()]
292
-
293
  for url in url_list:
294
  validation = processor.validate_url(url)
295
  if validation.get('is_valid'):
296
  content = processor.fetch_content(url)
297
  if content:
298
  results.append({
299
- 'source': 'url',
300
  'url': url,
301
  'content': content,
302
- 'timestamp': datetime.now().isoformat()
303
  })
304
-
305
- if file:
306
- results.extend(file_processor.process_file(file))
307
-
308
  if text:
309
  cleaned_text = processor.advanced_text_cleaning(text)
310
  results.append({
311
  'source': 'direct_input',
312
  'content': cleaned_text,
313
- 'timestamp': datetime.now().isoformat()
314
  })
315
-
 
316
  if results:
317
- output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
318
- output_dir.mkdir(parents=True, exist_ok=True)
319
- output_path = output_dir / f'processed_{int(time.time())}.json'
320
-
321
  with open(output_path, 'w', encoding='utf-8') as f:
322
  json.dump(results, f, ensure_ascii=False, indent=2)
323
-
324
  summary = f"Processed {len(results)} items successfully!"
325
- return str(output_path), summary
326
  else:
327
  return None, "No valid content to process."
328
-
329
  except Exception as e:
330
  logger.error(f"Processing error: {e}")
331
  return None, f"Error: {str(e)}"
332
-
333
  process_btn.click(
334
- process_all_inputs,
335
- inputs=[url_input, file_input, text_input],
336
  outputs=[output_file, output_text]
337
  )
338
-
339
  gr.Markdown("""
340
  ### Usage Guidelines
341
- - **URL Processing**: Enter valid HTTP/HTTPS URLs
342
- - **File Input**: Upload text files or ZIP archives
343
- - **Text Input**: Direct text processing
344
  - Advanced cleaning and validation included
345
  """)
346
-
347
  return interface
348
 
349
  def main():
 
350
  mimetypes.init()
351
- interface = create_interface()
 
 
352
  interface.launch(
 
353
  server_name="0.0.0.0",
354
  server_port=7860,
355
- show_error=True,
356
- share=False,
357
- inbrowser=True,
358
  debug=True
359
  )
360
 
361
  if __name__ == "__main__":
362
- main()
 
6
  import mimetypes
7
  import concurrent.futures
8
  import string
 
 
 
9
  from typing import List, Dict, Optional, Union
10
  from pathlib import Path
11
  from urllib.parse import urlparse
12
+
13
  import requests
14
  import validators
15
  import gradio as gr
16
+ import torch
17
+ import cachetools
18
  from bs4 import BeautifulSoup
19
  from fake_useragent import UserAgent
20
  from ratelimit import limits, sleep_and_retry
 
 
21
 
22
+ # Advanced Logging Configuration
23
  logging.basicConfig(
24
  level=logging.INFO,
25
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
26
  handlers=[
27
  logging.StreamHandler(),
28
+ logging.FileHandler('app_advanced.log', encoding='utf-8')
29
  ]
30
  )
31
  logger = logging.getLogger(__name__)
32
 
33
+ class AdvancedURLProcessor:
34
+ """Enhanced URL processing with advanced features"""
35
+
36
+ def __init__(
37
+ self,
38
+ timeout: int = 15,
39
+ max_retries: int = 3,
40
+ concurrent_requests: int = 5,
41
+ cache_size: int = 100
42
+ ):
43
+ self.timeout = timeout
44
+ self.max_retries = max_retries
45
+ self.concurrent_requests = concurrent_requests
46
+ self.ua = UserAgent()
47
+
48
+ # Implement multilevel caching
49
+ self.url_cache = cachetools.LRUCache(maxsize=cache_size)
50
+ self.content_cache = cachetools.TTLCache(maxsize=cache_size, ttl=3600) # 1-hour cache
51
+
52
  self.session = requests.Session()
 
53
  self.session.headers.update({
54
+ 'User-Agent': self.ua.random,
55
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
56
  'Accept-Language': 'en-US,en;q=0.5',
57
+ 'Connection': 'keep-alive'
 
 
58
  })
59
 
60
+ @sleep_and_retry
61
+ @limits(calls=10, period=60) # Rate limiting: 10 calls per minute
62
+ def validate_url(self, url: str) -> Dict[str, Union[bool, str]]:
63
+ """Enhanced URL validation with comprehensive checks"""
64
  try:
65
+ # Check cache first
66
+ if url in self.url_cache:
67
+ return self.url_cache[url]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ # Comprehensive URL validation
70
+ result = urlparse(url)
71
+ validation_result = {
72
+ 'is_valid': False,
73
+ 'message': 'Invalid URL',
74
+ 'scheme': result.scheme,
75
+ 'netloc': result.netloc
76
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ if not all([result.scheme, result.netloc]):
79
+ validation_result['message'] = 'Missing scheme or network location'
80
+ return validation_result
81
 
82
+ # Use validators for additional checks
83
+ if not validators.url(url):
84
+ validation_result['message'] = 'URL format validation failed'
85
+ return validation_result
86
+
87
+ # Perform HEAD request for accessibility
88
+ try:
89
+ response = self.session.head(
90
+ url,
91
+ timeout=self.timeout,
92
+ allow_redirects=True
93
+ )
94
+
95
+ validation_result['is_valid'] = response.status_code in [200, 301, 302]
96
+ validation_result['status_code'] = response.status_code
97
+ validation_result['message'] = f"URL is {'valid' if validation_result['is_valid'] else 'invalid'}"
98
+
99
+ except requests.RequestException as e:
100
+ validation_result['message'] = f"Connection error: {str(e)}"
 
 
 
 
 
 
 
101
 
102
+ # Cache the result
103
+ self.url_cache[url] = validation_result
104
+ return validation_result
105
+
106
+ except Exception as e:
107
+ logger.error(f"Unexpected error validating URL {url}: {e}")
 
 
 
 
 
 
 
108
  return {
109
+ 'is_valid': False,
110
+ 'message': f"Unexpected validation error: {str(e)}"
 
111
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ def advanced_text_cleaning(self, text: str) -> str:
114
+ """Sophisticated text cleaning and normalization"""
115
+ if not text:
116
+ return ""
117
+
118
+ # Remove control characters
119
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
120
+
121
+ # Normalize Unicode characters
122
+ text = text.encode('ascii', 'ignore').decode('ascii')
123
+
124
+ # Replace multiple whitespaces
125
+ text = re.sub(r'\s+', ' ', text)
126
+
127
+ # Remove HTML entities
128
+ text = re.sub(r'&[a-zA-Z]+;', '', text)
129
+
130
+ # Normalize quotation marks
131
+ text = text.replace('"', '"').replace('"', '"')
132
+ text = text.replace(''', "'").replace(''', "'")
133
+
134
+ # Remove excessive punctuation
135
+ text = re.sub(r'([.,!?]){2,}', r'\1', text)
136
+
137
+ return text.strip()
138
+
139
+ @sleep_and_retry
140
+ @limits(calls=20, period=60) # Refined rate limiting
141
+ def fetch_content(self, url: str) -> Optional[str]:
142
+ """Advanced content fetching with multiple safeguards"""
143
+ # Check content cache first
144
+ if url in self.content_cache:
145
+ return self.content_cache[url]
146
+
147
+ for attempt in range(self.max_retries):
148
+ try:
149
+ response = self.session.get(
150
+ url,
151
+ timeout=self.timeout,
152
+ headers={'User-Agent': self.ua.random}
153
+ )
154
+ response.raise_for_status()
155
+
156
+ # Use BeautifulSoup for more robust parsing
157
+ soup = BeautifulSoup(response.text, 'html.parser')
158
+
159
+ # Remove scripts, styles, comments
160
+ for script in soup(["script", "style"]):
161
+ script.decompose()
162
+
163
+ # Extract clean text
164
+ text = soup.get_text(separator=' ')
165
+ cleaned_text = self.advanced_text_cleaning(text)
166
+
167
+ # Cache the result
168
+ self.content_cache[url] = cleaned_text
169
+ return cleaned_text
170
+
171
+ except requests.RequestException as e:
172
+ logger.warning(f"Fetch attempt {attempt + 1} failed for {url}: {e}")
173
+ time.sleep(2 ** attempt) # Exponential backoff
174
+
175
+ return None
176
+
177
+ def create_advanced_interface():
178
+ """Create a comprehensive Gradio interface with advanced features"""
179
+
 
 
 
 
 
180
  css = """
181
  .container { max-width: 1200px; margin: auto; }
182
  .warning { background-color: #fff3cd; color: #856404; }
183
  .error { background-color: #f8d7da; color: #721c24; }
184
  """
185
+
186
  with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
187
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
188
+
189
  with gr.Tab("URL Processing"):
190
  url_input = gr.Textbox(
191
  label="Enter URLs (comma or newline separated)",
192
  lines=5,
193
  placeholder="https://example1.com\nhttps://example2.com"
194
  )
195
+
 
 
 
 
 
 
196
  with gr.Tab("Text Input"):
197
  text_input = gr.Textbox(
198
  label="Raw Text Input",
199
  lines=5,
200
  placeholder="Paste your text here..."
201
  )
202
+
203
  process_btn = gr.Button("Process Input", variant="primary")
204
+
205
  output_text = gr.Textbox(label="Processing Results", interactive=False)
206
  output_file = gr.File(label="Processed Output")
207
+
208
+ def process_input(urls, text):
209
  try:
210
+ processor = AdvancedURLProcessor()
 
211
  results = []
212
+
213
+ # Process URLs
214
  if urls:
215
+ url_list = re.split(r'[,\n]', urls)
216
  url_list = [url.strip() for url in url_list if url.strip()]
217
+
218
  for url in url_list:
219
  validation = processor.validate_url(url)
220
  if validation.get('is_valid'):
221
  content = processor.fetch_content(url)
222
  if content:
223
  results.append({
 
224
  'url': url,
225
  'content': content,
226
+ 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
227
  })
228
+
229
+ # Process text input
 
 
230
  if text:
231
  cleaned_text = processor.advanced_text_cleaning(text)
232
  results.append({
233
  'source': 'direct_input',
234
  'content': cleaned_text,
235
+ 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
236
  })
237
+
238
+ # Generate output
239
  if results:
240
+ output_path = 'processed_data.json'
 
 
 
241
  with open(output_path, 'w', encoding='utf-8') as f:
242
  json.dump(results, f, ensure_ascii=False, indent=2)
243
+
244
  summary = f"Processed {len(results)} items successfully!"
245
+ return output_path, summary
246
  else:
247
  return None, "No valid content to process."
248
+
249
  except Exception as e:
250
  logger.error(f"Processing error: {e}")
251
  return None, f"Error: {str(e)}"
252
+
253
  process_btn.click(
254
+ process_input,
255
+ inputs=[url_input, text_input],
256
  outputs=[output_file, output_text]
257
  )
258
+
259
  gr.Markdown("""
260
  ### Usage Guidelines
261
+ - URL Processing: Enter valid HTTP/HTTPS URLs
262
+ - Text Input: Direct text processing
 
263
  - Advanced cleaning and validation included
264
  """)
265
+
266
  return interface
267
 
268
  def main():
269
+ # Configure system settings
270
  mimetypes.init()
271
+
272
+ # Create and launch interface
273
+ interface = create_advanced_interface()
274
  interface.launch(
275
+ share=True,
276
  server_name="0.0.0.0",
277
  server_port=7860,
 
 
 
278
  debug=True
279
  )
280
 
281
  if __name__ == "__main__":
282
+ main()