acecalisto3 commited on
Commit
890dba9
·
verified ·
1 Parent(s): 1336a84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -318
app.py CHANGED
@@ -1,323 +1,48 @@
1
- import json
2
- import os
3
- import re
4
- import time
5
- import logging
6
- import mimetypes
7
- import concurrent.futures
8
- import string
9
- from typing import List, Dict, Optional, Union
10
- from pathlib import Path
11
- from urllib.parse import urlparse
12
-
13
- import requests
14
- import validators
15
- import gradio as gr
16
- from diskcache import Cache
17
- from bs4 import BeautifulSoup
18
- from fake_useragent import UserAgent
19
- from ratelimit import limits, sleep_and_retry
20
- from cleantext import clean
21
-
22
- # Setup logging with detailed configuration
23
- logging.basicConfig(
24
- level=logging.INFO,
25
- format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
26
- handlers=[
27
- logging.StreamHandler(),
28
- logging.FileHandler('app.log', encoding='utf-8')
29
- ]
30
- )
31
- logger = logging.getLogger(__name__)
32
-
33
- class URLProcessor:
34
- def advanced_text_cleaning(self, text: str) -> str:
35
- """Robust text cleaning with version compatibility"""
36
- try:
37
- # Modern clean-text parameters
38
- return clean(text,
39
- fix_unicode=True,
40
- to_ascii=True,
41
- lower=True,
42
- no_line_breaks=True,
43
- no_urls=True,
44
- no_emails=True,
45
- no_phone_numbers=True,
46
- no_numbers=False,
47
- no_digits=False,
48
- no_currency_symbols=True,
49
- no_punct=False
50
- ).strip()
51
- except TypeError as e:
52
- # Fallback to basic cleaning
53
- logger.warning("Using fallback text cleaning method")
54
- text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Control chars
55
- text = text.encode('ascii', 'ignore').decode('ascii') # Unicode
56
- text = re.sub(r'\s+', ' ', text) # Whitespace
57
- return text.strip()
58
-
59
- def fetch_content(self, url: str) -> Optional[Dict]:
60
- """Universal content fetcher with special case handling"""
61
- # Google Drive document handling
62
- if 'drive.google.com' in url:
63
- return self._handle_google_drive(url)
64
 
65
- # Google Calendar ICS handling
66
- if 'calendar.google.com' in url and 'ical' in url:
67
- return self._handle_google_calendar(url)
68
-
69
- # Standard HTML processing
70
- return self._fetch_html_content(url)
71
-
72
- def _handle_google_drive(self, url: str) -> Optional[Dict]:
73
- """Process Google Drive file links"""
74
- try:
75
- file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
76
- if not file_id:
77
- logger.error(f"Invalid Google Drive URL: {url}")
78
- return None
79
-
80
- direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
81
- response = self.session.get(direct_url, timeout=self.timeout)
82
- response.raise_for_status()
83
-
84
- return {
85
- 'content': response.text,
86
- 'content_type': response.headers.get('Content-Type', ''),
87
- 'timestamp': datetime.now().isoformat()
88
- }
89
- except Exception as e:
90
- logger.error(f"Google Drive processing failed: {e}")
91
- return None
92
-
93
- def _handle_google_calendar(self, url: str) -> Optional[Dict]:
94
- """Process Google Calendar ICS feeds"""
95
- try:
96
- response = self.session.get(url, timeout=self.timeout)
97
- response.raise_for_status()
98
- return {
99
- 'content': response.text,
100
- 'content_type': 'text/calendar',
101
- 'timestamp': datetime.now().isoformat()
102
- }
103
- except Exception as e:
104
- logger.error(f"Calendar fetch failed: {e}")
105
- return None
106
-
107
- def _fetch_html_content(self, url: str) -> Optional[Dict]:
108
- """Standard HTML content processing"""
109
- try:
110
- response = self.session.get(url, timeout=self.timeout)
111
- soup = BeautifulSoup(response.text, 'html.parser')
112
- # ... existing HTML processing logic ...
113
- return structured_data
114
- except Exception as e:
115
- logger.error(f"HTML processing failed: {e}")
116
- return None
117
 
118
- class FileProcessor:
119
- """Class to handle file processing"""
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- def __init__(self, max_file_size: int = 10 * 1024 * 1024): # 10MB default
122
- self.max_file_size = max_file_size
123
- self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
124
-
125
- def is_text_file(self, filepath: str) -> bool:
126
- """Check if file is a text file"""
127
- try:
128
- mime_type, _ = mimetypes.guess_type(filepath)
129
- return mime_type and mime_type.startswith('text/')
130
- except Exception:
131
- return False
132
-
133
-
134
-
135
- def process_file(self, file) -> List[Dict]:
136
- """Process uploaded file with enhanced error handling"""
137
- if not file:
138
- return []
139
-
140
- dataset = []
141
- try:
142
- file_size = os.path.getsize(file.name)
143
- if file_size > self.max_file_size:
144
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
145
- return []
146
-
147
- with tempfile.TemporaryDirectory() as temp_dir:
148
- if zipfile.is_zipfile(file.name):
149
- dataset.extend(self._process_zip_file(file.name, temp_dir))
150
- else:
151
- dataset.extend(self._process_single_file(file))
152
-
153
- except Exception as e:
154
- logger.error(f"Error processing file: {str(e)}")
155
- return []
156
-
157
- return dataset
158
-
159
- def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
160
- """Process ZIP file contents"""
161
- results = []
162
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
163
- zip_ref.extractall(temp_dir)
164
- for root, _, files in os.walk(temp_dir):
165
- for filename in files:
166
- filepath = os.path.join(root, filename)
167
- if self.is_text_file(filepath):
168
- try:
169
- with open(filepath, 'r', errors='ignore') as f:
170
- content = f.read()
171
- if content.strip():
172
- results.append({
173
- "source": "file",
174
- "filename": filename,
175
- "content": content,
176
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
177
- })
178
- except Exception as e:
179
- logger.error(f"Error reading file {filename}: {str(e)}")
180
- return results
181
-
182
- def _process_single_file(self, file) -> List[Dict]:
183
- try:
184
- file_stat = os.stat(file.name)
185
- content = file.read().decode('utf-8', errors='ignore')
186
 
187
- return [{
188
- 'source': 'file',
189
- 'filename': os.path.basename(file.name),
190
- 'file_size': file_stat.st_size,
191
- 'mime_type': mimetypes.guess_type(file.name)[0],
192
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
193
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
194
- 'content': content,
195
- 'timestamp': datetime.now().isoformat()
196
- }]
197
- except Exception as e:
198
- logger.error(f"File processing error: {e}")
199
- return []
200
-
201
- def create_interface():
202
- """Create a comprehensive Gradio interface with advanced features"""
203
-
204
- css = """
205
- .container { max-width: 1200px; margin: auto; }
206
- .warning { background-color: #fff3cd; color: #856404; }
207
- .error { background-color: #f8d7da; color: #721c24; }
208
- """
209
-
210
- with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
211
- gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
212
-
213
- with gr.Tab("URL Processing"):
214
- url_input = gr.Textbox(
215
- label="Enter URLs (comma or newline separated)",
216
- lines=5,
217
- placeholder="https://example1.com\nhttps://example2.com"
218
- )
219
-
220
- with gr.Tab("File Input"):
221
- file_input = gr.File(
222
- label="Upload text file or ZIP archive",
223
- file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
224
- )
225
-
226
- with gr.Tab("Text Input"):
227
- text_input = gr.Textbox(
228
- label="Raw Text Input",
229
- lines=5,
230
- placeholder="Paste your text here..."
231
- )
232
-
233
- process_btn = gr.Button("Process Input", variant="primary")
234
-
235
- output_text = gr.Textbox(label="Processing Results", interactive=False)
236
- output_file = gr.File(label="Processed Output")
237
-
238
- def process_all_inputs(urls, file, text):
239
- """Process all input types with progress tracking"""
240
- try:
241
- processor = URLProcessor()
242
- file_processor = FileProcessor()
243
- results = []
244
-
245
- # Process URLs
246
- if urls:
247
- url_list = re.split(r'[,\n]', urls)
248
- url_list = [url.strip() for url in url_list if url.strip()]
249
-
250
- for url in url_list:
251
- validation = processor.validate_url(url)
252
- if validation.get('is_valid'):
253
- content = processor.fetch_content(url)
254
- if content:
255
- results.append({
256
- 'source': 'url',
257
- 'url': url,
258
- 'content': content,
259
- 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
260
- })
261
-
262
- # Process files
263
- if file:
264
- results.extend(file_processor.process_file(file))
265
-
266
- # Process text input
267
- if text:
268
- cleaned_text = processor.advanced_text_cleaning(text)
269
- results.append({
270
- 'source': 'direct_input',
271
- 'content': cleaned_text,
272
- 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
273
- })
274
-
275
- # Generate output
276
- if results:
277
- output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
278
- output_dir.mkdir(parents=True, exist_ok=True)
279
- output_path = output_dir / f'processed_{int(time.time())}.json'
280
-
281
- with open(output_path, 'w', encoding='utf-8') as f:
282
- json.dump(results, f, ensure_ascii=False, indent=2)
283
-
284
- summary = f"Processed {len(results)} items successfully!"
285
- return output_path, summary
286
- else:
287
- return None, "No valid content to process."
288
 
289
- except Exception as e:
290
- logger.error(f"Processing error: {e}")
291
- return None, f"Error: {str(e)}"
292
-
293
- process_btn.click(
294
- process_all_inputs,
295
- inputs=[url_input, file_input, text_input],
296
- outputs=[output_file, output_text]
297
- )
298
-
299
- gr.Markdown("""
300
- ### Usage Guidelines
301
- - **URL Processing**: Enter valid HTTP/HTTPS URLs
302
- - **File Input**: Upload text files or ZIP archives
303
- - **Text Input**: Direct text processing
304
- - Advanced cleaning and validation included
305
- """)
306
-
307
- return interface
308
-
309
- def main():
310
- # Configure system settings
311
- mimetypes.init()
312
-
313
- # Create and launch interface
314
- interface = create_interface()
315
- interface.launch(
316
- share=True,
317
- server_name="0.0.0.0",
318
- server_port=7860,
319
- debug=True
320
- )
321
-
322
- if __name__ == "__main__":
323
- main()
 
1
+ def process_all_inputs(urls, file, text):
2
+ """Process all input types with progress tracking"""
3
+ try:
4
+ processor = URLProcessor()
5
+ file_processor = FileProcessor()
6
+ results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Process URLs
9
+ if urls:
10
+ url_list = re.split(r'[,\n]', urls)
11
+ url_list = [url.strip() for url in url_list if url.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ for url in url_list:
14
+ validation = processor.validate_url(url)
15
+ if validation.get('is_valid'):
16
+ content = processor.fetch_content(url)
17
+ if content:
18
+ results.append({
19
+ 'source': 'url',
20
+ 'url': url,
21
+ 'content': content,
22
+ 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
23
+ })
24
+
25
+ # Process files
26
+ if file:
27
+ results.extend(file_processor.process_file(file))
28
 
29
+ # Process text input
30
+ if text:
31
+ cleaned_text = processor.advanced_text_cleaning(text)
32
+ results.append({
33
+ 'source': 'direct_input',
34
+ 'content': cleaned_text,
35
+ 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
36
+ })
37
+
38
+ # Generate output
39
+ if results:
40
+ output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
41
+ output_dir.mkdir(parents=True, exist_ok=True)
42
+ output_path = output_dir / f'processed_{int(time.time())}.json'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ with open(output_path, 'w', encoding='utf-8') as f:
45
+ json.dump(results, f, ensure_ascii=False, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ summary = f"Processed {len(results)} items successfully!"
48
+ return output