acecalisto3 commited on
Commit
c92df66
·
verified ·
1 Parent(s): 163699e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -183
app.py CHANGED
@@ -4,16 +4,22 @@ import re
4
  import time
5
  import logging
6
  import mimetypes
 
 
7
  import zipfile
 
8
  from datetime import datetime
9
  from typing import List, Dict, Optional, Union
10
  from pathlib import Path
 
 
11
  import requests
12
  import validators
13
-
14
  import gradio as gr
 
15
  from bs4 import BeautifulSoup
16
  from fake_useragent import UserAgent
 
17
  from cleantext import clean
18
 
19
  # Setup logging with detailed configuration
@@ -80,10 +86,15 @@ class URLProcessor:
80
  def fetch_content(self, url: str) -> Optional[Dict]:
81
  """Universal content fetcher with special case handling"""
82
  try:
 
83
  if 'drive.google.com' in url:
84
  return self._handle_google_drive(url)
 
 
85
  if 'calendar.google.com' in url and 'ical' in url:
86
  return self._handle_google_calendar(url)
 
 
87
  return self._fetch_html_content(url)
88
  except Exception as e:
89
  logger.error(f"Content fetch failed: {e}")
@@ -96,7 +107,7 @@ class URLProcessor:
96
  if not file_id:
97
  logger.error(f"Invalid Google Drive URL: {url}")
98
  return None
99
-
100
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
101
  response = self.session.get(direct_url, timeout=self.timeout)
102
  response.raise_for_status()
@@ -132,151 +143,108 @@ class URLProcessor:
132
 
133
  soup = BeautifulSoup(response.text, 'html.parser')
134
 
 
135
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
136
  element.decompose()
137
-
 
138
  main_content = soup.find('main') or soup.find('article') or soup.body
139
 
140
- if main_content:
141
- text_content = main_content.get_text(separator='\n', strip=True)
142
- cleaned_content = self.advanced_text_cleaning(text_content)
143
-
144
- return {
145
- 'content': cleaned_content,
146
- 'content_type': response.headers.get('Content-Type', ''),
147
- 'timestamp': datetime.now().isoformat()
148
- }
149
- else:
150
- logger.warning(f"No main content found for URL: {url}")
151
- return None
152
-
153
  except Exception as e:
154
  logger.error(f"HTML processing failed: {e}")
155
  return None
156
 
157
  class FileProcessor:
158
  """Class to handle file processing"""
159
-
160
- def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
161
  self.max_file_size = max_file_size
162
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
163
- self.processed_zip_count = 0
164
- self.max_zip_files = 5
165
 
166
- def is_text_file(self, file_path: str) -> bool:
167
- """Check if the file is a text file based on its extension."""
168
- return any(file_path.lower().endswith(ext) for ext in self.supported_text_extensions)
169
-
170
-
171
- def validate_filepath(path: Path) -> bool:
172
- """Validate file exists and has supported extension"""
173
  try:
174
- return path.exists() and path.is_file() and path.suffix.lower() in valid_extensions
175
- except Exception as e:
176
- logger.error(f"Validation error for {path}: {str(e)}")
 
177
  return False
178
-
179
- def process_files(base_path: str = "/app/data") -> list:
180
- """Process files with validation and error handling"""
181
- combined_data = []
182
-
183
- base_dir = Path(base_path)
184
- if not base_dir.exists():
185
- base_dir.mkdir(parents=True, exist_ok=True)
186
- logger.info(f"Created data directory at {base_dir}")
187
-
188
- for item in base_dir.glob('**/*'):
189
- try:
190
- # Skip directories immediately
191
- if item.is_dir():
192
- logger.debug(f"Skipping directory: {item}")
193
- continue
194
-
195
- # Validate file using shared function
196
- if not validate_filepath(item):
197
- logger.warning(f"Invalid file skipped: {item}")
198
- continue
199
-
200
- logger.info(f"Processing valid file: {item.name}")
201
-
202
- # Add actual processing logic here
203
- file_data = process_single_file(item) # Your processing function
204
- combined_data.append(file_data)
205
-
206
- except Exception as e:
207
- logger.error(f"Failed processing {item}: {str(e)}")
208
- continue
209
-
210
- return combined_data
211
-
212
- def process_single_file(file_path: Path) -> dict:
213
- """Example processing function"""
214
- # Add your actual file processing logic here
215
- return {
216
- 'filename': file_path.name,
217
- 'content': "processed content", # Replace with real content
218
- 'metadata': {} # Add actual metadata
219
- }
220
- except Exception as e:
221
- logger.error(f"File processing error: {e}")
222
  return []
223
-
224
- def _process_zip_file(self, zip_file_path: str) -> List[Dict]:
225
- """Process a ZIP file and extract data from text files within."""
226
- extracted_data = []
227
  try:
228
- with zipfile.ZipFile(zip_file_path, 'r') as zf:
229
- for name in zf.namelist():
230
- if self.is_text_file(name):
231
- try:
232
- file_info = zf.getinfo(name)
233
- with zf.open(name) as f:
234
- content = f.read().decode('utf-8', errors='ignore')
235
-
236
- # Use file_info for file size and date/time
237
- extracted_data.append({
238
- 'source': 'zip',
239
- 'filename': name,
240
- 'file_size': file_info.file_size, # Get file size from ZipInfo
241
- 'mime_type': mimetypes.guess_type(name)[0],
242
- 'created': datetime(*file_info.date_time).isoformat(), # Get date from ZipInfo
243
- 'modified': datetime(*file_info.date_time).isoformat(),
244
- 'content': content,
245
- 'timestamp': datetime.now().isoformat()
246
- })
247
- except Exception as e:
248
- logger.error(f"Error processing file {name} from ZIP: {e}")
249
- except zipfile.BadZipFile:
250
- logger.error(f"Error: {zip_file_path} is not a valid ZIP file.")
251
- except Exception as e:
252
- logger.error(f"Error processing ZIP file {zip_file_path}: {e}")
253
- return extracted_data
254
-
255
 
256
- class Chatbot:
257
- """Simple chatbot that uses provided JSON data for responses."""
258
-
259
- def __init__(self):
260
- self.data = None
261
 
262
- def load_data(self, json_data: str):
263
- """Load JSON data into the chatbot."""
264
- try:
265
- self.data = json.loads(json_data)
266
- return "Data loaded successfully!"
267
- except json.JSONDecodeError:
268
- return "Invalid JSON data. Please check your input."
269
 
270
- def chat(self, user_input: str) -> str:
271
- """Generate a response based on user input and loaded data."""
272
- if not self.data:
273
- return "No data loaded. Please load your JSON data first."
274
 
275
- for key, value in self.data.items():
276
- if key.lower() in user_input.lower():
277
- return f"{key}: {value}"
278
-
279
- return "I don't have information on that. Please ask about something else."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
  def create_interface():
282
  """Create a comprehensive Gradio interface with advanced features"""
@@ -292,7 +260,7 @@ def create_interface():
292
 
293
  with gr.Tab("URL Processing"):
294
  url_input = gr.Textbox(
295
- label="Enter URLs (comma or newline separated)",
296
  lines=5,
297
  placeholder="https://example1.com\nhttps://example2.com"
298
  )
@@ -305,31 +273,16 @@ def create_interface():
305
 
306
  with gr.Tab("Text Input"):
307
  text_input = gr.Textbox(
308
- label="Raw Text Input",
309
  lines=5,
310
  placeholder="Paste your text here..."
311
  )
312
 
313
- with gr.Tab("Chat"):
314
- json_input = gr.Textbox(
315
- label="Load JSON Data",
316
- placeholder="Paste your JSON data here...",
317
- lines=5
318
- )
319
- load_btn = gr.Button("Load Data", variant="primary")
320
- chat_input = gr.Textbox(
321
- label="Chat with your data",
322
- placeholder="Type your question here..."
323
- )
324
- chat_output = gr.Textbox(label="Chatbot Response", interactive=False)
325
-
326
  process_btn = gr.Button("Process Input", variant="primary")
327
 
328
  output_text = gr.Textbox(label="Processing Results", interactive=False)
329
  output_file = gr.File(label="Processed Output")
330
 
331
- chatbot = Chatbot()
332
-
333
  def process_all_inputs(urls, file, text):
334
  """Process all input types with progress tracking"""
335
  try:
@@ -337,6 +290,7 @@ def create_interface():
337
  file_processor = FileProcessor()
338
  results = []
339
 
 
340
  if urls:
341
  url_list = re.split(r'[,\n]', urls)
342
  url_list = [url.strip() for url in url_list if url.strip()]
@@ -352,10 +306,12 @@ def create_interface():
352
  'content': content,
353
  'timestamp': datetime.now().isoformat()
354
  })
355
-
356
- if file:
357
- results.extend(file_processor.process_files(file))
358
 
 
 
 
 
 
359
  if text:
360
  cleaned_text = processor.advanced_text_cleaning(text)
361
  results.append({
@@ -364,6 +320,7 @@ def create_interface():
364
  'timestamp': datetime.now().isoformat()
365
  })
366
 
 
367
  if results:
368
  output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
369
  output_dir.mkdir(parents=True, exist_ok=True)
@@ -373,6 +330,7 @@ def create_interface():
373
  json.dump(results, f, ensure_ascii=False, indent=2)
374
 
375
  summary = f"Processed {len(results)} items successfully!"
 
376
  return str(output_path), summary
377
  else:
378
  return None, "No valid content to process."
@@ -380,52 +338,23 @@ def create_interface():
380
  except Exception as e:
381
  logger.error(f"Processing error: {e}")
382
  return None, f"Error: {str(e)}"
383
-
384
- def load_chat_data(json_data):
385
- """Load JSON data into the chatbot."""
386
- return chatbot.load_data(json_data)
387
-
388
- def chat_with_data(user_input):
389
- """Chat with the loaded data."""
390
- return chatbot.chat(user_input)
391
-
392
  process_btn.click(
393
- process_all_inputs,
394
- inputs=[url_input, file_input, text_input],
395
  outputs=[output_file, output_text]
396
  )
397
-
398
- load_btn.click(
399
- load_chat_data,
400
- inputs=json_input,
401
- outputs=chat_output
402
- )
403
-
404
- chat_input.submit(
405
- chat_with_data,
406
- inputs=chat_input,
407
- outputs=chat_output
408
- )
409
 
410
  gr.Markdown("""
411
  ### Usage Guidelines
412
  - **URL Processing**: Enter valid HTTP/HTTPS URLs
413
  - **File Input**: Upload text files or ZIP archives
414
  - **Text Input**: Direct text processing
415
- - **Chat**: Load JSON data and ask questions about it
416
  - Advanced cleaning and validation included
417
  """)
418
 
419
  return interface
420
 
421
- def gradio_interface_handler(input_path: str):
422
- """Example Gradio handler function"""
423
- if not validate_filepath(Path(input_path)):
424
- raise ValueError("Invalid file path provided")
425
-
426
- processed_data = process_files(input_path)
427
- return format_output(processed_data)
428
-
429
  def main():
430
  # Configure system settings
431
  mimetypes.init()
@@ -437,10 +366,8 @@ def main():
437
  interface.launch(
438
  server_name="0.0.0.0",
439
  server_port=7860,
 
440
  share=False,
441
- inbrowser=False, # Disable browser opening in container
442
- debug=False # Disable debug mode for production
443
  )
444
-
445
- if __name__ == "__main__":
446
- main()
 
4
  import time
5
  import logging
6
  import mimetypes
7
+ import concurrent.futures
8
+ import string
9
  import zipfile
10
+ import tempfile
11
  from datetime import datetime
12
  from typing import List, Dict, Optional, Union
13
  from pathlib import Path
14
+ from urllib.parse import urlparse
15
+
16
  import requests
17
  import validators
 
18
  import gradio as gr
19
+ from diskcache import Cache
20
  from bs4 import BeautifulSoup
21
  from fake_useragent import UserAgent
22
+ from ratelimit import limits, sleep_and_retry
23
  from cleantext import clean
24
 
25
  # Setup logging with detailed configuration
 
86
  def fetch_content(self, url: str) -> Optional[Dict]:
87
  """Universal content fetcher with special case handling"""
88
  try:
89
+ # Google Drive document handling
90
  if 'drive.google.com' in url:
91
  return self._handle_google_drive(url)
92
+
93
+ # Google Calendar ICS handling
94
  if 'calendar.google.com' in url and 'ical' in url:
95
  return self._handle_google_calendar(url)
96
+
97
+ # Standard HTML processing
98
  return self._fetch_html_content(url)
99
  except Exception as e:
100
  logger.error(f"Content fetch failed: {e}")
 
107
  if not file_id:
108
  logger.error(f"Invalid Google Drive URL: {url}")
109
  return None
110
+
111
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
112
  response = self.session.get(direct_url, timeout=self.timeout)
113
  response.raise_for_status()
 
143
 
144
  soup = BeautifulSoup(response.text, 'html.parser')
145
 
146
+ # Remove unwanted elements
147
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
148
  element.decompose()
149
+
150
+ # Extract main content
151
  main_content = soup.find('main') or soup.find('article') or soup.body
152
 
153
+ # Clean and structure content
154
+ text_content = main_content.get_text(separator='\n', strip=True)
155
+ cleaned_content = self.advanced_text_cleaning(text_content)
156
+
157
+ return {
158
+ 'content': cleaned_content,
159
+ 'content_type': response.headers.get('Content-Type', ''),
160
+ 'timestamp': datetime.now().isoformat()
161
+ }
 
 
 
 
162
  except Exception as e:
163
  logger.error(f"HTML processing failed: {e}")
164
  return None
165
 
166
  class FileProcessor:
167
  """Class to handle file processing"""
168
+
169
+ def __init__(self, max_file_size: int = 10 * 1024 * 1024): # 10MB default
170
  self.max_file_size = max_file_size
171
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
 
 
172
 
173
+ def is_text_file(self, filepath: str) -> bool:
174
+ """Check if file is a text file"""
 
 
 
 
 
175
  try:
176
+ mime_type, _ = mimetypes.guess_type(filepath)
177
+ return (mime_type and mime_type.startswith('text/')) or \
178
+ (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
179
+ except Exception:
180
  return False
181
+
182
+ def process_file(self, file) -> List[Dict]:
183
+ """Process uploaded file with enhanced error handling"""
184
+ if not file:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  return []
186
+
187
+ dataset = []
 
 
188
  try:
189
+ file_size = os.path.getsize(file.name)
190
+ if file_size > self.max_file_size:
191
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
192
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ with tempfile.TemporaryDirectory() as temp_dir:
195
+ if zipfile.is_zipfile(file.name):
196
+ dataset.extend(self._process_zip_file(file.name, temp_dir))
197
+ else:
198
+ dataset.extend(self._process_single_file(file))
199
 
200
+ except Exception as e:
201
+ logger.error(f"Error processing file: {str(e)}")
202
+ return []
 
 
 
 
203
 
204
+ return dataset
 
 
 
205
 
206
+ def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
207
+ """Process ZIP file contents"""
208
+ results = []
209
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
210
+ zip_ref.extractall(temp_dir)
211
+ for root, _, files in os.walk(temp_dir):
212
+ for filename in files:
213
+ filepath = os.path.join(root, filename)
214
+ if self.is_text_file(filepath):
215
+ try:
216
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
217
+ content = f.read()
218
+ if content.strip():
219
+ results.append({
220
+ "source": "file",
221
+ "filename": filename,
222
+ "content": content,
223
+ "timestamp": datetime.now().isoformat()
224
+ })
225
+ except Exception as e:
226
+ logger.error(f"Error reading file {filename}: {str(e)}")
227
+ return results
228
+
229
+ def _process_single_file(self, file) -> List[Dict]:
230
+ try:
231
+ file_stat = os.stat(file.name)
232
+ with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
233
+ content = f.read()
234
+
235
+ return [{
236
+ 'source': 'file',
237
+ 'filename': os.path.basename(file.name),
238
+ 'file_size': file_stat.st_size,
239
+ 'mime_type': mimetypes.guess_type(file.name)[0],
240
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
241
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
242
+ 'content': content,
243
+ 'timestamp': datetime.now().isoformat()
244
+ }]
245
+ except Exception as e:
246
+ logger.error(f"File processing error: {e}")
247
+ return []
248
 
249
  def create_interface():
250
  """Create a comprehensive Gradio interface with advanced features"""
 
260
 
261
  with gr.Tab("URL Processing"):
262
  url_input = gr.Textbox(
263
+ label="Enter URLs (comma or newline separated)",
264
  lines=5,
265
  placeholder="https://example1.com\nhttps://example2.com"
266
  )
 
273
 
274
  with gr.Tab("Text Input"):
275
  text_input = gr.Textbox(
276
+ label="Raw Text Input",
277
  lines=5,
278
  placeholder="Paste your text here..."
279
  )
280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  process_btn = gr.Button("Process Input", variant="primary")
282
 
283
  output_text = gr.Textbox(label="Processing Results", interactive=False)
284
  output_file = gr.File(label="Processed Output")
285
 
 
 
286
  def process_all_inputs(urls, file, text):
287
  """Process all input types with progress tracking"""
288
  try:
 
290
  file_processor = FileProcessor()
291
  results = []
292
 
293
+ # Process URLs
294
  if urls:
295
  url_list = re.split(r'[,\n]', urls)
296
  url_list = [url.strip() for url in url_list if url.strip()]
 
306
  'content': content,
307
  'timestamp': datetime.now().isoformat()
308
  })
 
 
 
309
 
310
+ # Process files
311
+ if file:
312
+ results.extend(file_processor.process_file(file))
313
+
314
+ # Process text input
315
  if text:
316
  cleaned_text = processor.advanced_text_cleaning(text)
317
  results.append({
 
320
  'timestamp': datetime.now().isoformat()
321
  })
322
 
323
+ # Generate output
324
  if results:
325
  output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
326
  output_dir.mkdir(parents=True, exist_ok=True)
 
330
  json.dump(results, f, ensure_ascii=False, indent=2)
331
 
332
  summary = f"Processed {len(results)} items successfully!"
333
+ # Convert Path object to string here
334
  return str(output_path), summary
335
  else:
336
  return None, "No valid content to process."
 
338
  except Exception as e:
339
  logger.error(f"Processing error: {e}")
340
  return None, f"Error: {str(e)}"
341
+
 
 
 
 
 
 
 
 
342
  process_btn.click(
343
+ process_all_inputs,
344
+ inputs=[url_input, file_input, text_input],
345
  outputs=[output_file, output_text]
346
  )
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  gr.Markdown("""
349
  ### Usage Guidelines
350
  - **URL Processing**: Enter valid HTTP/HTTPS URLs
351
  - **File Input**: Upload text files or ZIP archives
352
  - **Text Input**: Direct text processing
 
353
  - Advanced cleaning and validation included
354
  """)
355
 
356
  return interface
357
 
 
 
 
 
 
 
 
 
358
  def main():
359
  # Configure system settings
360
  mimetypes.init()
 
366
  interface.launch(
367
  server_name="0.0.0.0",
368
  server_port=7860,
369
+ show_error=True,
370
  share=False,
371
+ inbrowser=True,
372
+ debug=True
373
  )