acecalisto3 commited on
Commit
3244846
·
verified ·
1 Parent(s): 6a91fa4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -65
app.py CHANGED
@@ -7,7 +7,7 @@ import mimetypes
7
  import zipfile
8
  import tempfile
9
  from datetime import datetime
10
- from typing import List, Dict, Optional, Union
11
  from pathlib import Path
12
  import requests
13
  import validators
@@ -32,7 +32,7 @@ class URLProcessor:
32
  self.session = requests.Session()
33
  self.timeout = 10 # seconds
34
  self.session.headers.update({
35
- 'User-Agent': UserAgent().random,
36
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
37
  'Accept-Language': 'en-US,en;q=0.5',
38
  'Accept-Encoding': 'gzip, deflate, br',
@@ -96,7 +96,7 @@ class URLProcessor:
96
  if not file_id:
97
  logger.error(f"Invalid Google Drive URL: {url}")
98
  return None
99
-
100
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
101
  response = self.session.get(direct_url, timeout=self.timeout)
102
  response.raise_for_status()
@@ -109,7 +109,7 @@ class URLProcessor:
109
  except Exception as e:
110
  logger.error(f"Google Drive processing failed: {e}")
111
  return None
112
-
113
  def _handle_google_calendar(self, url: str) -> Optional[Dict]:
114
  """Process Google Calendar ICS feeds"""
115
  try:
@@ -165,68 +165,64 @@ class FileProcessor:
165
  except Exception:
166
  return False
167
 
168
- def process_files(self, files: Union[List[gr.File], List[str]]) -> List[Dict]:
169
- """Process multiple uploaded files and return a single JSON extraction"""
170
- if not files:
171
  return []
172
 
173
- combined_data = []
174
  try:
175
- for file in files:
176
- # Check if the file is a Gradio File object or a string path
177
- file_name = file.name if isinstance(file, gr.File) else file
178
- if os.path.isfile(file_name):
179
- file_size = os.path.getsize(file_name)
180
- if file_size > self.max_file_size:
181
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
182
- continue # Skip this file
183
 
184
- if zipfile.is_zipfile(file_name):
185
- combined_data.extend(self._process_zip_file(file_name))
186
- else:
187
- combined_data.extend(self._process_single_file(file_name))
188
  else:
189
- logger.warning(f"Skipping directory: {file_name}")
190
 
191
  except Exception as e:
192
- logger.error(f"Error processing files: {str(e)}")
193
  return []
194
 
195
- def _process_zip_file(self, zip_path: str) -> List[Dict]:
 
 
196
  """Process ZIP file contents"""
197
  results = []
198
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
199
- with tempfile.TemporaryDirectory() as temp_dir:
200
- zip_ref.extractall(temp_dir)
201
- for root, _, files in os.walk(temp_dir):
202
- for filename in files:
203
- filepath = os.path.join(root, filename)
204
- if self.is_text_file(filepath):
205
- try:
206
- with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
207
- content = f.read()
208
- if content.strip():
209
- results.append({
210
- "source": "file",
211
- "filename": filename,
212
- "content": content,
213
- "timestamp": datetime.now().isoformat()
214
- })
215
- except Exception as e:
216
- logger.error(f"Error reading file {filename}: {str(e)}")
217
  return results
218
 
219
- def _process_single_file(self, file_path: str) -> List[Dict]:
220
  try:
221
- file_stat = os.stat(file_path)
222
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
223
  content = f.read()
224
 
225
  return [{
226
  'source': 'file',
227
- 'filename': os.path.basename(file_path),
228
  'file_size': file_stat.st_size,
229
- 'mime_type': mimetypes.guess_type(file_path)[0],
230
  'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
231
  'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
232
  'content': content,
@@ -266,18 +262,13 @@ def create_interface():
266
  """Create a comprehensive Gradio interface with advanced features"""
267
 
268
  css = """
269
- body { background-color: #f0f4f8; font-family: 'Arial', sans-serif; }
270
- .container { max-width: 1200px; margin: auto; padding: 20px; border-radius: 8px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); }
271
- h1 { color: #333; }
272
- .tab { background-color: #ffffff; border-radius: 8px; padding: 20px; margin-bottom: 20px; }
273
- .button { background-color: #007bff; color: white; border: none; border-radius: 5px; padding: 10px 20px; cursor: pointer; }
274
- .button:hover { background-color: #0056b3; }
275
- .warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 5px; }
276
- .error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 5px; }
277
  """
278
 
279
- with gr.Blocks(css=css, title="Advanced Data Processing App") as interface:
280
- gr.Markdown("# 🌐 Advanced Data Processing Toolkit")
281
 
282
  with gr.Tab("URL Processing"):
283
  url_input = gr.Textbox(
@@ -288,7 +279,7 @@ def create_interface():
288
 
289
  with gr.Tab("File Input"):
290
  file_input = gr.File(
291
- label="Upload text files or ZIP archives",
292
  file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
293
  )
294
 
@@ -309,7 +300,7 @@ def create_interface():
309
  placeholder="Paste your JSON data here...",
310
  lines=5
311
  )
312
- load_btn = gr.Button("Load Data", variant="primary")
313
  chat_output = gr.Textbox(label="Chatbot Response", interactive=False)
314
 
315
  process_btn = gr.Button("Process Input", variant="primary")
@@ -320,7 +311,7 @@ def create_interface():
320
  # Initialize chatbot
321
  chatbot = Chatbot()
322
 
323
- def process_all_inputs(urls, files, text):
324
  """Process all input types with progress tracking"""
325
  try:
326
  processor = URLProcessor()
@@ -338,16 +329,15 @@ def create_interface():
338
  content = processor.fetch_content(url)
339
  if content:
340
  results.append({
341
- 'source': ' url',
342
  'url': url,
343
  'content': content,
344
  'timestamp': datetime.now().isoformat()
345
  })
346
 
347
  # Process files
348
- if files:
349
- combined_data = file_processor.process_files(files)
350
- results.extend(combined_data)
351
 
352
  # Process text input
353
  if text:
@@ -405,7 +395,7 @@ def create_interface():
405
  gr.Markdown("""
406
  ### Usage Guidelines
407
  - **URL Processing**: Enter valid HTTP/HTTPS URLs
408
- - **File Input**: Upload multiple text files or ZIP archives
409
  - **Text Input**: Direct text processing
410
  - **Chat**: Load your JSON data and ask questions about it
411
  - Advanced cleaning and validation included
@@ -430,4 +420,4 @@ def main():
430
  )
431
 
432
  if __name__ == "__main__":
433
- main()
 
7
  import zipfile
8
  import tempfile
9
  from datetime import datetime
10
+ from typing import List, Dict, Optional
11
  from pathlib import Path
12
  import requests
13
  import validators
 
32
  self.session = requests.Session()
33
  self.timeout = 10 # seconds
34
  self.session.headers.update({
35
+ 'User -Agent': UserAgent().random,
36
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
37
  'Accept-Language': 'en-US,en;q=0.5',
38
  'Accept-Encoding': 'gzip, deflate, br',
 
96
  if not file_id:
97
  logger.error(f"Invalid Google Drive URL: {url}")
98
  return None
99
+
100
  direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
101
  response = self.session.get(direct_url, timeout=self.timeout)
102
  response.raise_for_status()
 
109
  except Exception as e:
110
  logger.error(f"Google Drive processing failed: {e}")
111
  return None
112
+
113
  def _handle_google_calendar(self, url: str) -> Optional[Dict]:
114
  """Process Google Calendar ICS feeds"""
115
  try:
 
165
  except Exception:
166
  return False
167
 
168
+ def process_file(self, file) -> List[Dict]:
169
+ """Process uploaded file with enhanced error handling"""
170
+ if not file:
171
  return []
172
 
173
+ dataset = []
174
  try:
175
+ file_size = os.path.getsize(file.name)
176
+ if file_size > self.max_file_size:
177
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
178
+ return [{"error": f"File size ({file_size} bytes) exceeds maximum allowed size of {self.max_file_size} bytes."}]
 
 
 
 
179
 
180
+ with tempfile.TemporaryDirectory() as temp_dir:
181
+ if zipfile.is_zipfile(file.name):
182
+ dataset.extend(self._process_zip_file(file.name, temp_dir))
 
183
  else:
184
+ dataset.extend(self._process_single_file(file))
185
 
186
  except Exception as e:
187
+ logger.error(f"Error processing file: {str(e)}")
188
  return []
189
 
190
+ return dataset
191
+
192
+ def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
193
  """Process ZIP file contents"""
194
  results = []
195
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
196
+ zip_ref.extractall(temp_dir)
197
+ for root, _, files in os.walk(temp_dir):
198
+ for filename in files:
199
+ filepath = os.path.join(root, filename)
200
+ if self.is_text_file(filepath):
201
+ try:
202
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
203
+ content = f.read()
204
+ if content.strip():
205
+ results.append({
206
+ "source": "file",
207
+ "filename": filename,
208
+ "content": content,
209
+ "timestamp": datetime.now().isoformat()
210
+ })
211
+ except Exception as e:
212
+ logger.error(f"Error reading file {filename}: {str(e)}")
 
213
  return results
214
 
215
+ def _process_single_file(self, file) -> List[Dict]:
216
  try:
217
+ file_stat = os.stat(file.name)
218
+ with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
219
  content = f.read()
220
 
221
  return [{
222
  'source': 'file',
223
+ 'filename': os.path.basename(file.name),
224
  'file_size': file_stat.st_size,
225
+ 'mime_type': mimetypes.guess_type(file.name)[0],
226
  'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
227
  'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
228
  'content': content,
 
262
  """Create a comprehensive Gradio interface with advanced features"""
263
 
264
  css = """
265
+ .container { max-width: 1200px; margin: auto; }
266
+ .warning { background-color: #fff3cd; color: #856404; }
267
+ .error { background-color: #f8d7da; color: #721c24; }
 
 
 
 
 
268
  """
269
 
270
+ with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
271
+ gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
272
 
273
  with gr.Tab("URL Processing"):
274
  url_input = gr.Textbox(
 
279
 
280
  with gr.Tab("File Input"):
281
  file_input = gr.File(
282
+ label="Upload text file or ZIP archive",
283
  file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
284
  )
285
 
 
300
  placeholder="Paste your JSON data here...",
301
  lines=5
302
  )
303
+ load_btn = gr.Button("Load Data")
304
  chat_output = gr.Textbox(label="Chatbot Response", interactive=False)
305
 
306
  process_btn = gr.Button("Process Input", variant="primary")
 
311
  # Initialize chatbot
312
  chatbot = Chatbot()
313
 
314
+ def process_all_inputs(urls, file, text):
315
  """Process all input types with progress tracking"""
316
  try:
317
  processor = URLProcessor()
 
329
  content = processor.fetch_content(url)
330
  if content:
331
  results.append({
332
+ 'source': 'url',
333
  'url': url,
334
  'content': content,
335
  'timestamp': datetime.now().isoformat()
336
  })
337
 
338
  # Process files
339
+ if file:
340
+ results.extend(file_processor.process_file(file))
 
341
 
342
  # Process text input
343
  if text:
 
395
  gr.Markdown("""
396
  ### Usage Guidelines
397
  - **URL Processing**: Enter valid HTTP/HTTPS URLs
398
+ - **File Input**: Upload text files or ZIP archives
399
  - **Text Input**: Direct text processing
400
  - **Chat**: Load your JSON data and ask questions about it
401
  - Advanced cleaning and validation included
 
420
  )
421
 
422
  if __name__ == "__main__":
423
+ main()