acecalisto3 commited on
Commit
f1041ef
·
verified ·
1 Parent(s): dad6950

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -22
app.py CHANGED
@@ -5,7 +5,6 @@ import time
5
  import logging
6
  import mimetypes
7
  import zipfile
8
- import tempfile
9
  from datetime import datetime
10
  from typing import List, Dict, Optional, Union
11
  from pathlib import Path
@@ -32,7 +31,7 @@ class URLProcessor:
32
  self.session = requests.Session()
33
  self.timeout = 10 # seconds
34
  self.session.headers.update({
35
- 'User-Agent': UserAgent().random,
36
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
37
  'Accept-Language': 'en-US,en;q=0.5',
38
  'Accept-Encoding': 'gzip, deflate, br',
@@ -96,8 +95,8 @@ class URLProcessor:
96
  if not file_id:
97
  logger.error(f"Invalid Google Drive URL: {url}")
98
  return None
99
-
100
- direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
101
  response = self.session.get(direct_url, timeout=self.timeout)
102
  response.raise_for_status()
103
 
@@ -135,7 +134,6 @@ class URLProcessor:
135
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
136
  element.decompose()
137
 
138
- # Try to find the main content in a more robust way
139
  main_content = soup.find('main') or soup.find('article') or soup.body
140
 
141
  if main_content:
@@ -178,29 +176,23 @@ class FileProcessor:
178
 
179
  try:
180
  for file in files:
181
- # Check if the file is a Gradio File object or a string path
182
  file_path = file.name if isinstance(file, gr.File) else file
183
 
184
- # Log the file path being processed
185
  logger.info(f"Processing file: {file_path}")
186
 
187
- # Skip if it's a directory
188
  if os.path.isdir(file_path):
189
  logger.warning(f"Skipping directory: {file_path}")
190
  continue
191
 
192
- # Skip if file doesn't exist
193
  if not os.path.exists(file_path):
194
  logger.warning(f"File does not exist: {file_path}")
195
  continue
196
 
197
- # Check file size
198
  file_size = os.path.getsize(file_path)
199
  if file_size > self.max_file_size:
200
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
201
- continue # Skip this file
202
 
203
- # Process based on file type
204
  if zipfile.is_zipfile(file_path):
205
  if self.processed_zip_count >= self.max_zip_files:
206
  logger.warning(f"Maximum number of ZIP files ({self.max_zip_files}) reached, skipping {file_path}")
@@ -218,9 +210,11 @@ class FileProcessor:
218
  logger.error(f"Error processing files: {str(e)}")
219
 
220
  return combined_data
221
- def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
 
222
  """Process ZIP file contents"""
223
  results = []
 
224
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
225
  zip_ref.extractall(temp_dir)
226
  for root, _, files in os.walk(temp_dir):
@@ -230,7 +224,7 @@ class FileProcessor:
230
  try:
231
  with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
232
  content = f.read()
233
- if content.strip():
234
  results.append({
235
  "source": "file",
236
  "filename": filename,
@@ -280,7 +274,6 @@ class Chatbot:
280
  if not self.data:
281
  return "No data loaded. Please load your JSON data first."
282
 
283
- # Simple keyword-based response logic
284
  for key, value in self.data.items():
285
  if key.lower() in user_input.lower():
286
  return f"{key}: {value}"
@@ -337,7 +330,6 @@ def create_interface():
337
  output_text = gr.Textbox(label="Processing Results", interactive=False)
338
  output_file = gr.File(label="Processed Output")
339
 
340
- # Initialize chatbot
341
  chatbot = Chatbot()
342
 
343
  def process_all_inputs(urls, file, text):
@@ -347,7 +339,6 @@ def create_interface():
347
  file_processor = FileProcessor()
348
  results = []
349
 
350
- # Process URLs
351
  if urls:
352
  url_list = re.split(r'[,\n]', urls)
353
  url_list = [url.strip() for url in url_list if url.strip()]
@@ -364,20 +355,17 @@ def create_interface():
364
  'timestamp': datetime.now().isoformat()
365
  })
366
 
367
- # Process files
368
  if file:
369
- results.extend(file_processor.process_file(file))
370
 
371
- # Process text input
372
  if text:
373
  cleaned_text = processor.advanced_text_cleaning(text)
374
  results.append({
375
  'source': 'direct_input',
376
  'content': cleaned_text,
377
- 'timestamp': datetime.now().isoformat()
378
  })
379
 
380
- # Generate output
381
  if results:
382
  output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
383
  output_dir.mkdir(parents=True, exist_ok=True)
 
5
  import logging
6
  import mimetypes
7
  import zipfile
 
8
  from datetime import datetime
9
  from typing import List, Dict, Optional, Union
10
  from pathlib import Path
 
31
  self.session = requests.Session()
32
  self.timeout = 10 # seconds
33
  self.session.headers.update({
34
+ 'User -Agent': UserAgent().random,
35
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
36
  'Accept-Language': 'en-US,en;q=0.5',
37
  'Accept-Encoding': 'gzip, deflate, br',
 
95
  if not file_id:
96
  logger.error(f"Invalid Google Drive URL: {url}")
97
  return None
98
+
99
+ direct_url = f"https://drive.google.com/uc? export=download&id={file_id.group(1)}"
100
  response = self.session.get(direct_url, timeout=self.timeout)
101
  response.raise_for_status()
102
 
 
134
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
135
  element.decompose()
136
 
 
137
  main_content = soup.find('main') or soup.find('article') or soup.body
138
 
139
  if main_content:
 
176
 
177
  try:
178
  for file in files:
 
179
  file_path = file.name if isinstance(file, gr.File) else file
180
 
 
181
  logger.info(f"Processing file: {file_path}")
182
 
 
183
  if os.path.isdir(file_path):
184
  logger.warning(f"Skipping directory: {file_path}")
185
  continue
186
 
 
187
  if not os.path.exists(file_path):
188
  logger.warning(f"File does not exist: {file_path}")
189
  continue
190
 
 
191
  file_size = os.path.getsize(file_path)
192
  if file_size > self.max_file_size:
193
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
194
+ continue
195
 
 
196
  if zipfile.is_zipfile(file_path):
197
  if self.processed_zip_count >= self.max_zip_files:
198
  logger.warning(f"Maximum number of ZIP files ({self.max_zip_files}) reached, skipping {file_path}")
 
210
  logger.error(f"Error processing files: {str(e)}")
211
 
212
  return combined_data
213
+
214
+ def _process_zip_file(self, zip_path: str) -> List[Dict]:
215
  """Process ZIP file contents"""
216
  results = []
217
+ temp_dir = tempfile.mkdtemp()
218
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
219
  zip_ref.extractall(temp_dir)
220
  for root, _, files in os.walk(temp_dir):
 
224
  try:
225
  with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
226
  content = f.read()
227
+ if content.strip():
228
  results.append({
229
  "source": "file",
230
  "filename": filename,
 
274
  if not self.data:
275
  return "No data loaded. Please load your JSON data first."
276
 
 
277
  for key, value in self.data.items():
278
  if key.lower() in user_input.lower():
279
  return f"{key}: {value}"
 
330
  output_text = gr.Textbox(label="Processing Results", interactive=False)
331
  output_file = gr.File(label="Processed Output")
332
 
 
333
  chatbot = Chatbot()
334
 
335
  def process_all_inputs(urls, file, text):
 
339
  file_processor = FileProcessor()
340
  results = []
341
 
 
342
  if urls:
343
  url_list = re.split(r'[,\n]', urls)
344
  url_list = [url.strip() for url in url_list if url.strip()]
 
355
  'timestamp': datetime.now().isoformat()
356
  })
357
 
 
358
  if file:
359
+ results.extend(file_processor.process_files(file))
360
 
 
361
  if text:
362
  cleaned_text = processor.advanced_text_cleaning(text)
363
  results.append({
364
  'source': 'direct_input',
365
  'content': cleaned_text,
366
+ 'timestamp': datetime.now(). isoformat()
367
  })
368
 
 
369
  if results:
370
  output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
371
  output_dir.mkdir(parents=True, exist_ok=True)