acecalisto3 commited on
Commit
345d19b
·
verified ·
1 Parent(s): 815015e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -35
app.py CHANGED
@@ -32,7 +32,7 @@ class URLProcessor:
32
  self.session = requests.Session()
33
  self.timeout = 10 # seconds
34
  self.session.headers.update({
35
- 'User -Agent': UserAgent().random,
36
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
37
  'Accept-Language': 'en-US,en;q=0.5',
38
  'Accept-Encoding': 'gzip, deflate, br',
@@ -97,7 +97,7 @@ class URLProcessor:
97
  logger.error(f"Invalid Google Drive URL: {url}")
98
  return None
99
 
100
- direct_url = f"https://drive.google.com/uc? export=download&id={file_id.group(1)}"
101
  response = self.session.get(direct_url, timeout=self.timeout)
102
  response.raise_for_status()
103
 
@@ -149,7 +149,7 @@ class URLProcessor:
149
  else:
150
  logger.warning(f"No main content found for URL: {url}")
151
  return None
152
-
153
  except Exception as e:
154
  logger.error(f"HTML processing failed: {e}")
155
  return None
@@ -202,7 +202,7 @@ class FileProcessor:
202
  zip_results = self._process_zip_file(file_path)
203
  combined_data.extend(zip_results)
204
  elif self.is_text_file(file_path):
205
- file_results = self._process_single_file(file_path)
206
  combined_data.extend(file_results)
207
  else:
208
  logger.warning(f"Unsupported file type: {file_path}")
@@ -212,29 +212,7 @@ class FileProcessor:
212
 
213
  return combined_data
214
 
215
- def _process_zip_file(self, zip_path: str) -> List[Dict]:
216
- """Process ZIP file contents"""
217
- results = []
218
- temp_dir = tempfile.mkdtemp()
219
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
220
- zip_ref.extractall(temp_dir)
221
- for root, _, files in os.walk(temp_dir):
222
- for filename in files:
223
- filepath = os.path.join(root, filename)
224
- if self.is_text_file(filepath):
225
- try:
226
- with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
227
- content = f.read()
228
- if content.strip():
229
- results.append({
230
- "source": "file",
231
- "filename": filename,
232
- "content": content,
233
- "timestamp": datetime.now().isoformat()
234
- })
235
- except Exception as e:
236
- logger.error(f"Error reading file {filename}: {str(e)}")
237
- return results
238
 
239
  def _process_single_file(self, file) -> List[Dict]:
240
  try:
@@ -255,6 +233,38 @@ class FileProcessor:
255
  except Exception as e:
256
  logger.error(f"File processing error: {e}")
257
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
  class Chatbot:
260
  """Simple chatbot that uses provided JSON data for responses."""
@@ -295,7 +305,7 @@ def create_interface():
295
 
296
  with gr.Tab("URL Processing"):
297
  url_input = gr.Textbox(
298
- label="Enter URLs (comma or newline separated)",
299
  lines=5,
300
  placeholder="https://example1.com\nhttps://example2.com"
301
  )
@@ -308,7 +318,7 @@ def create_interface():
308
 
309
  with gr.Tab("Text Input"):
310
  text_input = gr.Textbox(
311
- label="Raw Text Input",
312
  lines=5,
313
  placeholder="Paste your text here..."
314
  )
@@ -358,13 +368,13 @@ def create_interface():
358
 
359
  if file:
360
  results.extend(file_processor.process_files(file))
361
-
362
  if text:
363
  cleaned_text = processor.advanced_text_cleaning(text)
364
  results.append({
365
  'source': 'direct_input',
366
  'content': cleaned_text,
367
- 'timestamp': datetime.now(). isoformat()
368
  })
369
 
370
  if results:
@@ -393,8 +403,8 @@ def create_interface():
393
  return chatbot.chat(user_input)
394
 
395
  process_btn.click(
396
- process_all_inputs,
397
- inputs=[url_input, file_input, text_input],
398
  outputs=[output_file, output_text]
399
  )
400
 
@@ -434,8 +444,8 @@ def main():
434
  server_port=7860,
435
  share=False,
436
  inbrowser=False, # Disable browser opening in container
437
- debug=False # Disable debug mode for production
438
  )
439
 
440
  if __name__ == "__main__":
441
- main()
 
32
  self.session = requests.Session()
33
  self.timeout = 10 # seconds
34
  self.session.headers.update({
35
+ 'User-Agent': UserAgent().random,
36
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
37
  'Accept-Language': 'en-US,en;q=0.5',
38
  'Accept-Encoding': 'gzip, deflate, br',
 
97
  logger.error(f"Invalid Google Drive URL: {url}")
98
  return None
99
 
100
+ direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
101
  response = self.session.get(direct_url, timeout=self.timeout)
102
  response.raise_for_status()
103
 
 
149
  else:
150
  logger.warning(f"No main content found for URL: {url}")
151
  return None
152
+
153
  except Exception as e:
154
  logger.error(f"HTML processing failed: {e}")
155
  return None
 
202
  zip_results = self._process_zip_file(file_path)
203
  combined_data.extend(zip_results)
204
  elif self.is_text_file(file_path):
205
+ file_results = self._process_single_file(file) # Changed file_path to file
206
  combined_data.extend(file_results)
207
  else:
208
  logger.warning(f"Unsupported file type: {file_path}")
 
212
 
213
  return combined_data
214
 
215
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
  def _process_single_file(self, file) -> List[Dict]:
218
  try:
 
233
  except Exception as e:
234
  logger.error(f"File processing error: {e}")
235
  return []
236
+
237
+ def _process_zip_file(self, zip_file_path: str) -> List[Dict]:
238
+ """Process a ZIP file and extract data from text files within."""
239
+ extracted_data = []
240
+ try:
241
+ with zipfile.ZipFile(zip_file_path, 'r') as zf:
242
+ for name in zf.namelist():
243
+ if self.is_text_file(name):
244
+ try:
245
+ file_info = zf.getinfo(name)
246
+ with zf.open(name) as f:
247
+ content = f.read().decode('utf-8', errors='ignore')
248
+
249
+ # Use file_info for file size and date/time
250
+ extracted_data.append({
251
+ 'source': 'zip',
252
+ 'filename': name,
253
+ 'file_size': file_info.file_size, # Get file size from ZipInfo
254
+ 'mime_type': mimetypes.guess_type(name)[0],
255
+ 'created': datetime(*file_info.date_time).isoformat(), # Get date from ZipInfo
256
+ 'modified': datetime(*file_info.date_time).isoformat(),
257
+ 'content': content,
258
+ 'timestamp': datetime.now().isoformat()
259
+ })
260
+ except Exception as e:
261
+ logger.error(f"Error processing file {name} from ZIP: {e}")
262
+ except zipfile.BadZipFile:
263
+ logger.error(f"Error: {zip_file_path} is not a valid ZIP file.")
264
+ except Exception as e:
265
+ logger.error(f"Error processing ZIP file {zip_file_path}: {e}")
266
+ return extracted_data
267
+
268
 
269
  class Chatbot:
270
  """Simple chatbot that uses provided JSON data for responses."""
 
305
 
306
  with gr.Tab("URL Processing"):
307
  url_input = gr.Textbox(
308
+ label="Enter URLs (comma or newline separated)",
309
  lines=5,
310
  placeholder="https://example1.com\nhttps://example2.com"
311
  )
 
318
 
319
  with gr.Tab("Text Input"):
320
  text_input = gr.Textbox(
321
+ label="Raw Text Input",
322
  lines=5,
323
  placeholder="Paste your text here..."
324
  )
 
368
 
369
  if file:
370
  results.extend(file_processor.process_files(file))
371
+
372
  if text:
373
  cleaned_text = processor.advanced_text_cleaning(text)
374
  results.append({
375
  'source': 'direct_input',
376
  'content': cleaned_text,
377
+ 'timestamp': datetime.now().isoformat()
378
  })
379
 
380
  if results:
 
403
  return chatbot.chat(user_input)
404
 
405
  process_btn.click(
406
+ process_all_inputs,
407
+ inputs=[url_input, file_input, text_input],
408
  outputs=[output_file, output_text]
409
  )
410
 
 
444
  server_port=7860,
445
  share=False,
446
  inbrowser=False, # Disable browser opening in container
447
+ debug=False # Disable debug mode for production
448
  )
449
 
450
  if __name__ == "__main__":
451
+ main()