acecalisto3 commited on
Commit
61eaf76
·
verified ·
1 Parent(s): 3a075bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -34
app.py CHANGED
@@ -151,10 +151,12 @@ class URLProcessor:
151
 
152
  class FileProcessor:
153
  """Class to handle file processing"""
154
-
155
- def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
156
  self.max_file_size = max_file_size
157
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
 
 
158
 
159
  def is_text_file(self, filepath: str) -> bool:
160
  """Check if file is a text file"""
@@ -171,57 +173,83 @@ class FileProcessor:
171
  return []
172
 
173
  combined_data = []
 
 
174
  try:
175
  for file in files:
176
  # Check if the file is a Gradio File object or a string path
177
- file_name = file.name if isinstance(file, gr.File) else file
178
- file_size = os.path.getsize(file_name)
 
 
 
 
 
 
 
 
 
 
 
 
179
  if file_size > self.max_file_size:
180
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
181
  continue # Skip this file
182
 
183
- if zipfile.is_zipfile(file_name):
184
- combined_data.extend(self._process_zip_file(file_name))
 
 
 
 
 
 
185
  else:
186
- combined_data.extend(self._process_single_file(file_name))
 
187
 
188
  except Exception as e:
189
  logger.error(f"Error processing files: {str(e)}")
190
- return []
191
- finally:
192
- return combined_data
193
 
194
  def _process_zip_file(self, zip_path: str) -> List[Dict]:
195
- """Process ZIP file contents"""
196
  results = []
197
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
198
- with tempfile.TemporaryDirectory() as temp_dir:
199
- zip_ref.extractall(temp_dir)
200
- for root, _, files in os.walk(temp_dir):
201
- for filename in files:
202
- filepath = os.path.join(root, filename)
203
- if self.is_text_file(filepath):
204
- try:
205
- with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
206
- content = f.read()
207
- if content.strip():
208
- results.append({
209
- "source": "file",
210
- "filename": filename,
211
- "content": content,
212
- "timestamp": datetime.now().isoformat()
213
- })
214
- except Exception as e:
215
- logger.error(f"Error reading file {filename}: {str(e)}")
 
 
 
 
 
216
  return results
217
 
218
  def _process_single_file(self, file_path: str) -> List[Dict]:
 
 
219
  try:
220
  file_stat = os.stat(file_path)
221
  with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
222
  content = f.read()
223
-
224
- return [{
225
  'source': 'file',
226
  'filename': os.path.basename(file_path),
227
  'file_size': file_stat.st_size,
@@ -230,10 +258,10 @@ class FileProcessor:
230
  'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
231
  'content': content,
232
  'timestamp': datetime.now().isoformat()
233
- }]
234
  except Exception as e:
235
  logger.error(f"File processing error: {e}")
236
- return []
237
 
238
  class Chatbot:
239
  """Simple chatbot that uses provided JSON data for responses."""
 
151
 
152
  class FileProcessor:
153
  """Class to handle file processing"""
154
+
155
+ def __init__(self, max_file_size: int = 1536 * 1024 * 1024): # 1.5GB default
156
  self.max_file_size = max_file_size
157
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
158
+ self.processed_zip_count = 0
159
+ self.max_zip_files = 5
160
 
161
  def is_text_file(self, filepath: str) -> bool:
162
  """Check if file is a text file"""
 
173
  return []
174
 
175
  combined_data = []
176
+ self.processed_zip_count = 0
177
+
178
  try:
179
  for file in files:
180
  # Check if the file is a Gradio File object or a string path
181
+ file_path = file.name if isinstance(file, gr.File) else file
182
+
183
+ # Skip if it's a directory
184
+ if os.path.isdir(file_path):
185
+ logger.warning(f"Skipping directory: {file_path}")
186
+ continue
187
+
188
+ # Skip if file doesn't exist
189
+ if not os.path.exists(file_path):
190
+ logger.warning(f"File does not exist: {file_path}")
191
+ continue
192
+
193
+ # Check file size
194
+ file_size = os.path.getsize(file_path)
195
  if file_size > self.max_file_size:
196
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
197
  continue # Skip this file
198
 
199
+ # Process based on file type
200
+ if zipfile.is_zipfile(file_path):
201
+ if self.processed_zip_count >= self.max_zip_files:
202
+ logger.warning(f"Maximum number of ZIP files ({self.max_zip_files}) reached, skipping {file_path}")
203
+ continue
204
+ self.processed_zip_count += 1
205
+ zip_results = self._process_zip_file(file_path)
206
+ combined_data.extend(zip_results)
207
  else:
208
+ file_results = self._process_single_file(file_path)
209
+ combined_data.extend(file_results)
210
 
211
  except Exception as e:
212
  logger.error(f"Error processing files: {str(e)}")
213
+
214
+ return combined_data
 
215
 
216
  def _process_zip_file(self, zip_path: str) -> List[Dict]:
217
+ """Process ZIP file contents more efficiently"""
218
  results = []
219
+ try:
220
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
221
+ # Get list of files in the zip
222
+ file_list = [file for file in zip_ref.namelist()
223
+ if not file.endswith('/') and not file.startswith('__MACOSX')]
224
+
225
+ # Process each file directly from the zip without extracting all files
226
+ for filename in file_list:
227
+ # Check if it's a text file by extension
228
+ if any(filename.lower().endswith(ext) for ext in self.supported_text_extensions):
229
+ try:
230
+ with zip_ref.open(filename) as file:
231
+ content content = file.read().decode('utf-8', errors='ignore')
232
+ if content.strip():
233
+ results.append({
234
+ "source": "zip_file",
235
+ "filename": filename,
236
+ "content": content,
237
+ "timestamp": datetime.now().isoformat()
238
+ })
239
+ except Exception as e:
240
+ logger.error(f"Error reading file {filename} from zip: {str(e)}")
241
+ except Exception as e:
242
+ logger.error(f"Error processing ZIP file {zip_path}: {str(e)}")
243
  return results
244
 
245
  def _process_single_file(self, file_path: str) -> List[Dict]:
246
+ """Process a single file and return its content"""
247
+ results = []
248
  try:
249
  file_stat = os.stat(file_path)
250
  with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
251
  content = f.read()
252
+ results.append({
 
253
  'source': 'file',
254
  'filename': os.path.basename(file_path),
255
  'file_size': file_stat.st_size,
 
258
  'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
259
  'content': content,
260
  'timestamp': datetime.now().isoformat()
261
+ })
262
  except Exception as e:
263
  logger.error(f"File processing error: {e}")
264
+
265
 
266
  class Chatbot:
267
  """Simple chatbot that uses provided JSON data for responses."""