acecalisto3 commited on
Commit
3c395db
·
verified ·
1 Parent(s): 14b5f32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -18
app.py CHANGED
@@ -151,35 +151,67 @@ class URLProcessor:
151
 
152
  class FileProcessor:
153
  """Class to handle file processing"""
154
-
155
  def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
156
  self.max_file_size = max_file_size
157
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
 
 
 
 
 
 
158
 
159
- def process_file(self, file) -> List[Dict]:
160
- """Process uploaded file with enhanced error handling"""
161
- if not file:
162
  return []
163
 
164
- dataset = []
 
 
165
  try:
166
- file_size = os.path.getsize(file.name)
167
- if file_size > self.max_file_size:
168
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
169
- return [{"error": f"File size ({file_size} bytes) exceeds maximum allowed size of {self.max_file_size} bytes."}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
- with tempfile.TemporaryDirectory() as temp_dir:
172
- if zipfile.is_zipfile(file.name):
173
- dataset.extend(self._process_zip_file(file.name, temp_dir))
 
 
 
 
 
 
 
 
174
  else:
175
- dataset.extend(self._process_single_file(file))
176
 
177
  except Exception as e:
178
- logger.error(f"Error processing file: {str(e)}")
179
- return []
180
-
181
- return dataset
182
-
183
  def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
184
  """Process ZIP file contents"""
185
  results = []
 
151
 
152
  class FileProcessor:
153
  """Class to handle file processing"""
154
+
155
  def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
156
  self.max_file_size = max_file_size
157
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
158
+ self.processed_zip_count = 0
159
+ self.max_zip_files = 5
160
+
161
+ def is_text_file(self, file_path: str) -> bool:
162
+ """Check if the file is a text file based on its extension."""
163
+ return any(file_path.lower().endswith(ext) for ext in self.supported_text_extensions)
164
 
165
+ def process_files(self, files: Union[List[gr.File], List[str]]) -> List[Dict]:
166
+ """Process multiple uploaded files and return a single JSON extraction"""
167
+ if not files:
168
  return []
169
 
170
+ combined_data = []
171
+ self.processed_zip_count = 0
172
+
173
  try:
174
+ for file in files:
175
+ # Check if the file is a Gradio File object or a string path
176
+ file_path = file.name if isinstance(file, gr.File) else file
177
+
178
+ # Log the file path being processed
179
+ logger.info(f"Processing file: {file_path}")
180
+
181
+ # Skip if it's a directory
182
+ if os.path.isdir(file_path):
183
+ logger.warning(f"Skipping directory: {file_path}")
184
+ continue
185
+
186
+ # Skip if file doesn't exist
187
+ if not os.path.exists(file_path):
188
+ logger.warning(f"File does not exist: {file_path}")
189
+ continue
190
+
191
+ # Check file size
192
+ file_size = os.path.getsize(file_path)
193
+ if file_size > self.max_file_size:
194
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
195
+ continue # Skip this file
196
 
197
+ # Process based on file type
198
+ if zipfile.is_zipfile(file_path):
199
+ if self.processed_zip_count >= self.max_zip_files:
200
+ logger.warning(f"Maximum number of ZIP files ({self.max_zip_files}) reached, skipping {file_path}")
201
+ continue
202
+ self.processed_zip_count += 1
203
+ zip_results = self._process_zip_file(file_path)
204
+ combined_data.extend(zip_results)
205
+ elif self.is_text_file(file_path):
206
+ file_results = self._process_single_file(file_path)
207
+ combined_data.extend(file_results)
208
  else:
209
+ logger.warning(f"Unsupported file type: {file_path}")
210
 
211
  except Exception as e:
212
+ logger.error(f"Error processing files: {str(e)}")
213
+
214
+ return combined_data
 
 
215
  def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
216
  """Process ZIP file contents"""
217
  results = []