acecalisto3 commited on
Commit
6a91fa4
·
verified ·
1 Parent(s): 172a6d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -65
app.py CHANGED
@@ -15,9 +15,6 @@ import gradio as gr
15
  from bs4 import BeautifulSoup
16
  from fake_useragent import UserAgent
17
  from cleantext import clean
18
- from starlette.applications import Starlette
19
- from starlette.responses import JSONResponse
20
- from starlette.routing import Route
21
 
22
  # Setup logging with detailed configuration
23
  logging.basicConfig(
@@ -154,12 +151,19 @@ class URLProcessor:
154
 
155
  class FileProcessor:
156
  """Class to handle file processing"""
157
-
158
  def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
159
  self.max_file_size = max_file_size
160
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
161
- self.processed_zip_count = 0
162
- self.max_zip_files = 5
 
 
 
 
 
 
 
163
 
164
  def process_files(self, files: Union[List[gr.File], List[str]]) -> List[Dict]:
165
  """Process multiple uploaded files and return a single JSON extraction"""
@@ -167,76 +171,70 @@ class FileProcessor:
167
  return []
168
 
169
  combined_data = []
170
- self.processed_zip_count = 0
171
-
172
  try:
173
  for file in files:
174
  # Check if the file is a Gradio File object or a string path
175
- file_path = file.name if isinstance(file, gr.File) else file
176
-
177
- # Log the file path being processed
178
- logger.info(f"Processing file: {file_path}")
179
-
180
- # Skip if it's a directory
181
- if os.path.isdir(file_path):
182
- logger.warning(f"Skipping directory: {file_path}")
183
- continue
184
-
185
- # Skip if file doesn't exist
186
- if not os.path.exists(file_path):
187
- logger.warning(f"File does not exist: {file_path}")
188
- continue
189
-
190
- # Check file size
191
- file_size = os.path.getsize(file_path)
192
- if file_size > self.max_file_size:
193
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
194
- continue # Skip this file
195
 
196
- # Process based on file type
197
- if zipfile.is_zipfile(file_path):
198
- if self.processed_zip_count >= self.max_zip_files:
199
- logger.warning(f"Maximum number of ZIP files ({self.max_zip_files}) reached, skipping {file_path}")
200
- continue
201
- self.processed_zip_count += 1
202
- zip_results = self._process_zip_file(file_path)
203
- combined_data.extend(zip_results)
204
  else:
205
- file_results = self._process_single_file(file_path)
206
- combined_data.extend(file_results)
207
 
208
  except Exception as e:
209
  logger.error(f"Error processing files: {str(e)}")
210
-
211
- return combined_data
212
 
213
  def _process_zip_file(self, zip_path: str) -> List[Dict]:
214
- """Process ZIP file contents more efficiently"""
215
  results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  try:
217
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
218
- # Get list of files in the zip
219
- file_list = [file for file in zip_ref.namelist()
220
- if not file.endswith('/') and not file.startswith('__MACOSX')]
221
-
222
- # Process each file directly from the zip without extracting all files
223
- for filename in file_list:
224
- if any(filename.lower().endswith(ext) for ext in self.supported_text_extensions):
225
- try:
226
- with zip_ref.open(filename) as file:
227
- content = file.read().decode('utf-8', errors='ignore')
228
- if content.strip():
229
- results.append({
230
- "source": "zip_file",
231
- "filename": filename,
232
- "content": content,
233
- "timestamp": datetime.now().isoformat()
234
- })
235
- except Exception as e:
236
- logger.error(f"Error reading file {filename} from zip: {str(e)}")
237
  except Exception as e:
238
- logger.error(f"Error processing ZIP file {zip_path}: {str(e)}")
239
- return results
240
 
241
  class Chatbot:
242
  """Simple chatbot that uses provided JSON data for responses."""
@@ -418,10 +416,10 @@ def create_interface():
418
  def main():
419
  # Configure system settings
420
  mimetypes.init()
421
-
422
  # Create and launch interface
423
  interface = create_interface()
424
-
425
  # Launch with proper configuration
426
  interface.launch(
427
  server_name="0.0.0.0",
 
15
  from bs4 import BeautifulSoup
16
  from fake_useragent import UserAgent
17
  from cleantext import clean
 
 
 
18
 
19
  # Setup logging with detailed configuration
20
  logging.basicConfig(
 
151
 
152
  class FileProcessor:
153
  """Class to handle file processing"""
154
+
155
  def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
156
  self.max_file_size = max_file_size
157
  self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
158
+
159
+ def is_text_file(self, filepath: str) -> bool:
160
+ """Check if file is a text file"""
161
+ try:
162
+ mime_type, _ = mimetypes.guess_type(filepath)
163
+ return (mime_type and mime_type.startswith('text/')) or \
164
+ (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
165
+ except Exception:
166
+ return False
167
 
168
  def process_files(self, files: Union[List[gr.File], List[str]]) -> List[Dict]:
169
  """Process multiple uploaded files and return a single JSON extraction"""
 
171
  return []
172
 
173
  combined_data = []
 
 
174
  try:
175
  for file in files:
176
  # Check if the file is a Gradio File object or a string path
177
+ file_name = file.name if isinstance(file, gr.File) else file
178
+ if os.path.isfile(file_name):
179
+ file_size = os.path.getsize(file_name)
180
+ if file_size > self.max_file_size:
181
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
182
+ continue # Skip this file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ if zipfile.is_zipfile(file_name):
185
+ combined_data.extend(self._process_zip_file(file_name))
186
+ else:
187
+ combined_data.extend(self._process_single_file(file_name))
 
 
 
 
188
  else:
189
+ logger.warning(f"Skipping directory: {file_name}")
 
190
 
191
  except Exception as e:
192
  logger.error(f"Error processing files: {str(e)}")
193
+ return []
 
194
 
195
  def _process_zip_file(self, zip_path: str) -> List[Dict]:
196
+ """Process ZIP file contents"""
197
  results = []
198
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
199
+ with tempfile.TemporaryDirectory() as temp_dir:
200
+ zip_ref.extractall(temp_dir)
201
+ for root, _, files in os.walk(temp_dir):
202
+ for filename in files:
203
+ filepath = os.path.join(root, filename)
204
+ if self.is_text_file(filepath):
205
+ try:
206
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
207
+ content = f.read()
208
+ if content.strip():
209
+ results.append({
210
+ "source": "file",
211
+ "filename": filename,
212
+ "content": content,
213
+ "timestamp": datetime.now().isoformat()
214
+ })
215
+ except Exception as e:
216
+ logger.error(f"Error reading file {filename}: {str(e)}")
217
+ return results
218
+
219
+ def _process_single_file(self, file_path: str) -> List[Dict]:
220
  try:
221
+ file_stat = os.stat(file_path)
222
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
223
+ content = f.read()
224
+
225
+ return [{
226
+ 'source': 'file',
227
+ 'filename': os.path.basename(file_path),
228
+ 'file_size': file_stat.st_size,
229
+ 'mime_type': mimetypes.guess_type(file_path)[0],
230
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
231
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
232
+ 'content': content,
233
+ 'timestamp': datetime.now().isoformat()
234
+ }]
 
 
 
 
 
 
235
  except Exception as e:
236
+ logger.error(f"File processing error: {e}")
237
+ return []
238
 
239
  class Chatbot:
240
  """Simple chatbot that uses provided JSON data for responses."""
 
416
  def main():
417
  # Configure system settings
418
  mimetypes.init()
419
+
420
  # Create and launch interface
421
  interface = create_interface()
422
+
423
  # Launch with proper configuration
424
  interface.launch(
425
  server_name="0.0.0.0",