acecalisto3 commited on
Commit
dad6950
·
verified ·
1 Parent(s): 4dd743a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -25
app.py CHANGED
@@ -134,17 +134,23 @@ class URLProcessor:
134
 
135
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
136
  element.decompose()
137
-
138
- main_content = soup.find('main') or soup.find('article') or soup.body
139
 
140
- text_content = main_content.get_text(separator='\n', strip=True)
141
- cleaned_content = self.advanced_text_cleaning(text_content)
142
 
143
- return {
144
- 'content': cleaned_content,
145
- 'content_type': response.headers.get('Content-Type', ''),
146
- 'timestamp': datetime.now().isoformat()
147
- }
 
 
 
 
 
 
 
 
148
  except Exception as e:
149
  logger.error(f"HTML processing failed: {e}")
150
  return None
@@ -203,7 +209,7 @@ class FileProcessor:
203
  zip_results = self._process_zip_file(file_path)
204
  combined_data.extend(zip_results)
205
  elif self.is_text_file(file_path):
206
- file_results = self.process_single_file(file_path)
207
  combined_data.extend(file_results)
208
  else:
209
  logger.warning(f"Unsupported file type: {file_path}")
@@ -212,22 +218,27 @@ class FileProcessor:
212
  logger.error(f"Error processing files: {str(e)}")
213
 
214
  return combined_data
215
-
216
- def process_single_file(self, file_path: str) -> List[Dict]:
217
- """Process a single file and extract its content."""
218
  results = []
219
- try:
220
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
221
- content = file.read()
222
- if content.strip():
223
- results.append({
224
- "source": "file",
225
- "filename": os.path.basename(file_path),
226
- "content": content,
227
- "timestamp": datetime.now().isoformat()
228
- })
229
- except Exception as e:
230
- logger.error(f"Error reading file {file_path}: {str(e)}")
 
 
 
 
 
 
231
  return results
232
 
233
  def _process_single_file(self, file) -> List[Dict]:
 
134
 
135
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
136
  element.decompose()
 
 
137
 
138
+ # Try to find the main content in a more robust way
139
+ main_content = soup.find('main') or soup.find('article') or soup.body
140
 
141
+ if main_content:
142
+ text_content = main_content.get_text(separator='\n', strip=True)
143
+ cleaned_content = self.advanced_text_cleaning(text_content)
144
+
145
+ return {
146
+ 'content': cleaned_content,
147
+ 'content_type': response.headers.get('Content-Type', ''),
148
+ 'timestamp': datetime.now().isoformat()
149
+ }
150
+ else:
151
+ logger.warning(f"No main content found for URL: {url}")
152
+ return None
153
+
154
  except Exception as e:
155
  logger.error(f"HTML processing failed: {e}")
156
  return None
 
209
  zip_results = self._process_zip_file(file_path)
210
  combined_data.extend(zip_results)
211
  elif self.is_text_file(file_path):
212
+ file_results = self._process_single_file(file_path)
213
  combined_data.extend(file_results)
214
  else:
215
  logger.warning(f"Unsupported file type: {file_path}")
 
218
  logger.error(f"Error processing files: {str(e)}")
219
 
220
  return combined_data
221
+ def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
222
+ """Process ZIP file contents"""
 
223
  results = []
224
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
225
+ zip_ref.extractall(temp_dir)
226
+ for root, _, files in os.walk(temp_dir):
227
+ for filename in files:
228
+ filepath = os.path.join(root, filename)
229
+ if self.is_text_file(filepath):
230
+ try:
231
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
232
+ content = f.read()
233
+ if content.strip():
234
+ results.append({
235
+ "source": "file",
236
+ "filename": filename,
237
+ "content": content,
238
+ "timestamp": datetime.now().isoformat()
239
+ })
240
+ except Exception as e:
241
+ logger.error(f"Error reading file {filename}: {str(e)}")
242
  return results
243
 
244
  def _process_single_file(self, file) -> List[Dict]: