Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -134,17 +134,23 @@ class URLProcessor:
|
|
134 |
|
135 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
136 |
element.decompose()
|
137 |
-
|
138 |
-
main_content = soup.find('main') or soup.find('article') or soup.body
|
139 |
|
140 |
-
|
141 |
-
|
142 |
|
143 |
-
|
144 |
-
'
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
except Exception as e:
|
149 |
logger.error(f"HTML processing failed: {e}")
|
150 |
return None
|
@@ -203,7 +209,7 @@ class FileProcessor:
|
|
203 |
zip_results = self._process_zip_file(file_path)
|
204 |
combined_data.extend(zip_results)
|
205 |
elif self.is_text_file(file_path):
|
206 |
-
file_results = self.
|
207 |
combined_data.extend(file_results)
|
208 |
else:
|
209 |
logger.warning(f"Unsupported file type: {file_path}")
|
@@ -212,22 +218,27 @@ class FileProcessor:
|
|
212 |
logger.error(f"Error processing files: {str(e)}")
|
213 |
|
214 |
return combined_data
|
215 |
-
|
216 |
-
|
217 |
-
"""Process a single file and extract its content."""
|
218 |
results = []
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
return results
|
232 |
|
233 |
def _process_single_file(self, file) -> List[Dict]:
|
|
|
134 |
|
135 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
136 |
element.decompose()
|
|
|
|
|
137 |
|
138 |
+
# Try to find the main content in a more robust way
|
139 |
+
main_content = soup.find('main') or soup.find('article') or soup.body
|
140 |
|
141 |
+
if main_content:
|
142 |
+
text_content = main_content.get_text(separator='\n', strip=True)
|
143 |
+
cleaned_content = self.advanced_text_cleaning(text_content)
|
144 |
+
|
145 |
+
return {
|
146 |
+
'content': cleaned_content,
|
147 |
+
'content_type': response.headers.get('Content-Type', ''),
|
148 |
+
'timestamp': datetime.now().isoformat()
|
149 |
+
}
|
150 |
+
else:
|
151 |
+
logger.warning(f"No main content found for URL: {url}")
|
152 |
+
return None
|
153 |
+
|
154 |
except Exception as e:
|
155 |
logger.error(f"HTML processing failed: {e}")
|
156 |
return None
|
|
|
209 |
zip_results = self._process_zip_file(file_path)
|
210 |
combined_data.extend(zip_results)
|
211 |
elif self.is_text_file(file_path):
|
212 |
+
file_results = self._process_single_file(file_path)
|
213 |
combined_data.extend(file_results)
|
214 |
else:
|
215 |
logger.warning(f"Unsupported file type: {file_path}")
|
|
|
218 |
logger.error(f"Error processing files: {str(e)}")
|
219 |
|
220 |
return combined_data
|
221 |
+
def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
|
222 |
+
"""Process ZIP file contents"""
|
|
|
223 |
results = []
|
224 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
225 |
+
zip_ref.extractall(temp_dir)
|
226 |
+
for root, _, files in os.walk(temp_dir):
|
227 |
+
for filename in files:
|
228 |
+
filepath = os.path.join(root, filename)
|
229 |
+
if self.is_text_file(filepath):
|
230 |
+
try:
|
231 |
+
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
232 |
+
content = f.read()
|
233 |
+
if content.strip():
|
234 |
+
results.append({
|
235 |
+
"source": "file",
|
236 |
+
"filename": filename,
|
237 |
+
"content": content,
|
238 |
+
"timestamp": datetime.now().isoformat()
|
239 |
+
})
|
240 |
+
except Exception as e:
|
241 |
+
logger.error(f"Error reading file {filename}: {str(e)}")
|
242 |
return results
|
243 |
|
244 |
def _process_single_file(self, file) -> List[Dict]:
|