Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -131,6 +131,7 @@ class URLProcessor:
|
|
131 |
logger.error(f"Calendar fetch failed: {e}")
|
132 |
return None
|
133 |
|
|
|
134 |
def _fetch_html_content(self, url: str) -> Optional[Dict]:
|
135 |
"""Standard HTML content processing"""
|
136 |
try:
|
@@ -146,6 +147,15 @@ class URLProcessor:
|
|
146 |
# Extract main content
|
147 |
main_content = soup.find('main') or soup.find('article') or soup.body
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
# Clean and structure content
|
150 |
text_content = main_content.get_text(separator='\n', strip=True)
|
151 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
|
|
131 |
logger.error(f"Calendar fetch failed: {e}")
|
132 |
return None
|
133 |
|
134 |
+
|
135 |
def _fetch_html_content(self, url: str) -> Optional[Dict]:
|
136 |
"""Standard HTML content processing"""
|
137 |
try:
|
|
|
147 |
# Extract main content
|
148 |
main_content = soup.find('main') or soup.find('article') or soup.body
|
149 |
|
150 |
+
# Check if main_content is None
|
151 |
+
if main_content is None:
|
152 |
+
logger.warning(f"No main content found in the HTML for URL: {url}")
|
153 |
+
return {
|
154 |
+
'content': "No main content found.",
|
155 |
+
'content_type': response.headers.get('Content-Type', ''),
|
156 |
+
'timestamp': datetime.now().isoformat()
|
157 |
+
}
|
158 |
+
|
159 |
# Clean and structure content
|
160 |
text_content = main_content.get_text(separator='\n', strip=True)
|
161 |
cleaned_content = self.advanced_text_cleaning(text_content)
|