Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -39,7 +39,7 @@ class URLProcessor:
|
|
39 |
self.session = requests.Session()
|
40 |
self.timeout = 10 # seconds
|
41 |
self.session.headers.update({
|
42 |
-
'User -Agent': UserAgent().random,
|
43 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
44 |
'Accept-Language': 'en-US,en;q=0.5',
|
45 |
'Accept-Encoding': 'gzip, deflate, br',
|
@@ -87,20 +87,43 @@ class URLProcessor:
|
|
87 |
def fetch_content(self, url: str) -> Optional[Dict]:
|
88 |
"""Universal content fetcher with special case handling"""
|
89 |
try:
|
90 |
-
#
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
#
|
95 |
-
|
96 |
-
return self._handle_google_calendar(url)
|
97 |
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
except Exception as e:
|
101 |
logger.error(f"Content fetch failed: {e}")
|
102 |
return None
|
103 |
-
|
104 |
def _handle_google_drive(self, url: str) -> Optional[Dict]:
|
105 |
"""Process Google Drive file links"""
|
106 |
try:
|
|
|
39 |
self.session = requests.Session()
|
40 |
self.timeout = 10 # seconds
|
41 |
self.session.headers.update({
|
42 |
+
'User -Agent': UserAgent().random, # Corrected User-Agent header
|
43 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
44 |
'Accept-Language': 'en-US,en;q=0.5',
|
45 |
'Accept-Encoding': 'gzip, deflate, br',
|
|
|
87 |
def fetch_content(self, url: str) -> Optional[Dict]:
|
88 |
"""Universal content fetcher with special case handling"""
|
89 |
try:
|
90 |
+
logger.info(f"Fetching content from URL: {url}") # Log the URL being fetched
|
91 |
+
response = self.session.get(url, timeout=self.timeout)
|
92 |
+
response.raise_for_status() # Raise an error for bad responses
|
93 |
+
|
94 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
95 |
+
|
96 |
+
# Remove unwanted elements
|
97 |
+
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
98 |
+
element.decompose()
|
99 |
|
100 |
+
# Extract main content
|
101 |
+
main_content = soup.find('main') or soup.find('article') or soup.body
|
|
|
102 |
|
103 |
+
if main_content is None:
|
104 |
+
logger.warning(f"No main content found for URL: {url}")
|
105 |
+
return {
|
106 |
+
'content': response.text, # Return the full HTML if no main content found
|
107 |
+
'content_type': response.headers.get('Content-Type', ''),
|
108 |
+
'timestamp': datetime.now().isoformat()
|
109 |
+
}
|
110 |
+
|
111 |
+
# Clean and structure content
|
112 |
+
text_content = main_content.get_text(separator='\n', strip=True)
|
113 |
+
cleaned_content = self.advanced_text_cleaning(text_content)
|
114 |
+
|
115 |
+
return {
|
116 |
+
'content': cleaned_content,
|
117 |
+
'content_type': response.headers.get('Content-Type', ''),
|
118 |
+
'timestamp': datetime.now().isoformat()
|
119 |
+
}
|
120 |
+
except requests.RequestException as e:
|
121 |
+
logger.error(f"Request failed: {e}")
|
122 |
+
return None
|
123 |
except Exception as e:
|
124 |
logger.error(f"Content fetch failed: {e}")
|
125 |
return None
|
126 |
+
|
127 |
def _handle_google_drive(self, url: str) -> Optional[Dict]:
|
128 |
"""Process Google Drive file links"""
|
129 |
try:
|