acecalisto3 commited on
Commit
4aedfb3
·
verified ·
1 Parent(s): 5e4b404

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -10
app.py CHANGED
@@ -39,7 +39,7 @@ class URLProcessor:
39
  self.session = requests.Session()
40
  self.timeout = 10 # seconds
41
  self.session.headers.update({
42
- 'User -Agent': UserAgent().random,
43
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
44
  'Accept-Language': 'en-US,en;q=0.5',
45
  'Accept-Encoding': 'gzip, deflate, br',
@@ -87,20 +87,43 @@ class URLProcessor:
87
  def fetch_content(self, url: str) -> Optional[Dict]:
88
  """Universal content fetcher with special case handling"""
89
  try:
90
- # Google Drive document handling
91
- if 'drive.google.com' in url:
92
- return self._handle_google_drive(url)
 
 
 
 
 
 
93
 
94
- # Google Calendar ICS handling
95
- if 'calendar.google.com' in url and 'ical' in url:
96
- return self._handle_google_calendar(url)
97
 
98
- # Standard HTML processing
99
- return self._fetch_html_content(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  except Exception as e:
101
  logger.error(f"Content fetch failed: {e}")
102
  return None
103
-
104
  def _handle_google_drive(self, url: str) -> Optional[Dict]:
105
  """Process Google Drive file links"""
106
  try:
 
39
  self.session = requests.Session()
40
  self.timeout = 10 # seconds
41
  self.session.headers.update({
42
+ 'User -Agent': UserAgent().random, # Corrected User-Agent header
43
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
44
  'Accept-Language': 'en-US,en;q=0.5',
45
  'Accept-Encoding': 'gzip, deflate, br',
 
87
  def fetch_content(self, url: str) -> Optional[Dict]:
88
  """Universal content fetcher with special case handling"""
89
  try:
90
+ logger.info(f"Fetching content from URL: {url}") # Log the URL being fetched
91
+ response = self.session.get(url, timeout=self.timeout)
92
+ response.raise_for_status() # Raise an error for bad responses
93
+
94
+ soup = BeautifulSoup(response.text, 'html.parser')
95
+
96
+ # Remove unwanted elements
97
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
98
+ element.decompose()
99
 
100
+ # Extract main content
101
+ main_content = soup.find('main') or soup.find('article') or soup.body
 
102
 
103
+ if main_content is None:
104
+ logger.warning(f"No main content found for URL: {url}")
105
+ return {
106
+ 'content': response.text, # Return the full HTML if no main content found
107
+ 'content_type': response.headers.get('Content-Type', ''),
108
+ 'timestamp': datetime.now().isoformat()
109
+ }
110
+
111
+ # Clean and structure content
112
+ text_content = main_content.get_text(separator='\n', strip=True)
113
+ cleaned_content = self.advanced_text_cleaning(text_content)
114
+
115
+ return {
116
+ 'content': cleaned_content,
117
+ 'content_type': response.headers.get('Content-Type', ''),
118
+ 'timestamp': datetime.now().isoformat()
119
+ }
120
+ except requests.RequestException as e:
121
+ logger.error(f"Request failed: {e}")
122
+ return None
123
  except Exception as e:
124
  logger.error(f"Content fetch failed: {e}")
125
  return None
126
+
127
  def _handle_google_drive(self, url: str) -> Optional[Dict]:
128
  """Process Google Drive file links"""
129
  try: