Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Apr 5

Commit

4aedfb3

verified ·

1 Parent(s): 5e4b404

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -10

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ class URLProcessor:
         self.session = requests.Session()
         self.timeout = 10  # seconds
         self.session.headers.update({
-            'User -Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
@@ -87,20 +87,43 @@ class URLProcessor:
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
-            # Google Drive document handling
-            if 'drive.google.com' in url:
-                return self._handle_google_drive(url)
-            # Google Calendar ICS handling
-            if 'calendar.google.com' in url and 'ical' in url:
-                return self._handle_google_calendar(url)
-            # Standard HTML processing
-            return self._fetch_html_content(url)
         except Exception as e:
             logger.error(f"Content fetch failed: {e}")
             return None
     def _handle_google_drive(self, url: str) -> Optional[Dict]:
         """Process Google Drive file links"""
         try:

         self.session = requests.Session()
         self.timeout = 10  # seconds
         self.session.headers.update({
+            'User -Agent': UserAgent().random,  # Corrected User-Agent header
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
+            logger.info(f"Fetching content from URL: {url}")  # Log the URL being fetched
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()  # Raise an error for bad responses
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
+                element.decompose()
+            # Extract main content
+            main_content = soup.find('main') or soup.find('article') or soup.body
+            if main_content is None:
+                logger.warning(f"No main content found for URL: {url}")
+                return {
+                    'content': response.text,  # Return the full HTML if no main content found
+                    'content_type': response.headers.get('Content-Type', ''),
+                    'timestamp': datetime.now().isoformat()
+                }
+            # Clean and structure content
+            text_content = main_content.get_text(separator='\n', strip=True)
+            cleaned_content = self.advanced_text_cleaning(text_content)
+            return {
+                'content': cleaned_content,
+                'content_type': response.headers.get('Content-Type', ''),
+                'timestamp': datetime.now().isoformat()
+            }
+        except requests.RequestException as e:
+            logger.error(f"Request failed: {e}")
+            return None
         except Exception as e:
             logger.error(f"Content fetch failed: {e}")
             return None
     def _handle_google_drive(self, url: str) -> Optional[Dict]:
         """Process Google Drive file links"""
         try: