Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 18

Commit

35d78ec

verified ·

1 Parent(s): 19ffead

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -24

app.py CHANGED Viewed

@@ -1,31 +1,19 @@
 import json
 import os
 import re
 import time
-try:
-    import matplotlib
-except ImportError:
-    import subprocess
-    subprocess.run(["pip", "install", "matplotlib"])
-    import matplotlib
 import logging
 import mimetypes
-import concurrent.futures
-import string
 import zipfile
 import tempfile
 from datetime import datetime
-from typing import List, Dict, Optional, Union
 from pathlib import Path
-from urllib.parse import urlparse
 import requests
 import validators
 import gradio as gr
-from diskcache import Cache
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
-from ratelimit import limits, sleep_and_retry
 from cleantext import clean
 # Setup logging with detailed configuration
@@ -44,7 +32,7 @@ class URLProcessor:
         self.session = requests.Session()
         self.timeout = 10  # seconds
         self.session.headers.update({
-            'User-Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
@@ -92,15 +80,10 @@ class URLProcessor:
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
-            # Google Drive document handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
-            # Google Calendar ICS handling
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
-            # Standard HTML processing
             return self._fetch_html_content(url)
         except Exception as e:
             logger.error(f"Content fetch failed: {e}")
@@ -115,7 +98,7 @@ class URLProcessor:
                 return None
             direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
-            response = self.session.get(direct_url, timeout=self.timeout)
             response.raise_for_status()
             return {
@@ -149,14 +132,11 @@ class URLProcessor:
             soup = BeautifulSoup(response.text, 'html.parser')
-            # Remove unwanted elements
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
-            # Extract main content
             main_content = soup.find('main') or soup.find('article') or soup.body
-            # Clean and structure content
             text_content = main_content.get_text(separator='\n', strip=True)
             cleaned_content = self.advanced_text_cleaning(text_content)
@@ -336,7 +316,6 @@ def create_interface():
                         json.dump(results, f, ensure_ascii=False, indent=2)
                     summary = f"Processed {len(results)} items successfully!"
-                    # Convert Path object to string here
                     return str(output_path), summary
                 else:
                     return None, "No valid content to process."

 import json
 import os
 import re
 import time
 import logging
 import mimetypes
 import zipfile
 import tempfile
 from datetime import datetime
+from typing import List, Dict, Optional
 from pathlib import Path
 import requests
 import validators
 import gradio as gr
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from cleantext import clean
 # Setup logging with detailed configuration
         self.session = requests.Session()
         self.timeout = 10  # seconds
         self.session.headers.update({
+            'User -Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
             return self._fetch_html_content(url)
         except Exception as e:
             logger.error(f"Content fetch failed: {e}")
                 return None
             direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
+            response = self.session.get(direct_url, timeout = self.timeout)
             response.raise_for_status()
             return {
             soup = BeautifulSoup(response.text, 'html.parser')
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
             main_content = soup.find('main') or soup.find('article') or soup.body
             text_content = main_content.get_text(separator='\n', strip=True)
             cleaned_content = self.advanced_text_cleaning(text_content)
                         json.dump(results, f, ensure_ascii=False, indent=2)
                     summary = f"Processed {len(results)} items successfully!"
                     return str(output_path), summary
                 else:
                     return None, "No valid content to process."