acecalisto3 commited on
Commit
fa62c34
·
verified ·
1 Parent(s): 17fdb3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -124
app.py CHANGED
@@ -30,11 +30,26 @@ logging.basicConfig(
30
  logger = logging.getLogger(__name__)
31
 
32
 
 
 
 
 
 
 
 
 
33
  class URLProcessor:
34
  def __init__(self):
35
- self.session = requests.Session()
36
- self.timeout = 10 # seconds
37
- self.session.headers.update({
 
 
 
 
 
 
 
38
  'User-Agent': UserAgent().random,
39
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
40
  'Accept-Language': 'en-US,en;q=0.5',
@@ -42,133 +57,86 @@ class URLProcessor:
42
  'Connection': 'keep-alive',
43
  'Upgrade-Insecure-Requests': '1'
44
  })
 
45
 
46
- def advanced_text_cleaning(self, text: str) -> str:
47
- """Robust text cleaning with version compatibility"""
48
- try:
49
- cleaned_text = clean(
50
- text,
51
- to_ascii=True,
52
- lower=True,
53
- no_line_breaks=True,
54
- no_urls=True,
55
- no_emails=True,
56
- no_phone_numbers=True,
57
- no_numbers=False,
58
- no_digits=False,
59
- no_currency_symbols=True,
60
- no_punct=False
61
- ).strip()
62
- return cleaned_text
63
- except Exception as e:
64
- logger.warning(f"Text cleaning error: {e}. Using fallback method.")
65
- text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Remove control characters
66
- text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
67
- text = re.sub(r'\s+', ' ', text) # Normalize whitespace
68
- return text.strip()
69
-
70
- def validate_url(self, url: str) -> Dict:
71
- """Validate URL format and accessibility"""
72
  try:
73
- if not validators.url(url):
74
- return {'is_valid': False, 'message': 'Invalid URL format'}
75
- # Try with DNS resolution retry
76
- for attempt in range(3): # Try up to 3 times
77
- try:
78
- # Some sites block HEAD requests but allow GET
79
- try:
80
- response = self.session.head(url, timeout=self.timeout)
81
- response.raise_for_status()
82
- except (requests.exceptions.RequestException, Exception) as e:
83
- logger.warning(f"HEAD request failed for {url}, trying GET: {e}")
84
- # Try with GET request if HEAD fails
85
- response = self.session.get(url, timeout=self.timeout, stream=True)
86
- response.raise_for_status()
87
- # Close the connection to avoid downloading the entire content
88
- response.close()
89
-
90
- return {'is_valid': True, 'message': 'URL is valid and accessible'}
91
- except requests.exceptions.ConnectionError as e:
92
- if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
93
- logger.warning(f"DNS resolution failed for {url}, attempt {attempt + 1}/3")
94
- time.sleep(1) # Wait a bit before retrying
95
- continue
96
- else:
97
- raise
98
- except Exception as e:
99
- raise
100
- # If we get here, all attempts failed
101
- return {'is_valid': False,
102
- 'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
103
- except Exception as e:
104
- logger.error(f"URL validation failed for {url}: {str(e)}")
105
- return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
106
 
107
- def fetch_content(self, url: str) -> Optional[Dict]:
108
- """Universal content fetcher with special case handling"""
109
- try:
110
- logger.info(f"Fetching content from: {url}")
111
-
112
- # Google Drive document handling
113
- if 'drive.google.com' in url:
114
- return self._handle_google_drive(url)
115
- # Google Calendar ICS handling
116
- if 'calendar.google.com' in url and 'ical' in url:
117
- return self._handle_google_calendar(url)
118
- # Try standard HTML processing first
119
- result = self._fetch_html_content(url)
120
-
121
- # If standard processing failed or returned minimal content, try with Selenium
122
- if not result or len(result.get('content', '')) < 100:
123
- logger.info(
124
- f"Standard processing failed or returned minimal content for {url}, trying Selenium")
125
- selenium_html = self._fetch_with_selenium(url)
126
- if selenium_html:
127
- # Process the Selenium HTML
128
- soup = BeautifulSoup(selenium_html, 'html.parser')
129
- # Remove unwanted elements
130
- for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
131
- element.decompose()
132
 
133
- # Apply the same content extraction strategies as in _fetch_html_content
134
- # Strategy 1: Look for semantic HTML5 elements
135
- main_content = None
136
- for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post',
137
- '.entry', '.page']:
138
- elements = soup.select(selector)
139
- if elements:
140
- main_content = elements[0]
141
- logger.info(f"Found content with selector: {selector}")
142
- break
143
-
144
- # If no main content found, use body
145
- if not main_content or not main_content.get_text(strip=True):
146
- main_content = soup.body if soup.body else soup
147
-
148
- # Extract text
149
- text_content = main_content.get_text(separator='\n', strip=True)
150
-
151
- # Clean content
152
- cleaned_content = self.advanced_text_cleaning(text_content)
153
-
154
- if len(cleaned_content) >= 20:
155
- result = {
156
- 'content': cleaned_content,
157
- 'content_type': 'text/html',
158
- 'timestamp': datetime.now().isoformat(),
159
- 'url': url,
160
- 'source': 'selenium' # Mark that this came from Selenium
161
- }
162
- # Log the result status
163
- if result:
164
- logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
165
- else:
166
- logger.error(f"Failed to extract content from {url}")
167
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  except Exception as e:
169
- logger.error(f"Content fetch failed for {url}: {e}")
170
  return None
171
 
 
 
 
 
 
 
 
 
 
 
 
172
  def _fetch_html_content(self, url: str) -> Optional[Dict]:
173
  """Standard HTML content processing"""
174
  try:
@@ -682,6 +650,7 @@ def create_interface():
682
  .warning { background-color: #fff3cd; color: #856404; }
683
  .error { background-color: #f8d7da; color: #721c24; }
684
  """
 
685
  with gr.Blocks(css=css, title="Advanced Text & URL Processing") as interface:
686
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
687
 
@@ -805,4 +774,4 @@ def main():
805
 
806
  if __name__ == "__main__":
807
  main()
808
-
 
30
  logger = logging.getLogger(__name__)
31
 
32
 
33
+ # Add these imports at the top
34
+ from config import Config
35
+ from proxy_handler import ProxyHandler
36
+ from robots_handler import RobotsHandler
37
+ import asyncio
38
+ import aiohttp
39
+ from tqdm import tqdm
40
+
41
  class URLProcessor:
42
  def __init__(self):
43
+ self.config = Config()
44
+ self.proxy_handler = ProxyHandler(self.config.get('PROXY_URL'))
45
+ self.robots_handler = RobotsHandler()
46
+ self.session = self._create_session()
47
+
48
+ def _create_session(self):
49
+ session = requests.Session()
50
+ if self.config.get('USE_PROXY'):
51
+ session.proxies = self.proxy_handler.get_proxy_config()
52
+ session.headers.update({
53
  'User-Agent': UserAgent().random,
54
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
55
  'Accept-Language': 'en-US,en;q=0.5',
 
57
  'Connection': 'keep-alive',
58
  'Upgrade-Insecure-Requests': '1'
59
  })
60
+ return session
61
 
62
+ def _fetch_with_selenium(self, url: str) -> Optional[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  try:
64
+ chrome_options = Options()
65
+ from selenium import webdriver
66
+ from selenium.webdriver.chrome.options import Options
67
+ from selenium.webdriver.common.by import By
68
+ from selenium.webdriver.support.ui import WebDriverWait
69
+ from selenium.webdriver.support import expected_conditions as EC
70
+ from selenium.common.exceptions import TimeoutException
71
+ import time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ logger.info(f"Attempting to fetch {url} with Selenium")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ # Set up Chrome options
76
+ chrome_options = Options()
77
+ chrome_options.add_argument("--headless")
78
+ chrome_options.add_argument("--no-sandbox")
79
+ chrome_options.add_argument("--disable-dev-shm-usage")
80
+ chrome_options.add_argument("--disable-gpu")
81
+ chrome_options.add_argument("--window-size=1920,1080")
82
+ chrome_options.add_argument(
83
+ "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
84
+
85
+ # Initialize the driver
86
+ driver = webdriver.Chrome(options=chrome_options)
87
+
88
+ try:
89
+ # Navigate to the URL
90
+ driver.get(url)
91
+
92
+ # Wait for the page to load
93
+ WebDriverWait(driver, 10).until(
94
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
95
+ )
96
+
97
+ # Simulate pressing ESC key to dismiss overlays
98
+ from selenium.webdriver.common.keys import Keys
99
+ action_chains = webdriver.ActionChains(driver)
100
+ action_chains.send_keys(Keys.ESCAPE).perform()
101
+ time.sleep(1) # give it a moment to take effect
102
+ action_chains.reset_actions() # Clear actions
103
+
104
+ # try again
105
+ action_chains.send_keys(Keys.ESCAPE).perform()
106
+ time.sleep(1) # give it a moment to take effect
107
+ action_chains.reset_actions()
108
+
109
+ # Get the page source
110
+ page_source = driver.page_source
111
+
112
+ # Save the Selenium HTML for debugging
113
+ debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
114
+ with open(debug_path, "w", encoding="utf-8") as f:
115
+ f.write(page_source)
116
+ logger.info(f"Saved Selenium HTML to {debug_path}")
117
+
118
+ return page_source
119
+ finally:
120
+ driver.quit()
121
+
122
+ except ImportError:
123
+ logger.error("Selenium is not installed. Cannot use browser automation.")
124
+ return None
125
  except Exception as e:
126
+ logger.error(f"Selenium processing failed for {url}: {e}")
127
  return None
128
 
129
+ async def fetch_urls_async(self, urls: List[str]) -> List[Dict]:
130
+ async with aiohttp.ClientSession() as session:
131
+ tasks = []
132
+ for url in urls:
133
+ if self.config.get('RESPECT_ROBOTS'):
134
+ if not self.robots_handler.can_fetch(url, self.session.headers['User-Agent']):
135
+ logger.warning(f"Skipping {url} due to robots.txt restrictions")
136
+ continue
137
+ tasks.append(self.fetch_content_async(session, url))
138
+ return await asyncio.gather(*tasks)
139
+
140
  def _fetch_html_content(self, url: str) -> Optional[Dict]:
141
  """Standard HTML content processing"""
142
  try:
 
650
  .warning { background-color: #fff3cd; color: #856404; }
651
  .error { background-color: #f8d7da; color: #721c24; }
652
  """
653
+
654
  with gr.Blocks(css=css, title="Advanced Text & URL Processing") as interface:
655
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
656
 
 
774
 
775
  if __name__ == "__main__":
776
  main()
777
+