n0v33n commited on
Commit
8724d52
·
1 Parent(s): ff3a25c

updated changes

Browse files
Files changed (2) hide show
  1. WebScraper.py +215 -89
  2. requirements.txt +2 -1
WebScraper.py CHANGED
@@ -3,20 +3,13 @@ import os
3
  import re
4
  import urllib.parse
5
  from datetime import datetime
6
- # from selenium import webdriver
7
- # from selenium.webdriver.chrome.options import Options
8
  from selenium.webdriver.common.by import By
9
  from selenium.webdriver.support.ui import WebDriverWait
10
  from selenium.webdriver.support import expected_conditions as EC
11
  from bs4 import BeautifulSoup
12
- try:
13
- from selenium import webdriver
14
- from selenium.webdriver.chrome.options import Options
15
- SELENIUM_AVAILABLE = True
16
- except ImportError:
17
- SELENIUM_AVAILABLE = False
18
- print("Selenium not available. Some features may not work.")
19
-
20
  class WebsiteScraper:
21
  def __init__(self, base_url, site_name, site_description="", site_category="General",
22
  output_dir=None, max_depth=3, max_pages=50, delay=2, headless=True,
@@ -55,14 +48,19 @@ class WebsiteScraper:
55
  self.visited_links = set()
56
  self.page_count = 0
57
  self.start_time = datetime.now()
 
 
58
  if output_dir is None:
59
  domain_safe = self.domain_name.replace(".", "_").replace(":", "_")
60
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
61
  self.output_dir = f"{site_name}_{domain_safe}_{timestamp}"
62
  else:
63
  self.output_dir = output_dir
 
64
  if not os.path.exists(self.output_dir):
65
  os.makedirs(self.output_dir)
 
 
66
  self.log_path = os.path.join(self.output_dir, "scraping_log.txt")
67
  with open(self.log_path, "w", encoding="utf-8") as log_file:
68
  log_file.write(f"Website scraping started at: {self.start_time}\n")
@@ -74,63 +72,127 @@ class WebsiteScraper:
74
  log_file.write(f"Max depth: {self.max_depth}\n")
75
  log_file.write(f"Max pages: {self.max_pages}\n")
76
  log_file.write(f"External links: {self.scrape_external_links}\n\n")
77
- self.setup_driver(headless)
 
 
78
  self.documents = []
79
-
80
- def setup_driver(self, headless):
81
- """Setup Chrome driver with options."""
82
- try:
83
- chrome_options = Options()
84
- if headless:
85
- chrome_options.add_argument("--headless")
86
- chrome_options.add_argument("--no-sandbox")
87
- chrome_options.add_argument("--disable-dev-shm-usage")
88
- chrome_options.add_argument("--disable-logging")
89
- chrome_options.add_argument("--log-level=3")
90
- chrome_options.add_argument("--disable-extensions")
91
- chrome_options.add_argument("--disable-gpu")
92
- chrome_options.add_argument("--window-size=1920,1080")
93
- chrome_options.add_argument("--disable-web-security")
94
- chrome_options.add_argument("--allow-running-insecure-content")
95
- chrome_options.add_argument("--disable-features=VizDisplayCompositor")
96
- chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
97
- chrome_options.binary_location = "/usr/bin/chromium"
98
 
99
- chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
100
- chrome_options.add_experimental_option('useAutomationExtension', False)
 
 
 
101
  try:
102
- self.driver = webdriver.Chrome(
103
- executable_path="/usr/bin/chromedriver",
104
- options=chrome_options
105
- )
106
- except:
107
- from webdriver_manager.chrome import ChromeDriverManager
108
- self.driver = webdriver.Chrome(
109
- ChromeDriverManager().install(),
110
- options=chrome_options
111
- )
112
-
113
- self.log_message("Chrome driver initialized successfully")
114
- except Exception as e:
115
- self.log_message(f"Error setting up Chrome driver: {e}")
116
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  def log_message(self, message):
119
  """Write message to console and log file."""
120
  print(message)
121
- with open(self.log_path, "a", encoding="utf-8") as log_file:
122
- log_file.write(f"{message}\n")
 
 
 
123
 
124
  def is_valid_url(self, url):
125
  """Check if URL should be scraped."""
 
 
 
 
126
  if not self.scrape_external_links and not url.startswith(self.base_domain):
127
  return False
 
 
128
  if re.search(r"\.(pdf|doc|docx|xls|xlsx|ppt|pptx|jpg|jpeg|png|gif|svg|ico|css|js|xml|json|zip|tar|gz|rar|7z|exe|dmg|mp3|mp4|avi|mov|wmv)$", url, re.IGNORECASE):
129
  return False
 
 
130
  if "#" in url:
131
  url = url.split("#")[0]
132
  if url in self.visited_links:
133
  return False
 
 
134
  skip_patterns = [
135
  '/login', '/register', '/signup', '/sign-up', '/signin', '/sign-in',
136
  '/logout', '/password', '/forgot', '/reset',
@@ -139,14 +201,18 @@ def setup_driver(self, headless):
139
  '/terms', '/privacy', '/legal', '/disclaimer',
140
  '/sitemap', '/robots.txt', '/favicon'
141
  ]
 
142
  url_lower = url.lower()
143
  for pattern in skip_patterns:
144
  if pattern in url_lower:
145
  return False
 
 
146
  spam_patterns = ['popup', 'advertisement', 'tracking', 'analytics']
147
  for pattern in spam_patterns:
148
  if pattern in url_lower:
149
  return False
 
150
  return True
151
 
152
  def sanitize_filename(self, text):
@@ -160,23 +226,27 @@ def setup_driver(self, headless):
160
 
161
  def extract_links(self):
162
  """Extract valid links from current page."""
163
- links = self.driver.find_elements(By.TAG_NAME, "a")
164
- valid_links = []
165
- for link in links:
166
- try:
167
- href = link.get_attribute("href")
168
- if href:
169
- if href.startswith('/'):
170
- href = self.base_domain + href
171
- elif href.startswith('./') or not href.startswith('http'):
172
- current_url = self.driver.current_url
173
- base_path = '/'.join(current_url.split('/')[:-1])
174
- href = base_path + '/' + href.lstrip('./')
175
- if self.is_valid_url(href) and href not in self.visited_links:
176
- valid_links.append(href)
177
- except Exception:
178
- continue
179
- return list(set(valid_links))
 
 
 
 
180
 
181
  def extract_main_content(self, soup):
182
  """Extract main content using various selectors."""
@@ -192,12 +262,15 @@ def setup_driver(self, headless):
192
  break
193
  except:
194
  continue
 
195
  if not content_element:
196
  content_element = soup.find('body')
 
197
  return content_element
198
 
199
  def extract_clean_text(self, soup):
200
  """Extract and clean text from BeautifulSoup object."""
 
201
  unwanted_tags = [
202
  "script", "style", "nav", "footer", "header", "aside",
203
  "advertisement", "ads", "popup", "modal", "cookie-notice"
@@ -205,6 +278,8 @@ def setup_driver(self, headless):
205
  for tag in unwanted_tags:
206
  for element in soup.find_all(tag):
207
  element.decompose()
 
 
208
  unwanted_classes = [
209
  "sidebar", "menu", "navigation", "nav", "footer", "header",
210
  "advertisement", "ad", "ads", "popup", "modal", "cookie",
@@ -215,42 +290,62 @@ def setup_driver(self, headless):
215
  element.decompose()
216
  for element in soup.find_all(id=re.compile(class_name, re.I)):
217
  element.decompose()
 
 
218
  main_content = self.extract_main_content(soup)
219
  if main_content:
220
  text = main_content.get_text(separator=" ", strip=True)
221
  else:
222
  text = soup.get_text(separator=" ", strip=True)
 
 
223
  lines = [line.strip() for line in text.split('\n') if line.strip()]
224
  cleaned_text = '\n'.join(lines)
225
  cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text)
226
  cleaned_text = re.sub(r' +', ' ', cleaned_text)
 
227
  return cleaned_text
228
 
229
  def scrape_page(self, url):
230
  """Scrape content from a single page and save as markdown."""
231
  if url in self.visited_links:
232
  return []
 
233
  self.page_count += 1
234
  self.visited_links.add(url)
 
235
  status = f"Scraping [{self.page_count}/{self.max_pages}]: {url}"
236
  self.log_message(status)
 
237
  try:
238
  self.driver.get(url)
239
- WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
 
 
240
  time.sleep(self.delay)
 
 
241
  try:
242
  page_title = self.driver.title or f"Page_{self.page_count}"
243
  except:
244
  page_title = f"Page_{self.page_count}"
 
 
245
  soup = BeautifulSoup(self.driver.page_source, "html.parser")
246
  cleaned_text = self.extract_clean_text(soup)
 
 
247
  if len(cleaned_text.strip()) < 50:
248
  self.log_message(f"Skipping {url}: insufficient content")
249
  return self.extract_links()
 
 
250
  meta_desc = ""
251
  meta_tag = soup.find("meta", attrs={"name": "description"})
252
  if meta_tag:
253
  meta_desc = meta_tag.get("content", "")
 
 
254
  doc = {
255
  "text": cleaned_text,
256
  "metadata": {
@@ -265,14 +360,20 @@ def setup_driver(self, headless):
265
  }
266
  }
267
  self.documents.append(doc)
 
 
268
  safe_filename = self.sanitize_filename(page_title)
269
  file_path = os.path.join(self.output_dir, f"{safe_filename}.md")
 
 
270
  counter = 1
271
  original_path = file_path
272
  while os.path.exists(file_path):
273
  base, ext = os.path.splitext(original_path)
274
  file_path = f"{base}_{counter}{ext}"
275
  counter += 1
 
 
276
  with open(file_path, "w", encoding="utf-8") as file:
277
  file.write(f"# {page_title}\n\n")
278
  file.write(f"**URL:** {url}\n")
@@ -283,10 +384,15 @@ def setup_driver(self, headless):
283
  file.write(f"**Scraped:** {datetime.now()}\n\n")
284
  file.write("---\n\n")
285
  file.write(cleaned_text)
 
286
  self.log_message(f"Saved: {os.path.basename(file_path)}")
 
 
287
  new_links = self.extract_links()
288
  self.log_message(f"Found {len(new_links)} new links")
 
289
  return new_links
 
290
  except Exception as e:
291
  self.log_message(f"Error scraping {url}: {str(e)}")
292
  return []
@@ -294,25 +400,29 @@ def setup_driver(self, headless):
294
  def create_summary(self):
295
  """Create a summary of the scraped content."""
296
  summary_path = os.path.join(self.output_dir, "scraping_summary.md")
297
- with open(summary_path, "w", encoding="utf-8") as f:
298
- f.write(f"# Scraping Summary: {self.site_name}\n\n")
299
- f.write(f"**Website:** {self.site_name}\n")
300
- f.write(f"**URL:** {self.base_url}\n")
301
- f.write(f"**Domain:** {self.domain_name}\n")
302
- f.write(f"**Category:** {self.site_category}\n")
303
- f.write(f"**Description:** {self.site_description}\n\n")
304
- f.write(f"**Scraping Details:**\n")
305
- f.write(f"- Start time: {self.start_time}\n")
306
- f.write(f"- End time: {datetime.now()}\n")
307
- f.write(f"- Duration: {datetime.now() - self.start_time}\n")
308
- f.write(f"- Pages scraped: {len(self.documents)}\n")
309
- f.write(f"- Max pages allowed: {self.max_pages}\n")
310
- f.write(f"- Max depth: {self.max_depth}\n")
311
- f.write(f"- External links allowed: {self.scrape_external_links}\n\n")
312
- if self.documents:
313
- f.write("**Scraped Pages:**\n")
314
- for i, doc in enumerate(self.documents, 1):
315
- f.write(f"{i}. [{doc['metadata']['title']}]({doc['metadata']['source']})\n")
 
 
 
 
316
 
317
  def start(self):
318
  """Start the website scraping process."""
@@ -320,31 +430,47 @@ def setup_driver(self, headless):
320
  self.log_message(f"Starting website scraping for {self.site_name}")
321
  self.log_message(f"Target: {self.base_url}")
322
  self.log_message(f"Limits: max_depth={self.max_depth}, max_pages={self.max_pages}")
 
323
  urls_to_scrape = [(self.base_url, 0)]
 
324
  while urls_to_scrape and self.page_count < self.max_pages:
325
  current_url, current_depth = urls_to_scrape.pop(0)
 
326
  if current_url in self.visited_links or current_depth > self.max_depth:
327
  continue
 
328
  new_links = self.scrape_page(current_url)
 
 
329
  if current_depth + 1 <= self.max_depth:
330
  for link in new_links:
331
  if link not in self.visited_links:
332
  urls_to_scrape.append((link, current_depth + 1))
 
 
333
  self.create_summary()
334
- self.driver.quit()
 
 
 
 
335
  end_time = datetime.now()
336
  duration = end_time - self.start_time
 
337
  self.log_message(f"Scraping completed for {self.site_name}")
338
  self.log_message(f"Total pages scraped: {self.page_count}")
339
  self.log_message(f"Duration: {duration}")
 
340
  return {
341
  "success": True,
342
  "pages_scraped": self.page_count,
343
  "duration": str(duration),
344
  "output_dir": self.output_dir
345
  }
 
346
  except Exception as e:
347
- self.driver.quit()
 
348
  self.log_message(f"Scraping failed: {str(e)}")
349
  return {
350
  "success": False,
@@ -352,4 +478,4 @@ def setup_driver(self, headless):
352
  "pages_scraped": self.page_count,
353
  "duration": "0",
354
  "output_dir": self.output_dir
355
- }
 
3
  import re
4
  import urllib.parse
5
  from datetime import datetime
6
+ from selenium import webdriver
7
+ from selenium.webdriver.chrome.options import Options
8
  from selenium.webdriver.common.by import By
9
  from selenium.webdriver.support.ui import WebDriverWait
10
  from selenium.webdriver.support import expected_conditions as EC
11
  from bs4 import BeautifulSoup
12
+
 
 
 
 
 
 
 
13
  class WebsiteScraper:
14
  def __init__(self, base_url, site_name, site_description="", site_category="General",
15
  output_dir=None, max_depth=3, max_pages=50, delay=2, headless=True,
 
48
  self.visited_links = set()
49
  self.page_count = 0
50
  self.start_time = datetime.now()
51
+
52
+ # Create output directory
53
  if output_dir is None:
54
  domain_safe = self.domain_name.replace(".", "_").replace(":", "_")
55
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
56
  self.output_dir = f"{site_name}_{domain_safe}_{timestamp}"
57
  else:
58
  self.output_dir = output_dir
59
+
60
  if not os.path.exists(self.output_dir):
61
  os.makedirs(self.output_dir)
62
+
63
+ # Initialize log file
64
  self.log_path = os.path.join(self.output_dir, "scraping_log.txt")
65
  with open(self.log_path, "w", encoding="utf-8") as log_file:
66
  log_file.write(f"Website scraping started at: {self.start_time}\n")
 
72
  log_file.write(f"Max depth: {self.max_depth}\n")
73
  log_file.write(f"Max pages: {self.max_pages}\n")
74
  log_file.write(f"External links: {self.scrape_external_links}\n\n")
75
+
76
+ # Initialize driver
77
+ self.driver = None
78
  self.documents = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ # Set up the Chrome driver
81
+ self.setup_driver(headless)
82
+
83
+ def setup_driver(self, headless=True):
84
+ """Setup Chrome driver with options for Gradio Spaces compatibility."""
85
  try:
86
+ chrome_options = Options()
87
+
88
+ # Essential headless options
89
+ if headless:
90
+ chrome_options.add_argument("--headless")
91
+
92
+ # Core stability options
93
+ chrome_options.add_argument("--no-sandbox")
94
+ chrome_options.add_argument("--disable-dev-shm-usage")
95
+ chrome_options.add_argument("--disable-gpu")
96
+ chrome_options.add_argument("--disable-features=VizDisplayCompositor")
97
+ chrome_options.add_argument("--disable-web-security")
98
+ chrome_options.add_argument("--allow-running-insecure-content")
99
+
100
+ # Logging and extension options
101
+ chrome_options.add_argument("--disable-logging")
102
+ chrome_options.add_argument("--log-level=3")
103
+ chrome_options.add_argument("--disable-extensions")
104
+ chrome_options.add_argument("--disable-plugins")
105
+
106
+ # Window and display options
107
+ chrome_options.add_argument("--window-size=1920,1080")
108
+ chrome_options.add_argument("--start-maximized")
109
+
110
+ # User agent
111
+ chrome_options.add_argument("--user-agent=Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")
112
+
113
+ # Experimental options
114
+ chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
115
+ chrome_options.add_experimental_option('useAutomationExtension', False)
116
+
117
+ # Memory and performance options
118
+ chrome_options.add_argument("--memory-pressure-off")
119
+ chrome_options.add_argument("--max_old_space_size=4096")
120
+
121
+ # Try different Chrome binary locations for different environments
122
+ chrome_binaries = [
123
+ "/usr/bin/chromium",
124
+ "/usr/bin/chromium-browser",
125
+ "/usr/bin/google-chrome",
126
+ "/usr/bin/google-chrome-stable"
127
+ ]
128
+
129
+ for binary in chrome_binaries:
130
+ if os.path.exists(binary):
131
+ chrome_options.binary_location = binary
132
+ break
133
+
134
+ # Try different driver approaches
135
+ try:
136
+ # Try system chromedriver first
137
+ if os.path.exists("/usr/bin/chromedriver"):
138
+ from selenium.webdriver.chrome.service import Service
139
+ service = Service("/usr/bin/chromedriver")
140
+ self.driver = webdriver.Chrome(service=service, options=chrome_options)
141
+ else:
142
+ # Fallback to webdriver-manager
143
+ try:
144
+ from webdriver_manager.chrome import ChromeDriverManager
145
+ from selenium.webdriver.chrome.service import Service
146
+ service = Service(ChromeDriverManager().install())
147
+ self.driver = webdriver.Chrome(service=service, options=chrome_options)
148
+ except:
149
+ # Last resort - try without service
150
+ self.driver = webdriver.Chrome(options=chrome_options)
151
+
152
+ except Exception as e:
153
+ self.log_message(f"Chrome setup failed: {e}")
154
+ # Try with minimal options as last resort
155
+ minimal_options = Options()
156
+ minimal_options.add_argument("--headless")
157
+ minimal_options.add_argument("--no-sandbox")
158
+ minimal_options.add_argument("--disable-dev-shm-usage")
159
+ self.driver = webdriver.Chrome(options=minimal_options)
160
+
161
+ self.log_message("Chrome driver initialized successfully")
162
+
163
+ except Exception as e:
164
+ self.log_message(f"Error setting up Chrome driver: {e}")
165
+ raise Exception(f"Failed to initialize Chrome driver: {e}")
166
 
167
  def log_message(self, message):
168
  """Write message to console and log file."""
169
  print(message)
170
+ try:
171
+ with open(self.log_path, "a", encoding="utf-8") as log_file:
172
+ log_file.write(f"{datetime.now()}: {message}\n")
173
+ except:
174
+ pass # Continue even if logging fails
175
 
176
  def is_valid_url(self, url):
177
  """Check if URL should be scraped."""
178
+ if not url:
179
+ return False
180
+
181
+ # Check external links
182
  if not self.scrape_external_links and not url.startswith(self.base_domain):
183
  return False
184
+
185
+ # Skip file extensions
186
  if re.search(r"\.(pdf|doc|docx|xls|xlsx|ppt|pptx|jpg|jpeg|png|gif|svg|ico|css|js|xml|json|zip|tar|gz|rar|7z|exe|dmg|mp3|mp4|avi|mov|wmv)$", url, re.IGNORECASE):
187
  return False
188
+
189
+ # Handle fragments
190
  if "#" in url:
191
  url = url.split("#")[0]
192
  if url in self.visited_links:
193
  return False
194
+
195
+ # Skip common patterns
196
  skip_patterns = [
197
  '/login', '/register', '/signup', '/sign-up', '/signin', '/sign-in',
198
  '/logout', '/password', '/forgot', '/reset',
 
201
  '/terms', '/privacy', '/legal', '/disclaimer',
202
  '/sitemap', '/robots.txt', '/favicon'
203
  ]
204
+
205
  url_lower = url.lower()
206
  for pattern in skip_patterns:
207
  if pattern in url_lower:
208
  return False
209
+
210
+ # Skip spam patterns
211
  spam_patterns = ['popup', 'advertisement', 'tracking', 'analytics']
212
  for pattern in spam_patterns:
213
  if pattern in url_lower:
214
  return False
215
+
216
  return True
217
 
218
  def sanitize_filename(self, text):
 
226
 
227
  def extract_links(self):
228
  """Extract valid links from current page."""
229
+ links = []
230
+ try:
231
+ link_elements = self.driver.find_elements(By.TAG_NAME, "a")
232
+ for link in link_elements:
233
+ try:
234
+ href = link.get_attribute("href")
235
+ if href:
236
+ if href.startswith('/'):
237
+ href = self.base_domain + href
238
+ elif href.startswith('./') or not href.startswith('http'):
239
+ current_url = self.driver.current_url
240
+ base_path = '/'.join(current_url.split('/')[:-1])
241
+ href = base_path + '/' + href.lstrip('./')
242
+ if self.is_valid_url(href) and href not in self.visited_links:
243
+ links.append(href)
244
+ except Exception:
245
+ continue
246
+ except Exception as e:
247
+ self.log_message(f"Error extracting links: {e}")
248
+
249
+ return list(set(links))
250
 
251
  def extract_main_content(self, soup):
252
  """Extract main content using various selectors."""
 
262
  break
263
  except:
264
  continue
265
+
266
  if not content_element:
267
  content_element = soup.find('body')
268
+
269
  return content_element
270
 
271
  def extract_clean_text(self, soup):
272
  """Extract and clean text from BeautifulSoup object."""
273
+ # Remove unwanted tags
274
  unwanted_tags = [
275
  "script", "style", "nav", "footer", "header", "aside",
276
  "advertisement", "ads", "popup", "modal", "cookie-notice"
 
278
  for tag in unwanted_tags:
279
  for element in soup.find_all(tag):
280
  element.decompose()
281
+
282
+ # Remove unwanted classes and IDs
283
  unwanted_classes = [
284
  "sidebar", "menu", "navigation", "nav", "footer", "header",
285
  "advertisement", "ad", "ads", "popup", "modal", "cookie",
 
290
  element.decompose()
291
  for element in soup.find_all(id=re.compile(class_name, re.I)):
292
  element.decompose()
293
+
294
+ # Extract main content
295
  main_content = self.extract_main_content(soup)
296
  if main_content:
297
  text = main_content.get_text(separator=" ", strip=True)
298
  else:
299
  text = soup.get_text(separator=" ", strip=True)
300
+
301
+ # Clean up text
302
  lines = [line.strip() for line in text.split('\n') if line.strip()]
303
  cleaned_text = '\n'.join(lines)
304
  cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text)
305
  cleaned_text = re.sub(r' +', ' ', cleaned_text)
306
+
307
  return cleaned_text
308
 
309
  def scrape_page(self, url):
310
  """Scrape content from a single page and save as markdown."""
311
  if url in self.visited_links:
312
  return []
313
+
314
  self.page_count += 1
315
  self.visited_links.add(url)
316
+
317
  status = f"Scraping [{self.page_count}/{self.max_pages}]: {url}"
318
  self.log_message(status)
319
+
320
  try:
321
  self.driver.get(url)
322
+ WebDriverWait(self.driver, 10).until(
323
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
324
+ )
325
  time.sleep(self.delay)
326
+
327
+ # Get page title
328
  try:
329
  page_title = self.driver.title or f"Page_{self.page_count}"
330
  except:
331
  page_title = f"Page_{self.page_count}"
332
+
333
+ # Parse page content
334
  soup = BeautifulSoup(self.driver.page_source, "html.parser")
335
  cleaned_text = self.extract_clean_text(soup)
336
+
337
+ # Skip if insufficient content
338
  if len(cleaned_text.strip()) < 50:
339
  self.log_message(f"Skipping {url}: insufficient content")
340
  return self.extract_links()
341
+
342
+ # Extract meta description
343
  meta_desc = ""
344
  meta_tag = soup.find("meta", attrs={"name": "description"})
345
  if meta_tag:
346
  meta_desc = meta_tag.get("content", "")
347
+
348
+ # Create document
349
  doc = {
350
  "text": cleaned_text,
351
  "metadata": {
 
360
  }
361
  }
362
  self.documents.append(doc)
363
+
364
+ # Save to file
365
  safe_filename = self.sanitize_filename(page_title)
366
  file_path = os.path.join(self.output_dir, f"{safe_filename}.md")
367
+
368
+ # Handle duplicate filenames
369
  counter = 1
370
  original_path = file_path
371
  while os.path.exists(file_path):
372
  base, ext = os.path.splitext(original_path)
373
  file_path = f"{base}_{counter}{ext}"
374
  counter += 1
375
+
376
+ # Write markdown file
377
  with open(file_path, "w", encoding="utf-8") as file:
378
  file.write(f"# {page_title}\n\n")
379
  file.write(f"**URL:** {url}\n")
 
384
  file.write(f"**Scraped:** {datetime.now()}\n\n")
385
  file.write("---\n\n")
386
  file.write(cleaned_text)
387
+
388
  self.log_message(f"Saved: {os.path.basename(file_path)}")
389
+
390
+ # Extract new links
391
  new_links = self.extract_links()
392
  self.log_message(f"Found {len(new_links)} new links")
393
+
394
  return new_links
395
+
396
  except Exception as e:
397
  self.log_message(f"Error scraping {url}: {str(e)}")
398
  return []
 
400
  def create_summary(self):
401
  """Create a summary of the scraped content."""
402
  summary_path = os.path.join(self.output_dir, "scraping_summary.md")
403
+ try:
404
+ with open(summary_path, "w", encoding="utf-8") as f:
405
+ f.write(f"# Scraping Summary: {self.site_name}\n\n")
406
+ f.write(f"**Website:** {self.site_name}\n")
407
+ f.write(f"**URL:** {self.base_url}\n")
408
+ f.write(f"**Domain:** {self.domain_name}\n")
409
+ f.write(f"**Category:** {self.site_category}\n")
410
+ f.write(f"**Description:** {self.site_description}\n\n")
411
+ f.write(f"**Scraping Details:**\n")
412
+ f.write(f"- Start time: {self.start_time}\n")
413
+ f.write(f"- End time: {datetime.now()}\n")
414
+ f.write(f"- Duration: {datetime.now() - self.start_time}\n")
415
+ f.write(f"- Pages scraped: {len(self.documents)}\n")
416
+ f.write(f"- Max pages allowed: {self.max_pages}\n")
417
+ f.write(f"- Max depth: {self.max_depth}\n")
418
+ f.write(f"- External links allowed: {self.scrape_external_links}\n\n")
419
+
420
+ if self.documents:
421
+ f.write("**Scraped Pages:**\n")
422
+ for i, doc in enumerate(self.documents, 1):
423
+ f.write(f"{i}. [{doc['metadata']['title']}]({doc['metadata']['source']})\n")
424
+ except Exception as e:
425
+ self.log_message(f"Error creating summary: {e}")
426
 
427
  def start(self):
428
  """Start the website scraping process."""
 
430
  self.log_message(f"Starting website scraping for {self.site_name}")
431
  self.log_message(f"Target: {self.base_url}")
432
  self.log_message(f"Limits: max_depth={self.max_depth}, max_pages={self.max_pages}")
433
+
434
  urls_to_scrape = [(self.base_url, 0)]
435
+
436
  while urls_to_scrape and self.page_count < self.max_pages:
437
  current_url, current_depth = urls_to_scrape.pop(0)
438
+
439
  if current_url in self.visited_links or current_depth > self.max_depth:
440
  continue
441
+
442
  new_links = self.scrape_page(current_url)
443
+
444
+ # Add new links for next depth level
445
  if current_depth + 1 <= self.max_depth:
446
  for link in new_links:
447
  if link not in self.visited_links:
448
  urls_to_scrape.append((link, current_depth + 1))
449
+
450
+ # Create summary
451
  self.create_summary()
452
+
453
+ # Clean up
454
+ if self.driver:
455
+ self.driver.quit()
456
+
457
  end_time = datetime.now()
458
  duration = end_time - self.start_time
459
+
460
  self.log_message(f"Scraping completed for {self.site_name}")
461
  self.log_message(f"Total pages scraped: {self.page_count}")
462
  self.log_message(f"Duration: {duration}")
463
+
464
  return {
465
  "success": True,
466
  "pages_scraped": self.page_count,
467
  "duration": str(duration),
468
  "output_dir": self.output_dir
469
  }
470
+
471
  except Exception as e:
472
+ if self.driver:
473
+ self.driver.quit()
474
  self.log_message(f"Scraping failed: {str(e)}")
475
  return {
476
  "success": False,
 
478
  "pages_scraped": self.page_count,
479
  "duration": "0",
480
  "output_dir": self.output_dir
481
+ }
requirements.txt CHANGED
@@ -9,4 +9,5 @@ convertapi
9
  python-dotenv
10
  pathlib
11
  urllib3
12
- webdriver-manager
 
 
9
  python-dotenv
10
  pathlib
11
  urllib3
12
+ webdriver-manager
13
+ lxml