ahmednoorx commited on
Commit
2e93556
·
verified ·
1 Parent(s): 7319f65

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +51 -7
scraper.py CHANGED
@@ -4,12 +4,19 @@ import time
4
  import re
5
  from urllib.parse import urlparse, urljoin
6
  import sqlite3
7
- from selenium import webdriver
8
- from selenium.webdriver.chrome.options import Options
9
- from selenium.webdriver.common.by import By
10
- from selenium.webdriver.support.ui import WebDriverWait
11
- from selenium.webdriver.support import expected_conditions as EC
12
- from webdriver_manager.chrome import ChromeDriverManager
 
 
 
 
 
 
 
13
 
14
  class LinkedInScraper:
15
  def __init__(self, timeout=10, use_selenium=False):
@@ -25,11 +32,17 @@ class LinkedInScraper:
25
  'Upgrade-Insecure-Requests': '1',
26
  })
27
 
28
- if self.use_selenium:
29
  self._setup_selenium()
 
 
30
 
31
  def _setup_selenium(self):
32
  """Setup Selenium WebDriver"""
 
 
 
 
33
  try:
34
  chrome_options = Options()
35
  chrome_options.add_argument('--headless')
@@ -285,3 +298,34 @@ class LinkedInScraper:
285
  self.driver.quit()
286
  except:
287
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import re
5
  from urllib.parse import urlparse, urljoin
6
  import sqlite3
7
+
8
+ # Optional Selenium imports for advanced scraping
9
+ try:
10
+ from selenium import webdriver
11
+ from selenium.webdriver.chrome.options import Options
12
+ from selenium.webdriver.common.by import By
13
+ from selenium.webdriver.support.ui import WebDriverWait
14
+ from selenium.webdriver.support import expected_conditions as EC
15
+ from webdriver_manager.chrome import ChromeDriverManager
16
+ SELENIUM_AVAILABLE = True
17
+ except ImportError:
18
+ SELENIUM_AVAILABLE = False
19
+ print("⚠️ Selenium not available. Company research will use basic scraping only.")
20
 
21
  class LinkedInScraper:
22
  def __init__(self, timeout=10, use_selenium=False):
 
32
  'Upgrade-Insecure-Requests': '1',
33
  })
34
 
35
+ if self.use_selenium and SELENIUM_AVAILABLE:
36
  self._setup_selenium()
37
+ elif self.use_selenium and not SELENIUM_AVAILABLE:
38
+ print("⚠️ Selenium requested but not available. Falling back to basic scraping.")
39
 
40
  def _setup_selenium(self):
41
  """Setup Selenium WebDriver"""
42
+ if not SELENIUM_AVAILABLE:
43
+ print("⚠️ Selenium not available. Cannot setup WebDriver.")
44
+ return
45
+
46
  try:
47
  chrome_options = Options()
48
  chrome_options.add_argument('--headless')
 
298
  self.driver.quit()
299
  except:
300
  pass
301
+
302
+
303
+ # Standalone function for easy import
304
+ def scrape_company_info(input_data):
305
+ """
306
+ Scrape company information from LinkedIn URL or company name
307
+
308
+ Args:
309
+ input_data (str): LinkedIn URL or company name
310
+
311
+ Returns:
312
+ str: Scraped company information or error message if dependencies missing
313
+ """
314
+ if not SELENIUM_AVAILABLE:
315
+ return "Company research feature requires additional setup. Please install selenium and webdriver-manager for enterprise features."
316
+
317
+ try:
318
+ scraper = LinkedInScraper()
319
+
320
+ # Check if input is a LinkedIn URL
321
+ if 'linkedin.com' in input_data.lower():
322
+ result = scraper.scrape_linkedin_or_company(input_data, "")
323
+ else:
324
+ # Treat as company name
325
+ result = scraper.scrape_company_website(input_data)
326
+
327
+ return result if result else ""
328
+
329
+ except Exception as e:
330
+ print(f"Error in scrape_company_info: {e}")
331
+ return ""