Spaces:
Running
Running
fixed bugs
Browse files- scraper.py +283 -279
scraper.py
CHANGED
@@ -1,279 +1,283 @@
|
|
1 |
-
import requests
|
2 |
-
from bs4 import BeautifulSoup
|
3 |
-
import time
|
4 |
-
import re
|
5 |
-
from urllib.parse import urlparse, urljoin
|
6 |
-
import sqlite3
|
7 |
-
from selenium import webdriver
|
8 |
-
from selenium.webdriver.chrome.options import Options
|
9 |
-
from selenium.webdriver.common.by import By
|
10 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
11 |
-
from selenium.webdriver.support import expected_conditions as EC
|
12 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
13 |
-
|
14 |
-
class LinkedInScraper:
|
15 |
-
def __init__(self, timeout=10, use_selenium=False):
|
16 |
-
self.timeout = timeout
|
17 |
-
self.use_selenium = use_selenium
|
18 |
-
self.session = requests.Session()
|
19 |
-
self.session.headers.update({
|
20 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
21 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
22 |
-
'Accept-Language': 'en-US,en;q=0.5',
|
23 |
-
'Accept-Encoding': 'gzip, deflate',
|
24 |
-
'Connection': 'keep-alive',
|
25 |
-
'Upgrade-Insecure-Requests': '1',
|
26 |
-
})
|
27 |
-
|
28 |
-
if self.use_selenium:
|
29 |
-
self._setup_selenium()
|
30 |
-
|
31 |
-
def _setup_selenium(self):
|
32 |
-
"""Setup Selenium WebDriver"""
|
33 |
-
try:
|
34 |
-
chrome_options = Options()
|
35 |
-
chrome_options.add_argument('--headless')
|
36 |
-
chrome_options.add_argument('--no-sandbox')
|
37 |
-
chrome_options.add_argument('--disable-dev-shm-usage')
|
38 |
-
chrome_options.add_argument('--disable-gpu')
|
39 |
-
chrome_options.add_argument('--window-size=1920,1080')
|
40 |
-
|
41 |
-
self.driver = webdriver.Chrome(
|
42 |
-
ChromeDriverManager().install(),
|
43 |
-
options=chrome_options
|
44 |
-
)
|
45 |
-
except Exception as e:
|
46 |
-
print(f"Error setting up Selenium: {e}")
|
47 |
-
self.use_selenium = False
|
48 |
-
|
49 |
-
def _get_cached_data(self, url):
|
50 |
-
"""Check if URL data is cached in database"""
|
51 |
-
try:
|
52 |
-
conn = sqlite3.connect('leads.db')
|
53 |
-
cursor = conn.cursor()
|
54 |
-
|
55 |
-
cursor.execute('''
|
56 |
-
CREATE TABLE IF NOT EXISTS scraped_cache (
|
57 |
-
url TEXT PRIMARY KEY,
|
58 |
-
content TEXT,
|
59 |
-
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
60 |
-
)
|
61 |
-
''')
|
62 |
-
|
63 |
-
cursor.execute('SELECT content FROM scraped_cache WHERE url = ?', (url,))
|
64 |
-
result = cursor.fetchone()
|
65 |
-
conn.close()
|
66 |
-
|
67 |
-
return result[0] if result else None
|
68 |
-
except Exception as e:
|
69 |
-
print(f"Cache error: {e}")
|
70 |
-
return None
|
71 |
-
|
72 |
-
def _cache_data(self, url, content):
|
73 |
-
"""Cache scraped data"""
|
74 |
-
try:
|
75 |
-
conn = sqlite3.connect('leads.db')
|
76 |
-
cursor = conn.cursor()
|
77 |
-
|
78 |
-
cursor.execute('''
|
79 |
-
INSERT OR REPLACE INTO scraped_cache (url, content)
|
80 |
-
VALUES (?, ?)
|
81 |
-
''', (url, content))
|
82 |
-
|
83 |
-
conn.commit()
|
84 |
-
conn.close()
|
85 |
-
except Exception as e:
|
86 |
-
print(f"Cache save error: {e}")
|
87 |
-
|
88 |
-
def scrape_with_requests(self, url):
|
89 |
-
"""Scrape URL using requests and BeautifulSoup"""
|
90 |
-
try:
|
91 |
-
response = self.session.get(url, timeout=self.timeout)
|
92 |
-
response.raise_for_status()
|
93 |
-
|
94 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
95 |
-
|
96 |
-
# Extract various content types
|
97 |
-
content_parts = []
|
98 |
-
|
99 |
-
# Try to get meta description
|
100 |
-
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
101 |
-
if meta_desc:
|
102 |
-
content_parts.append(f"Description: {meta_desc.get('content', '')}")
|
103 |
-
|
104 |
-
# Try to get title
|
105 |
-
title = soup.find('title')
|
106 |
-
if title:
|
107 |
-
content_parts.append(f"Title: {title.get_text().strip()}")
|
108 |
-
|
109 |
-
# Try to get about section or main content
|
110 |
-
about_selectors = [
|
111 |
-
'.about-section',
|
112 |
-
'.company-description',
|
113 |
-
'.about-us',
|
114 |
-
'[class*="about"]',
|
115 |
-
'.description',
|
116 |
-
'.summary',
|
117 |
-
'main',
|
118 |
-
'.content'
|
119 |
-
]
|
120 |
-
|
121 |
-
for selector in about_selectors:
|
122 |
-
elements = soup.select(selector)
|
123 |
-
for element in elements:
|
124 |
-
text = element.get_text().strip()
|
125 |
-
if len(text) > 50: # Only meaningful content
|
126 |
-
content_parts.append(text[:500]) # Limit length
|
127 |
-
break
|
128 |
-
if content_parts:
|
129 |
-
break
|
130 |
-
|
131 |
-
# If no specific content found, get paragraphs
|
132 |
-
if not content_parts:
|
133 |
-
paragraphs = soup.find_all('p')
|
134 |
-
for p in paragraphs[:3]: # First 3 paragraphs
|
135 |
-
text = p.get_text().strip()
|
136 |
-
if len(text) > 30:
|
137 |
-
content_parts.append(text[:300])
|
138 |
-
|
139 |
-
return ' | '.join(content_parts) if content_parts else "No content extracted"
|
140 |
-
|
141 |
-
except Exception as e:
|
142 |
-
return f"Error scraping {url}: {str(e)}"
|
143 |
-
|
144 |
-
def scrape_with_selenium(self, url):
|
145 |
-
"""Scrape URL using Selenium"""
|
146 |
-
try:
|
147 |
-
self.driver.get(url)
|
148 |
-
WebDriverWait(self.driver, self.timeout).until(
|
149 |
-
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
150 |
-
)
|
151 |
-
|
152 |
-
# Wait a bit for dynamic content
|
153 |
-
time.sleep(2)
|
154 |
-
|
155 |
-
content_parts = []
|
156 |
-
|
157 |
-
# Try different selectors for LinkedIn
|
158 |
-
linkedin_selectors = [
|
159 |
-
'[data-test-id="about-us-description"]',
|
160 |
-
'.company-about-us-description',
|
161 |
-
'.about-section',
|
162 |
-
'[class*="about"]'
|
163 |
-
]
|
164 |
-
|
165 |
-
for selector in linkedin_selectors:
|
166 |
-
try:
|
167 |
-
elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
|
168 |
-
for element in elements:
|
169 |
-
text = element.text.strip()
|
170 |
-
if len(text) > 50:
|
171 |
-
content_parts.append(text[:500])
|
172 |
-
break
|
173 |
-
except:
|
174 |
-
continue
|
175 |
-
|
176 |
-
# If no LinkedIn-specific content, try general selectors
|
177 |
-
if not content_parts:
|
178 |
-
general_selectors = ['main', '.content', 'article', '.description']
|
179 |
-
for selector in general_selectors:
|
180 |
-
try:
|
181 |
-
elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
|
182 |
-
for element in elements:
|
183 |
-
text = element.text.strip()
|
184 |
-
if len(text) > 50:
|
185 |
-
content_parts.append(text[:500])
|
186 |
-
break
|
187 |
-
except:
|
188 |
-
continue
|
189 |
-
|
190 |
-
return ' | '.join(content_parts) if content_parts else "No content extracted"
|
191 |
-
|
192 |
-
except Exception as e:
|
193 |
-
return f"Error scraping {url} with Selenium: {str(e)}"
|
194 |
-
|
195 |
-
def scrape_linkedin_profile(self, linkedin_url):
|
196 |
-
"""Scrape LinkedIn company profile"""
|
197 |
-
if not linkedin_url or not linkedin_url.strip():
|
198 |
-
return "No LinkedIn URL provided"
|
199 |
-
|
200 |
-
# Check cache first
|
201 |
-
cached_content = self._get_cached_data(linkedin_url)
|
202 |
-
if cached_content:
|
203 |
-
return cached_content
|
204 |
-
|
205 |
-
try:
|
206 |
-
# Clean URL
|
207 |
-
linkedin_url = linkedin_url.strip()
|
208 |
-
if not linkedin_url.startswith('http'):
|
209 |
-
linkedin_url = 'https://' + linkedin_url
|
210 |
-
|
211 |
-
# Use appropriate scraping method
|
212 |
-
if self.use_selenium:
|
213 |
-
content = self.scrape_with_selenium(linkedin_url)
|
214 |
-
else:
|
215 |
-
content = self.scrape_with_requests(linkedin_url)
|
216 |
-
|
217 |
-
# Cache the result
|
218 |
-
self._cache_data(linkedin_url, content)
|
219 |
-
|
220 |
-
return content
|
221 |
-
|
222 |
-
except Exception as e:
|
223 |
-
return f"Error accessing LinkedIn: {str(e)}"
|
224 |
-
|
225 |
-
def
|
226 |
-
"""
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import time
|
4 |
+
import re
|
5 |
+
from urllib.parse import urlparse, urljoin
|
6 |
+
import sqlite3
|
7 |
+
from selenium import webdriver
|
8 |
+
from selenium.webdriver.chrome.options import Options
|
9 |
+
from selenium.webdriver.common.by import By
|
10 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
11 |
+
from selenium.webdriver.support import expected_conditions as EC
|
12 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
13 |
+
|
14 |
+
class LinkedInScraper:
|
15 |
+
def __init__(self, timeout=10, use_selenium=False):
|
16 |
+
self.timeout = timeout
|
17 |
+
self.use_selenium = use_selenium
|
18 |
+
self.session = requests.Session()
|
19 |
+
self.session.headers.update({
|
20 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
21 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
22 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
23 |
+
'Accept-Encoding': 'gzip, deflate',
|
24 |
+
'Connection': 'keep-alive',
|
25 |
+
'Upgrade-Insecure-Requests': '1',
|
26 |
+
})
|
27 |
+
|
28 |
+
if self.use_selenium:
|
29 |
+
self._setup_selenium()
|
30 |
+
|
31 |
+
def _setup_selenium(self):
|
32 |
+
"""Setup Selenium WebDriver"""
|
33 |
+
try:
|
34 |
+
chrome_options = Options()
|
35 |
+
chrome_options.add_argument('--headless')
|
36 |
+
chrome_options.add_argument('--no-sandbox')
|
37 |
+
chrome_options.add_argument('--disable-dev-shm-usage')
|
38 |
+
chrome_options.add_argument('--disable-gpu')
|
39 |
+
chrome_options.add_argument('--window-size=1920,1080')
|
40 |
+
|
41 |
+
self.driver = webdriver.Chrome(
|
42 |
+
ChromeDriverManager().install(),
|
43 |
+
options=chrome_options
|
44 |
+
)
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Error setting up Selenium: {e}")
|
47 |
+
self.use_selenium = False
|
48 |
+
|
49 |
+
def _get_cached_data(self, url):
|
50 |
+
"""Check if URL data is cached in database"""
|
51 |
+
try:
|
52 |
+
conn = sqlite3.connect('leads.db')
|
53 |
+
cursor = conn.cursor()
|
54 |
+
|
55 |
+
cursor.execute('''
|
56 |
+
CREATE TABLE IF NOT EXISTS scraped_cache (
|
57 |
+
url TEXT PRIMARY KEY,
|
58 |
+
content TEXT,
|
59 |
+
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
60 |
+
)
|
61 |
+
''')
|
62 |
+
|
63 |
+
cursor.execute('SELECT content FROM scraped_cache WHERE url = ?', (url,))
|
64 |
+
result = cursor.fetchone()
|
65 |
+
conn.close()
|
66 |
+
|
67 |
+
return result[0] if result else None
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Cache error: {e}")
|
70 |
+
return None
|
71 |
+
|
72 |
+
def _cache_data(self, url, content):
|
73 |
+
"""Cache scraped data"""
|
74 |
+
try:
|
75 |
+
conn = sqlite3.connect('leads.db')
|
76 |
+
cursor = conn.cursor()
|
77 |
+
|
78 |
+
cursor.execute('''
|
79 |
+
INSERT OR REPLACE INTO scraped_cache (url, content)
|
80 |
+
VALUES (?, ?)
|
81 |
+
''', (url, content))
|
82 |
+
|
83 |
+
conn.commit()
|
84 |
+
conn.close()
|
85 |
+
except Exception as e:
|
86 |
+
print(f"Cache save error: {e}")
|
87 |
+
|
88 |
+
def scrape_with_requests(self, url):
|
89 |
+
"""Scrape URL using requests and BeautifulSoup"""
|
90 |
+
try:
|
91 |
+
response = self.session.get(url, timeout=self.timeout)
|
92 |
+
response.raise_for_status()
|
93 |
+
|
94 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
95 |
+
|
96 |
+
# Extract various content types
|
97 |
+
content_parts = []
|
98 |
+
|
99 |
+
# Try to get meta description
|
100 |
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
101 |
+
if meta_desc:
|
102 |
+
content_parts.append(f"Description: {meta_desc.get('content', '')}")
|
103 |
+
|
104 |
+
# Try to get title
|
105 |
+
title = soup.find('title')
|
106 |
+
if title:
|
107 |
+
content_parts.append(f"Title: {title.get_text().strip()}")
|
108 |
+
|
109 |
+
# Try to get about section or main content
|
110 |
+
about_selectors = [
|
111 |
+
'.about-section',
|
112 |
+
'.company-description',
|
113 |
+
'.about-us',
|
114 |
+
'[class*="about"]',
|
115 |
+
'.description',
|
116 |
+
'.summary',
|
117 |
+
'main',
|
118 |
+
'.content'
|
119 |
+
]
|
120 |
+
|
121 |
+
for selector in about_selectors:
|
122 |
+
elements = soup.select(selector)
|
123 |
+
for element in elements:
|
124 |
+
text = element.get_text().strip()
|
125 |
+
if len(text) > 50: # Only meaningful content
|
126 |
+
content_parts.append(text[:500]) # Limit length
|
127 |
+
break
|
128 |
+
if content_parts:
|
129 |
+
break
|
130 |
+
|
131 |
+
# If no specific content found, get paragraphs
|
132 |
+
if not content_parts:
|
133 |
+
paragraphs = soup.find_all('p')
|
134 |
+
for p in paragraphs[:3]: # First 3 paragraphs
|
135 |
+
text = p.get_text().strip()
|
136 |
+
if len(text) > 30:
|
137 |
+
content_parts.append(text[:300])
|
138 |
+
|
139 |
+
return ' | '.join(content_parts) if content_parts else "No content extracted"
|
140 |
+
|
141 |
+
except Exception as e:
|
142 |
+
return f"Error scraping {url}: {str(e)}"
|
143 |
+
|
144 |
+
def scrape_with_selenium(self, url):
|
145 |
+
"""Scrape URL using Selenium"""
|
146 |
+
try:
|
147 |
+
self.driver.get(url)
|
148 |
+
WebDriverWait(self.driver, self.timeout).until(
|
149 |
+
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
150 |
+
)
|
151 |
+
|
152 |
+
# Wait a bit for dynamic content
|
153 |
+
time.sleep(2)
|
154 |
+
|
155 |
+
content_parts = []
|
156 |
+
|
157 |
+
# Try different selectors for LinkedIn
|
158 |
+
linkedin_selectors = [
|
159 |
+
'[data-test-id="about-us-description"]',
|
160 |
+
'.company-about-us-description',
|
161 |
+
'.about-section',
|
162 |
+
'[class*="about"]'
|
163 |
+
]
|
164 |
+
|
165 |
+
for selector in linkedin_selectors:
|
166 |
+
try:
|
167 |
+
elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
|
168 |
+
for element in elements:
|
169 |
+
text = element.text.strip()
|
170 |
+
if len(text) > 50:
|
171 |
+
content_parts.append(text[:500])
|
172 |
+
break
|
173 |
+
except:
|
174 |
+
continue
|
175 |
+
|
176 |
+
# If no LinkedIn-specific content, try general selectors
|
177 |
+
if not content_parts:
|
178 |
+
general_selectors = ['main', '.content', 'article', '.description']
|
179 |
+
for selector in general_selectors:
|
180 |
+
try:
|
181 |
+
elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
|
182 |
+
for element in elements:
|
183 |
+
text = element.text.strip()
|
184 |
+
if len(text) > 50:
|
185 |
+
content_parts.append(text[:500])
|
186 |
+
break
|
187 |
+
except:
|
188 |
+
continue
|
189 |
+
|
190 |
+
return ' | '.join(content_parts) if content_parts else "No content extracted"
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
return f"Error scraping {url} with Selenium: {str(e)}"
|
194 |
+
|
195 |
+
def scrape_linkedin_profile(self, linkedin_url):
|
196 |
+
"""Scrape LinkedIn company profile"""
|
197 |
+
if not linkedin_url or not linkedin_url.strip():
|
198 |
+
return "No LinkedIn URL provided"
|
199 |
+
|
200 |
+
# Check cache first
|
201 |
+
cached_content = self._get_cached_data(linkedin_url)
|
202 |
+
if cached_content:
|
203 |
+
return cached_content
|
204 |
+
|
205 |
+
try:
|
206 |
+
# Clean URL
|
207 |
+
linkedin_url = linkedin_url.strip()
|
208 |
+
if not linkedin_url.startswith('http'):
|
209 |
+
linkedin_url = 'https://' + linkedin_url
|
210 |
+
|
211 |
+
# Use appropriate scraping method
|
212 |
+
if self.use_selenium:
|
213 |
+
content = self.scrape_with_selenium(linkedin_url)
|
214 |
+
else:
|
215 |
+
content = self.scrape_with_requests(linkedin_url)
|
216 |
+
|
217 |
+
# Cache the result
|
218 |
+
self._cache_data(linkedin_url, content)
|
219 |
+
|
220 |
+
return content
|
221 |
+
|
222 |
+
except Exception as e:
|
223 |
+
return f"Error accessing LinkedIn: {str(e)}"
|
224 |
+
|
225 |
+
def scrape_linkedin_company(self, linkedin_url):
|
226 |
+
"""Alias for scrape_linkedin_profile - for compatibility"""
|
227 |
+
return self.scrape_linkedin_profile(linkedin_url)
|
228 |
+
|
229 |
+
def scrape_company_website(self, company_name):
|
230 |
+
"""Scrape company website as fallback"""
|
231 |
+
try:
|
232 |
+
# Try to construct company website URL
|
233 |
+
company_clean = re.sub(r'[^\w\s-]', '', company_name.lower())
|
234 |
+
company_clean = re.sub(r'\s+', '', company_clean)
|
235 |
+
|
236 |
+
possible_urls = [
|
237 |
+
f"https://{company_clean}.com",
|
238 |
+
f"https://www.{company_clean}.com",
|
239 |
+
f"https://{company_clean}.org",
|
240 |
+
f"https://www.{company_clean}.org"
|
241 |
+
]
|
242 |
+
|
243 |
+
for url in possible_urls:
|
244 |
+
cached_content = self._get_cached_data(url)
|
245 |
+
if cached_content:
|
246 |
+
return cached_content
|
247 |
+
|
248 |
+
try:
|
249 |
+
if self.use_selenium:
|
250 |
+
content = self.scrape_with_selenium(url)
|
251 |
+
else:
|
252 |
+
content = self.scrape_with_requests(url)
|
253 |
+
|
254 |
+
if "Error" not in content and len(content) > 50:
|
255 |
+
self._cache_data(url, content)
|
256 |
+
return content
|
257 |
+
except:
|
258 |
+
continue
|
259 |
+
|
260 |
+
return f"Could not find website for {company_name}"
|
261 |
+
|
262 |
+
except Exception as e:
|
263 |
+
return f"Error finding company website: {str(e)}"
|
264 |
+
|
265 |
+
def scrape_linkedin_or_company(self, linkedin_url, company_name):
|
266 |
+
"""Main method to scrape LinkedIn or fallback to company website"""
|
267 |
+
# First try LinkedIn
|
268 |
+
if linkedin_url and linkedin_url.strip():
|
269 |
+
linkedin_content = self.scrape_linkedin_profile(linkedin_url)
|
270 |
+
if "Error" not in linkedin_content and len(linkedin_content) > 50:
|
271 |
+
return f"LinkedIn: {linkedin_content}"
|
272 |
+
|
273 |
+
# Fallback to company website
|
274 |
+
company_content = self.scrape_company_website(company_name)
|
275 |
+
return f"Company Website: {company_content}"
|
276 |
+
|
277 |
+
def __del__(self):
|
278 |
+
"""Clean up Selenium driver"""
|
279 |
+
if hasattr(self, 'driver'):
|
280 |
+
try:
|
281 |
+
self.driver.quit()
|
282 |
+
except:
|
283 |
+
pass
|