File size: 12,722 Bytes
18f9528 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 |
# agent_api/serpjob.py
import os
from dotenv import load_dotenv
import json
from serpapi import GoogleSearch
import time
from typing import List, Dict, Optional, Tuple
load_dotenv()
def test_api_connection(api_key: str = None) -> Tuple[bool, str]:
"""Test SerpAPI connection with provided key"""
try:
if not api_key:
return False, "API key is required"
# Test with minimal query
params = {
"api_key": api_key,
"engine": "google",
"q": "test",
"num": 1
}
search = GoogleSearch(params)
results = search.get_dict()
if "error" in results:
return False, f"API Error: {results['error']}"
return True, "Connection successful"
except Exception as e:
return False, f"Connection failed: {str(e)}"
def clean_salary_text(text: str) -> str:
"""Clean and format salary information"""
if not text or text.lower() in ['n/a', 'not specified', '']:
return "Not specified"
# Clean whitespace
cleaned = ' '.join(text.split())
# Format common salary terms
replacements = {
'a year': '/year',
'an hour': '/hour',
'a month': '/month',
'per year': '/year',
'per hour': '/hour',
'per month': '/month'
}
for old, new in replacements.items():
cleaned = cleaned.replace(old, new)
return cleaned
def is_job_remote(job: Dict) -> bool:
"""Determine if a job is remote based on various indicators"""
# Collect all text fields to analyze
text_sources = [
job.get("title", ""),
job.get("description", ""),
job.get("location", ""),
job.get("company_name", ""),
str(job.get("detected_extensions", {}))
]
# Join all text and convert to lowercase
full_text = " ".join(text_sources).lower()
# Remote work indicators
remote_keywords = [
"remote", "work from home", "wfh", "virtual",
"anywhere", "fully remote", "remote-first", "100% remote",
"work remotely", "distributed team", "telecommute",
"home-based", "remote position", "flexible location",
"work from anywhere", "remote-friendly", "home office"
]
return any(keyword in full_text for keyword in remote_keywords)
def scrape_job_profile(query: str, location: str = "Canada", api_key: str = None) -> str:
"""
Scrape job information from Google Jobs using provided SerpAPI key
"""
print(f"\n{'='*50}")
print(f"π STARTING JOB SEARCH")
print(f"Query: {query}")
print(f"Location: {location}")
print(f"{'='*50}")
try:
# Validate API key
if not api_key:
print("No API key provided")
return json.dumps([])
# Test API connection first
api_connected, api_message = test_api_connection(api_key)
if not api_connected:
print(f"API Connection Failed: {api_message}")
return json.dumps([])
print(f"API Connection: {api_message}")
# Clean and prepare search query
search_query = query.strip()
# Add remote keyword if not present
if "remote" not in search_query.lower():
search_query = f"{search_query} remote"
# Location configuration mapping
location_settings = {
"United States": {"location": "United States", "gl": "us", "hl": "en"},
"Canada": {"location": "Canada", "gl": "ca", "hl": "en"},
"United Kingdom": {"location": "United Kingdom", "gl": "gb", "hl": "en"},
"Australia": {"location": "Australia", "gl": "au", "hl": "en"},
"Germany": {"location": "Germany", "gl": "de", "hl": "en"},
"Netherlands": {"location": "Netherlands", "gl": "nl", "hl": "en"},
"Remote Worldwide": {"location": "", "gl": "us", "hl": "en"}
}
# Get location settings
loc_config = location_settings.get(location, location_settings["Canada"])
# Add location to query if not worldwide
if location != "Remote Worldwide" and loc_config["location"]:
search_query = f"{search_query} in {loc_config['location']}"
print(f"π Final search query: '{search_query}'")
# Build search parameters
search_params = {
"api_key": api_key,
"engine": "google_jobs",
"q": search_query,
"hl": loc_config["hl"],
"gl": loc_config["gl"],
"chips": "date_posted:month",
"num": 12 # Reasonable number of results
}
# Add location parameter if specified
if loc_config["location"]:
search_params["location"] = loc_config["location"]
print(f"π§ Search parameters:")
for key, value in search_params.items():
if key != "api_key": # Don't log API key
print(f" {key}: {value}")
# Execute search with retry mechanism
max_attempts = 1
results = None
for attempt in range(max_attempts):
try:
print(f"Search attempt {attempt + 1}/{max_attempts}")
search = GoogleSearch(search_params)
results = search.get_dict()
# Check for API errors
if "error" in results:
error_msg = results["error"]
print(f"API Error on attempt {attempt + 1}: {error_msg}")
if attempt < max_attempts - 1:
print("β³ Waiting before retry...")
time.sleep(2 ** attempt) # Exponential backoff
continue
else:
print("All attempts failed")
return json.dumps([])
# Success
print(f"Search successful on attempt {attempt + 1}")
break
except Exception as e:
print(f"Attempt {attempt + 1} failed: {str(e)}")
if attempt < max_attempts - 1:
print("β³ Waiting before retry...")
time.sleep(2 ** attempt)
continue
else:
print("All search attempts exhausted")
return json.dumps([])
# Extract job results
raw_jobs = results.get("jobs_results", [])
print(f"π Raw jobs found: {len(raw_jobs)}")
if not raw_jobs:
print("No jobs found in search results")
return json.dumps([])
# Process and filter jobs
processed_jobs = []
for idx, job in enumerate(raw_jobs):
try:
print(f"\n Processing job {idx + 1}: {job.get('title', 'Unknown Title')}")
# Extract basic information
title = job.get("title", "N/A")
company = job.get("company_name", "N/A")
job_location = job.get("location", "N/A")
print(f" Company: {company}")
print(f" Location: {job_location}")
# Extract salary information
salary = "Not specified"
if job.get("salary_snippet"):
salary = clean_salary_text(job["salary_snippet"].get("text", ""))
elif job.get("detected_extensions", {}).get("salary"):
salary = clean_salary_text(job["detected_extensions"]["salary"])
print(f" Salary: {salary}")
# Determine if job is remote
remote_status = is_job_remote(job)
print(f" Remote: {'Yes' if remote_status else 'No'}")
# Extract apply link
apply_link = "N/A"
# Try multiple sources for apply link
if job.get("apply_options") and len(job["apply_options"]) > 0:
apply_link = job["apply_options"][0].get("link", "N/A")
elif job.get("share_link"):
apply_link = job["share_link"]
elif job.get("link"):
apply_link = job["link"]
# Extract posting date
posted_date = "Recently"
if job.get("detected_extensions", {}).get("posted_at"):
posted_date = job["detected_extensions"]["posted_at"]
elif job.get("posted_at"):
posted_date = job["posted_at"]
# Create job entry
job_entry = {
"title": title,
"company_name": company,
"location": job_location,
"salary": salary,
"remote": "Yes" if remote_status else "No",
"posted_at": posted_date,
"link": apply_link
}
# Apply filtering logic
should_include = False
if location == "Remote Worldwide":
# For worldwide remote, only include remote jobs
should_include = remote_status
filter_reason = "remote status" if remote_status else "not remote"
else:
# For specific locations, include if matches location OR is remote
location_match = location.lower() in job_location.lower()
should_include = location_match or remote_status
if location_match and remote_status:
filter_reason = "location match + remote"
elif location_match:
filter_reason = "location match"
elif remote_status:
filter_reason = "remote job"
else:
filter_reason = "no match"
if should_include:
processed_jobs.append(job_entry)
print(f" INCLUDED ({filter_reason})")
else:
print(f" FILTERED OUT ({filter_reason})")
except Exception as e:
print(f"Error processing job {idx + 1}: {str(e)}")
continue
print(f"\n{'='*50}")
print(f"π FINAL RESULTS")
print(f"Raw jobs found: {len(raw_jobs)}")
print(f"Jobs after filtering: {len(processed_jobs)}")
print(f"{'='*50}")
# Sort by posting date (attempt to put recent jobs first)
try:
processed_jobs.sort(
key=lambda x: x.get("posted_at", ""),
reverse=True
)
except:
print("β οΈ Could not sort by posting date")
# Return results as JSON
result_json = json.dumps(processed_jobs, indent=2)
print(f"Returning {len(processed_jobs)} jobs")
return result_json
except Exception as e:
print(f"\n CRITICAL ERROR in scrape_job_profile:")
print(f"Error: {str(e)}")
import traceback
traceback.print_exc()
return json.dumps([])
def quick_test(api_key: str = None):
"""Quick test function to verify the scraper works"""
print("π§ͺ TESTING SCRAPER")
print("="*30)
# Test connection first
connected, message = test_api_connection(api_key)
print(f"Connection test: {message}")
if not connected:
return False
# Test search
try:
result = scrape_job_profile("Python developer", "Canada", api_key)
jobs = json.loads(result)
print(f"Test completed: Found {len(jobs)} jobs")
if jobs:
sample_job = jobs[0]
print(f"Sample job: {sample_job['title']} at {sample_job['company_name']}")
return True
except Exception as e:
print(f"Test failed: {str(e)}")
return False
# if __name__ == "__main__":
# # Only run test if API key is provided
# api_key = os.environ.get("SERP_API_KEY")
# if api_key:
# quick_test(api_key)
# else:
# print("No API key found in environment variables") |