ak0601 commited on
Commit
995c3a9
·
verified ·
1 Parent(s): 4cee115

Upload 3 files

Browse files
Files changed (3) hide show
  1. db.py +36 -0
  2. reccomendation.py +949 -0
  3. requirements.txt +14 -0
db.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import psycopg2
4
+ from sqlalchemy import create_engine, inspect, text
5
+ import re
6
+ from datetime import datetime
7
+ import logging
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Database connection parameters - use environment variables with fallbacks
17
+ DB_PARAMS = {
18
+ 'dbname': os.getenv("DB_NAME"),
19
+ 'user': os.getenv("DB_USER"),
20
+ 'password': os.getenv("DB_PASSWORD"),
21
+ 'host': os.getenv("DB_HOST"),
22
+ 'port': os.getenv("DB_PORT")
23
+ }
24
+
25
+ # Create SQLAlchemy engine
26
+ def get_engine():
27
+ conn_string = f"postgresql://{DB_PARAMS['user']}:{DB_PARAMS['password']}@{DB_PARAMS['host']}:{DB_PARAMS['port']}/{DB_PARAMS['dbname']}"
28
+ return create_engine(conn_string)
29
+
30
+ def get_jobs():
31
+ engine = get_engine()
32
+ df = pd.read_sql_table("jobs", con=engine)
33
+
34
+ def submissions():
35
+ engine = get_engine()
36
+ df = pd.read_sql_table("candidate_submissions", con=engine)
reccomendation.py ADDED
@@ -0,0 +1,949 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ from pydantic import BaseModel, Field
4
+ from typing import List, Tuple, Optional
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ import os
8
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Depends, Header, Request
9
+ from fastapi.responses import JSONResponse
10
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ import json
13
+ import tempfile
14
+ import shutil
15
+ import PyPDF2
16
+ from dotenv import load_dotenv
17
+ import pdfplumber
18
+ import re
19
+ from db import *
20
+ import time
21
+ import asyncio
22
+ from contextlib import asynccontextmanager
23
+ import logging
24
+ from sqlalchemy.pool import NullPool
25
+
26
+ # Load environment variables
27
+ load_dotenv()
28
+
29
+ # Configure logging for Cloud Run
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+ # Global variable to store access token
37
+ access_token = None
38
+
39
+ # Startup/shutdown events
40
+ @asynccontextmanager
41
+ async def lifespan(app: FastAPI):
42
+ # Startup
43
+ logger.info("Starting up Job Recommendation API...")
44
+ # You can initialize connection pools here if needed
45
+ yield
46
+ # Shutdown
47
+ logger.info("Shutting down Job Recommendation API...")
48
+ # Close any open connections here
49
+
50
+ # Initialize FastAPI app with lifespan
51
+ app = FastAPI(
52
+ title="Job Recommendation API",
53
+ description="API for processing resumes and recommending jobs",
54
+ lifespan=lifespan
55
+ )
56
+
57
+ # Add CORS middleware for cloud deployment
58
+ app.add_middleware(
59
+ CORSMiddleware,
60
+ allow_origins=["*"], # Configure based on your needs
61
+ allow_credentials=True,
62
+ allow_methods=["*"],
63
+ allow_headers=["*"],
64
+ )
65
+
66
+ # Add request ID middleware for better tracing
67
+ @app.middleware("http")
68
+ async def add_request_id(request: Request, call_next):
69
+ request_id = f"{time.time()}-{request.client.host}"
70
+ request.state.request_id = request_id
71
+
72
+ # Log the request
73
+ logger.info(f"Request ID: {request_id} - {request.method} {request.url.path}")
74
+
75
+ try:
76
+ response = await call_next(request)
77
+ response.headers["X-Request-ID"] = request_id
78
+ return response
79
+ except Exception as e:
80
+ logger.error(f"Request ID: {request_id} - Error: {str(e)}")
81
+ raise
82
+
83
+ # Security configuration
84
+ API_KEY = os.getenv("API_KEY")
85
+ security = HTTPBearer()
86
+
87
+ def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
88
+ """
89
+ Verify the API key from the Authorization header
90
+ """
91
+ if not API_KEY:
92
+ logger.error("API key not configured")
93
+ raise HTTPException(
94
+ status_code=500,
95
+ detail="API key not configured",
96
+ )
97
+
98
+ if credentials.credentials != API_KEY:
99
+ logger.warning("Invalid API key attempt")
100
+ raise HTTPException(
101
+ status_code=401,
102
+ detail="Invalid API key",
103
+ headers={"WWW-Authenticate": "Bearer"},
104
+ )
105
+ return credentials.credentials
106
+
107
+ # Initialize OpenAI client with error handling
108
+ try:
109
+ llm = ChatOpenAI(
110
+ model="gpt-4o-mini",
111
+ temperature=0,
112
+ api_key=os.getenv("OPENAI_API_KEY")
113
+ )
114
+ logger.info("OpenAI client initialized successfully")
115
+ except Exception as e:
116
+ logger.error(f"Failed to initialize OpenAI client: {e}")
117
+ raise
118
+
119
+ # Initialize database engine with connection pooling suitable for Cloud Run
120
+ def get_engine():
121
+ """
122
+ Get database engine with NullPool for Cloud Run
123
+ """
124
+ try:
125
+ conn_string = f"postgresql://{DB_PARAMS['user']}:{DB_PARAMS['password']}@{DB_PARAMS['host']}:{DB_PARAMS['port']}/{DB_PARAMS['dbname']}"
126
+ # Use NullPool for Cloud Run to avoid connection issues
127
+ engine = create_engine(conn_string, poolclass=NullPool, pool_pre_ping=True)
128
+ logger.info("Database engine created successfully")
129
+ return engine
130
+ except Exception as e:
131
+ logger.error(f"Failed to create database engine: {e}")
132
+ raise
133
+
134
+ # Initialize database engine
135
+ engine = get_engine()
136
+
137
+ def get_access_token():
138
+ """
139
+ Get access token for the external API with better error handling
140
+ """
141
+ global access_token
142
+
143
+ # If we already have a token, return it
144
+ if access_token:
145
+ return access_token
146
+
147
+ try:
148
+ login_url = "https://fitscore-agent-535960463668.us-central1.run.app/auth/login"
149
+ login_data = {
150
+ "email": "[email protected]",
151
+ "password": "Password@123"
152
+ }
153
+ login_headers = {
154
+ 'accept': 'application/json',
155
+ 'Content-Type': 'application/json'
156
+ }
157
+
158
+ # Add timeout to prevent hanging
159
+ login_response = requests.post(login_url, headers=login_headers, json=login_data, timeout=30)
160
+
161
+ if login_response.status_code == 200:
162
+ login_result = login_response.json()
163
+ access_token = login_result.get('data', {}).get('tokens', {}).get('accessToken')
164
+ if access_token:
165
+ logger.info("Successfully obtained access token")
166
+ return access_token
167
+ else:
168
+ logger.error("Login successful but no access token found in response")
169
+ return None
170
+ else:
171
+ logger.error(f"Login failed with status {login_response.status_code}: {login_response.text}")
172
+ return None
173
+ except requests.exceptions.Timeout:
174
+ logger.error("Login request timed out")
175
+ return None
176
+ except requests.exceptions.RequestException as e:
177
+ logger.error(f"Network error during login: {e}")
178
+ return None
179
+ except Exception as e:
180
+ logger.error(f"Unexpected error getting access token: {e}")
181
+ return None
182
+
183
+ class structure(BaseModel):
184
+ name: str = Field(description="Name of the candidate")
185
+ location: str = Field(description="The location of the candidate. Extract city and state if possible.")
186
+ skills: List[str] = Field(description="List of individual skills of the candidate")
187
+ ideal_jobs: str = Field(description="List of ideal jobs for the candidate based on past experience.")
188
+ email: str = Field(description="The email of the candidate")
189
+ yoe: str = Field(description="Years of experience of the candidate.")
190
+ experience: str = Field(description="A brief summary of the candidate's past experience.")
191
+ industry: str = Field(description="The industry the candidate has experience in.(Tech,Legal,Finance/Accounting,Healthcare,Industrial,Logistics,Telecom,Admin,Other)")
192
+
193
+ class JobAnalysis(BaseModel):
194
+ job_title: str
195
+ company_name: str
196
+ analysis: dict
197
+
198
+ def extract_text_from_pdf(pdf_file_path: str) -> str:
199
+ """
200
+ Extract text from PDF file using multiple methods for better accuracy
201
+ """
202
+ text = ""
203
+
204
+ # Method 1: Try pdfplumber (better for complex layouts)
205
+ try:
206
+ with pdfplumber.open(pdf_file_path) as pdf:
207
+ for page in pdf.pages:
208
+ page_text = page.extract_text()
209
+ if page_text:
210
+ text += page_text + "\n"
211
+ if text.strip():
212
+ logger.info(f"Successfully extracted text using pdfplumber: {len(text)} characters")
213
+ return text.strip()
214
+ except Exception as e:
215
+ logger.warning(f"pdfplumber failed: {e}")
216
+
217
+ # Method 2: Try PyPDF2 (fallback)
218
+ try:
219
+ with open(pdf_file_path, 'rb') as file:
220
+ pdf_reader = PyPDF2.PdfReader(file)
221
+ for page in pdf_reader.pages:
222
+ page_text = page.extract_text()
223
+ if page_text:
224
+ text += page_text + "\n"
225
+ if text.strip():
226
+ logger.info(f"Successfully extracted text using PyPDF2: {len(text)} characters")
227
+ return text.strip()
228
+ except Exception as e:
229
+ logger.error(f"PyPDF2 failed: {e}")
230
+
231
+ # If both methods fail, return empty string
232
+ logger.error("Failed to extract text from PDF")
233
+ return ""
234
+
235
+ def extract_resume_info(resume_text: str) -> structure:
236
+ """
237
+ Extract structured information from resume using LLM
238
+ """
239
+ prompt = ChatPromptTemplate.from_template("""
240
+ You are an expert resume parser. Extract the following information from the resume text provided and return it in a structured JSON format.
241
+
242
+ Resume Text:
243
+ {resume_text}
244
+
245
+ Please extract and structure the information according to the following schema:
246
+ - name: Full name of the candidate
247
+ - location: City and state if available, otherwise general location
248
+ - skills: List of technical skills, tools, technologies, programming languages, etc.
249
+ - ideal_jobs: Based on their experience, what types of jobs would be ideal for this candidate
250
+ - email: Email address of the candidate (if found in resume)
251
+ - yoe: Years of experience (extract from work history)
252
+ - experience: Brief summary of their work experience and background
253
+ - industry: Categorize into one of these industries: Tech, Legal, Finance/Accounting, Healthcare, Industrial, Logistics, Telecom, Admin, Other
254
+
255
+ Return ONLY a valid JSON object with these fields. Do not include any other text or explanations.
256
+ """)
257
+
258
+ try:
259
+ str_llm = llm.with_structured_output(structure)
260
+ chain = prompt | str_llm
261
+ response = chain.invoke({"resume_text": resume_text})
262
+
263
+ validated_data = {
264
+ 'name': response.name,
265
+ 'location': response.location,
266
+ 'email': response.email,
267
+ 'skills': response.skills,
268
+ 'ideal_jobs': response.ideal_jobs,
269
+ 'yoe': response.yoe,
270
+ 'experience': response.experience,
271
+ 'industry': response.industry
272
+ }
273
+
274
+ logger.info(f"Successfully extracted resume info for: {validated_data['name']}")
275
+ return validated_data
276
+
277
+ except Exception as e:
278
+ logger.error(f"Failed to extract resume info: {e}")
279
+ return {
280
+ 'name': "Unknown",
281
+ 'location': "Unknown",
282
+ 'email': "",
283
+ 'skills': [],
284
+ 'ideal_jobs': "Software Engineer",
285
+ 'yoe': "0",
286
+ 'experience': "No experience listed",
287
+ 'industry': "Tech"
288
+ }
289
+
290
+ def filter_jobs_by_industry(jobs_df: pd.DataFrame, target_industry: str) -> pd.DataFrame:
291
+ """
292
+ Filter jobs by industry
293
+ """
294
+ # Map the extracted industry to database industry values
295
+ industry_mapping = {
296
+ 'Tech': ['technology', 'VC Tech'],
297
+ 'Legal': ['Legal'],
298
+ 'Finance/Accounting': ['finance/Accounting'],
299
+ 'Healthcare': ['healthcare'],
300
+ 'Industrial': ['industrial'],
301
+ 'Logistics': ['logistics'],
302
+ 'Telecom': ['telecom'],
303
+ 'Admin': ['admin'],
304
+ 'Other': ['Other']
305
+ }
306
+
307
+ target_industries = industry_mapping.get(target_industry, ['Tech'])
308
+
309
+ # Filter jobs by industry (using database column name 'industry')
310
+ filtered_jobs = jobs_df[jobs_df['industry'].isin(target_industries)]
311
+
312
+ logger.info(f"Filtered {len(filtered_jobs)} jobs for industry: {target_industry}")
313
+ return filtered_jobs
314
+
315
+ def filter_jobs_by_location(jobs_df: pd.DataFrame, candidate_location: str) -> pd.DataFrame:
316
+ """
317
+ Filter jobs by location matching the candidate's location
318
+ """
319
+ if not candidate_location or candidate_location.lower() in ['unknown', 'n/a', '']:
320
+ logger.info(f"No location info provided, returning all {len(jobs_df)} jobs")
321
+ return jobs_df # Return all jobs if no location info
322
+
323
+ # Clean and normalize candidate location
324
+ candidate_location = candidate_location.lower().strip()
325
+ logger.info(f"Filtering jobs for candidate location: {candidate_location}")
326
+
327
+ # Extract state abbreviations and full names
328
+ state_mapping = {
329
+ 'alabama': 'al', 'alaska': 'ak', 'arizona': 'az', 'arkansas': 'ar', 'california': 'ca',
330
+ 'colorado': 'co', 'connecticut': 'ct', 'delaware': 'de', 'district of columbia': 'dc', 'florida': 'fl', 'georgia': 'ga',
331
+ 'hawaii': 'hi', 'idaho': 'id', 'illinois': 'il', 'indiana': 'in', 'iowa': 'ia',
332
+ 'kansas': 'ks', 'kentucky': 'ky', 'louisiana': 'la', 'maine': 'me', 'maryland': 'md',
333
+ 'massachusetts': 'ma', 'michigan': 'mi', 'minnesota': 'mn', 'mississippi': 'ms', 'missouri': 'mo',
334
+ 'montana': 'mt', 'nebraska': 'ne', 'nevada': 'nv', 'new hampshire': 'nh', 'new jersey': 'nj',
335
+ 'new mexico': 'nm', 'new york': 'ny', 'north carolina': 'nc', 'north dakota': 'nd', 'ohio': 'oh',
336
+ 'oklahoma': 'ok', 'oregon': 'or', 'pennsylvania': 'pa', 'rhode island': 'ri', 'south carolina': 'sc',
337
+ 'south dakota': 'sd', 'tennessee': 'tn', 'texas': 'tx', 'utah': 'ut', 'vermont': 'vt',
338
+ 'virginia': 'va', 'washington': 'wa', 'west virginia': 'wv', 'wisconsin': 'wi', 'wyoming': 'wy'
339
+ }
340
+
341
+ # Create location patterns to match
342
+ location_patterns = []
343
+
344
+ # Add the original location
345
+ location_patterns.append(candidate_location)
346
+
347
+ # Add state variations
348
+ for state_name, state_abbr in state_mapping.items():
349
+ if state_name in candidate_location or state_abbr in candidate_location:
350
+ location_patterns.extend([state_name, state_abbr])
351
+
352
+ # Add common city variations (extract city name)
353
+ city_match = re.search(r'^([^,]+)', candidate_location)
354
+ if city_match:
355
+ city_name = city_match.group(1).strip()
356
+ location_patterns.append(city_name)
357
+
358
+ # Add remote/anywhere patterns if location is remote
359
+ if 'remote' in candidate_location or 'anywhere' in candidate_location:
360
+ location_patterns.extend(['remote', 'anywhere', 'work from home', 'wfh'])
361
+
362
+ logger.info(f"Location patterns to match: {location_patterns}")
363
+
364
+ # Filter jobs by location
365
+ matching_jobs = []
366
+
367
+ for _, job_row in jobs_df.iterrows():
368
+ job_location = str(job_row.get('job_location', '')).lower()
369
+
370
+ # Check if any location pattern matches
371
+ location_matches = any(pattern in job_location for pattern in location_patterns)
372
+
373
+ # Also check for remote jobs if candidate location includes remote
374
+ if 'remote' in candidate_location and any(remote_term in job_location for remote_term in ['remote', 'anywhere', 'work from home', 'wfh']):
375
+ location_matches = True
376
+
377
+ # Check for exact city/state matches
378
+ if candidate_location in job_location or job_location in candidate_location:
379
+ location_matches = True
380
+
381
+ if location_matches:
382
+ matching_jobs.append(job_row)
383
+
384
+ result_df = pd.DataFrame(matching_jobs) if matching_jobs else jobs_df
385
+ logger.info(f"Found {len(matching_jobs)} jobs matching location out of {len(jobs_df)} total jobs")
386
+
387
+ return result_df
388
+
389
+ def extract_experience_requirement(requirements_text: str) -> dict:
390
+ """
391
+ Extract experience requirements from job requirements text
392
+ Returns a dictionary with min_years, max_years, and level
393
+ """
394
+ if not requirements_text or pd.isna(requirements_text):
395
+ return {'min_years': 0, 'max_years': 999, 'level': 'any'}
396
+
397
+ requirements_text = str(requirements_text).lower()
398
+
399
+ # Common experience patterns
400
+ experience_patterns = [
401
+ # Specific year ranges
402
+ r'(\d+)[\-\+]\s*(\d+)\s*years?\s*experience',
403
+ r'(\d+)\s*to\s*(\d+)\s*years?\s*experience',
404
+ r'(\d+)\s*-\s*(\d+)\s*years?\s*experience',
405
+
406
+ # Minimum years
407
+ r'(\d+)\+?\s*years?\s*experience',
408
+ r'minimum\s*(\d+)\s*years?\s*experience',
409
+ r'at\s*least\s*(\d+)\s*years?\s*experience',
410
+
411
+ # Level-based patterns
412
+ r'(entry\s*level|junior|associate)',
413
+ r'(mid\s*level|intermediate|mid\s*senior)',
414
+ r'(senior|lead|principal|staff)',
415
+ r'(executive|director|vp|chief|c\s*level)',
416
+
417
+ # Specific year mentions
418
+ r'(\d+)\s*years?\s*in\s*the\s*field',
419
+ r'(\d+)\s*years?\s*of\s*professional\s*experience',
420
+ r'(\d+)\s*years?\s*of\s*relevant\s*experience'
421
+ ]
422
+
423
+ min_years = 0
424
+ max_years = 999
425
+ level = 'any'
426
+
427
+ # Check for specific year ranges
428
+ for pattern in experience_patterns[:3]: # First 3 patterns are for ranges
429
+ matches = re.findall(pattern, requirements_text)
430
+ if matches:
431
+ try:
432
+ min_years = int(matches[0][0])
433
+ max_years = int(matches[0][1])
434
+ break
435
+ except (ValueError, IndexError):
436
+ continue
437
+
438
+ # Check for minimum years if no range found
439
+ if min_years == 0:
440
+ for pattern in experience_patterns[3:6]: # Minimum year patterns
441
+ matches = re.findall(pattern, requirements_text)
442
+ if matches:
443
+ try:
444
+ min_years = int(matches[0])
445
+ break
446
+ except (ValueError, IndexError):
447
+ continue
448
+
449
+ # Check for level-based requirements
450
+ for pattern in experience_patterns[6:10]: # Level patterns
451
+ matches = re.findall(pattern, requirements_text)
452
+ if matches:
453
+ level_match = matches[0].lower()
454
+ if 'entry' in level_match or 'junior' in level_match or 'associate' in level_match:
455
+ level = 'entry'
456
+ if min_years == 0:
457
+ min_years = 0
458
+ max_years = 2
459
+ elif 'mid' in level_match or 'intermediate' in level_match:
460
+ level = 'mid'
461
+ if min_years == 0:
462
+ min_years = 2
463
+ max_years = 5
464
+ elif 'senior' in level_match or 'lead' in level_match or 'principal' in level_match or 'staff' in level_match:
465
+ level = 'senior'
466
+ if min_years == 0:
467
+ min_years = 5
468
+ max_years = 10
469
+ elif 'executive' in level_match or 'director' in level_match or 'vp' in level_match or 'chief' in level_match:
470
+ level = 'executive'
471
+ if min_years == 0:
472
+ min_years = 10
473
+ max_years = 999
474
+ break
475
+
476
+ # Check for specific year mentions if still no match
477
+ if min_years == 0:
478
+ for pattern in experience_patterns[10:]: # Specific year mention patterns
479
+ matches = re.findall(pattern, requirements_text)
480
+ if matches:
481
+ try:
482
+ min_years = int(matches[0])
483
+ max_years = min_years + 2 # Add buffer
484
+ break
485
+ except (ValueError, IndexError):
486
+ continue
487
+
488
+ return {
489
+ 'min_years': min_years,
490
+ 'max_years': max_years,
491
+ 'level': level
492
+ }
493
+
494
+ def filter_jobs_by_experience(jobs_df: pd.DataFrame, candidate_yoe: str) -> pd.DataFrame:
495
+ """
496
+ Filter jobs by experience level matching the candidate's years of experience
497
+ """
498
+ if not candidate_yoe or candidate_yoe.lower() in ['unknown', 'n/a', '']:
499
+ logger.info(f"No experience info provided, returning all {len(jobs_df)} jobs")
500
+ return jobs_df
501
+
502
+ # Extract numeric years from candidate experience
503
+ try:
504
+ # Handle various formats like "5 years", "5+ years", "5-7 years", etc.
505
+ yoe_match = re.search(r'(\d+(?:\.\d+)?)', str(candidate_yoe))
506
+ if yoe_match:
507
+ candidate_years = float(yoe_match.group(1))
508
+ else:
509
+ logger.warning(f"Could not extract years from: {candidate_yoe}")
510
+ return jobs_df
511
+ except (ValueError, TypeError):
512
+ logger.error(f"Invalid experience format: {candidate_yoe}")
513
+ return jobs_df
514
+
515
+ logger.info(f"Filtering jobs for candidate with {candidate_years} years of experience")
516
+
517
+ # Filter jobs by experience requirements
518
+ matching_jobs = []
519
+
520
+ for _, job_row in jobs_df.iterrows():
521
+ requirements_text = str(job_row.get('requirements', ''))
522
+ experience_req = extract_experience_requirement(requirements_text)
523
+
524
+ # Check if candidate's experience matches the job requirements
525
+ if (candidate_years >= experience_req['min_years'] and
526
+ candidate_years <= experience_req['max_years']):
527
+ matching_jobs.append(job_row)
528
+
529
+ result_df = pd.DataFrame(matching_jobs) if matching_jobs else jobs_df
530
+ logger.info(f"Found {len(matching_jobs)} jobs matching experience out of {len(jobs_df)} total jobs")
531
+
532
+ return result_df
533
+
534
+ def filter_jobs_by_priority(jobs_df: pd.DataFrame) -> pd.DataFrame:
535
+ """
536
+ Filter jobs to only include high priority jobs
537
+ """
538
+ if jobs_df.empty:
539
+ logger.info("No jobs to filter by priority")
540
+ return jobs_df
541
+
542
+ # Filter jobs by priority - only include high priority jobs
543
+ priority_filtered_jobs = jobs_df[jobs_df['priority'].str.lower() == 'high']
544
+
545
+ logger.info(f"Found {len(priority_filtered_jobs)} high priority jobs out of {len(jobs_df)} total jobs")
546
+
547
+ return priority_filtered_jobs
548
+
549
+ def create_job_description(job_row: pd.Series) -> str:
550
+ """
551
+ Create a comprehensive job description from job data
552
+ """
553
+ description_parts = []
554
+
555
+ if pd.notna(job_row.get('company_blurb')):
556
+ description_parts.append(f"Company: {job_row['company_blurb']}")
557
+
558
+ if pd.notna(job_row.get('company_culture')):
559
+ description_parts.append(f"Company Culture: {job_row['company_culture']}")
560
+
561
+ if pd.notna(job_row.get('requirements')):
562
+ description_parts.append(f"Requirements: {job_row['requirements']}")
563
+
564
+ if pd.notna(job_row.get('role_responsibilities')):
565
+ description_parts.append(f"Role Responsibilities: {job_row['role_responsibilities']}")
566
+
567
+ if pd.notna(job_row.get('job_location')):
568
+ description_parts.append(f"Location: {job_row['job_location']}")
569
+
570
+ return "\n\n".join(description_parts)
571
+
572
+ def clean_analysis_result(analysis_result: dict) -> dict:
573
+ """
574
+ Clean up the analysis result to only include final_score and summary
575
+ """
576
+ if not isinstance(analysis_result, dict):
577
+ return analysis_result
578
+
579
+ # Remove user_context if present
580
+ if 'user_context' in analysis_result:
581
+ del analysis_result['user_context']
582
+
583
+ # Clean up final_response if present
584
+ if 'final_response' in analysis_result:
585
+ try:
586
+ # Handle both string and dict formats
587
+ if isinstance(analysis_result['final_response'], str):
588
+ final_response = json.loads(analysis_result['final_response'])
589
+ else:
590
+ final_response = analysis_result['final_response']
591
+
592
+ # Extract and format the evaluation data
593
+ if 'evaluation' in final_response and len(final_response['evaluation']) > 0:
594
+ evaluation = final_response['evaluation'][0]
595
+
596
+ # Create a minimal structure with only final_score and summary
597
+ cleaned_response = {
598
+ 'final_score': evaluation.get('final_score', 0),
599
+ 'summary': {}
600
+ }
601
+
602
+ # Extract summary information
603
+ if 'summary' in evaluation and len(evaluation['summary']) > 0:
604
+ summary = evaluation['summary'][0]
605
+ cleaned_response['summary'] = {
606
+ 'strengths': summary.get('strengths', []),
607
+ 'weaknesses': summary.get('weaknesses', []),
608
+ 'opportunities': summary.get('opportunities', []),
609
+ 'recommendations': summary.get('recommendations', [])
610
+ }
611
+
612
+ analysis_result['final_response'] = cleaned_response
613
+
614
+ except (json.JSONDecodeError, KeyError, IndexError) as e:
615
+ logger.error(f"Error cleaning analysis result: {e}")
616
+ # Keep original if cleaning fails
617
+ pass
618
+
619
+ return analysis_result
620
+
621
+ def sort_jobs_by_score(job_analyses: list) -> list:
622
+ """
623
+ Sort jobs by final_score in descending order (highest scores first)
624
+ """
625
+ def extract_score(job_analysis):
626
+ try:
627
+ analysis = job_analysis.get('analysis', {})
628
+ if 'final_response' in analysis and isinstance(analysis['final_response'], dict):
629
+ return analysis['final_response'].get('final_score', 0)
630
+ return 0
631
+ except:
632
+ return 0
633
+
634
+ return sorted(job_analyses, key=extract_score, reverse=True)
635
+
636
+ async def analyze_job_fit_with_retry(job_description: str, resume_file_path: str, max_retries: int = 3) -> dict:
637
+ """
638
+ Analyze job-candidate fit with retry logic for resilience
639
+ """
640
+ for attempt in range(max_retries):
641
+ try:
642
+ result = analyze_job_fit(job_description, resume_file_path)
643
+ if "error" not in result:
644
+ return result
645
+
646
+ # If authentication error and not last attempt, retry
647
+ if "Authentication failed" in result.get("error", "") and attempt < max_retries - 1:
648
+ logger.warning(f"Authentication failed, retrying... (attempt {attempt + 1}/{max_retries})")
649
+ global access_token
650
+ access_token = None # Reset token to force refresh
651
+ await asyncio.sleep(2 ** attempt) # Exponential backoff
652
+ continue
653
+
654
+ return result
655
+ except Exception as e:
656
+ logger.error(f"Attempt {attempt + 1}/{max_retries} failed: {str(e)}")
657
+ if attempt == max_retries - 1:
658
+ return {"error": f"Failed after {max_retries} attempts: {str(e)}"}
659
+ await asyncio.sleep(2 ** attempt)
660
+
661
+ def analyze_job_fit(job_description: str, resume_file_path: str) -> dict:
662
+ """
663
+ Analyze job-candidate fit using the external API
664
+ """
665
+
666
+ url = "https://fitscore-agent-535960463668.us-central1.run.app/analyze"
667
+
668
+ # Check if resume file exists
669
+ if not os.path.exists(resume_file_path):
670
+ logger.error(f"Resume file not found: {resume_file_path}")
671
+ return {"error": f"Resume file not found: {resume_file_path}"}
672
+
673
+
674
+ # Prepare headers with authentication
675
+ headers = {
676
+ 'accept': 'application/json',
677
+ 'Authorization': f'Bearer {get_access_token()}'
678
+ }
679
+
680
+ # Prepare form data
681
+ files = {
682
+ 'resume': (os.path.basename(resume_file_path), open(resume_file_path, 'rb'), 'application/pdf')
683
+ }
684
+
685
+ data = {
686
+ 'jd_text': job_description
687
+ }
688
+
689
+ try:
690
+ # Make the API request with longer timeout for cloud environments
691
+ response = requests.post(url, headers=headers, files=files, data=data, timeout=60)
692
+
693
+ # If we get an authentication error, try to get a fresh token and retry once
694
+ if response.status_code == 401:
695
+ logger.warning("Authentication failed, getting fresh token...")
696
+ global access_token
697
+ access_token = None # Reset the token
698
+ new_token = get_access_token()
699
+ if new_token:
700
+ headers['Authorization'] = f'Bearer {new_token}'
701
+ # Close the previous file and reopen
702
+ files['resume'][1].close()
703
+ files['resume'] = (os.path.basename(resume_file_path), open(resume_file_path, 'rb'), 'application/pdf')
704
+ response = requests.post(url, headers=headers, files=files, data=data, timeout=60)
705
+ else:
706
+ # If we can't get a fresh token, return error
707
+ return {"error": "Authentication failed and could not obtain fresh token"}
708
+
709
+ if response.status_code == 200:
710
+ logger.info("Job fit analysis completed successfully")
711
+ return response.json()
712
+ elif response.status_code == 401:
713
+ # If we still get 401 after fresh token, return error
714
+ return {"error": "Authentication failed even with fresh token"}
715
+ else:
716
+ logger.error(f"API call failed with status {response.status_code}")
717
+ return {"error": f"API call failed with status {response.status_code}", "details": response.text}
718
+
719
+ except requests.exceptions.Timeout:
720
+ logger.error("API request timed out")
721
+ return {"error": "API request timed out"}
722
+ except Exception as e:
723
+ logger.error(f"Exception occurred: {str(e)}")
724
+ return {"error": f"Exception occurred: {str(e)}"}
725
+ finally:
726
+ # Ensure the file is closed
727
+ if 'resume' in files:
728
+ try:
729
+ files['resume'][1].close()
730
+ except:
731
+ pass
732
+
733
+ @app.post("/process_resume_and_recommend_jobs")
734
+ async def process_resume_and_recommend_jobs(
735
+ resume: UploadFile = File(...),
736
+ resume_text: str = Form(""),
737
+ api_key: str = Depends(verify_api_key)
738
+ ):
739
+ """
740
+ Process resume, extract information, filter jobs by industry, and analyze fit
741
+ """
742
+ request_start_time = time.time()
743
+
744
+ try:
745
+ logger.info(f"Processing resume: {resume.filename}")
746
+
747
+ # Save uploaded file temporarily
748
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
749
+ shutil.copyfileobj(resume.file, tmp_file)
750
+ tmp_file_path = tmp_file.name
751
+
752
+ try:
753
+ # Extract text from PDF if no resume_text provided
754
+ if not resume_text:
755
+ resume_text = extract_text_from_pdf(tmp_file_path)
756
+ if not resume_text:
757
+ logger.error("Could not extract text from PDF file")
758
+ return JSONResponse(
759
+ status_code=400,
760
+ content={"error": "Could not extract text from PDF file"}
761
+ )
762
+
763
+ # Extract resume information using LLM
764
+ resume_info = extract_resume_info(resume_text)
765
+
766
+ # Load jobs data from PostgreSQL database
767
+ try:
768
+ jobs_df = pd.read_sql_table("jobs", con=engine)
769
+ candidates_df = pd.read_sql_table("candidates", con=engine)
770
+ submissions_df = pd.read_sql_table("candidate_submissions", con=engine)
771
+ logger.info(f"Loaded {len(jobs_df)} jobs, {len(candidates_df)} candidates, {len(submissions_df)} submissions")
772
+ except Exception as db_error:
773
+ logger.error(f"Database error: {db_error}")
774
+ return JSONResponse(
775
+ status_code=500,
776
+ content={"error": "Database connection error"}
777
+ )
778
+
779
+ # Filter jobs by industry
780
+ filtered_jobs = filter_jobs_by_industry(jobs_df, resume_info['industry'])
781
+
782
+ if filtered_jobs.empty:
783
+ logger.warning(f"No jobs found for industry: {resume_info['industry']}")
784
+ return JSONResponse(
785
+ status_code=404,
786
+ content={"message": f"No jobs found for industry: {resume_info['industry']}"}
787
+ )
788
+
789
+ # Filter jobs by location
790
+ location_filtered_jobs = filter_jobs_by_location(filtered_jobs, resume_info['location'])
791
+
792
+ # Filter jobs by experience level
793
+ experience_filtered_jobs = filter_jobs_by_experience(location_filtered_jobs, resume_info['yoe'])
794
+
795
+ # Filter jobs by priority
796
+ priority_filtered_jobs = filter_jobs_by_priority(experience_filtered_jobs)
797
+
798
+ # Use priority filtered jobs if available, otherwise fall back to experience filtered jobs, then location filtered jobs
799
+ if not priority_filtered_jobs.empty:
800
+ jobs_to_analyze = priority_filtered_jobs
801
+ elif not experience_filtered_jobs.empty:
802
+ jobs_to_analyze = experience_filtered_jobs
803
+ else:
804
+ jobs_to_analyze = location_filtered_jobs
805
+
806
+ # Create filtered_submission_df with job_ids from jobs_to_analyze
807
+ job_ids_to_analyze = jobs_to_analyze['id'].tolist()
808
+ filtered_submission_df = submissions_df[submissions_df['jobId'].isin(job_ids_to_analyze)]
809
+
810
+ # Check if candidate email exists in candidates_df
811
+ candidate_id = None
812
+ if resume_info.get('email'):
813
+ candidate_match = candidates_df[candidates_df['email'] == resume_info['email']]
814
+ if not candidate_match.empty:
815
+ candidate_id = candidate_match.iloc[0]['id']
816
+ logger.info(f"Found existing candidate with ID: {candidate_id}")
817
+
818
+ # Analyze job fit for each filtered job
819
+ job_analyses = []
820
+
821
+ for _, job_row in jobs_to_analyze.head(20).iterrows(): # Analyze top 20 jobs
822
+ job_id = job_row.get('id')
823
+
824
+ # Check if we have an existing submission for this candidate and job
825
+ existing_submission = None
826
+ if candidate_id and job_id:
827
+ submission_match = filtered_submission_df[
828
+ (filtered_submission_df['candidate_id'] == candidate_id) &
829
+ (filtered_submission_df['jobId'] == job_id)
830
+ ]
831
+ if not submission_match.empty:
832
+ existing_submission = submission_match.iloc[0]
833
+ logger.info(f"Found existing submission for job_id: {job_id}, candidate_id: {candidate_id}")
834
+
835
+ if existing_submission is not None:
836
+ # Use existing fit score from submission
837
+ fit_score = existing_submission.get('fit_score', 0)
838
+ existing_analysis = {
839
+ 'final_response': {
840
+ 'final_score': fit_score,
841
+ 'summary': {
842
+ 'strengths': [],
843
+ 'weaknesses': [],
844
+ 'opportunities': [],
845
+ 'recommendations': []
846
+ }
847
+ },
848
+ 'source': 'existing_submission'
849
+ }
850
+ analysis_result = existing_analysis
851
+ else:
852
+ # Call API for new analysis with retry logic
853
+ job_description = create_job_description(job_row)
854
+ analysis_result = await analyze_job_fit_with_retry(job_description, tmp_file_path)
855
+ analysis_result['source'] = 'api_call'
856
+
857
+ # Clean up the analysis result
858
+ cleaned_analysis = clean_analysis_result(analysis_result)
859
+
860
+ job_analysis = JobAnalysis(
861
+ job_title=job_row.get('job_title', 'Unknown'),
862
+ company_name=job_row.get('company_name', 'Unknown'),
863
+ analysis=cleaned_analysis
864
+ )
865
+ job_analyses.append(job_analysis.dict())
866
+
867
+ # Sort jobs by final_score in descending order (highest scores first)
868
+ job_analyses = sort_jobs_by_score(job_analyses)
869
+
870
+ # Count existing submissions vs API calls
871
+ existing_submissions_count = sum(1 for analysis in job_analyses if analysis.get('analysis', {}).get('source') == 'existing_submission')
872
+ api_calls_count = sum(1 for analysis in job_analyses if analysis.get('analysis', {}).get('source') == 'api_call')
873
+
874
+ # Clean up temporary file
875
+ os.unlink(tmp_file_path)
876
+
877
+ # Calculate processing time
878
+ processing_time = time.time() - request_start_time
879
+ logger.info(f"Request completed in {processing_time:.2f} seconds")
880
+
881
+ return {
882
+ "resume_info": resume_info,
883
+ "industry": resume_info['industry'],
884
+ "location": resume_info['location'],
885
+ "experience_years": resume_info['yoe'],
886
+ "jobs_analyzed": len(job_analyses),
887
+ "location_filtered": not location_filtered_jobs.empty,
888
+ "experience_filtered": not experience_filtered_jobs.empty,
889
+ "priority_filtered": not priority_filtered_jobs.empty,
890
+ "existing_submissions_used": existing_submissions_count,
891
+ "api_calls_made": api_calls_count,
892
+ "candidate_found": candidate_id is not None,
893
+ "processing_time_seconds": round(processing_time, 2),
894
+ "job_analyses": job_analyses
895
+ }
896
+
897
+ except Exception as e:
898
+ # Clean up temporary file in case of error
899
+ if os.path.exists(tmp_file_path):
900
+ os.unlink(tmp_file_path)
901
+ raise e
902
+
903
+ except Exception as e:
904
+ logger.error(f"Processing failed: {str(e)}", exc_info=True)
905
+ return JSONResponse(
906
+ status_code=500,
907
+ content={"error": f"Processing failed: {str(e)}"}
908
+ )
909
+
910
+ @app.get("/health")
911
+ async def health_check(api_key: str = Depends(verify_api_key)):
912
+ """
913
+ Health check endpoint with database connectivity check
914
+ """
915
+ health_status = {
916
+ "status": "healthy",
917
+ "message": "Job Recommendation API is running",
918
+ "timestamp": time.time()
919
+ }
920
+
921
+ # Check database connectivity
922
+ try:
923
+ with engine.connect() as conn:
924
+ result = conn.execute(text("SELECT 1"))
925
+ health_status["database"] = "connected"
926
+ except Exception as e:
927
+ logger.error(f"Database health check failed: {e}")
928
+ health_status["database"] = "disconnected"
929
+ health_status["status"] = "degraded"
930
+
931
+ return health_status
932
+
933
+ @app.get("/")
934
+ async def root():
935
+ """
936
+ Root endpoint
937
+ """
938
+ return {
939
+ "message": "Job Recommendation API",
940
+ "version": "1.0.0",
941
+ "docs": "/docs",
942
+ "health": "/health"
943
+ }
944
+
945
+ if __name__ == "__main__":
946
+ import uvicorn
947
+ port = int(os.getenv("PORT", 8080))
948
+ logger.info(f"Starting server on port {port}")
949
+ uvicorn.run(app, host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pandas
4
+ requests
5
+ pydantic
6
+ langchain-openai
7
+ langchain-core
8
+ python-multipart
9
+ python-dotenv
10
+ PyPDF2
11
+ pdfplumber
12
+ reportlab
13
+ psycopg2-binary
14
+ sqlalchemy