Codingo / backend /services /resume_parser.py
husseinelsaadi's picture
updated
45f0a42
import re
from pathlib import Path
from typing import Dict, List, Tuple
import spacy
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import nltk
from nltk.corpus import stopwords
from dateutil.parser import parse as date_parse
# Download required NLTK data
try:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
except:
pass
# Load spaCy model for better NER
try:
nlp = spacy.load("en_core_web_sm")
except:
print("Please install spacy model: python -m spacy download en_core_web_sm")
nlp = None
MODEL_NAME = "manishiitg/resume-ner"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
# Expanded keyword lists
SKILL_KEYWORDS = {
# Programming Languages
"python", "java", "javascript", "typescript", "c++", "c#", "ruby", "go", "rust", "kotlin", "swift",
"php", "r", "matlab", "scala", "perl", "bash", "powershell", "sql", "html", "css",
# Frameworks & Libraries
"react", "angular", "vue", "node.js", "express", "django", "flask", "spring", "spring boot",
".net", "laravel", "rails", "fastapi", "pytorch", "tensorflow", "keras", "scikit-learn",
# Databases
"mysql", "postgresql", "mongodb", "redis", "elasticsearch", "cassandra", "oracle", "sql server",
# Cloud & DevOps
"aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "terraform", "ansible", "ci/cd",
# Other Technical Skills
"machine learning", "deep learning", "data science", "nlp", "computer vision", "ai",
"rest api", "graphql", "microservices", "agile", "scrum", "git", "linux", "windows"
}
EDUCATION_PATTERNS = [
# Degrees
r"\b(bachelor|b\.?s\.?c?\.?|b\.?a\.?|b\.?tech|b\.?e\.?)\b",
r"\b(master|m\.?s\.?c?\.?|m\.?a\.?|m\.?tech|m\.?e\.?|mba)\b",
r"\b(ph\.?d\.?|doctorate|doctoral)\b",
r"\b(diploma|certificate|certification)\b",
# Fields of Study
r"\b(computer science|software engineering|information technology|it|cs)\b",
r"\b(electrical engineering|mechanical engineering|civil engineering)\b",
r"\b(data science|artificial intelligence|machine learning)\b",
r"\b(business administration|finance|accounting|marketing)\b",
# Institution indicators
r"\b(university|college|institute|school)\s+of\s+\w+",
r"\b\w+\s+(university|college|institute)\b"
]
JOB_TITLE_PATTERNS = [
r"\b(software|senior|junior|lead|principal|staff)\s*(engineer|developer|programmer)\b",
r"\b(data|business|system|security)\s*(analyst|scientist|engineer)\b",
r"\b(project|product|program|engineering)\s*manager\b",
r"\b(devops|cloud|ml|ai|backend|frontend|full[\s-]?stack)\s*(engineer|developer)\b",
r"\b(consultant|architect|specialist|coordinator|administrator)\b"
]
def extract_text(file_path: str) -> str:
"""Extract text from PDF or DOCX files"""
path = Path(file_path)
if path.suffix.lower() == ".pdf":
text = pdf_extract_text(file_path)
elif path.suffix.lower() == ".docx":
doc = Document(file_path)
text = "\n".join([p.text for p in doc.paragraphs])
else:
raise ValueError("Unsupported file format")
return text
def clean_text(text: str) -> str:
"""Clean and normalize text"""
# Remove multiple spaces and normalize
text = re.sub(r'\s+', ' ', text)
# Keep line breaks for section detection
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def extract_sections(text: str) -> Dict[str, str]:
"""Extract different sections from resume"""
sections = {
'education': '',
'experience': '',
'skills': '',
'summary': ''
}
# Common section headers
section_patterns = {
'education': r'(education|academic|qualification|degree)',
'experience': r'(experience|employment|work\s*history|professional\s*experience|career)',
'skills': r'(skills|technical\s*skills|competencies|expertise)',
'summary': r'(summary|objective|profile|about)'
}
lines = text.split('\n')
current_section = None
for i, line in enumerate(lines):
line_lower = line.lower().strip()
# Check if this line is a section header
for section, pattern in section_patterns.items():
if re.search(pattern, line_lower) and len(line_lower) < 50:
current_section = section
break
# Add content to current section
if current_section and i > 0:
sections[current_section] += line + '\n'
return sections
def extract_name(text: str, entities: List) -> str:
"""Extract name using multiple methods"""
# Method 1: Use transformer model
name_parts = []
for ent in entities:
if ent["entity_group"].upper() in ["NAME", "PERSON", "PER"]:
name_parts.append(ent["word"].strip())
if name_parts:
# Clean and join name parts
full_name = " ".join(dict.fromkeys(name_parts))
full_name = re.sub(r'\s+', ' ', full_name).strip()
if len(full_name) > 3 and len(full_name.split()) <= 4:
return full_name
# Method 2: Use spaCy if available
if nlp:
doc = nlp(text[:500]) # Check first 500 chars
for ent in doc.ents:
if ent.label_ == "PERSON":
name = ent.text.strip()
if len(name) > 3 and len(name.split()) <= 4:
return name
# Method 3: Pattern matching for first few lines
first_lines = text.split('\n')[:5]
for line in first_lines:
line = line.strip()
# Look for name pattern (2-4 words, title case)
if re.match(r'^[A-Z][a-z]+(\s+[A-Z][a-z]+){1,3}$', line):
return line
return "Not Found"
def extract_skills(text: str, skill_section: str = "") -> List[str]:
"""Extract skills using multiple methods"""
skills_found = set()
# Prioritize skills section if available
search_text = skill_section + " " + text if skill_section else text
search_text = search_text.lower()
# Method 1: Direct keyword matching
for skill in SKILL_KEYWORDS:
if re.search(rf'\b{re.escape(skill.lower())}\b', search_text):
skills_found.add(skill)
# Method 2: Pattern-based extraction
# Look for skills in bullet points or comma-separated lists
skill_patterns = [
r'[•·▪▫◦‣⁃]\s*([A-Za-z\s\+\#\.]+)', # Bullet points
r'(?:skills?|technologies|tools?)[\s:]*([A-Za-z\s,\+\#\.]+)', # After keywords
]
for pattern in skill_patterns:
matches = re.findall(pattern, search_text, re.IGNORECASE)
for match in matches:
# Check each word/phrase in the match
potential_skills = re.split(r'[,;]', match)
for ps in potential_skills:
ps = ps.strip().lower()
if ps in SKILL_KEYWORDS:
skills_found.add(ps)
return list(skills_found)
def extract_education(text: str, edu_section: str = "") -> List[str]:
"""Extract education information"""
education_info = []
search_text = edu_section + " " + text if edu_section else text
# Extract degrees
for pattern in EDUCATION_PATTERNS:
matches = re.findall(pattern, search_text, re.IGNORECASE)
for match in matches:
if isinstance(match, tuple):
match = match[0]
education_info.append(match)
# Extract years (graduation years)
year_pattern = r'\b(19[0-9]{2}|20[0-9]{2})\b'
years = re.findall(year_pattern, search_text)
# Extract GPA if mentioned
gpa_pattern = r'(?:gpa|cgpa|grade)[\s:]*([0-9]\.[0-9]+)'
gpa_matches = re.findall(gpa_pattern, search_text, re.IGNORECASE)
return list(dict.fromkeys(education_info)) # Remove duplicates
def extract_experience(text: str, exp_section: str = "") -> List[str]:
"""Extract experience information"""
experience_info = []
search_text = exp_section + " " + text if exp_section else text
# Extract job titles
for pattern in JOB_TITLE_PATTERNS:
matches = re.findall(pattern, search_text, re.IGNORECASE)
for match in matches:
if isinstance(match, tuple):
match = ' '.join(match).strip()
experience_info.append(match)
# Extract years of experience
exp_patterns = [
r'(\d+)\+?\s*(?:years?|yrs?)(?:\s+of)?\s+experience',
r'experience\s*:?\s*(\d+)\+?\s*(?:years?|yrs?)',
r'(\d+)\+?\s*(?:years?|yrs?)\s+(?:as|in|of)',
]
for pattern in exp_patterns:
matches = re.findall(pattern, search_text, re.IGNORECASE)
if matches:
years = max(map(int, matches))
experience_info.append(f"{years}+ years experience")
break
# Extract company names (common patterns)
company_patterns = [
r'(?:at|@|company|employer)\s*:?\s*([A-Z][A-Za-z\s&\.\-]+)',
r'([A-Z][A-Za-z\s&\.\-]+)\s*(?:inc|llc|ltd|corp|company)',
]
for pattern in company_patterns:
matches = re.findall(pattern, search_text)
experience_info.extend(matches[:3]) # Limit to avoid false positives
return list(dict.fromkeys(experience_info))
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
"""Main function to parse resume"""
# Extract and clean text
raw_text = extract_text(file_path)
text = clean_text(raw_text)
# Extract sections
sections = extract_sections(text)
# Get NER entities
entities = ner_pipeline(text[:1024]) # Limit for performance
# Extract information
name = extract_name(text, entities)
skills = extract_skills(text, sections.get('skills', ''))
education = extract_education(text, sections.get('education', ''))
experience = extract_experience(text, sections.get('experience', ''))
return {
"name": name,
"skills": ", ".join(skills[:15]) if skills else "Not Found", # Limit to 15 skills
"education": ", ".join(education[:5]) if education else "Not Found",
"experience": ", ".join(experience[:5]) if experience else "Not Found"
}
# Optional: Add confidence scores
def parse_resume_with_confidence(file_path: str) -> Dict[str, Tuple[str, float]]:
"""Parse resume with confidence scores for each field"""
result = parse_resume(file_path)
# Simple confidence calculation based on whether data was found
confidence_scores = {
"name": 0.9 if result["name"] != "Not Found" else 0.1,
"skills": min(0.9, len(result["skills"].split(",")) * 0.1) if result["skills"] != "Not Found" else 0.1,
"education": 0.8 if result["education"] != "Not Found" else 0.2,
"experience": 0.8 if result["experience"] != "Not Found" else 0.2
}
return {
key: (value, confidence_scores[key])
for key, value in result.items()
}