Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

Codingo / backend /services /resume_parser.py

husseinelsaadi

updated

45f0a42 16 days ago

raw

history blame

11.3 kB

	import re
	from pathlib import Path
	from typing import Dict, List, Tuple
	import spacy
	from pdfminer.high_level import extract_text as pdf_extract_text
	from docx import Document
	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	import nltk
	from nltk.corpus import stopwords
	from dateutil.parser import parse as date_parse

	# Download required NLTK data
	try:
	nltk.download('stopwords', quiet=True)
	nltk.download('punkt', quiet=True)
	except:
	pass

	# Load spaCy model for better NER
	try:
	nlp = spacy.load("en_core_web_sm")
	except:
	print("Please install spacy model: python -m spacy download en_core_web_sm")
	nlp = None

	MODEL_NAME = "manishiitg/resume-ner"
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
	ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

	# Expanded keyword lists
	SKILL_KEYWORDS = {
	# Programming Languages
	"python", "java", "javascript", "typescript", "c++", "c#", "ruby", "go", "rust", "kotlin", "swift",
	"php", "r", "matlab", "scala", "perl", "bash", "powershell", "sql", "html", "css",

	# Frameworks & Libraries
	"react", "angular", "vue", "node.js", "express", "django", "flask", "spring", "spring boot",
	".net", "laravel", "rails", "fastapi", "pytorch", "tensorflow", "keras", "scikit-learn",

	# Databases
	"mysql", "postgresql", "mongodb", "redis", "elasticsearch", "cassandra", "oracle", "sql server",

	# Cloud & DevOps
	"aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "terraform", "ansible", "ci/cd",

	# Other Technical Skills
	"machine learning", "deep learning", "data science", "nlp", "computer vision", "ai",
	"rest api", "graphql", "microservices", "agile", "scrum", "git", "linux", "windows"
	}

	EDUCATION_PATTERNS = [
	# Degrees
	r"\b(bachelor\|b\.?s\.?c?\.?\|b\.?a\.?\|b\.?tech\|b\.?e\.?)\b",
	r"\b(master\|m\.?s\.?c?\.?\|m\.?a\.?\|m\.?tech\|m\.?e\.?\|mba)\b",
	r"\b(ph\.?d\.?\|doctorate\|doctoral)\b",
	r"\b(diploma\|certificate\|certification)\b",

	# Fields of Study
	r"\b(computer science\|software engineering\|information technology\|it\|cs)\b",
	r"\b(electrical engineering\|mechanical engineering\|civil engineering)\b",
	r"\b(data science\|artificial intelligence\|machine learning)\b",
	r"\b(business administration\|finance\|accounting\|marketing)\b",

	# Institution indicators
	r"\b(university\|college\|institute\|school)\s+of\s+\w+",
	r"\b\w+\s+(university\|college\|institute)\b"
	]

	JOB_TITLE_PATTERNS = [
	r"\b(software\|senior\|junior\|lead\|principal\|staff)\s*(engineer\|developer\|programmer)\b",
	r"\b(data\|business\|system\|security)\s*(analyst\|scientist\|engineer)\b",
	r"\b(project\|product\|program\|engineering)\s*manager\b",
	r"\b(devops\|cloud\|ml\|ai\|backend\|frontend\|full[\s-]?stack)\s*(engineer\|developer)\b",
	r"\b(consultant\|architect\|specialist\|coordinator\|administrator)\b"
	]

	def extract_text(file_path: str) -> str:
	"""Extract text from PDF or DOCX files"""
	path = Path(file_path)
	if path.suffix.lower() == ".pdf":
	text = pdf_extract_text(file_path)
	elif path.suffix.lower() == ".docx":
	doc = Document(file_path)
	text = "\n".join([p.text for p in doc.paragraphs])
	else:
	raise ValueError("Unsupported file format")
	return text

	def clean_text(text: str) -> str:
	"""Clean and normalize text"""
	# Remove multiple spaces and normalize
	text = re.sub(r'\s+', ' ', text)
	# Keep line breaks for section detection
	text = re.sub(r'\n{3,}', '\n\n', text)
	return text.strip()

	def extract_sections(text: str) -> Dict[str, str]:
	"""Extract different sections from resume"""
	sections = {
	'education': '',
	'experience': '',
	'skills': '',
	'summary': ''
	}

	# Common section headers
	section_patterns = {
	'education': r'(education\|academic\|qualification\|degree)',
	'experience': r'(experience\|employment\|work\shistory\|professional\sexperience\|career)',
	'skills': r'(skills\|technical\s*skills\|competencies\|expertise)',
	'summary': r'(summary\|objective\|profile\|about)'
	}

	lines = text.split('\n')
	current_section = None

	for i, line in enumerate(lines):
	line_lower = line.lower().strip()

	# Check if this line is a section header
	for section, pattern in section_patterns.items():
	if re.search(pattern, line_lower) and len(line_lower) < 50:
	current_section = section
	break

	# Add content to current section
	if current_section and i > 0:
	sections[current_section] += line + '\n'

	return sections

	def extract_name(text: str, entities: List) -> str:
	"""Extract name using multiple methods"""
	# Method 1: Use transformer model
	name_parts = []
	for ent in entities:
	if ent["entity_group"].upper() in ["NAME", "PERSON", "PER"]:
	name_parts.append(ent["word"].strip())

	if name_parts:
	# Clean and join name parts
	full_name = " ".join(dict.fromkeys(name_parts))
	full_name = re.sub(r'\s+', ' ', full_name).strip()
	if len(full_name) > 3 and len(full_name.split()) <= 4:
	return full_name

	# Method 2: Use spaCy if available
	if nlp:
	doc = nlp(text[:500]) # Check first 500 chars
	for ent in doc.ents:
	if ent.label_ == "PERSON":
	name = ent.text.strip()
	if len(name) > 3 and len(name.split()) <= 4:
	return name

	# Method 3: Pattern matching for first few lines
	first_lines = text.split('\n')[:5]
	for line in first_lines:
	line = line.strip()
	# Look for name pattern (2-4 words, title case)
	if re.match(r'^[A-Z][a-z]+(\s+[A-Z][a-z]+){1,3}$', line):
	return line

	return "Not Found"

	def extract_skills(text: str, skill_section: str = "") -> List[str]:
	"""Extract skills using multiple methods"""
	skills_found = set()

	# Prioritize skills section if available
	search_text = skill_section + " " + text if skill_section else text
	search_text = search_text.lower()

	# Method 1: Direct keyword matching
	for skill in SKILL_KEYWORDS:
	if re.search(rf'\b{re.escape(skill.lower())}\b', search_text):
	skills_found.add(skill)

	# Method 2: Pattern-based extraction
	# Look for skills in bullet points or comma-separated lists
	skill_patterns = [
	r'[•·▪▫◦‣⁃]\s*([A-Za-z\s\+\#\.]+)', # Bullet points
	r'(?:skills?\|technologies\|tools?)[\s:]*([A-Za-z\s,\+\#\.]+)', # After keywords
	]

	for pattern in skill_patterns:
	matches = re.findall(pattern, search_text, re.IGNORECASE)
	for match in matches:
	# Check each word/phrase in the match
	potential_skills = re.split(r'[,;]', match)
	for ps in potential_skills:
	ps = ps.strip().lower()
	if ps in SKILL_KEYWORDS:
	skills_found.add(ps)

	return list(skills_found)

	def extract_education(text: str, edu_section: str = "") -> List[str]:
	"""Extract education information"""
	education_info = []

	search_text = edu_section + " " + text if edu_section else text

	# Extract degrees
	for pattern in EDUCATION_PATTERNS:
	matches = re.findall(pattern, search_text, re.IGNORECASE)
	for match in matches:
	if isinstance(match, tuple):
	match = match[0]
	education_info.append(match)

	# Extract years (graduation years)
	year_pattern = r'\b(19[0-9]{2}\|20[0-9]{2})\b'
	years = re.findall(year_pattern, search_text)

	# Extract GPA if mentioned
	gpa_pattern = r'(?:gpa\|cgpa\|grade)[\s:]*([0-9]\.[0-9]+)'
	gpa_matches = re.findall(gpa_pattern, search_text, re.IGNORECASE)

	return list(dict.fromkeys(education_info)) # Remove duplicates

	def extract_experience(text: str, exp_section: str = "") -> List[str]:
	"""Extract experience information"""
	experience_info = []

	search_text = exp_section + " " + text if exp_section else text

	# Extract job titles
	for pattern in JOB_TITLE_PATTERNS:
	matches = re.findall(pattern, search_text, re.IGNORECASE)
	for match in matches:
	if isinstance(match, tuple):
	match = ' '.join(match).strip()
	experience_info.append(match)

	# Extract years of experience
	exp_patterns = [
	r'(\d+)\+?\s*(?:years?\|yrs?)(?:\s+of)?\s+experience',
	r'experience\s:?\s(\d+)\+?\s*(?:years?\|yrs?)',
	r'(\d+)\+?\s*(?:years?\|yrs?)\s+(?:as\|in\|of)',
	]

	for pattern in exp_patterns:
	matches = re.findall(pattern, search_text, re.IGNORECASE)
	if matches:
	years = max(map(int, matches))
	experience_info.append(f"{years}+ years experience")
	break

	# Extract company names (common patterns)
	company_patterns = [
	r'(?:at\|@\|company\|employer)\s:?\s([A-Z][A-Za-z\s&\.\-]+)',
	r'([A-Z][A-Za-z\s&\.\-]+)\s*(?:inc\|llc\|ltd\|corp\|company)',
	]

	for pattern in company_patterns:
	matches = re.findall(pattern, search_text)
	experience_info.extend(matches[:3]) # Limit to avoid false positives

	return list(dict.fromkeys(experience_info))

	def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
	"""Main function to parse resume"""
	# Extract and clean text
	raw_text = extract_text(file_path)
	text = clean_text(raw_text)

	# Extract sections
	sections = extract_sections(text)

	# Get NER entities
	entities = ner_pipeline(text[:1024]) # Limit for performance

	# Extract information
	name = extract_name(text, entities)
	skills = extract_skills(text, sections.get('skills', ''))
	education = extract_education(text, sections.get('education', ''))
	experience = extract_experience(text, sections.get('experience', ''))

	return {
	"name": name,
	"skills": ", ".join(skills[:15]) if skills else "Not Found", # Limit to 15 skills
	"education": ", ".join(education[:5]) if education else "Not Found",
	"experience": ", ".join(experience[:5]) if experience else "Not Found"
	}

	# Optional: Add confidence scores
	def parse_resume_with_confidence(file_path: str) -> Dict[str, Tuple[str, float]]:
	"""Parse resume with confidence scores for each field"""
	result = parse_resume(file_path)

	# Simple confidence calculation based on whether data was found
	confidence_scores = {
	"name": 0.9 if result["name"] != "Not Found" else 0.1,
	"skills": min(0.9, len(result["skills"].split(",")) * 0.1) if result["skills"] != "Not Found" else 0.1,
	"education": 0.8 if result["education"] != "Not Found" else 0.2,
	"experience": 0.8 if result["experience"] != "Not Found" else 0.2
	}

	return {
	key: (value, confidence_scores[key])
	for key, value in result.items()
	}