Codingo / backend /models /resume_parser /resume_to_features.py
husseinelsaadi's picture
updated
9e72d2c
raw
history blame
9.32 kB
import os
import re
import json
from pathlib import Path
import PyPDF2
from docx import Document
import textract
class SimpleResumeParser:
def __init__(self):
# Common skills keywords
self.skills_keywords = [
'python', 'javascript', 'java', 'c++', 'c#', 'php', 'ruby', 'go', 'rust',
'html', 'css', 'react', 'angular', 'vue', 'node.js', 'express', 'django',
'flask', 'spring', 'laravel', 'rails', 'asp.net', 'jquery', 'bootstrap',
'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch',
'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'github',
'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'scikit-learn',
'data analysis', 'pandas', 'numpy', 'matplotlib', 'tableau', 'power bi',
'agile', 'scrum', 'devops', 'ci/cd', 'microservices', 'api', 'rest', 'graphql'
]
# Education keywords
self.education_keywords = [
'bachelor', 'master', 'phd', 'degree', 'university', 'college', 'institute',
'computer science', 'engineering', 'mathematics', 'physics', 'chemistry',
'business', 'mba', 'certification', 'diploma'
]
# Experience keywords
self.experience_keywords = [
'experience', 'worked', 'developed', 'managed', 'led', 'created', 'built',
'designed', 'implemented', 'maintained', 'optimized', 'improved', 'years'
]
def extract_text_from_pdf(self, file_path):
"""Extract text from PDF file"""
try:
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
print(f"Error reading PDF: {e}")
return ""
def extract_text_from_docx(self, file_path):
"""Extract text from DOCX file"""
try:
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
print(f"Error reading DOCX: {e}")
return ""
def extract_text_from_doc(self, file_path):
"""Extract text from DOC file using textract"""
try:
text = textract.process(file_path).decode('utf-8')
return text
except Exception as e:
print(f"Error reading DOC: {e}")
return ""
def extract_text(self, file_path):
"""Extract text based on file extension"""
file_extension = Path(file_path).suffix.lower()
if file_extension == '.pdf':
return self.extract_text_from_pdf(file_path)
elif file_extension == '.docx':
return self.extract_text_from_docx(file_path)
elif file_extension == '.doc':
return self.extract_text_from_doc(file_path)
else:
return ""
def extract_email(self, text):
"""Extract email addresses from text"""
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)
return emails[0] if emails else ""
def extract_phone(self, text):
"""Extract phone numbers from text"""
phone_patterns = [
r'\+?1?[-.\s]?$$?([0-9]{3})$$?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})',
r'(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})',
r'($$\d{3}$$\s?\d{3}[-.\s]?\d{4})'
]
for pattern in phone_patterns:
matches = re.findall(pattern, text)
if matches:
if isinstance(matches[0], tuple):
return ''.join(matches[0])
return matches[0]
return ""
def extract_name(self, text):
"""Extract name from text (simple heuristic)"""
lines = text.split('\n')
for line in lines[:5]: # Check first 5 lines
line = line.strip()
if len(line.split()) == 2 and line.replace(' ', '').isalpha():
# Simple check: two words, all alphabetic
if not any(keyword in line.lower() for keyword in ['resume', 'cv', 'curriculum']):
return line.title()
return ""
def extract_skills(self, text):
"""Extract skills from text"""
text_lower = text.lower()
found_skills = []
for skill in self.skills_keywords:
if skill.lower() in text_lower:
found_skills.append(skill.title())
# Remove duplicates and return
return list(set(found_skills))
def extract_education(self, text):
"""Extract education information"""
text_lower = text.lower()
education = []
# Look for education section
education_section = ""
lines = text.split('\n')
in_education_section = False
for line in lines:
line_lower = line.lower()
if any(keyword in line_lower for keyword in ['education', 'academic', 'qualification']):
in_education_section = True
continue
elif in_education_section and any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'project']):
break
elif in_education_section:
education_section += line + " "
# Extract degrees and institutions
for keyword in self.education_keywords:
if keyword in text_lower:
# Find context around the keyword
pattern = rf'.{{0,50}}{re.escape(keyword)}.{{0,50}}'
matches = re.findall(pattern, text, re.IGNORECASE)
education.extend(matches)
return education[:3] # Return top 3 education entries
def extract_experience(self, text):
"""Extract work experience"""
experience = []
lines = text.split('\n')
# Look for experience section
in_experience_section = False
current_experience = ""
for line in lines:
line_lower = line.lower()
if any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'career']):
in_experience_section = True
continue
elif in_experience_section and any(keyword in line_lower for keyword in ['education', 'skill', 'project']):
if current_experience:
experience.append(current_experience.strip())
break
elif in_experience_section:
if line.strip():
current_experience += line + " "
elif current_experience:
experience.append(current_experience.strip())
current_experience = ""
if current_experience:
experience.append(current_experience.strip())
return experience[:3] # Return top 3 experience entries
def extract_summary(self, text):
"""Extract summary/objective"""
lines = text.split('\n')
summary = ""
for i, line in enumerate(lines):
line_lower = line.lower()
if any(keyword in line_lower for keyword in ['summary', 'objective', 'profile', 'about']):
# Get next few lines as summary
summary_lines = lines[i+1:i+4]
summary = ' '.join([l.strip() for l in summary_lines if l.strip()])
break
return summary[:200] # Limit to 200 characters
def extract_resume_features(file_path):
"""
Main function to extract features from resume
Returns a dictionary with extracted information
"""
try:
parser = SimpleResumeParser()
text = parser.extract_text(file_path)
if not text:
return {
'name': '',
'email': '',
'mobile_number': '',
'skills': [],
'experience': [],
'education': [],
'summary': ''
}
# Extract all features
features = {
'name': parser.extract_name(text),
'email': parser.extract_email(text),
'mobile_number': parser.extract_phone(text),
'skills': parser.extract_skills(text),
'experience': parser.extract_experience(text),
'education': parser.extract_education(text),
'summary': parser.extract_summary(text)
}
return features
except Exception as e:
print(f"Error extracting resume features: {e}")
return {
'name': '',
'email': '',
'mobile_number': '',
'skills': [],
'experience': [],
'education': [],
'summary': ''
}
# For backward compatibility
def parse_resume(file_path):
return extract_resume_features(file_path)