Codingo / backend /services /resume_parser.py
husseinelsaadi's picture
updated
c0dac84
raw
history blame
3.98 kB
import re
from pathlib import Path
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
class ResumeParser:
def __init__(self):
pass
def extract_text(self, file_path: str) -> str:
"""Extract text from PDF or DOCX files"""
path = Path(file_path)
if path.suffix.lower() == ".pdf":
text = pdf_extract_text(file_path)
return re.sub(r'\s+', ' ', text).strip()
elif path.suffix.lower() == ".docx":
doc = Document(file_path)
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
else:
raise ValueError("Unsupported file format")
def extract_name(self, text: str) -> str:
"""Extract name from resume text"""
# Try to find name at the beginning of document
first_lines = [line.strip() for line in text.split('\n')[:10] if line.strip()]
for line in first_lines:
# Simple name pattern (2-4 words, all starting with capital)
if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3}$', line):
if not any(word.lower() in ['resume', 'cv', 'curriculum'] for word in line.split()):
return line
# Fallback: return first non-empty line that looks like a name
for line in first_lines:
if 2 <= len(line.split()) <= 4 and line[0].isupper():
return line
return "Not Found"
def extract_sections(self, text: str) -> dict:
"""Extract skills, education, and experience using regex"""
results = {
"skills": [],
"education": [],
"experience": []
}
# Extract skills
skills_match = re.search(
r'(?:skills|technologies|expertise)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
text, re.IGNORECASE
)
if skills_match:
skills_text = skills_match.group(1)
results["skills"] = [s.strip() for s in re.split(r'[,;]', skills_text) if s.strip()]
# Extract education
edu_match = re.search(
r'(?:education|degrees?)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
text, re.IGNORECASE
)
if edu_match:
results["education"] = [e.strip() for e in edu_match.group(1).split('\n') if e.strip()]
# Extract experience
exp_match = re.search(
r'(?:experience|work history|employment)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
text, re.IGNORECASE
)
if exp_match:
results["experience"] = [x.strip() for x in exp_match.group(1).split('\n') if x.strip()]
return results
def parse_resume(self, file_path: str) -> dict:
"""Main parsing function"""
try:
text = self.extract_text(file_path)
if not text or len(text.strip()) < 10:
return {
"name": "Error: Empty file",
"skills": [],
"education": [],
"experience": []
}
name = self.extract_name(text)
sections = self.extract_sections(text)
return {
"name": name,
"skills": sections["skills"][:10], # Limit to 10 skills
"education": sections["education"][:3], # Limit to 3 items
"experience": sections["experience"][:3] # Limit to 3 items
}
except Exception as e:
return {
"name": f"Error: {str(e)}",
"skills": [],
"education": [],
"experience": []
}
# Global instance
resume_parser = ResumeParser()
def parse_resume(file_path: str) -> dict:
"""Public interface for resume parsing"""
return resume_parser.parse_resume(file_path)