Codingo / backend /services /resume_parser.py
husseinelsaadi's picture
updated
6248af7
raw
history blame
10.2 kB
import json
import re
from pathlib import Path
from typing import Dict, List, Optional
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ResumeParser:
def __init__(self):
self.ner_pipeline = None
self.model_loaded = False
self._load_model()
def _load_model(self):
"""Load the NER model with error handling and fallbacks"""
try:
# Try the original model first
MODEL_NAME = "manishiitg/resume-ner"
logger.info(f"Attempting to load model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
self.ner_pipeline = pipeline(
"ner",
model=model,
tokenizer=tokenizer,
aggregation_strategy="simple"
)
self.model_loaded = True
logger.info("Model loaded successfully")
except Exception as e:
logger.warning(f"Failed to load primary model: {e}")
try:
# Fallback to a more reliable model
MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
logger.info(f"Trying fallback model: {MODEL_NAME}")
self.ner_pipeline = pipeline(
"ner",
model=MODEL_NAME,
aggregation_strategy="simple"
)
self.model_loaded = True
logger.info("Fallback model loaded successfully")
except Exception as e2:
logger.error(f"Failed to load fallback model: {e2}")
self.model_loaded = False
def extract_text(self, file_path: str) -> str:
"""Extract text from PDF or DOCX files with error handling"""
try:
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if path.suffix.lower() == ".pdf":
text = pdf_extract_text(file_path)
logger.info(f"Extracted {len(text)} characters from PDF")
return text
elif path.suffix.lower() == ".docx":
doc = Document(file_path)
text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
logger.info(f"Extracted {len(text)} characters from DOCX")
return text
else:
raise ValueError(f"Unsupported file format: {path.suffix}")
except Exception as e:
logger.error(f"Error extracting text: {e}")
raise
def extract_with_regex(self, text: str) -> Dict[str, List[str]]:
"""Fallback extraction using regex patterns"""
patterns = {
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
'phone': r'(\+\d{1,3}[-.\s]?)?$$?\d{3}$$?[-.\s]?\d{3}[-.\s]?\d{4}',
'skills': r'(?i)(?:skills?|technologies?|tools?)[:\-\s]*([^\n]+)',
'education': r'(?i)(?:education|degree|university|college|bachelor|master|phd)[:\-\s]*([^\n]+)',
'experience': r'(?i)(?:experience|work|employment|job)[:\-\s]*([^\n]+)'
}
results = {}
for key, pattern in patterns.items():
matches = re.findall(pattern, text, re.MULTILINE)
results[key] = [match.strip() for match in matches if match.strip()]
return results
def extract_name_from_text(self, text: str) -> str:
"""Extract name using heuristics"""
lines = text.split('\n')
# Usually name is in the first few lines
for line in lines[:5]:
line = line.strip()
if line and len(line.split()) <= 4 and len(line) > 2:
# Check if it looks like a name (not email, phone, etc.)
if not re.search(r'[@\d]', line) and not line.lower().startswith(('resume', 'cv', 'curriculum')):
return line
return "Not Found"
def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]:
"""Process NER entities with improved logic"""
name, skills, education, experience = [], [], [], []
logger.info(f"Processing {len(entities)} entities")
for ent in entities:
label = ent.get("entity_group", "").upper()
value = ent.get("word", "").strip()
confidence = ent.get("score", 0)
logger.debug(f"Entity: {label} = {value} (confidence: {confidence:.2f})")
# Only consider high-confidence entities
if confidence < 0.5:
continue
if label in ["PERSON", "PER", "NAME"]:
name.append(value)
elif label in ["SKILL", "TECH", "TECHNOLOGY"]:
skills.append(value)
elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"]:
education.append(value)
elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]:
experience.append(value)
return {
"name": name,
"skills": skills,
"education": education,
"experience": experience
}
def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]:
"""Parse resume with multiple extraction methods"""
try:
# Extract text
text = self.extract_text(file_path)
if not text or len(text.strip()) < 10:
raise ValueError("Extracted text is too short or empty")
logger.info(f"Text preview: {text[:200]}...")
# Initialize results
results = {
"name": "Not Found",
"skills": "Not Found",
"education": "Not Found",
"experience": "Not Found"
}
# Method 1: Try NER model if available
if self.model_loaded and self.ner_pipeline:
try:
logger.info("Using NER model for extraction")
entities = self.ner_pipeline(text)
ner_results = self.process_ner_entities(entities)
# Update results with NER findings
for key in results.keys():
if ner_results.get(key):
unique_items = list(dict.fromkeys(ner_results[key]))
results[key] = ", ".join(unique_items)
except Exception as e:
logger.warning(f"NER extraction failed: {e}")
# Method 2: Regex fallback
logger.info("Using regex patterns for extraction")
regex_results = self.extract_with_regex(text)
# Fill in missing information with regex results
if results["name"] == "Not Found":
results["name"] = self.extract_name_from_text(text)
if results["skills"] == "Not Found" and regex_results.get("skills"):
results["skills"] = ", ".join(regex_results["skills"][:3]) # Limit to first 3
if results["education"] == "Not Found" and regex_results.get("education"):
results["education"] = ", ".join(regex_results["education"][:2]) # Limit to first 2
if results["experience"] == "Not Found" and regex_results.get("experience"):
results["experience"] = ", ".join(regex_results["experience"][:3]) # Limit to first 3
# Add email and phone if found
if regex_results.get("email"):
results["email"] = regex_results["email"][0]
if regex_results.get("phone"):
results["phone"] = regex_results["phone"][0]
logger.info("Parsing completed successfully")
return results
except Exception as e:
logger.error(f"Error parsing resume: {e}")
return {
"name": "Error",
"skills": "Error",
"education": "Error",
"experience": "Error",
"error": str(e)
}
# Create global instance
resume_parser = ResumeParser()
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
"""Main function to parse resume"""
return resume_parser.parse_resume(file_path, filename)
# Test function
def test_parser():
"""Test the parser with sample text"""
sample_text = """
John Doe
Software Engineer
[email protected]
(555) 123-4567
Skills: Python, JavaScript, React, Node.js, SQL
Education:
Bachelor of Science in Computer Science
University of Technology, 2020
Experience:
Senior Software Developer at Tech Corp (2021-2023)
- Developed web applications using React and Node.js
- Managed database systems and APIs
"""
# Create a temporary file for testing
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write(sample_text)
temp_path = f.name
try:
# Test regex extraction
regex_results = resume_parser.extract_with_regex(sample_text)
print("Regex Results:", json.dumps(regex_results, indent=2))
# Test name extraction
name = resume_parser.extract_name_from_text(sample_text)
print(f"Extracted Name: {name}")
except Exception as e:
print(f"Test error: {e}")
finally:
Path(temp_path).unlink(missing_ok=True)