Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

Codingo / backend /services /resume_parser.py

husseinelsaadi

updated

f2a1cfa 19 days ago

raw

history blame

12.4 kB

	import json
	import re
	import os
	from pathlib import Path
	from typing import Dict, List, Optional, Union
	from pdfminer.high_level import extract_text as pdf_extract_text
	from docx import Document
	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class ResumeParser:
	def __init__(self):
	self.ner_pipeline = None
	self.model_loaded = False
	self._load_model()

	def _load_model(self):
	"""Load the NER model with error handling and fallbacks"""
	try:
	# Try the original model first
	MODEL_NAME = "manishiitg/resume-ner"
	logger.info(f"Attempting to load model: {MODEL_NAME}")

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
	self.ner_pipeline = pipeline(
	"ner",
	model=model,
	tokenizer=tokenizer,
	aggregation_strategy="simple",
	device=0 if os.environ.get("L4_GPU", "false").lower() == "true" else -1
	)
	self.model_loaded = True
	logger.info("Model loaded successfully")

	except Exception as e:
	logger.warning(f"Failed to load primary model: {e}")
	try:
	# Fallback to a more reliable model
	MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
	logger.info(f"Trying fallback model: {MODEL_NAME}")

	self.ner_pipeline = pipeline(
	"ner",
	model=MODEL_NAME,
	aggregation_strategy="simple",
	device=0 if os.environ.get("L4_GPU", "false").lower() == "true" else -1
	)
	self.model_loaded = True
	logger.info("Fallback model loaded successfully")

	except Exception as e2:
	logger.error(f"Failed to load fallback model: {e2}")
	self.model_loaded = False

	def extract_text(self, file_path: str) -> str:
	"""Extract text from PDF or DOCX files with error handling"""
	try:
	path = Path(file_path)

	if not path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	if path.suffix.lower() == ".pdf":
	text = pdf_extract_text(file_path)
	# Clean up PDF text extraction artifacts
	text = re.sub(r'\s+', ' ', text).strip()
	logger.info(f"Extracted {len(text)} characters from PDF")
	return text

	elif path.suffix.lower() == ".docx":
	doc = Document(file_path)
	text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
	logger.info(f"Extracted {len(text)} characters from DOCX")
	return text

	else:
	raise ValueError(f"Unsupported file format: {path.suffix}")

	except Exception as e:
	logger.error(f"Error extracting text: {e}")
	raise

	def extract_with_regex(self, text: str) -> Dict[str, List[str]]:
	"""Improved regex patterns for extraction"""
	patterns = {
	'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b',
	'phone': r'(?:\+?\d{1,3}[-.\s]?)?$?\d{3}$?[-.\s]?\d{3}[-.\s]?\d{4}',
	'skills': r'(?i)(?:skills?\|technologies?\|tools?\|expertise)[:\-\s](.?)(?:\n\n\|\n\s*\n\|$)',
	'education': r'(?i)(?:education\|degree\|university\|college\|bachelor\|master\|phd)[:\-\s](.?)(?:\n\n\|\n\s*\n\|$)',
	'experience': r'(?i)(?:experience\|work\shistory\|employment\|job\shistory)[:\-\s](.?)(?:\n\n\|\n\s*\n\|$)',
	'name': r'^(?!(resume\|cv\|curriculum vitae\|\d))[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+'
	}

	results = {}
	for key, pattern in patterns.items():
	matches = re.findall(pattern, text, re.MULTILINE \| re.IGNORECASE)
	if key == 'name' and matches:
	# Take the first likely name match
	results[key] = [matches[0].strip()]
	else:
	# Clean and filter matches
	cleaned = [m.strip() for m in matches if m.strip()]
	if cleaned:
	results[key] = cleaned

	return results

	def extract_name_from_text(self, text: str) -> str:
	"""Improved name extraction heuristics"""
	# First try to find name using regex
	name_match = re.search(
	r'^(?!(resume\|cv\|curriculum vitae\|\d))[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+',
	text,
	re.MULTILINE \| re.IGNORECASE
	)

	if name_match:
	return name_match.group(0).strip()

	# Fallback to line-based approach
	lines = text.split('\n')
	for line in lines[:10]: # Check first 10 lines
	line = line.strip()
	if line and 2 <= len(line.split()) <= 4:
	# Check if it looks like a name (not email, phone, etc.)
	if not re.search(r'[@\d+\-]', line):
	if line[0].isupper() and not line.lower().startswith(('resume', 'cv', 'curriculum')):
	return line
	return "Not Found"

	def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]:
	"""Process NER entities with improved logic"""
	results = {
	"name": [],
	"skills": [],
	"education": [],
	"experience": []
	}

	logger.info(f"Processing {len(entities)} entities")

	for ent in entities:
	label = ent.get("entity_group", "").upper()
	value = ent.get("word", "").strip()
	confidence = ent.get("score", 0)

	# Skip low confidence entities and empty values
	if confidence < 0.7 or not value:
	continue

	# Normalize labels
	if label in ["PERSON", "PER", "NAME"]:
	results["name"].append(value)
	elif label in ["SKILL", "TECH", "TECHNOLOGY"]:
	results["skills"].append(value)
	elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"] and "university" not in value.lower():
	results["education"].append(value)
	elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]:
	results["experience"].append(value)

	# Deduplicate and clean results
	for key in results:
	results[key] = list(dict.fromkeys(results[key])) # Preserve order

	return results

	def merge_results(self, ner_results: Dict, regex_results: Dict) -> Dict[str, str]:
	"""Merge NER and regex results intelligently"""
	merged = {
	"name": "Not Found",
	"email": "Not Found",
	"phone": "Not Found",
	"skills": "Not Found",
	"education": "Not Found",
	"experience": "Not Found"
	}

	# Name - prioritize NER, then regex, then text extraction
	if ner_results.get("name"):
	merged["name"] = " ".join(ner_results["name"][:1]) # Take first name only
	elif regex_results.get("name"):
	merged["name"] = regex_results["name"][0]

	# Email and phone - only from regex
	if regex_results.get("email"):
	merged["email"] = regex_results["email"][0]
	if regex_results.get("phone"):
	merged["phone"] = regex_results["phone"][0]

	# Skills - combine both sources
	all_skills = []
	if ner_results.get("skills"):
	all_skills.extend(ner_results["skills"])
	if regex_results.get("skills"):
	all_skills.extend(regex_results["skills"])
	if all_skills:
	merged["skills"] = ", ".join(list(dict.fromkeys(all_skills))[:10]) # Limit to 10 skills

	# Education - combine both sources
	all_edu = []
	if ner_results.get("education"):
	all_edu.extend(ner_results["education"])
	if regex_results.get("education"):
	all_edu.extend(regex_results["education"])
	if all_edu:
	merged["education"] = ", ".join(list(dict.fromkeys(all_edu))[:3] # Limit to 3 items

	# Experience - combine both sources
	all_exp = []
	if ner_results.get("experience"):
	all_exp.extend(ner_results["experience"])
	if regex_results.get("experience"):
	all_exp.extend(regex_results["experience"])
	if all_exp:
	merged["experience"] = ", ".join(list(dict.fromkeys(all_exp))[:3] # Limit to 3 items

	return merged

	def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]:
	"""Parse resume with multiple extraction methods"""
	try:
	# Extract text
	text = self.extract_text(file_path)

	if not text or len(text.strip()) < 10:
	raise ValueError("Extracted text is too short or empty")

	logger.info(f"Text preview: {text[:200]}...")

	# Initialize results
	ner_results = {
	"name": [],
	"skills": [],
	"education": [],
	"experience": []
	}

	# Method 1: Try NER model if available
	if self.model_loaded and self.ner_pipeline:
	try:
	logger.info("Using NER model for extraction")
	entities = self.ner_pipeline(text[:5120]) # Limit input size for NER
	ner_results = self.process_ner_entities(entities)
	logger.info(f"NER results: {json.dumps(ner_results, indent=2)}")
	except Exception as e:
	logger.warning(f"NER extraction failed: {e}")

	# Method 2: Regex extraction
	logger.info("Using regex patterns for extraction")
	regex_results = self.extract_with_regex(text)
	logger.info(f"Regex results: {json.dumps(regex_results, indent=2)}")

	# Method 3: Name extraction fallback
	if not ner_results.get("name") and not regex_results.get("name"):
	name = self.extract_name_from_text(text)
	if name != "Not Found":
	regex_results["name"] = [name]

	# Merge all results
	final_results = self.merge_results(ner_results, regex_results)

	# If name still not found, try filename
	if final_results["name"] == "Not Found" and filename:
	# Try to extract name from filename (common pattern: "Firstname Lastname - Resume.pdf")
	name_from_file = re.sub(r'[-_].*', '', filename).strip()
	if len(name_from_file.split()) >= 2:
	final_results["name"] = name_from_file

	logger.info("Parsing completed successfully")
	return final_results

	except Exception as e:
	logger.error(f"Error parsing resume: {e}")
	return {
	"name": "Error",
	"email": "Error",
	"phone": "Error",
	"skills": "Error",
	"education": "Error",
	"experience": "Error",
	"error": str(e)
	}

	# Create global instance
	resume_parser = ResumeParser()

	def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
	"""Main function to parse resume"""
	return resume_parser.parse_resume(file_path, filename)

	if __name__ == "__main__":
	# Test the parser
	test_file = input("Enter path to resume file: ")
	if os.path.exists(test_file):
	results = parse_resume(test_file, os.path.basename(test_file))
	print("\nParsing Results:")
	print(json.dumps(results, indent=2))
	else:
	print("File not found")