Spaces:
Paused
Paused
import json | |
import re | |
import os | |
from pathlib import Path | |
from typing import Dict, List, Optional, Union | |
from pdfminer.high_level import extract_text as pdf_extract_text | |
from docx import Document | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
import logging | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class ResumeParser: | |
def __init__(self): | |
self.ner_pipeline = None | |
self.model_loaded = False | |
self._load_model() | |
def _load_model(self): | |
"""Load the NER model with error handling and fallbacks""" | |
try: | |
# Try the original model first | |
MODEL_NAME = "manishiitg/resume-ner" | |
logger.info(f"Attempting to load model: {MODEL_NAME}") | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) | |
self.ner_pipeline = pipeline( | |
"ner", | |
model=model, | |
tokenizer=tokenizer, | |
aggregation_strategy="simple", | |
device=0 if os.environ.get("L4_GPU", "false").lower() == "true" else -1 | |
) | |
self.model_loaded = True | |
logger.info("Model loaded successfully") | |
except Exception as e: | |
logger.warning(f"Failed to load primary model: {e}") | |
try: | |
# Fallback to a more reliable model | |
MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english" | |
logger.info(f"Trying fallback model: {MODEL_NAME}") | |
self.ner_pipeline = pipeline( | |
"ner", | |
model=MODEL_NAME, | |
aggregation_strategy="simple", | |
device=0 if os.environ.get("L4_GPU", "false").lower() == "true" else -1 | |
) | |
self.model_loaded = True | |
logger.info("Fallback model loaded successfully") | |
except Exception as e2: | |
logger.error(f"Failed to load fallback model: {e2}") | |
self.model_loaded = False | |
def extract_text(self, file_path: str) -> str: | |
"""Extract text from PDF or DOCX files with error handling""" | |
try: | |
path = Path(file_path) | |
if not path.exists(): | |
raise FileNotFoundError(f"File not found: {file_path}") | |
if path.suffix.lower() == ".pdf": | |
text = pdf_extract_text(file_path) | |
# Clean up PDF text extraction artifacts | |
text = re.sub(r'\s+', ' ', text).strip() | |
logger.info(f"Extracted {len(text)} characters from PDF") | |
return text | |
elif path.suffix.lower() == ".docx": | |
doc = Document(file_path) | |
text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) | |
logger.info(f"Extracted {len(text)} characters from DOCX") | |
return text | |
else: | |
raise ValueError(f"Unsupported file format: {path.suffix}") | |
except Exception as e: | |
logger.error(f"Error extracting text: {e}") | |
raise | |
def extract_with_regex(self, text: str) -> Dict[str, List[str]]: | |
"""Improved regex patterns for extraction""" | |
patterns = { | |
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', | |
'phone': r'(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', | |
'skills': r'(?i)(?:skills?|technologies?|tools?|expertise)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)', | |
'education': r'(?i)(?:education|degree|university|college|bachelor|master|phd)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)', | |
'experience': r'(?i)(?:experience|work\shistory|employment|job\shistory)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)', | |
'name': r'^(?!(resume|cv|curriculum vitae|\d))[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+' | |
} | |
results = {} | |
for key, pattern in patterns.items(): | |
matches = re.findall(pattern, text, re.MULTILINE | re.IGNORECASE) | |
if key == 'name' and matches: | |
# Take the first likely name match | |
results[key] = [matches[0].strip()] | |
else: | |
# Clean and filter matches | |
cleaned = [m.strip() for m in matches if m.strip()] | |
if cleaned: | |
results[key] = cleaned | |
return results | |
def extract_name_from_text(self, text: str) -> str: | |
"""Improved name extraction heuristics""" | |
# First try to find name using regex | |
name_match = re.search( | |
r'^(?!(resume|cv|curriculum vitae|\d))[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+', | |
text, | |
re.MULTILINE | re.IGNORECASE | |
) | |
if name_match: | |
return name_match.group(0).strip() | |
# Fallback to line-based approach | |
lines = text.split('\n') | |
for line in lines[:10]: # Check first 10 lines | |
line = line.strip() | |
if line and 2 <= len(line.split()) <= 4: | |
# Check if it looks like a name (not email, phone, etc.) | |
if not re.search(r'[@\d+\-\(\)]', line): | |
if line[0].isupper() and not line.lower().startswith(('resume', 'cv', 'curriculum')): | |
return line | |
return "Not Found" | |
def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]: | |
"""Process NER entities with improved logic""" | |
results = { | |
"name": [], | |
"skills": [], | |
"education": [], | |
"experience": [] | |
} | |
logger.info(f"Processing {len(entities)} entities") | |
for ent in entities: | |
label = ent.get("entity_group", "").upper() | |
value = ent.get("word", "").strip() | |
confidence = ent.get("score", 0) | |
# Skip low confidence entities and empty values | |
if confidence < 0.7 or not value: | |
continue | |
# Normalize labels | |
if label in ["PERSON", "PER", "NAME"]: | |
results["name"].append(value) | |
elif label in ["SKILL", "TECH", "TECHNOLOGY"]: | |
results["skills"].append(value) | |
elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"] and "university" not in value.lower(): | |
results["education"].append(value) | |
elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]: | |
results["experience"].append(value) | |
# Deduplicate and clean results | |
for key in results: | |
results[key] = list(dict.fromkeys(results[key])) # Preserve order | |
return results | |
def merge_results(self, ner_results: Dict, regex_results: Dict) -> Dict[str, str]: | |
"""Merge NER and regex results intelligently""" | |
merged = { | |
"name": "Not Found", | |
"email": "Not Found", | |
"phone": "Not Found", | |
"skills": "Not Found", | |
"education": "Not Found", | |
"experience": "Not Found" | |
} | |
# Name - prioritize NER, then regex, then text extraction | |
if ner_results.get("name"): | |
merged["name"] = " ".join(ner_results["name"][:1]) # Take first name only | |
elif regex_results.get("name"): | |
merged["name"] = regex_results["name"][0] | |
# Email and phone - only from regex | |
if regex_results.get("email"): | |
merged["email"] = regex_results["email"][0] | |
if regex_results.get("phone"): | |
merged["phone"] = regex_results["phone"][0] | |
# Skills - combine both sources | |
all_skills = [] | |
if ner_results.get("skills"): | |
all_skills.extend(ner_results["skills"]) | |
if regex_results.get("skills"): | |
all_skills.extend(regex_results["skills"]) | |
if all_skills: | |
merged["skills"] = ", ".join(list(dict.fromkeys(all_skills))[:10]) # Limit to 10 skills | |
# Education - combine both sources | |
all_edu = [] | |
if ner_results.get("education"): | |
all_edu.extend(ner_results["education"]) | |
if regex_results.get("education"): | |
all_edu.extend(regex_results["education"]) | |
if all_edu: | |
merged["education"] = ", ".join(list(dict.fromkeys(all_edu))[:3] # Limit to 3 items | |
# Experience - combine both sources | |
all_exp = [] | |
if ner_results.get("experience"): | |
all_exp.extend(ner_results["experience"]) | |
if regex_results.get("experience"): | |
all_exp.extend(regex_results["experience"]) | |
if all_exp: | |
merged["experience"] = ", ".join(list(dict.fromkeys(all_exp))[:3] # Limit to 3 items | |
return merged | |
def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]: | |
"""Parse resume with multiple extraction methods""" | |
try: | |
# Extract text | |
text = self.extract_text(file_path) | |
if not text or len(text.strip()) < 10: | |
raise ValueError("Extracted text is too short or empty") | |
logger.info(f"Text preview: {text[:200]}...") | |
# Initialize results | |
ner_results = { | |
"name": [], | |
"skills": [], | |
"education": [], | |
"experience": [] | |
} | |
# Method 1: Try NER model if available | |
if self.model_loaded and self.ner_pipeline: | |
try: | |
logger.info("Using NER model for extraction") | |
entities = self.ner_pipeline(text[:5120]) # Limit input size for NER | |
ner_results = self.process_ner_entities(entities) | |
logger.info(f"NER results: {json.dumps(ner_results, indent=2)}") | |
except Exception as e: | |
logger.warning(f"NER extraction failed: {e}") | |
# Method 2: Regex extraction | |
logger.info("Using regex patterns for extraction") | |
regex_results = self.extract_with_regex(text) | |
logger.info(f"Regex results: {json.dumps(regex_results, indent=2)}") | |
# Method 3: Name extraction fallback | |
if not ner_results.get("name") and not regex_results.get("name"): | |
name = self.extract_name_from_text(text) | |
if name != "Not Found": | |
regex_results["name"] = [name] | |
# Merge all results | |
final_results = self.merge_results(ner_results, regex_results) | |
# If name still not found, try filename | |
if final_results["name"] == "Not Found" and filename: | |
# Try to extract name from filename (common pattern: "Firstname Lastname - Resume.pdf") | |
name_from_file = re.sub(r'[-_].*', '', filename).strip() | |
if len(name_from_file.split()) >= 2: | |
final_results["name"] = name_from_file | |
logger.info("Parsing completed successfully") | |
return final_results | |
except Exception as e: | |
logger.error(f"Error parsing resume: {e}") | |
return { | |
"name": "Error", | |
"email": "Error", | |
"phone": "Error", | |
"skills": "Error", | |
"education": "Error", | |
"experience": "Error", | |
"error": str(e) | |
} | |
# Create global instance | |
resume_parser = ResumeParser() | |
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]: | |
"""Main function to parse resume""" | |
return resume_parser.parse_resume(file_path, filename) | |
if __name__ == "__main__": | |
# Test the parser | |
test_file = input("Enter path to resume file: ") | |
if os.path.exists(test_file): | |
results = parse_resume(test_file, os.path.basename(test_file)) | |
print("\nParsing Results:") | |
print(json.dumps(results, indent=2)) | |
else: | |
print("File not found") |