import json import re from pathlib import Path from typing import Dict, List, Optional from pdfminer.high_level import extract_text as pdf_extract_text from docx import Document from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ResumeParser: def __init__(self): self.ner_pipeline = None self.model_loaded = False self._load_model() def _load_model(self): """Load the NER model with error handling and fallbacks""" try: # Try the original model first MODEL_NAME = "manishiitg/resume-ner" logger.info(f"Attempting to load model: {MODEL_NAME}") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) self.ner_pipeline = pipeline( "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple" ) self.model_loaded = True logger.info("Model loaded successfully") except Exception as e: logger.warning(f"Failed to load primary model: {e}") try: # Fallback to a more reliable model MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english" logger.info(f"Trying fallback model: {MODEL_NAME}") self.ner_pipeline = pipeline( "ner", model=MODEL_NAME, aggregation_strategy="simple" ) self.model_loaded = True logger.info("Fallback model loaded successfully") except Exception as e2: logger.error(f"Failed to load fallback model: {e2}") self.model_loaded = False def extract_text(self, file_path: str) -> str: """Extract text from PDF or DOCX files with error handling""" try: path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if path.suffix.lower() == ".pdf": text = pdf_extract_text(file_path) logger.info(f"Extracted {len(text)} characters from PDF") return text elif path.suffix.lower() == ".docx": doc = Document(file_path) text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) logger.info(f"Extracted {len(text)} characters from DOCX") return text else: raise ValueError(f"Unsupported file format: {path.suffix}") except Exception as e: logger.error(f"Error extracting text: {e}") raise def extract_with_regex(self, text: str) -> Dict[str, List[str]]: """Fallback extraction using regex patterns""" patterns = { 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'phone': r'(\+\d{1,3}[-.\s]?)?$$?\d{3}$$?[-.\s]?\d{3}[-.\s]?\d{4}', 'skills': r'(?i)(?:skills?|technologies?|tools?)[:\-\s]*([^\n]+)', 'education': r'(?i)(?:education|degree|university|college|bachelor|master|phd)[:\-\s]*([^\n]+)', 'experience': r'(?i)(?:experience|work|employment|job)[:\-\s]*([^\n]+)' } results = {} for key, pattern in patterns.items(): matches = re.findall(pattern, text, re.MULTILINE) results[key] = [match.strip() for match in matches if match.strip()] return results def extract_name_from_text(self, text: str) -> str: """Extract name using heuristics""" lines = text.split('\n') # Usually name is in the first few lines for line in lines[:5]: line = line.strip() if line and len(line.split()) <= 4 and len(line) > 2: # Check if it looks like a name (not email, phone, etc.) if not re.search(r'[@\d]', line) and not line.lower().startswith(('resume', 'cv', 'curriculum')): return line return "Not Found" def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]: """Process NER entities with improved logic""" name, skills, education, experience = [], [], [], [] logger.info(f"Processing {len(entities)} entities") for ent in entities: label = ent.get("entity_group", "").upper() value = ent.get("word", "").strip() confidence = ent.get("score", 0) logger.debug(f"Entity: {label} = {value} (confidence: {confidence:.2f})") # Only consider high-confidence entities if confidence < 0.5: continue if label in ["PERSON", "PER", "NAME"]: name.append(value) elif label in ["SKILL", "TECH", "TECHNOLOGY"]: skills.append(value) elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"]: education.append(value) elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]: experience.append(value) return { "name": name, "skills": skills, "education": education, "experience": experience } def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]: """Parse resume with multiple extraction methods""" try: # Extract text text = self.extract_text(file_path) if not text or len(text.strip()) < 10: raise ValueError("Extracted text is too short or empty") logger.info(f"Text preview: {text[:200]}...") # Initialize results results = { "name": "Not Found", "skills": "Not Found", "education": "Not Found", "experience": "Not Found" } # Method 1: Try NER model if available if self.model_loaded and self.ner_pipeline: try: logger.info("Using NER model for extraction") entities = self.ner_pipeline(text) ner_results = self.process_ner_entities(entities) # Update results with NER findings for key in results.keys(): if ner_results.get(key): unique_items = list(dict.fromkeys(ner_results[key])) results[key] = ", ".join(unique_items) except Exception as e: logger.warning(f"NER extraction failed: {e}") # Method 2: Regex fallback logger.info("Using regex patterns for extraction") regex_results = self.extract_with_regex(text) # Fill in missing information with regex results if results["name"] == "Not Found": results["name"] = self.extract_name_from_text(text) if results["skills"] == "Not Found" and regex_results.get("skills"): results["skills"] = ", ".join(regex_results["skills"][:3]) # Limit to first 3 if results["education"] == "Not Found" and regex_results.get("education"): results["education"] = ", ".join(regex_results["education"][:2]) # Limit to first 2 if results["experience"] == "Not Found" and regex_results.get("experience"): results["experience"] = ", ".join(regex_results["experience"][:3]) # Limit to first 3 # Add email and phone if found if regex_results.get("email"): results["email"] = regex_results["email"][0] if regex_results.get("phone"): results["phone"] = regex_results["phone"][0] logger.info("Parsing completed successfully") return results except Exception as e: logger.error(f"Error parsing resume: {e}") return { "name": "Error", "skills": "Error", "education": "Error", "experience": "Error", "error": str(e) } # Create global instance resume_parser = ResumeParser() def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]: """Main function to parse resume""" return resume_parser.parse_resume(file_path, filename) # Test function def test_parser(): """Test the parser with sample text""" sample_text = """ John Doe Software Engineer john.doe@email.com (555) 123-4567 Skills: Python, JavaScript, React, Node.js, SQL Education: Bachelor of Science in Computer Science University of Technology, 2020 Experience: Senior Software Developer at Tech Corp (2021-2023) - Developed web applications using React and Node.js - Managed database systems and APIs """ # Create a temporary file for testing import tempfile with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: f.write(sample_text) temp_path = f.name try: # Test regex extraction regex_results = resume_parser.extract_with_regex(sample_text) print("Regex Results:", json.dumps(regex_results, indent=2)) # Test name extraction name = resume_parser.extract_name_from_text(sample_text) print(f"Extracted Name: {name}") except Exception as e: print(f"Test error: {e}") finally: Path(temp_path).unlink(missing_ok=True)