Spaces:

husseinelsaadi
/

Codingo

Paused

File size: 11,262 Bytes

import re
from pathlib import Path
from typing import Dict, List, Tuple
import spacy
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import nltk
from nltk.corpus import stopwords
from dateutil.parser import parse as date_parse

# Download required NLTK data
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
except:
    pass

# Load spaCy model for better NER
try:
    nlp = spacy.load("en_core_web_sm")
except:
    print("Please install spacy model: python -m spacy download en_core_web_sm")
    nlp = None

MODEL_NAME = "manishiitg/resume-ner"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Expanded keyword lists
SKILL_KEYWORDS = {
    # Programming Languages
    "python", "java", "javascript", "typescript", "c++", "c#", "ruby", "go", "rust", "kotlin", "swift", 
    "php", "r", "matlab", "scala", "perl", "bash", "powershell", "sql", "html", "css",
    
    # Frameworks & Libraries
    "react", "angular", "vue", "node.js", "express", "django", "flask", "spring", "spring boot", 
    ".net", "laravel", "rails", "fastapi", "pytorch", "tensorflow", "keras", "scikit-learn",
    
    # Databases
    "mysql", "postgresql", "mongodb", "redis", "elasticsearch", "cassandra", "oracle", "sql server",
    
    # Cloud & DevOps
    "aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "terraform", "ansible", "ci/cd",
    
    # Other Technical Skills
    "machine learning", "deep learning", "data science", "nlp", "computer vision", "ai", 
    "rest api", "graphql", "microservices", "agile", "scrum", "git", "linux", "windows"
}

EDUCATION_PATTERNS = [
    # Degrees
    r"\b(bachelor|b\.?s\.?c?\.?|b\.?a\.?|b\.?tech|b\.?e\.?)\b",
    r"\b(master|m\.?s\.?c?\.?|m\.?a\.?|m\.?tech|m\.?e\.?|mba)\b",
    r"\b(ph\.?d\.?|doctorate|doctoral)\b",
    r"\b(diploma|certificate|certification)\b",
    
    # Fields of Study
    r"\b(computer science|software engineering|information technology|it|cs)\b",
    r"\b(electrical engineering|mechanical engineering|civil engineering)\b",
    r"\b(data science|artificial intelligence|machine learning)\b",
    r"\b(business administration|finance|accounting|marketing)\b",
    
    # Institution indicators
    r"\b(university|college|institute|school)\s+of\s+\w+",
    r"\b\w+\s+(university|college|institute)\b"
]

JOB_TITLE_PATTERNS = [
    r"\b(software|senior|junior|lead|principal|staff)\s*(engineer|developer|programmer)\b",
    r"\b(data|business|system|security)\s*(analyst|scientist|engineer)\b",
    r"\b(project|product|program|engineering)\s*manager\b",
    r"\b(devops|cloud|ml|ai|backend|frontend|full[\s-]?stack)\s*(engineer|developer)\b",
    r"\b(consultant|architect|specialist|coordinator|administrator)\b"
]

def extract_text(file_path: str) -> str:
    """Extract text from PDF or DOCX files"""
    path = Path(file_path)
    if path.suffix.lower() == ".pdf":
        text = pdf_extract_text(file_path)
    elif path.suffix.lower() == ".docx":
        doc = Document(file_path)
        text = "\n".join([p.text for p in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format")
    return text

def clean_text(text: str) -> str:
    """Clean and normalize text"""
    # Remove multiple spaces and normalize
    text = re.sub(r'\s+', ' ', text)
    # Keep line breaks for section detection
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

def extract_sections(text: str) -> Dict[str, str]:
    """Extract different sections from resume"""
    sections = {
        'education': '',
        'experience': '',
        'skills': '',
        'summary': ''
    }
    
    # Common section headers
    section_patterns = {
        'education': r'(education|academic|qualification|degree)',
        'experience': r'(experience|employment|work\s*history|professional\s*experience|career)',
        'skills': r'(skills|technical\s*skills|competencies|expertise)',
        'summary': r'(summary|objective|profile|about)'
    }
    
    lines = text.split('\n')
    current_section = None
    
    for i, line in enumerate(lines):
        line_lower = line.lower().strip()
        
        # Check if this line is a section header
        for section, pattern in section_patterns.items():
            if re.search(pattern, line_lower) and len(line_lower) < 50:
                current_section = section
                break
        
        # Add content to current section
        if current_section and i > 0:
            sections[current_section] += line + '\n'
    
    return sections

def extract_name(text: str, entities: List) -> str:
    """Extract name using multiple methods"""
    # Method 1: Use transformer model
    name_parts = []
    for ent in entities:
        if ent["entity_group"].upper() in ["NAME", "PERSON", "PER"]:
            name_parts.append(ent["word"].strip())
    
    if name_parts:
        # Clean and join name parts
        full_name = " ".join(dict.fromkeys(name_parts))
        full_name = re.sub(r'\s+', ' ', full_name).strip()
        if len(full_name) > 3 and len(full_name.split()) <= 4:
            return full_name
    
    # Method 2: Use spaCy if available
    if nlp:
        doc = nlp(text[:500])  # Check first 500 chars
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                name = ent.text.strip()
                if len(name) > 3 and len(name.split()) <= 4:
                    return name
    
    # Method 3: Pattern matching for first few lines
    first_lines = text.split('\n')[:5]
    for line in first_lines:
        line = line.strip()
        # Look for name pattern (2-4 words, title case)
        if re.match(r'^[A-Z][a-z]+(\s+[A-Z][a-z]+){1,3}$', line):
            return line
    
    return "Not Found"

def extract_skills(text: str, skill_section: str = "") -> List[str]:
    """Extract skills using multiple methods"""
    skills_found = set()
    
    # Prioritize skills section if available
    search_text = skill_section + " " + text if skill_section else text
    search_text = search_text.lower()
    
    # Method 1: Direct keyword matching
    for skill in SKILL_KEYWORDS:
        if re.search(rf'\b{re.escape(skill.lower())}\b', search_text):
            skills_found.add(skill)
    
    # Method 2: Pattern-based extraction
    # Look for skills in bullet points or comma-separated lists
    skill_patterns = [
        r'[•·▪▫◦‣⁃]\s*([A-Za-z\s\+\#\.]+)',  # Bullet points
        r'(?:skills?|technologies|tools?)[\s:]*([A-Za-z\s,\+\#\.]+)',  # After keywords
    ]
    
    for pattern in skill_patterns:
        matches = re.findall(pattern, search_text, re.IGNORECASE)
        for match in matches:
            # Check each word/phrase in the match
            potential_skills = re.split(r'[,;]', match)
            for ps in potential_skills:
                ps = ps.strip().lower()
                if ps in SKILL_KEYWORDS:
                    skills_found.add(ps)
    
    return list(skills_found)

def extract_education(text: str, edu_section: str = "") -> List[str]:
    """Extract education information"""
    education_info = []
    
    search_text = edu_section + " " + text if edu_section else text
    
    # Extract degrees
    for pattern in EDUCATION_PATTERNS:
        matches = re.findall(pattern, search_text, re.IGNORECASE)
        for match in matches:
            if isinstance(match, tuple):
                match = match[0]
            education_info.append(match)
    
    # Extract years (graduation years)
    year_pattern = r'\b(19[0-9]{2}|20[0-9]{2})\b'
    years = re.findall(year_pattern, search_text)
    
    # Extract GPA if mentioned
    gpa_pattern = r'(?:gpa|cgpa|grade)[\s:]*([0-9]\.[0-9]+)'
    gpa_matches = re.findall(gpa_pattern, search_text, re.IGNORECASE)
    
    return list(dict.fromkeys(education_info))  # Remove duplicates

def extract_experience(text: str, exp_section: str = "") -> List[str]:
    """Extract experience information"""
    experience_info = []
    
    search_text = exp_section + " " + text if exp_section else text
    
    # Extract job titles
    for pattern in JOB_TITLE_PATTERNS:
        matches = re.findall(pattern, search_text, re.IGNORECASE)
        for match in matches:
            if isinstance(match, tuple):
                match = ' '.join(match).strip()
            experience_info.append(match)
    
    # Extract years of experience
    exp_patterns = [
        r'(\d+)\+?\s*(?:years?|yrs?)(?:\s+of)?\s+experience',
        r'experience\s*:?\s*(\d+)\+?\s*(?:years?|yrs?)',
        r'(\d+)\+?\s*(?:years?|yrs?)\s+(?:as|in|of)',
    ]
    
    for pattern in exp_patterns:
        matches = re.findall(pattern, search_text, re.IGNORECASE)
        if matches:
            years = max(map(int, matches))
            experience_info.append(f"{years}+ years experience")
            break
    
    # Extract company names (common patterns)
    company_patterns = [
        r'(?:at|@|company|employer)\s*:?\s*([A-Z][A-Za-z\s&\.\-]+)',
        r'([A-Z][A-Za-z\s&\.\-]+)\s*(?:inc|llc|ltd|corp|company)',
    ]
    
    for pattern in company_patterns:
        matches = re.findall(pattern, search_text)
        experience_info.extend(matches[:3])  # Limit to avoid false positives
    
    return list(dict.fromkeys(experience_info))

def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
    """Main function to parse resume"""
    # Extract and clean text
    raw_text = extract_text(file_path)
    text = clean_text(raw_text)
    
    # Extract sections
    sections = extract_sections(text)
    
    # Get NER entities
    entities = ner_pipeline(text[:1024])  # Limit for performance
    
    # Extract information
    name = extract_name(text, entities)
    skills = extract_skills(text, sections.get('skills', ''))
    education = extract_education(text, sections.get('education', ''))
    experience = extract_experience(text, sections.get('experience', ''))
    
    return {
        "name": name,
        "skills": ", ".join(skills[:15]) if skills else "Not Found",  # Limit to 15 skills
        "education": ", ".join(education[:5]) if education else "Not Found",
        "experience": ", ".join(experience[:5]) if experience else "Not Found"
    }

# Optional: Add confidence scores
def parse_resume_with_confidence(file_path: str) -> Dict[str, Tuple[str, float]]:
    """Parse resume with confidence scores for each field"""
    result = parse_resume(file_path)
    
    # Simple confidence calculation based on whether data was found
    confidence_scores = {
        "name": 0.9 if result["name"] != "Not Found" else 0.1,
        "skills": min(0.9, len(result["skills"].split(",")) * 0.1) if result["skills"] != "Not Found" else 0.1,
        "education": 0.8 if result["education"] != "Not Found" else 0.2,
        "experience": 0.8 if result["experience"] != "Not Found" else 0.2
    }
    
    return {
        key: (value, confidence_scores[key]) 
        for key, value in result.items()
    }