Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on 12 days ago

Commit

45f0a42

1 Parent(s): 471f933

updated

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +275 -30

backend/services/resume_parser.py CHANGED Viewed

@@ -1,23 +1,82 @@
 import re
 from pathlib import Path
-from typing import Dict
 from pdfminer.high_level import extract_text as pdf_extract_text
 from docx import Document
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
-MODEL_NAME = "manishiitg/resume-ner"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
-# Basic keyword lists (you can expand dynamically if needed)
-SKILL_KEYWORDS = ["python", "java", "sql", "docker", "aws", "machine learning", "flask", "django", "react"]
-EDU_KEYWORDS = ["bachelor", "master", "phd", "bsc", "msc", "mba", "computer science", "engineering"]
-JOB_KEYWORDS = ["engineer", "developer", "manager", "analyst", "consultant", "specialist"]
 def extract_text(file_path: str) -> str:
     path = Path(file_path)
     if path.suffix.lower() == ".pdf":
         text = pdf_extract_text(file_path)
@@ -26,34 +85,220 @@ def extract_text(file_path: str) -> str:
         text = "\n".join([p.text for p in doc.paragraphs])
     else:
         raise ValueError("Unsupported file format")
-    return text.replace("\n", " ").replace("\r", " ").strip()
-def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
-    text = extract_text(file_path)
-    entities = ner_pipeline(text)
-    # Model extraction (Name only works well)
-    name_parts = [ent["word"].strip() for ent in entities if ent["entity_group"].upper() in ["NAME", "PERSON"]]
-    full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"
-    # Skills fallback
-    skills_found = [skill for skill in SKILL_KEYWORDS if re.search(rf"\b{skill}\b", text, re.IGNORECASE)]
-    # Education fallback
-    education_found = [edu for edu in EDU_KEYWORDS if re.search(rf"\b{edu}\b", text, re.IGNORECASE)]
-    # Experience fallback
-    experience_found = []
-    for job in JOB_KEYWORDS:
-        if re.search(rf"\b{job}\b", text, re.IGNORECASE):
-            experience_found.append(job)
-    years_match = re.findall(r"(\d+)\s*(?:years|yrs)", text, re.IGNORECASE)
-    if years_match:
-        experience_found.append(f"{max(map(int, years_match))} years")
     return {
-        "name": full_name,
-        "skills": ", ".join(set(skills_found)) or "Not Found",
-        "education": ", ".join(set(education_found)) or "Not Found",
-        "experience": ", ".join(set(experience_found)) or "Not Found"
     }

 import re
 from pathlib import Path
+from typing import Dict, List, Tuple
+import spacy
 from pdfminer.high_level import extract_text as pdf_extract_text
 from docx import Document
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+import nltk
+from nltk.corpus import stopwords
+from dateutil.parser import parse as date_parse
+# Download required NLTK data
+try:
+    nltk.download('stopwords', quiet=True)
+    nltk.download('punkt', quiet=True)
+except:
+    pass
+# Load spaCy model for better NER
+try:
+    nlp = spacy.load("en_core_web_sm")
+except:
+    print("Please install spacy model: python -m spacy download en_core_web_sm")
+    nlp = None
+MODEL_NAME = "manishiitg/resume-ner"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+# Expanded keyword lists
+SKILL_KEYWORDS = {
+    # Programming Languages
+    "python", "java", "javascript", "typescript", "c++", "c#", "ruby", "go", "rust", "kotlin", "swift",
+    "php", "r", "matlab", "scala", "perl", "bash", "powershell", "sql", "html", "css",
+    # Frameworks & Libraries
+    "react", "angular", "vue", "node.js", "express", "django", "flask", "spring", "spring boot",
+    ".net", "laravel", "rails", "fastapi", "pytorch", "tensorflow", "keras", "scikit-learn",
+    # Databases
+    "mysql", "postgresql", "mongodb", "redis", "elasticsearch", "cassandra", "oracle", "sql server",
+    # Cloud & DevOps
+    "aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "terraform", "ansible", "ci/cd",
+    # Other Technical Skills
+    "machine learning", "deep learning", "data science", "nlp", "computer vision", "ai",
+    "rest api", "graphql", "microservices", "agile", "scrum", "git", "linux", "windows"
+}
+EDUCATION_PATTERNS = [
+    # Degrees
+    r"\b(bachelor|b\.?s\.?c?\.?|b\.?a\.?|b\.?tech|b\.?e\.?)\b",
+    r"\b(master|m\.?s\.?c?\.?|m\.?a\.?|m\.?tech|m\.?e\.?|mba)\b",
+    r"\b(ph\.?d\.?|doctorate|doctoral)\b",
+    r"\b(diploma|certificate|certification)\b",
+    # Fields of Study
+    r"\b(computer science|software engineering|information technology|it|cs)\b",
+    r"\b(electrical engineering|mechanical engineering|civil engineering)\b",
+    r"\b(data science|artificial intelligence|machine learning)\b",
+    r"\b(business administration|finance|accounting|marketing)\b",
+    # Institution indicators
+    r"\b(university|college|institute|school)\s+of\s+\w+",
+    r"\b\w+\s+(university|college|institute)\b"
+]
+JOB_TITLE_PATTERNS = [
+    r"\b(software|senior|junior|lead|principal|staff)\s*(engineer|developer|programmer)\b",
+    r"\b(data|business|system|security)\s*(analyst|scientist|engineer)\b",
+    r"\b(project|product|program|engineering)\s*manager\b",
+    r"\b(devops|cloud|ml|ai|backend|frontend|full[\s-]?stack)\s*(engineer|developer)\b",
+    r"\b(consultant|architect|specialist|coordinator|administrator)\b"
+]
 def extract_text(file_path: str) -> str:
+    """Extract text from PDF or DOCX files"""
     path = Path(file_path)
     if path.suffix.lower() == ".pdf":
         text = pdf_extract_text(file_path)
         text = "\n".join([p.text for p in doc.paragraphs])
     else:
         raise ValueError("Unsupported file format")
+    return text
+def clean_text(text: str) -> str:
+    """Clean and normalize text"""
+    # Remove multiple spaces and normalize
+    text = re.sub(r'\s+', ' ', text)
+    # Keep line breaks for section detection
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    return text.strip()
+def extract_sections(text: str) -> Dict[str, str]:
+    """Extract different sections from resume"""
+    sections = {
+        'education': '',
+        'experience': '',
+        'skills': '',
+        'summary': ''
+    }
+    # Common section headers
+    section_patterns = {
+        'education': r'(education|academic|qualification|degree)',
+        'experience': r'(experience|employment|work\s*history|professional\s*experience|career)',
+        'skills': r'(skills|technical\s*skills|competencies|expertise)',
+        'summary': r'(summary|objective|profile|about)'
+    }
+    lines = text.split('\n')
+    current_section = None
+    for i, line in enumerate(lines):
+        line_lower = line.lower().strip()
+        # Check if this line is a section header
+        for section, pattern in section_patterns.items():
+            if re.search(pattern, line_lower) and len(line_lower) < 50:
+                current_section = section
+                break
+        # Add content to current section
+        if current_section and i > 0:
+            sections[current_section] += line + '\n'
+    return sections
+def extract_name(text: str, entities: List) -> str:
+    """Extract name using multiple methods"""
+    # Method 1: Use transformer model
+    name_parts = []
+    for ent in entities:
+        if ent["entity_group"].upper() in ["NAME", "PERSON", "PER"]:
+            name_parts.append(ent["word"].strip())
+    if name_parts:
+        # Clean and join name parts
+        full_name = " ".join(dict.fromkeys(name_parts))
+        full_name = re.sub(r'\s+', ' ', full_name).strip()
+        if len(full_name) > 3 and len(full_name.split()) <= 4:
+            return full_name
+    # Method 2: Use spaCy if available
+    if nlp:
+        doc = nlp(text[:500])  # Check first 500 chars
+        for ent in doc.ents:
+            if ent.label_ == "PERSON":
+                name = ent.text.strip()
+                if len(name) > 3 and len(name.split()) <= 4:
+                    return name
+    # Method 3: Pattern matching for first few lines
+    first_lines = text.split('\n')[:5]
+    for line in first_lines:
+        line = line.strip()
+        # Look for name pattern (2-4 words, title case)
+        if re.match(r'^[A-Z][a-z]+(\s+[A-Z][a-z]+){1,3}$', line):
+            return line
+    return "Not Found"
+def extract_skills(text: str, skill_section: str = "") -> List[str]:
+    """Extract skills using multiple methods"""
+    skills_found = set()
+    # Prioritize skills section if available
+    search_text = skill_section + " " + text if skill_section else text
+    search_text = search_text.lower()
+    # Method 1: Direct keyword matching
+    for skill in SKILL_KEYWORDS:
+        if re.search(rf'\b{re.escape(skill.lower())}\b', search_text):
+            skills_found.add(skill)
+    # Method 2: Pattern-based extraction
+    # Look for skills in bullet points or comma-separated lists
+    skill_patterns = [
+        r'[•·▪▫◦‣⁃]\s*([A-Za-z\s\+\#\.]+)',  # Bullet points
+        r'(?:skills?|technologies|tools?)[\s:]*([A-Za-z\s,\+\#\.]+)',  # After keywords
+    ]
+    for pattern in skill_patterns:
+        matches = re.findall(pattern, search_text, re.IGNORECASE)
+        for match in matches:
+            # Check each word/phrase in the match
+            potential_skills = re.split(r'[,;]', match)
+            for ps in potential_skills:
+                ps = ps.strip().lower()
+                if ps in SKILL_KEYWORDS:
+                    skills_found.add(ps)
+    return list(skills_found)
+def extract_education(text: str, edu_section: str = "") -> List[str]:
+    """Extract education information"""
+    education_info = []
+    search_text = edu_section + " " + text if edu_section else text
+    # Extract degrees
+    for pattern in EDUCATION_PATTERNS:
+        matches = re.findall(pattern, search_text, re.IGNORECASE)
+        for match in matches:
+            if isinstance(match, tuple):
+                match = match[0]
+            education_info.append(match)
+    # Extract years (graduation years)
+    year_pattern = r'\b(19[0-9]{2}|20[0-9]{2})\b'
+    years = re.findall(year_pattern, search_text)
+    # Extract GPA if mentioned
+    gpa_pattern = r'(?:gpa|cgpa|grade)[\s:]*([0-9]\.[0-9]+)'
+    gpa_matches = re.findall(gpa_pattern, search_text, re.IGNORECASE)
+    return list(dict.fromkeys(education_info))  # Remove duplicates
+def extract_experience(text: str, exp_section: str = "") -> List[str]:
+    """Extract experience information"""
+    experience_info = []
+    search_text = exp_section + " " + text if exp_section else text
+    # Extract job titles
+    for pattern in JOB_TITLE_PATTERNS:
+        matches = re.findall(pattern, search_text, re.IGNORECASE)
+        for match in matches:
+            if isinstance(match, tuple):
+                match = ' '.join(match).strip()
+            experience_info.append(match)
+    # Extract years of experience
+    exp_patterns = [
+        r'(\d+)\+?\s*(?:years?|yrs?)(?:\s+of)?\s+experience',
+        r'experience\s*:?\s*(\d+)\+?\s*(?:years?|yrs?)',
+        r'(\d+)\+?\s*(?:years?|yrs?)\s+(?:as|in|of)',
+    ]
+    for pattern in exp_patterns:
+        matches = re.findall(pattern, search_text, re.IGNORECASE)
+        if matches:
+            years = max(map(int, matches))
+            experience_info.append(f"{years}+ years experience")
+            break
+    # Extract company names (common patterns)
+    company_patterns = [
+        r'(?:at|@|company|employer)\s*:?\s*([A-Z][A-Za-z\s&\.\-]+)',
+        r'([A-Z][A-Za-z\s&\.\-]+)\s*(?:inc|llc|ltd|corp|company)',
+    ]
+    for pattern in company_patterns:
+        matches = re.findall(pattern, search_text)
+        experience_info.extend(matches[:3])  # Limit to avoid false positives
+    return list(dict.fromkeys(experience_info))
+def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
+    """Main function to parse resume"""
+    # Extract and clean text
+    raw_text = extract_text(file_path)
+    text = clean_text(raw_text)
+    # Extract sections
+    sections = extract_sections(text)
+    # Get NER entities
+    entities = ner_pipeline(text[:1024])  # Limit for performance
+    # Extract information
+    name = extract_name(text, entities)
+    skills = extract_skills(text, sections.get('skills', ''))
+    education = extract_education(text, sections.get('education', ''))
+    experience = extract_experience(text, sections.get('experience', ''))
     return {
+        "name": name,
+        "skills": ", ".join(skills[:15]) if skills else "Not Found",  # Limit to 15 skills
+        "education": ", ".join(education[:5]) if education else "Not Found",
+        "experience": ", ".join(experience[:5]) if experience else "Not Found"
     }
+# Optional: Add confidence scores
+def parse_resume_with_confidence(file_path: str) -> Dict[str, Tuple[str, float]]:
+    """Parse resume with confidence scores for each field"""
+    result = parse_resume(file_path)
+    # Simple confidence calculation based on whether data was found
+    confidence_scores = {
+        "name": 0.9 if result["name"] != "Not Found" else 0.1,
+        "skills": min(0.9, len(result["skills"].split(",")) * 0.1) if result["skills"] != "Not Found" else 0.1,
+        "education": 0.8 if result["education"] != "Not Found" else 0.2,
+        "experience": 0.8 if result["experience"] != "Not Found" else 0.2
+    }
+    return {
+        key: (value, confidence_scores[key])
+        for key, value in result.items()
+    }