File size: 2,124 Bytes
288175b
33fa314
288175b
 
33fa314
 
288175b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6248af7
288175b
 
 
 
 
 
6248af7
288175b
 
 
 
6248af7
288175b
 
 
6248af7
288175b
 
 
 
 
 
 
 
6248af7
288175b
b336194
288175b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import json
from pathlib import Path
from typing import Dict

from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# --------------------
# Load PyTorch Resume NER Model
# --------------------
MODEL_NAME = "manishiitg/resume-ner"  # Works with PyTorch on Hugging Face Spaces

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# --------------------
# Extract Text from PDF/DOCX
# --------------------
def extract_text(file_path: str) -> str:
    path = Path(file_path)
    if path.suffix.lower() == ".pdf":
        return pdf_extract_text(file_path)
    elif path.suffix.lower() == ".docx":
        doc = Document(file_path)
        return "\n".join([p.text for p in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format")

# --------------------
# Parse Resume (returns only: full name, skills, education, experience)
# --------------------
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
    text = extract_text(file_path)
    entities = ner_pipeline(text)

    name_parts = []
    skills = []
    education = []
    experience = []

    for ent in entities:
        label = ent["entity_group"].upper()
        value = ent["word"].strip()

        if label == "NAME":
            name_parts.append(value)
        elif label == "SKILL":
            skills.append(value)
        elif label in ["EDUCATION", "DEGREE"]:
            education.append(value)
        elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
            experience.append(value)

    full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"

    return {
        "name": full_name,
        "skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
        "education": ", ".join(dict.fromkeys(education)) or "Not Found",
        "experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
    }