File size: 1,955 Bytes
682910e
33fa314
 
 
 
 
efffc2e
b336194
682910e
2c6f29c
682910e
2c6f29c
682910e
864c2ae
2c6f29c
6d286f1
a511250
682910e
33fa314
682910e
af02e64
33fa314
 
 
 
 
 
 
 
864c2ae
682910e
6d286f1
682910e
33fa314
864c2ae
efffc2e
d4b2339
33fa314
 
 
 
 
efffc2e
 
33fa314
682910e
864c2ae
33fa314
864c2ae
33fa314
864c2ae
33fa314
 
 
efffc2e
 
33fa314
 
 
 
efffc2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json
from pathlib import Path
from typing import Dict

from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# --------------------
# Load PyTorch Resume NER Model
# --------------------
MODEL_NAME = "manishiitg/resume-ner"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# --------------------
# Extract Text from PDF/DOCX
# --------------------
def extract_text(file_path: str) -> str:
    path = Path(file_path)
    if path.suffix.lower() == ".pdf":
        return pdf_extract_text(file_path)
    elif path.suffix.lower() == ".docx":
        doc = Document(file_path)
        return "\n".join([p.text for p in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format")

# --------------------
# Parse Resume
# --------------------
def parse_resume(file_path: str) -> Dict[str, str]:
    text = extract_text(file_path)
    entities = ner_pipeline(text)

    name = []
    skills = []
    education = []
    experience = []

    for ent in entities:
        label = ent["entity_group"].upper()
        value = ent["word"].strip()

        if label == "NAME":
            name.append(value)
        elif label == "SKILL":
            skills.append(value)
        elif label in ["EDUCATION", "DEGREE"]:
            education.append(value)
        elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
            experience.append(value)

    return {
        "name": " ".join(dict.fromkeys(name)) or "Not Found",
        "skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
        "education": ", ".join(dict.fromkeys(education)) or "Not Found",
        "experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
    }