File size: 1,960 Bytes
682910e
33fa314
 
 
 
 
efffc2e
b336194
682910e
33fa314
682910e
33fa314
682910e
864c2ae
 
6d286f1
a511250
682910e
33fa314
682910e
af02e64
33fa314
 
 
 
 
 
 
 
864c2ae
682910e
6d286f1
682910e
33fa314
864c2ae
efffc2e
d4b2339
33fa314
 
 
 
 
efffc2e
 
33fa314
682910e
864c2ae
33fa314
864c2ae
33fa314
864c2ae
33fa314
 
 
efffc2e
 
33fa314
 
 
 
efffc2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json
from pathlib import Path
from typing import Dict

from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# --------------------
# Load Resume NER Model
# --------------------
MODEL_NAME = "Ioana23/bert-finetuned-resumes-ner"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# --------------------
# Extract Text from PDF/DOCX
# --------------------
def extract_text(file_path: str) -> str:
    path = Path(file_path)
    if path.suffix.lower() == ".pdf":
        return pdf_extract_text(file_path)
    elif path.suffix.lower() == ".docx":
        doc = Document(file_path)
        return "\n".join([p.text for p in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format")

# --------------------
# Parse Resume
# --------------------
def parse_resume(file_path: str) -> Dict[str, str]:
    text = extract_text(file_path)
    entities = ner_pipeline(text)

    name = []
    skills = []
    education = []
    experience = []

    for ent in entities:
        label = ent["entity_group"].upper()
        value = ent["word"].strip()

        if label == "NAME":
            name.append(value)
        elif label == "SKILL":
            skills.append(value)
        elif label in ["EDUCATION", "DEGREE"]:
            education.append(value)
        elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
            experience.append(value)

    return {
        "name": " ".join(dict.fromkeys(name)) or "Not Found",
        "skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
        "education": ", ".join(dict.fromkeys(education)) or "Not Found",
        "experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
    }