File size: 3,446 Bytes
af02e64
f3f24e3
 
 
 
efffc2e
 
 
b336194
f3f24e3
efffc2e
f3f24e3
efffc2e
 
 
 
 
 
 
 
 
 
 
 
d4b2339
 
efffc2e
d4b2339
af02e64
efffc2e
af02e64
 
f3f24e3
 
af02e64
efffc2e
d4b2339
efffc2e
d4b2339
 
 
 
efffc2e
f3f24e3
efffc2e
d4b2339
efffc2e
d4b2339
efffc2e
 
 
 
f3f24e3
 
af02e64
f3f24e3
af02e64
d4b2339
efffc2e
d4b2339
efffc2e
 
 
d4b2339
efffc2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ead253
8d99522
 
 
 
efffc2e
8d99522
efffc2e
 
f3f24e3
efffc2e
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from __future__ import annotations
import os
import re
import subprocess
import zipfile
from typing import List, Dict
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ===============================
# Load Model & Tokenizer
# ===============================
MODEL_ID = "sravya-abburi/ResumeParserBERT"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)

ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

# ===============================
# Text Extraction
# ===============================
def extract_text(file_path: str) -> str:
    """Extract raw text from PDF or DOCX."""
    if not file_path or not os.path.isfile(file_path):
        return ""

    lower_name = file_path.lower()
    try:
        if lower_name.endswith(".pdf"):
            result = subprocess.run(
                ["pdftotext", "-layout", file_path, "-"],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False
            )
            return result.stdout.decode("utf-8", errors="ignore")

        elif lower_name.endswith(".docx"):
            with zipfile.ZipFile(file_path) as zf:
                with zf.open("word/document.xml") as docx_xml:
                    xml_bytes = docx_xml.read()
                    xml_text = xml_bytes.decode("utf-8", errors="ignore")
                    xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
                    text = re.sub(r"<[^>]+>", " ", xml_text)
                    return re.sub(r"\s+", " ", text)
        else:
            return ""
    except Exception:
        return ""

# ===============================
# Parse Resume using BERT NER
# ===============================
def parse_with_bert(text: str) -> Dict[str, str]:
    """Parse resume text into structured fields using BERT NER."""
    entities = ner_pipeline(text)

    name_tokens, skill_tokens, edu_tokens, exp_tokens = [], [], [], []

    for ent in entities:
        label = ent["entity_group"].upper()
        word = ent["word"].strip()

        if label == "NAME" and word not in name_tokens:
            name_tokens.append(word)
        elif label == "SKILL" and word not in skill_tokens:
            skill_tokens.append(word)
        elif label == "EDUCATION" and word not in edu_tokens:
            edu_tokens.append(word)
        elif label == "EXPERIENCE" and word not in exp_tokens:
            exp_tokens.append(word)

    return {
        "name": " ".join(name_tokens),
        "skills": ", ".join(skill_tokens),
        "education": ", ".join(edu_tokens),
        "experience": ", ".join(exp_tokens)
    }

# ===============================
# Main Parse Function
# ===============================
def parse_resume(file_path: str, filename: str) -> dict:
    """Main function for resume parsing."""
    text = extract_text(file_path)
    if not text:
        return {"name": "", "skills": "", "education": "", "experience": ""}

    ents = parse_with_bert(text)

    # Fallback: use filename for name if model doesn't find one
    if not ents["name"]:
        base = os.path.basename(filename)
        base = re.sub(r"\.(pdf|docx|doc)$", "", base, flags=re.I)
        ents["name"] = re.sub(r"[\._-]+", " ", base).title().strip()

    return ents