File size: 2,584 Bytes
682910e
 
 
 
 
efffc2e
b336194
682910e
 
 
 
 
864c2ae
 
682910e
6d286f1
a511250
682910e
 
 
af02e64
682910e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
864c2ae
 
682910e
6d286f1
682910e
a511250
682910e
864c2ae
efffc2e
d4b2339
864c2ae
efffc2e
 
6d286f1
682910e
864c2ae
 
 
 
 
 
 
 
efffc2e
 
a511250
 
 
 
efffc2e
682910e
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import re
import subprocess
import zipfile
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# --------------------
# Load Model
# --------------------
MODEL_NAME = "sravya-abburi/ResumeParserBERT"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# --------------------
# Extract Text
# --------------------
def extract_text(file_path: str) -> str:
    """Extract text from PDF/DOCX resumes."""
    if file_path.lower().endswith(".pdf"):
        try:
            result = subprocess.run(
                ["pdftotext", "-layout", file_path, "-"],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
            )
            return result.stdout.decode("utf-8", errors="ignore")
        except:
            return ""
    elif file_path.lower().endswith(".docx"):
        try:
            with zipfile.ZipFile(file_path) as zf:
                with zf.open("word/document.xml") as docx_xml:
                    xml_text = docx_xml.read().decode("utf-8", errors="ignore")
                    xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
                    return re.sub(r"<[^>]+>", " ", xml_text)
        except:
            return ""
    return ""

# --------------------
# Parse Resume
# --------------------
def parse_resume(file_path: str, filename: str = None) -> dict:
    """Extract Name, Skills, Education, Experience from resume."""
    text = extract_text(file_path)
    entities = ner_pipeline(text)

    name, skills, education, experience = [], [], [], []
    for ent in entities:
        label = ent["entity_group"].upper()
        word = ent["word"].strip()

        if label == "NAME":
            name.append(word)
        elif label == "SKILL":
            skills.append(word)
        elif label in ["EDUCATION", "DEGREE"]:
            education.append(word)
        elif label in ["EXPERIENCE", "JOB", "ROLE"]:
            experience.append(word)

    return {
        "name": " ".join(dict.fromkeys(name)),
        "skills": ", ".join(dict.fromkeys(skills)),
        "education": ", ".join(dict.fromkeys(education)),
        "experience": ", ".join(dict.fromkeys(experience))
    }

# --------------------
# Example
# --------------------
if __name__ == "__main__":
    resume_path = "resume.pdf"  # Change to test file
    result = parse_resume(resume_path)
    print(json.dumps(result, indent=2))