File size: 2,581 Bytes
efffc2e
6d286f1
947d727
b336194
6d286f1
 
 
947d727
864c2ae
 
6d286f1
a511250
6d286f1
 
 
af02e64
6d286f1
 
 
947d727
6d286f1
 
947d727
 
 
 
 
 
6d286f1
 
947d727
6d286f1
864c2ae
 
 
 
 
6d286f1
864c2ae
 
6d286f1
 
 
a511250
6d286f1
864c2ae
6d286f1
efffc2e
d4b2339
864c2ae
efffc2e
 
6d286f1
864c2ae
 
 
 
 
 
 
 
efffc2e
 
a511250
 
 
 
efffc2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import zipfile, re, os
from PyPDF2 import PdfReader  # Lightweight & already in Spaces

# ===============================
# Load Model & Tokenizer
# ===============================
MODEL_NAME = "sravya-abburi/ResumeParserBERT"  # Swap to Kiet model if needed
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ===============================
# Extract Text (PDF & DOCX)
# ===============================
def extract_text(file_path: str) -> str:
    """Extract text from PDF or DOCX without external dependencies."""
    file_path_lower = file_path.lower()

    # ✅ PDF reading using PyPDF2 (no fitz, no installs needed)
    if file_path_lower.endswith(".pdf"):
        text = ""
        with open(file_path, "rb") as f:
            reader = PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        return text

    # ✅ DOCX reading by extracting XML content
    elif file_path_lower.endswith(".docx"):
        with zipfile.ZipFile(file_path) as zf:
            with zf.open("word/document.xml") as docx_xml:
                xml_text = docx_xml.read().decode("utf-8", errors="ignore")
                xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
                return re.sub(r"<[^>]+>", " ", xml_text)

    return ""

# ===============================
# Parse Resume
# ===============================
def parse_resume(file_path: str, filename: str = None) -> dict:
    """Parse resume and extract structured information."""
    text = extract_text(file_path)

    entities = ner_pipeline(text)

    name, skills, education, experience = [], [], [], []
    for ent in entities:
        label = ent["entity_group"].upper()
        word = ent["word"].strip()
        if label == "NAME":
            name.append(word)
        elif label == "SKILL":
            skills.append(word)
        elif label in ["EDUCATION", "DEGREE"]:
            education.append(word)
        elif label in ["EXPERIENCE", "JOB", "ROLE"]:
            experience.append(word)

    return {
        "name": " ".join(dict.fromkeys(name)),
        "skills": ", ".join(dict.fromkeys(skills)),
        "education": ", ".join(dict.fromkeys(education)),
        "experience": ", ".join(dict.fromkeys(experience))
    }