File size: 5,301 Bytes
af02e64
 
 
 
 
d4b2339
 
af02e64
d4b2339
af02e64
1de8504
 
 
 
 
 
2a8ccfd
b336194
af02e64
d4b2339
 
 
 
2e9d967
d4b2339
 
 
2e9d967
d4b2339
 
 
af02e64
d4b2339
af02e64
 
 
 
 
 
d4b2339
 
 
 
 
 
 
 
af02e64
d4b2339
 
 
 
 
 
 
af02e64
 
 
 
 
d4b2339
 
 
af02e64
d4b2339
af02e64
 
 
 
 
 
 
 
 
 
 
 
 
d4b2339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
775c09c
d4b2339
 
 
775c09c
af02e64
d4b2339
 
 
af02e64
d4b2339
af02e64
 
d4b2339
 
 
 
 
 
 
 
 
 
 
af02e64
d4b2339
 
 
 
775c09c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from __future__ import annotations
import os
import re
import subprocess
import zipfile
import json
import torch
from typing import List
from transformers import AutoModelForCausalLM, AutoTokenizer

os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"

MODEL_ID = "tiiuae/falcon-7b-instruct"


print(f"Loading {MODEL_ID}... (This may take some time on first run)")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,  
    device_map="auto"
)


# ===============================
# Text Extraction (PDF/DOCX)
# ===============================
def extract_text(file_path: str) -> str:
    """Extract text from PDF or DOCX resumes."""
    if not file_path or not os.path.isfile(file_path):
        return ""

    lower_name = file_path.lower()
    try:
        if lower_name.endswith('.pdf'):
            result = subprocess.run(
                ['pdftotext', '-layout', file_path, '-'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False
            )
            return result.stdout.decode('utf-8', errors='ignore')

        elif lower_name.endswith('.docx'):
            with zipfile.ZipFile(file_path) as zf:
                with zf.open('word/document.xml') as docx_xml:
                    xml_bytes = docx_xml.read()
                    xml_text = xml_bytes.decode('utf-8', errors='ignore')
                    xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
                    text = re.sub(r'<[^>]+>', ' ', xml_text)
                    return re.sub(r'\s+', ' ', text)
        else:
            return ""
    except Exception:
        return ""

# ===============================
# Name Extraction (Fallback)
# ===============================
def extract_name(text: str, filename: str) -> str:
    """Extract candidate's name from resume text or filename."""
    if text:
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        for line in lines[:10]:
            if re.match(r'(?i)resume|curriculum vitae', line):
                continue
            words = line.split()
            if 1 < len(words) <= 4:
                if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
                    return line
    base = os.path.basename(filename)
    base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
    base = re.sub(r'[\._-]+', ' ', base)
    base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
    return base.title().strip()

# ===============================
# Janus-Pro Parsing
# ===============================
def parse_with_deepseek(text: str) -> dict:
    """Use DeepSeek Janus-Pro-7B to extract resume details in JSON format."""
    prompt = f"""
    Extract the following information from the resume text below:

    - Full Name
    - Skills (comma separated)
    - Education (degrees + institutions)
    - Experience (job titles + companies)

    Return only valid JSON in the following structure:
    {{
      "name": "Full Name",
      "skills": "Skill1, Skill2, Skill3",
      "education": "Degree1 - Institution1; Degree2 - Institution2",
      "experience": "Job1 - Company1; Job2 - Company2"
    }}

    Resume:
    {text}
    """

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=512)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract JSON safely
    match = re.search(r"\{.*\}", response, re.S)
    if match:
        try:
            return json.loads(match.group())
        except:
            pass
    return {"name": "", "skills": "", "education": "", "experience": ""}

# ===============================
# Fallback Heading-based Parsing
# ===============================
def fallback_parse(text: str) -> dict:
    """Simple heading-based parsing as backup."""
    skills = re.findall(r"Skills\s*[:\-]?\s*(.*)", text, re.I)
    education = re.findall(r"Education\s*[:\-]?\s*(.*)", text, re.I)
    experience = re.findall(r"(Experience|Work History)\s*[:\-]?\s*(.*)", text, re.I)
    return {
        "skills": ", ".join(skills),
        "education": ", ".join(education),
        "experience": ", ".join([exp[1] for exp in experience])
    }

# ===============================
# Main Parse Function
# ===============================
def parse_resume(file_path: str, filename: str) -> dict:
    """Main resume parsing function."""
    text = extract_text(file_path)
    name = extract_name(text, filename)

    # Try Janus-Pro parsing
    ents = parse_with_deepseek(text)

    # If Janus-Pro misses fields, use fallback
    if not ents.get("skills") or not ents.get("education"):
        fb = fallback_parse(text)
        ents["skills"] = ents.get("skills") or fb["skills"]
        ents["education"] = ents.get("education") or fb["education"]
        ents["experience"] = ents.get("experience") or fb["experience"]

    return {
        "name": ents.get("name") or name,
        "skills": ents.get("skills", ""),
        "education": ents.get("education", ""),
        "experience": ents.get("experience", "")
    }