File size: 4,485 Bytes
af02e64
8d99522
af02e64
722e882
b336194
8d99522
 
 
 
 
 
 
 
 
 
722e882
 
 
 
 
 
af02e64
8d99522
d4b2339
8d99522
722e882
0e43f07
 
 
d4b2339
 
 
 
 
af02e64
 
 
 
8d99522
d4b2339
 
 
 
 
 
 
8d99522
d4b2339
 
 
 
 
 
 
af02e64
8d99522
 
af02e64
d4b2339
 
 
af02e64
 
 
 
8d99522
 
 
af02e64
 
 
 
 
d4b2339
 
 
8d99522
d4b2339
8d99522
4f1e97d
 
d4b2339
4f1e97d
d4b2339
4f1e97d
 
 
 
 
 
 
 
 
d4b2339
8d99522
1ead253
 
4f1e97d
1ead253
4f1e97d
 
 
 
1ead253
 
4f1e97d
1ead253
4f1e97d
1ead253
 
 
 
 
 
 
 
4f1e97d
 
8d99522
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from __future__ import annotations
import os, re, subprocess, zipfile, json, torch
from typing import List
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Limit threads to avoid Hugging Face Spaces threading issues
os.environ.update({
    "OMP_NUM_THREADS": "1",
    "OPENBLAS_NUM_THREADS": "1",
    "MKL_NUM_THREADS": "1",
    "NUMEXPR_NUM_THREADS": "1",
    "VECLIB_MAXIMUM_THREADS": "1"
})

# Load Zephyr in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "HuggingFaceH4/zephyr-7b-beta",
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# ===============================
# Text Extraction (PDF/DOCX)
# ===============================
def extract_text(file_path: str) -> str:
    if not file_path or not os.path.isfile(file_path):
        return ""
    try:
        if file_path.lower().endswith('.pdf'):
            result = subprocess.run(
                ['pdftotext', '-layout', file_path, '-'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False
            )
            return result.stdout.decode('utf-8', errors='ignore')
        elif file_path.lower().endswith('.docx'):
            with zipfile.ZipFile(file_path) as zf:
                with zf.open('word/document.xml') as docx_xml:
                    xml_bytes = docx_xml.read()
                    xml_text = xml_bytes.decode('utf-8', errors='ignore')
                    xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
                    text = re.sub(r'<[^>]+>', ' ', xml_text)
                    return re.sub(r'\s+', ' ', text)
    except Exception:
        pass
    return ""

# ===============================
# Name Extraction (Fallback)
# ===============================
def extract_name(text: str, filename: str) -> str:
    if text:
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        for line in lines[:10]:
            if not re.match(r'(?i)resume|curriculum vitae', line):
                words = line.split()
                if 1 < len(words) <= 4 and all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
                    return line
    base = os.path.basename(filename)
    base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
    base = re.sub(r'[\._-]+', ' ', base)
    base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
    return base.title().strip()

# ===============================
# Zephyr Parsing
# ===============================
def parse_with_zephyr(text: str) -> dict:
    """Use Zephyr-7B to extract resume details in JSON format."""
    
    prompt = f"""
You are an information extraction system.

Extract the following fields from the resume text.  
⚠️ DO NOT return placeholders like "Full Name" or "Skill1".  
Return ONLY actual values from the resume. If a field is missing, leave it as an empty string or empty list.

Fields to extract:
- name
- skills (list of skills)
- education (list of degrees + institutions)
- experience (list of jobs with company, title, dates)

Resume:
{text}

Return ONLY a valid JSON in this format:
{{
  "name": "<actual name or empty string>",
  "skills": ["<actual skill>", "<actual skill>"],
  "education": ["<Degree - Institution>", "<Degree - Institution>"],
  "experience": ["<Job - Company (Dates)>", "<Job - Company (Dates)>"]
}}
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=512, do_sample=False, temperature=0)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    match = re.search(r"\{.*\}", response, re.S)
    if match:
        try:
            return json.loads(match.group())
        except:
            pass
    
    return {"name": "", "skills": [], "education": [], "experience": []}
# ===============================
# Main Parse Function
# ===============================
def parse_resume(file_path: str, filename: str) -> dict:
    text = extract_text(file_path)
    name_fallback = extract_name(text, filename)
    data = parse_with_zephyr(text)
    if not data.get("name"):
        data["name"] = name_fallback
    return data