File size: 4,300 Bytes
af02e64
8d99522
af02e64
722e882
b336194
8d99522
 
 
 
 
 
 
 
 
 
722e882
 
 
 
 
 
af02e64
8d99522
d4b2339
8d99522
722e882
0e43f07
 
 
d4b2339
 
 
 
 
af02e64
 
 
 
8d99522
d4b2339
 
 
 
 
 
 
8d99522
d4b2339
 
 
 
 
 
 
af02e64
8d99522
 
af02e64
d4b2339
 
 
af02e64
 
 
 
8d99522
 
 
af02e64
 
 
 
 
d4b2339
 
 
8d99522
d4b2339
8d99522
d4b2339
8d99522
 
d4b2339
1ead253
8d99522
 
 
 
 
 
d4b2339
8d99522
1ead253
 
8d99522
1ead253
 
 
 
 
 
 
 
 
 
8d99522
1ead253
 
 
 
 
 
 
 
 
8d99522
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from __future__ import annotations
import os, re, subprocess, zipfile, json, torch
from typing import List
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Limit threads to avoid Hugging Face Spaces threading issues
os.environ.update({
    "OMP_NUM_THREADS": "1",
    "OPENBLAS_NUM_THREADS": "1",
    "MKL_NUM_THREADS": "1",
    "NUMEXPR_NUM_THREADS": "1",
    "VECLIB_MAXIMUM_THREADS": "1"
})

# Load Zephyr in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "HuggingFaceH4/zephyr-7b-beta",
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# ===============================
# Text Extraction (PDF/DOCX)
# ===============================
def extract_text(file_path: str) -> str:
    if not file_path or not os.path.isfile(file_path):
        return ""
    try:
        if file_path.lower().endswith('.pdf'):
            result = subprocess.run(
                ['pdftotext', '-layout', file_path, '-'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False
            )
            return result.stdout.decode('utf-8', errors='ignore')
        elif file_path.lower().endswith('.docx'):
            with zipfile.ZipFile(file_path) as zf:
                with zf.open('word/document.xml') as docx_xml:
                    xml_bytes = docx_xml.read()
                    xml_text = xml_bytes.decode('utf-8', errors='ignore')
                    xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
                    text = re.sub(r'<[^>]+>', ' ', xml_text)
                    return re.sub(r'\s+', ' ', text)
    except Exception:
        pass
    return ""

# ===============================
# Name Extraction (Fallback)
# ===============================
def extract_name(text: str, filename: str) -> str:
    if text:
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        for line in lines[:10]:
            if not re.match(r'(?i)resume|curriculum vitae', line):
                words = line.split()
                if 1 < len(words) <= 4 and all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
                    return line
    base = os.path.basename(filename)
    base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
    base = re.sub(r'[\._-]+', ' ', base)
    base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
    return base.title().strip()

# ===============================
# Zephyr Parsing
# ===============================
def parse_with_zephyr(text: str) -> dict:
    prompt = f"""
Extract the following information from the resume text provided below.
Return ONLY a valid JSON object (no extra commentary).

Information to extract:
- Full Name
- Email
- Phone
- Skills (list)
- Education (list of degrees + institutions)
- Experience (list of jobs with company, title, and dates)

Resume:
{text}

JSON format:
{{
  "name": "Full Name",
  "email": "[email protected]",
  "phone": "+961-xxx-xxx",
  "skills": ["Skill1", "Skill2", "Skill3"],
  "education": ["Degree1 - Institution1", "Degree2 - Institution2"],
  "experience": ["Job1 - Company1 (Dates)", "Job2 - Company2 (Dates)"]
}}
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    match = re.search(r"\{.*\}", response, re.S)
    if match:
        try:
            return json.loads(match.group())
        except:
            pass
    return {"name": "", "email": "", "phone": "", "skills": [], "education": [], "experience": []}

# ===============================
# Main Parse Function
# ===============================
def parse_resume(file_path: str, filename: str) -> dict:
    text = extract_text(file_path)
    name_fallback = extract_name(text, filename)
    data = parse_with_zephyr(text)
    if not data.get("name"):
        data["name"] = name_fallback
    return data