Spaces:
Paused
Paused
File size: 4,485 Bytes
af02e64 8d99522 af02e64 722e882 b336194 8d99522 722e882 af02e64 8d99522 d4b2339 8d99522 722e882 0e43f07 d4b2339 af02e64 8d99522 d4b2339 8d99522 d4b2339 af02e64 8d99522 af02e64 d4b2339 af02e64 8d99522 af02e64 d4b2339 8d99522 d4b2339 8d99522 4f1e97d d4b2339 4f1e97d d4b2339 4f1e97d d4b2339 8d99522 1ead253 4f1e97d 1ead253 4f1e97d 1ead253 4f1e97d 1ead253 4f1e97d 1ead253 4f1e97d 8d99522 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
from __future__ import annotations
import os, re, subprocess, zipfile, json, torch
from typing import List
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# Limit threads to avoid Hugging Face Spaces threading issues
os.environ.update({
"OMP_NUM_THREADS": "1",
"OPENBLAS_NUM_THREADS": "1",
"MKL_NUM_THREADS": "1",
"NUMEXPR_NUM_THREADS": "1",
"VECLIB_MAXIMUM_THREADS": "1"
})
# Load Zephyr in 4-bit
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"HuggingFaceH4/zephyr-7b-beta",
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
# ===============================
# Text Extraction (PDF/DOCX)
# ===============================
def extract_text(file_path: str) -> str:
if not file_path or not os.path.isfile(file_path):
return ""
try:
if file_path.lower().endswith('.pdf'):
result = subprocess.run(
['pdftotext', '-layout', file_path, '-'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False
)
return result.stdout.decode('utf-8', errors='ignore')
elif file_path.lower().endswith('.docx'):
with zipfile.ZipFile(file_path) as zf:
with zf.open('word/document.xml') as docx_xml:
xml_bytes = docx_xml.read()
xml_text = xml_bytes.decode('utf-8', errors='ignore')
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
text = re.sub(r'<[^>]+>', ' ', xml_text)
return re.sub(r'\s+', ' ', text)
except Exception:
pass
return ""
# ===============================
# Name Extraction (Fallback)
# ===============================
def extract_name(text: str, filename: str) -> str:
if text:
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for line in lines[:10]:
if not re.match(r'(?i)resume|curriculum vitae', line):
words = line.split()
if 1 < len(words) <= 4 and all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
return line
base = os.path.basename(filename)
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
base = re.sub(r'[\._-]+', ' ', base)
base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
return base.title().strip()
# ===============================
# Zephyr Parsing
# ===============================
def parse_with_zephyr(text: str) -> dict:
"""Use Zephyr-7B to extract resume details in JSON format."""
prompt = f"""
You are an information extraction system.
Extract the following fields from the resume text.
⚠️ DO NOT return placeholders like "Full Name" or "Skill1".
Return ONLY actual values from the resume. If a field is missing, leave it as an empty string or empty list.
Fields to extract:
- name
- skills (list of skills)
- education (list of degrees + institutions)
- experience (list of jobs with company, title, dates)
Resume:
{text}
Return ONLY a valid JSON in this format:
{{
"name": "<actual name or empty string>",
"skills": ["<actual skill>", "<actual skill>"],
"education": ["<Degree - Institution>", "<Degree - Institution>"],
"experience": ["<Job - Company (Dates)>", "<Job - Company (Dates)>"]
}}
"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=512, do_sample=False, temperature=0)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
match = re.search(r"\{.*\}", response, re.S)
if match:
try:
return json.loads(match.group())
except:
pass
return {"name": "", "skills": [], "education": [], "experience": []}
# ===============================
# Main Parse Function
# ===============================
def parse_resume(file_path: str, filename: str) -> dict:
text = extract_text(file_path)
name_fallback = extract_name(text, filename)
data = parse_with_zephyr(text)
if not data.get("name"):
data["name"] = name_fallback
return data
|