Spaces:
Paused
Paused
File size: 4,300 Bytes
af02e64 8d99522 af02e64 722e882 b336194 8d99522 722e882 af02e64 8d99522 d4b2339 8d99522 722e882 0e43f07 d4b2339 af02e64 8d99522 d4b2339 8d99522 d4b2339 af02e64 8d99522 af02e64 d4b2339 af02e64 8d99522 af02e64 d4b2339 8d99522 d4b2339 8d99522 d4b2339 8d99522 d4b2339 1ead253 8d99522 d4b2339 8d99522 1ead253 8d99522 1ead253 8d99522 1ead253 8d99522 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
from __future__ import annotations
import os, re, subprocess, zipfile, json, torch
from typing import List
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# Limit threads to avoid Hugging Face Spaces threading issues
os.environ.update({
"OMP_NUM_THREADS": "1",
"OPENBLAS_NUM_THREADS": "1",
"MKL_NUM_THREADS": "1",
"NUMEXPR_NUM_THREADS": "1",
"VECLIB_MAXIMUM_THREADS": "1"
})
# Load Zephyr in 4-bit
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"HuggingFaceH4/zephyr-7b-beta",
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
# ===============================
# Text Extraction (PDF/DOCX)
# ===============================
def extract_text(file_path: str) -> str:
if not file_path or not os.path.isfile(file_path):
return ""
try:
if file_path.lower().endswith('.pdf'):
result = subprocess.run(
['pdftotext', '-layout', file_path, '-'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False
)
return result.stdout.decode('utf-8', errors='ignore')
elif file_path.lower().endswith('.docx'):
with zipfile.ZipFile(file_path) as zf:
with zf.open('word/document.xml') as docx_xml:
xml_bytes = docx_xml.read()
xml_text = xml_bytes.decode('utf-8', errors='ignore')
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
text = re.sub(r'<[^>]+>', ' ', xml_text)
return re.sub(r'\s+', ' ', text)
except Exception:
pass
return ""
# ===============================
# Name Extraction (Fallback)
# ===============================
def extract_name(text: str, filename: str) -> str:
if text:
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for line in lines[:10]:
if not re.match(r'(?i)resume|curriculum vitae', line):
words = line.split()
if 1 < len(words) <= 4 and all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
return line
base = os.path.basename(filename)
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
base = re.sub(r'[\._-]+', ' ', base)
base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
return base.title().strip()
# ===============================
# Zephyr Parsing
# ===============================
def parse_with_zephyr(text: str) -> dict:
prompt = f"""
Extract the following information from the resume text provided below.
Return ONLY a valid JSON object (no extra commentary).
Information to extract:
- Full Name
- Email
- Phone
- Skills (list)
- Education (list of degrees + institutions)
- Experience (list of jobs with company, title, and dates)
Resume:
{text}
JSON format:
{{
"name": "Full Name",
"email": "[email protected]",
"phone": "+961-xxx-xxx",
"skills": ["Skill1", "Skill2", "Skill3"],
"education": ["Degree1 - Institution1", "Degree2 - Institution2"],
"experience": ["Job1 - Company1 (Dates)", "Job2 - Company2 (Dates)"]
}}
"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
match = re.search(r"\{.*\}", response, re.S)
if match:
try:
return json.loads(match.group())
except:
pass
return {"name": "", "email": "", "phone": "", "skills": [], "education": [], "experience": []}
# ===============================
# Main Parse Function
# ===============================
def parse_resume(file_path: str, filename: str) -> dict:
text = extract_text(file_path)
name_fallback = extract_name(text, filename)
data = parse_with_zephyr(text)
if not data.get("name"):
data["name"] = name_fallback
return data
|