Spaces:
Paused
Paused
File size: 4,524 Bytes
af02e64 d4b2339 af02e64 1de8504 722e882 b336194 722e882 af02e64 0e43f07 d4b2339 0e43f07 722e882 0e43f07 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 0e43f07 1ead253 d4b2339 0e43f07 d4b2339 1ead253 d4b2339 1ead253 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from __future__ import annotations
import os
import re
import subprocess
import zipfile
import json
import torch
from typing import List
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
# --- UPDATED: Using Deepseek-Coder-V2-Lite-Instruct for better performance ---
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/Deepseek-Coder-V2-Lite-Instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"deepseek-ai/Deepseek-Coder-V2-Lite-Instruct",
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
# ===============================
# Text Extraction (PDF/DOCX)
# ===============================
def extract_text(file_path: str) -> str:
"""Extract text from PDF or DOCX resumes."""
if not file_path or not os.path.isfile(file_path):
return ""
lower_name = file_path.lower()
try:
if lower_name.endswith('.pdf'):
result = subprocess.run(
['pdftotext', '-layout', file_path, '-'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False
)
return result.stdout.decode('utf-8', errors='ignore')
elif lower_name.endswith('.docx'):
with zipfile.ZipFile(file_path) as zf:
with zf.open('word/document.xml') as docx_xml:
xml_bytes = docx_xml.read()
xml_text = xml_bytes.decode('utf-8', errors='ignore')
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
text = re.sub(r'<[^>]+>', ' ', xml_text)
return re.sub(r'\s+', ' ', text)
else:
return ""
except Exception:
return ""
# ===============================
# Name Extraction (Fallback)
# ===============================
def extract_name(text: str, filename: str) -> str:
"""Extract candidate's name from resume text or filename."""
if text:
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for line in lines[:10]:
if re.match(r'(?i)resume|curriculum vitae', line):
continue
words = line.split()
if 1 < len(words) <= 4:
if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
return line
base = os.path.basename(filename)
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
base = re.sub(r'[\._-]+', ' ', base)
base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
return base.title().strip()
# ===============================
# Janus-Pro Parsing
# ===============================
def parse_with_deepseek(text: str) -> dict:
"""Use Deepseek-Coder-V2-Lite-Instruct to extract resume details in JSON format."""
prompt = f"""
Extract the following information from the resume text provided below. Your response should be a valid JSON object.
Information to extract:
- Full Name: The candidate's full name.
- Email: The candidate's email address.
- Phone: The candidate's phone number.
- Skills: A list of technical and soft skills.
- Education: A list of academic degrees and institutions.
- Experience: A list of previous jobs, including company, title, and dates.
Resume Text:
{text}
Return only valid JSON in the following format:
{{
"name": "Full Name",
"email": "[email protected]",
"phone": "+961-xxx-xxx",
"skills": ["Skill1", "Skill2", "Skill3"],
"education": ["Degree1 - Institution1", "Degree2 - Institution2"],
"experience": ["Job1 - Company1 (Dates)", "Job2 - Company2 (Dates)"]
}}
"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
import re, json
match = re.search(r"\{.*\}", response, re.S)
if match:
try:
return json.loads(match.group())
except:
pass
return {"name": "", "email": "", "phone": "", "skills": [], "education": [], "experience": []}
|