Spaces:
Paused
Paused
File size: 3,801 Bytes
af02e64 d4b2339 af02e64 1de8504 722e882 b336194 722e882 af02e64 0e43f07 d4b2339 0e43f07 722e882 0e43f07 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 0e43f07 d4b2339 0e43f07 d4b2339 0e43f07 d4b2339 0e43f07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
from __future__ import annotations
import os
import re
import subprocess
import zipfile
import json
import torch
from typing import List
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
# --- UPDATED: Using Deepseek-Coder-V2-Lite-Instruct for better performance ---
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/Deepseek-Coder-V2-Lite-Instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"deepseek-ai/Deepseek-Coder-V2-Lite-Instruct",
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
# ===============================
# Text Extraction (PDF/DOCX)
# ===============================
def extract_text(file_path: str) -> str:
"""Extract text from PDF or DOCX resumes."""
if not file_path or not os.path.isfile(file_path):
return ""
lower_name = file_path.lower()
try:
if lower_name.endswith('.pdf'):
result = subprocess.run(
['pdftotext', '-layout', file_path, '-'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False
)
return result.stdout.decode('utf-8', errors='ignore')
elif lower_name.endswith('.docx'):
with zipfile.ZipFile(file_path) as zf:
with zf.open('word/document.xml') as docx_xml:
xml_bytes = docx_xml.read()
xml_text = xml_bytes.decode('utf-8', errors='ignore')
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
text = re.sub(r'<[^>]+>', ' ', xml_text)
return re.sub(r'\s+', ' ', text)
else:
return ""
except Exception:
return ""
# ===============================
# Name Extraction (Fallback)
# ===============================
def extract_name(text: str, filename: str) -> str:
"""Extract candidate's name from resume text or filename."""
if text:
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for line in lines[:10]:
if re.match(r'(?i)resume|curriculum vitae', line):
continue
words = line.split()
if 1 < len(words) <= 4:
if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
return line
base = os.path.basename(filename)
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
base = re.sub(r'[\._-]+', ' ', base)
base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
return base.title().strip()
# ===============================
# Janus-Pro Parsing
# ===============================
def parse_with_deepseek(text: str) -> dict:
"""Use Deepseek-Coder-V2-Lite-Instruct to extract resume details in JSON format."""
# --- UPDATED: Refined prompt for better JSON extraction ---
prompt = f"""
Extract the following information from the resume text provided below. Your response should be a valid JSON object.
**Information to extract:**
- **Full Name:** The candidate's full name.
- **Email:** The candidate's email address.
- **Phone:** The candidate's phone number.
- **Skills:** A list of technical and soft skills.
- **Education:** A list of academic degrees and institutions.
- **Experience:** A list of previous jobs, including company, title, and dates.
**Resume Text:** |