File size: 4,524 Bytes
af02e64
 
 
 
 
d4b2339
 
af02e64
 
1de8504
 
 
 
 
 
722e882
 
b336194
722e882
 
 
 
 
 
af02e64
0e43f07
 
d4b2339
0e43f07
722e882
0e43f07
 
 
d4b2339
 
 
 
 
af02e64
d4b2339
af02e64
 
 
 
 
 
d4b2339
 
 
 
 
 
 
 
af02e64
d4b2339
 
 
 
 
 
 
af02e64
 
 
 
 
d4b2339
 
 
af02e64
d4b2339
af02e64
 
 
 
 
 
 
 
 
 
 
 
 
d4b2339
 
 
 
 
 
0e43f07
1ead253
d4b2339
0e43f07
d4b2339
1ead253
 
 
 
 
 
 
d4b2339
1ead253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from __future__ import annotations
import os
import re
import subprocess
import zipfile
import json
import torch
from typing import List

os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# --- UPDATED: Using Deepseek-Coder-V2-Lite-Instruct for better performance ---
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/Deepseek-Coder-V2-Lite-Instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/Deepseek-Coder-V2-Lite-Instruct",
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# ===============================
# Text Extraction (PDF/DOCX)
# ===============================
def extract_text(file_path: str) -> str:
    """Extract text from PDF or DOCX resumes."""
    if not file_path or not os.path.isfile(file_path):
        return ""

    lower_name = file_path.lower()
    try:
        if lower_name.endswith('.pdf'):
            result = subprocess.run(
                ['pdftotext', '-layout', file_path, '-'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False
            )
            return result.stdout.decode('utf-8', errors='ignore')

        elif lower_name.endswith('.docx'):
            with zipfile.ZipFile(file_path) as zf:
                with zf.open('word/document.xml') as docx_xml:
                    xml_bytes = docx_xml.read()
                    xml_text = xml_bytes.decode('utf-8', errors='ignore')
                    xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
                    text = re.sub(r'<[^>]+>', ' ', xml_text)
                    return re.sub(r'\s+', ' ', text)
        else:
            return ""
    except Exception:
        return ""

# ===============================
# Name Extraction (Fallback)
# ===============================
def extract_name(text: str, filename: str) -> str:
    """Extract candidate's name from resume text or filename."""
    if text:
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        for line in lines[:10]:
            if re.match(r'(?i)resume|curriculum vitae', line):
                continue
            words = line.split()
            if 1 < len(words) <= 4:
                if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
                    return line
    base = os.path.basename(filename)
    base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
    base = re.sub(r'[\._-]+', ' ', base)
    base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
    return base.title().strip()

# ===============================
# Janus-Pro Parsing
# ===============================
def parse_with_deepseek(text: str) -> dict:
    """Use Deepseek-Coder-V2-Lite-Instruct to extract resume details in JSON format."""
    
    prompt = f"""
Extract the following information from the resume text provided below. Your response should be a valid JSON object.

Information to extract:
- Full Name: The candidate's full name.
- Email: The candidate's email address.
- Phone: The candidate's phone number.
- Skills: A list of technical and soft skills.
- Education: A list of academic degrees and institutions.
- Experience: A list of previous jobs, including company, title, and dates.

Resume Text:
{text}

Return only valid JSON in the following format:
{{
  "name": "Full Name",
  "email": "[email protected]",
  "phone": "+961-xxx-xxx",
  "skills": ["Skill1", "Skill2", "Skill3"],
  "education": ["Degree1 - Institution1", "Degree2 - Institution2"],
  "experience": ["Job1 - Company1 (Dates)", "Job2 - Company2 (Dates)"]
}}
"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=512)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    import re, json
    match = re.search(r"\{.*\}", response, re.S)
    if match:
        try:
            return json.loads(match.group())
        except:
            pass
    
    return {"name": "", "email": "", "phone": "", "skills": [], "education": [], "experience": []}