File size: 3,801 Bytes
af02e64
 
 
 
 
d4b2339
 
af02e64
 
1de8504
 
 
 
 
 
722e882
 
b336194
722e882
 
 
 
 
 
af02e64
0e43f07
 
d4b2339
0e43f07
722e882
0e43f07
 
 
d4b2339
 
 
 
 
af02e64
d4b2339
af02e64
 
 
 
 
 
d4b2339
 
 
 
 
 
 
 
af02e64
d4b2339
 
 
 
 
 
 
af02e64
 
 
 
 
d4b2339
 
 
af02e64
d4b2339
af02e64
 
 
 
 
 
 
 
 
 
 
 
 
d4b2339
 
 
 
 
 
0e43f07
 
d4b2339
0e43f07
d4b2339
0e43f07
 
 
 
 
 
 
d4b2339
0e43f07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from __future__ import annotations
import os
import re
import subprocess
import zipfile
import json
import torch
from typing import List

os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# --- UPDATED: Using Deepseek-Coder-V2-Lite-Instruct for better performance ---
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/Deepseek-Coder-V2-Lite-Instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/Deepseek-Coder-V2-Lite-Instruct",
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# ===============================
# Text Extraction (PDF/DOCX)
# ===============================
def extract_text(file_path: str) -> str:
    """Extract text from PDF or DOCX resumes."""
    if not file_path or not os.path.isfile(file_path):
        return ""

    lower_name = file_path.lower()
    try:
        if lower_name.endswith('.pdf'):
            result = subprocess.run(
                ['pdftotext', '-layout', file_path, '-'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False
            )
            return result.stdout.decode('utf-8', errors='ignore')

        elif lower_name.endswith('.docx'):
            with zipfile.ZipFile(file_path) as zf:
                with zf.open('word/document.xml') as docx_xml:
                    xml_bytes = docx_xml.read()
                    xml_text = xml_bytes.decode('utf-8', errors='ignore')
                    xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
                    text = re.sub(r'<[^>]+>', ' ', xml_text)
                    return re.sub(r'\s+', ' ', text)
        else:
            return ""
    except Exception:
        return ""

# ===============================
# Name Extraction (Fallback)
# ===============================
def extract_name(text: str, filename: str) -> str:
    """Extract candidate's name from resume text or filename."""
    if text:
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        for line in lines[:10]:
            if re.match(r'(?i)resume|curriculum vitae', line):
                continue
            words = line.split()
            if 1 < len(words) <= 4:
                if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
                    return line
    base = os.path.basename(filename)
    base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
    base = re.sub(r'[\._-]+', ' ', base)
    base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
    return base.title().strip()

# ===============================
# Janus-Pro Parsing
# ===============================
def parse_with_deepseek(text: str) -> dict:
    """Use Deepseek-Coder-V2-Lite-Instruct to extract resume details in JSON format."""
    # --- UPDATED: Refined prompt for better JSON extraction ---
    prompt = f"""
Extract the following information from the resume text provided below. Your response should be a valid JSON object.

**Information to extract:**
- **Full Name:** The candidate's full name.
- **Email:** The candidate's email address.
- **Phone:** The candidate's phone number.
- **Skills:** A list of technical and soft skills.
- **Education:** A list of academic degrees and institutions.
- **Experience:** A list of previous jobs, including company, title, and dates.

**Resume Text:**