Spaces:
Paused
Paused
File size: 5,301 Bytes
af02e64 d4b2339 af02e64 d4b2339 af02e64 1de8504 2a8ccfd b336194 af02e64 d4b2339 2e9d967 d4b2339 2e9d967 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 775c09c d4b2339 775c09c af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 af02e64 d4b2339 775c09c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
from __future__ import annotations
import os
import re
import subprocess
import zipfile
import json
import torch
from typing import List
from transformers import AutoModelForCausalLM, AutoTokenizer
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
MODEL_ID = "tiiuae/falcon-7b-instruct"
print(f"Loading {MODEL_ID}... (This may take some time on first run)")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
device_map="auto"
)
# ===============================
# Text Extraction (PDF/DOCX)
# ===============================
def extract_text(file_path: str) -> str:
"""Extract text from PDF or DOCX resumes."""
if not file_path or not os.path.isfile(file_path):
return ""
lower_name = file_path.lower()
try:
if lower_name.endswith('.pdf'):
result = subprocess.run(
['pdftotext', '-layout', file_path, '-'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False
)
return result.stdout.decode('utf-8', errors='ignore')
elif lower_name.endswith('.docx'):
with zipfile.ZipFile(file_path) as zf:
with zf.open('word/document.xml') as docx_xml:
xml_bytes = docx_xml.read()
xml_text = xml_bytes.decode('utf-8', errors='ignore')
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
text = re.sub(r'<[^>]+>', ' ', xml_text)
return re.sub(r'\s+', ' ', text)
else:
return ""
except Exception:
return ""
# ===============================
# Name Extraction (Fallback)
# ===============================
def extract_name(text: str, filename: str) -> str:
"""Extract candidate's name from resume text or filename."""
if text:
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for line in lines[:10]:
if re.match(r'(?i)resume|curriculum vitae', line):
continue
words = line.split()
if 1 < len(words) <= 4:
if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
return line
base = os.path.basename(filename)
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
base = re.sub(r'[\._-]+', ' ', base)
base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
return base.title().strip()
# ===============================
# Janus-Pro Parsing
# ===============================
def parse_with_deepseek(text: str) -> dict:
"""Use DeepSeek Janus-Pro-7B to extract resume details in JSON format."""
prompt = f"""
Extract the following information from the resume text below:
- Full Name
- Skills (comma separated)
- Education (degrees + institutions)
- Experience (job titles + companies)
Return only valid JSON in the following structure:
{{
"name": "Full Name",
"skills": "Skill1, Skill2, Skill3",
"education": "Degree1 - Institution1; Degree2 - Institution2",
"experience": "Job1 - Company1; Job2 - Company2"
}}
Resume:
{text}
"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract JSON safely
match = re.search(r"\{.*\}", response, re.S)
if match:
try:
return json.loads(match.group())
except:
pass
return {"name": "", "skills": "", "education": "", "experience": ""}
# ===============================
# Fallback Heading-based Parsing
# ===============================
def fallback_parse(text: str) -> dict:
"""Simple heading-based parsing as backup."""
skills = re.findall(r"Skills\s*[:\-]?\s*(.*)", text, re.I)
education = re.findall(r"Education\s*[:\-]?\s*(.*)", text, re.I)
experience = re.findall(r"(Experience|Work History)\s*[:\-]?\s*(.*)", text, re.I)
return {
"skills": ", ".join(skills),
"education": ", ".join(education),
"experience": ", ".join([exp[1] for exp in experience])
}
# ===============================
# Main Parse Function
# ===============================
def parse_resume(file_path: str, filename: str) -> dict:
"""Main resume parsing function."""
text = extract_text(file_path)
name = extract_name(text, filename)
# Try Janus-Pro parsing
ents = parse_with_deepseek(text)
# If Janus-Pro misses fields, use fallback
if not ents.get("skills") or not ents.get("education"):
fb = fallback_parse(text)
ents["skills"] = ents.get("skills") or fb["skills"]
ents["education"] = ents.get("education") or fb["education"]
ents["experience"] = ents.get("experience") or fb["experience"]
return {
"name": ents.get("name") or name,
"skills": ents.get("skills", ""),
"education": ents.get("education", ""),
"experience": ents.get("experience", "")
}
|