husseinelsaadi commited on
Commit
d4b2339
·
1 Parent(s): 50d928c

deepseek model loaded

Browse files
Files changed (1) hide show
  1. backend/services/resume_parser.py +108 -51
backend/services/resume_parser.py CHANGED
@@ -3,49 +3,61 @@ import os
3
  import re
4
  import subprocess
5
  import zipfile
 
 
6
  from typing import List
7
- from transformers import pipeline
8
 
9
- # Load the NER model for resume parsing
10
- ner = pipeline("ner", model="Kiet/ResumeParserBERT", aggregation_strategy="simple")
 
 
11
 
 
 
 
 
 
 
 
 
 
 
 
12
  def extract_text(file_path: str) -> str:
13
- """Extract text from PDF or DOCX."""
14
  if not file_path or not os.path.isfile(file_path):
15
  return ""
16
 
17
  lower_name = file_path.lower()
18
  try:
19
  if lower_name.endswith('.pdf'):
20
- try:
21
- result = subprocess.run(
22
- ['pdftotext', '-layout', file_path, '-'],
23
- stdout=subprocess.PIPE,
24
- stderr=subprocess.PIPE,
25
- check=False
26
- )
27
- return result.stdout.decode('utf-8', errors='ignore')
28
- except Exception:
29
- return ""
30
  elif lower_name.endswith('.docx'):
31
- try:
32
- with zipfile.ZipFile(file_path) as zf:
33
- with zf.open('word/document.xml') as docx_xml:
34
- xml_bytes = docx_xml.read()
35
- xml_text = xml_bytes.decode('utf-8', errors='ignore')
36
- xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
37
- text = re.sub(r'<[^>]+>', ' ', xml_text)
38
- text = re.sub(r'\s+', ' ', text)
39
- return text
40
- except Exception:
41
- return ""
42
  else:
43
  return ""
44
  except Exception:
45
  return ""
46
 
 
 
 
47
  def extract_name(text: str, filename: str) -> str:
48
- """Extract candidate's name from text or filename."""
49
  if text:
50
  lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
51
  for line in lines[:10]:
@@ -59,36 +71,81 @@ def extract_name(text: str, filename: str) -> str:
59
  base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
60
  base = re.sub(r'[\._-]+', ' ', base)
61
  base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
62
- base = re.sub(r'\s+', ' ', base).strip()
63
- return base.title() if base else ''
64
-
65
- def extract_entities(text: str) -> dict:
66
- """Extract structured info using NER model."""
67
- entities = ner(text)
68
- skills, education, experience = [], [], []
69
- for ent in entities:
70
- label = ent['entity_group'].upper()
71
- word = ent['word'].strip()
72
- if label in ["SKILL", "TECH", "TECHNOLOGY"]:
73
- skills.append(word)
74
- elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
75
- education.append(word)
76
- elif label in ["EXPERIENCE", "JOB", "ROLE"]:
77
- experience.append(word)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  return {
79
- "skills": list(dict.fromkeys(skills)),
80
- "education": list(dict.fromkeys(education)),
81
- "experience": list(dict.fromkeys(experience))
82
  }
83
 
 
 
 
84
  def parse_resume(file_path: str, filename: str) -> dict:
85
- """Main function to parse resume fields."""
86
  text = extract_text(file_path)
87
  name = extract_name(text, filename)
88
- ents = extract_entities(text)
 
 
 
 
 
 
 
 
 
 
89
  return {
90
- 'name': name or '',
91
- 'skills': ', '.join(ents["skills"]) if ents["skills"] else '',
92
- 'education': ', '.join(ents["education"]) if ents["education"] else '',
93
- 'experience': ', '.join(ents["experience"]) if ents["experience"] else ''
94
  }
 
3
  import re
4
  import subprocess
5
  import zipfile
6
+ import json
7
+ import torch
8
  from typing import List
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer
10
 
11
+ # ===============================
12
+ # Load DeepSeek Janus-Pro-7B Model
13
+ # ===============================
14
+ MODEL_ID = "deepseek-ai/Janus-Pro-7B"
15
 
16
+ print(f"Loading {MODEL_ID}... (This may take some time on first run)")
17
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ MODEL_ID,
20
+ torch_dtype=torch.float16,
21
+ device_map="auto"
22
+ )
23
+
24
+ # ===============================
25
+ # Text Extraction (PDF/DOCX)
26
+ # ===============================
27
  def extract_text(file_path: str) -> str:
28
+ """Extract text from PDF or DOCX resumes."""
29
  if not file_path or not os.path.isfile(file_path):
30
  return ""
31
 
32
  lower_name = file_path.lower()
33
  try:
34
  if lower_name.endswith('.pdf'):
35
+ result = subprocess.run(
36
+ ['pdftotext', '-layout', file_path, '-'],
37
+ stdout=subprocess.PIPE,
38
+ stderr=subprocess.PIPE,
39
+ check=False
40
+ )
41
+ return result.stdout.decode('utf-8', errors='ignore')
42
+
 
 
43
  elif lower_name.endswith('.docx'):
44
+ with zipfile.ZipFile(file_path) as zf:
45
+ with zf.open('word/document.xml') as docx_xml:
46
+ xml_bytes = docx_xml.read()
47
+ xml_text = xml_bytes.decode('utf-8', errors='ignore')
48
+ xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
49
+ text = re.sub(r'<[^>]+>', ' ', xml_text)
50
+ return re.sub(r'\s+', ' ', text)
 
 
 
 
51
  else:
52
  return ""
53
  except Exception:
54
  return ""
55
 
56
+ # ===============================
57
+ # Name Extraction (Fallback)
58
+ # ===============================
59
  def extract_name(text: str, filename: str) -> str:
60
+ """Extract candidate's name from resume text or filename."""
61
  if text:
62
  lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
63
  for line in lines[:10]:
 
71
  base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
72
  base = re.sub(r'[\._-]+', ' ', base)
73
  base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
74
+ return base.title().strip()
75
+
76
+ # ===============================
77
+ # Janus-Pro Parsing
78
+ # ===============================
79
+ def parse_with_deepseek(text: str) -> dict:
80
+ """Use DeepSeek Janus-Pro-7B to extract resume details in JSON format."""
81
+ prompt = f"""
82
+ Extract the following information from the resume text below:
83
+
84
+ - Full Name
85
+ - Skills (comma separated)
86
+ - Education (degrees + institutions)
87
+ - Experience (job titles + companies)
88
+
89
+ Return only valid JSON in the following structure:
90
+ {{
91
+ "name": "Full Name",
92
+ "skills": "Skill1, Skill2, Skill3",
93
+ "education": "Degree1 - Institution1; Degree2 - Institution2",
94
+ "experience": "Job1 - Company1; Job2 - Company2"
95
+ }}
96
+
97
+ Resume:
98
+ {text}
99
+ """
100
+
101
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
102
+ outputs = model.generate(**inputs, max_new_tokens=512)
103
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
104
+
105
+ # Extract JSON safely
106
+ match = re.search(r"\{.*\}", response, re.S)
107
+ if match:
108
+ try:
109
+ return json.loads(match.group())
110
+ except:
111
+ pass
112
+ return {"name": "", "skills": "", "education": "", "experience": ""}
113
+
114
+ # ===============================
115
+ # Fallback Heading-based Parsing
116
+ # ===============================
117
+ def fallback_parse(text: str) -> dict:
118
+ """Simple heading-based parsing as backup."""
119
+ skills = re.findall(r"Skills\s*[:\-]?\s*(.*)", text, re.I)
120
+ education = re.findall(r"Education\s*[:\-]?\s*(.*)", text, re.I)
121
+ experience = re.findall(r"(Experience|Work History)\s*[:\-]?\s*(.*)", text, re.I)
122
  return {
123
+ "skills": ", ".join(skills),
124
+ "education": ", ".join(education),
125
+ "experience": ", ".join([exp[1] for exp in experience])
126
  }
127
 
128
+ # ===============================
129
+ # Main Parse Function
130
+ # ===============================
131
  def parse_resume(file_path: str, filename: str) -> dict:
132
+ """Main resume parsing function."""
133
  text = extract_text(file_path)
134
  name = extract_name(text, filename)
135
+
136
+ # Try Janus-Pro parsing
137
+ ents = parse_with_deepseek(text)
138
+
139
+ # If Janus-Pro misses fields, use fallback
140
+ if not ents.get("skills") or not ents.get("education"):
141
+ fb = fallback_parse(text)
142
+ ents["skills"] = ents.get("skills") or fb["skills"]
143
+ ents["education"] = ents.get("education") or fb["education"]
144
+ ents["experience"] = ents.get("experience") or fb["experience"]
145
+
146
  return {
147
+ "name": ents.get("name") or name,
148
+ "skills": ents.get("skills", ""),
149
+ "education": ents.get("education", ""),
150
+ "experience": ents.get("experience", "")
151
  }