husseinelsaadi commited on
Commit
8d99522
·
1 Parent(s): a5be571
backend/services/resume_parser.py CHANGED
@@ -1,21 +1,18 @@
1
  from __future__ import annotations
2
- import os
3
- import re
4
- import subprocess
5
- import zipfile
6
- import json
7
- import torch
8
  from typing import List
9
-
10
- os.environ["OMP_NUM_THREADS"] = "1"
11
- os.environ["OPENBLAS_NUM_THREADS"] = "1"
12
- os.environ["MKL_NUM_THREADS"] = "1"
13
- os.environ["NUMEXPR_NUM_THREADS"] = "1"
14
- os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
15
-
16
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
17
- import torch
18
 
 
 
 
 
 
 
 
 
 
 
19
  bnb_config = BitsAndBytesConfig(
20
  load_in_4bit=True,
21
  bnb_4bit_compute_dtype=torch.float16,
@@ -23,10 +20,9 @@ bnb_config = BitsAndBytesConfig(
23
  bnb_4bit_quant_type="nf4"
24
  )
25
 
26
- # --- UPDATED: Using Deepseek-Coder-V2-Lite-Instruct for better performance ---
27
- tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/Deepseek-Coder-V2-Lite-Instruct", trust_remote_code=True)
28
  model = AutoModelForCausalLM.from_pretrained(
29
- "deepseek-ai/Deepseek-Coder-V2-Lite-Instruct",
30
  quantization_config=bnb_config,
31
  device_map="auto",
32
  torch_dtype=torch.bfloat16,
@@ -37,13 +33,10 @@ model = AutoModelForCausalLM.from_pretrained(
37
  # Text Extraction (PDF/DOCX)
38
  # ===============================
39
  def extract_text(file_path: str) -> str:
40
- """Extract text from PDF or DOCX resumes."""
41
  if not file_path or not os.path.isfile(file_path):
42
  return ""
43
-
44
- lower_name = file_path.lower()
45
  try:
46
- if lower_name.endswith('.pdf'):
47
  result = subprocess.run(
48
  ['pdftotext', '-layout', file_path, '-'],
49
  stdout=subprocess.PIPE,
@@ -51,8 +44,7 @@ def extract_text(file_path: str) -> str:
51
  check=False
52
  )
53
  return result.stdout.decode('utf-8', errors='ignore')
54
-
55
- elif lower_name.endswith('.docx'):
56
  with zipfile.ZipFile(file_path) as zf:
57
  with zf.open('word/document.xml') as docx_xml:
58
  xml_bytes = docx_xml.read()
@@ -60,24 +52,20 @@ def extract_text(file_path: str) -> str:
60
  xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
61
  text = re.sub(r'<[^>]+>', ' ', xml_text)
62
  return re.sub(r'\s+', ' ', text)
63
- else:
64
- return ""
65
  except Exception:
66
- return ""
 
67
 
68
  # ===============================
69
  # Name Extraction (Fallback)
70
  # ===============================
71
  def extract_name(text: str, filename: str) -> str:
72
- """Extract candidate's name from resume text or filename."""
73
  if text:
74
  lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
75
  for line in lines[:10]:
76
- if re.match(r'(?i)resume|curriculum vitae', line):
77
- continue
78
- words = line.split()
79
- if 1 < len(words) <= 4:
80
- if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
81
  return line
82
  base = os.path.basename(filename)
83
  base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
@@ -86,26 +74,25 @@ def extract_name(text: str, filename: str) -> str:
86
  return base.title().strip()
87
 
88
  # ===============================
89
- # Janus-Pro Parsing
90
  # ===============================
91
- def parse_with_deepseek(text: str) -> dict:
92
- """Use Deepseek-Coder-V2-Lite-Instruct to extract resume details in JSON format."""
93
-
94
  prompt = f"""
95
- Extract the following information from the resume text provided below. Your response should be a valid JSON object.
 
96
 
97
  Information to extract:
98
- - Full Name: The candidate's full name.
99
- - Email: The candidate's email address.
100
- - Phone: The candidate's phone number.
101
- - Skills: A list of technical and soft skills.
102
- - Education: A list of academic degrees and institutions.
103
- - Experience: A list of previous jobs, including company, title, and dates.
104
 
105
- Resume Text:
106
  {text}
107
 
108
- Return only valid JSON in the following format:
109
  {{
110
  "name": "Full Name",
111
  "email": "[email protected]",
@@ -115,17 +102,25 @@ Return only valid JSON in the following format:
115
  "experience": ["Job1 - Company1 (Dates)", "Job2 - Company2 (Dates)"]
116
  }}
117
  """
118
-
119
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
120
- outputs = model.generate(**inputs, max_new_tokens=512)
121
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
122
 
123
- import re, json
124
  match = re.search(r"\{.*\}", response, re.S)
125
  if match:
126
  try:
127
  return json.loads(match.group())
128
  except:
129
  pass
130
-
131
  return {"name": "", "email": "", "phone": "", "skills": [], "education": [], "experience": []}
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
+ import os, re, subprocess, zipfile, json, torch
 
 
 
 
 
3
  from typing import List
 
 
 
 
 
 
 
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 
5
 
6
+ # Limit threads to avoid Hugging Face Spaces threading issues
7
+ os.environ.update({
8
+ "OMP_NUM_THREADS": "1",
9
+ "OPENBLAS_NUM_THREADS": "1",
10
+ "MKL_NUM_THREADS": "1",
11
+ "NUMEXPR_NUM_THREADS": "1",
12
+ "VECLIB_MAXIMUM_THREADS": "1"
13
+ })
14
+
15
+ # Load Zephyr in 4-bit
16
  bnb_config = BitsAndBytesConfig(
17
  load_in_4bit=True,
18
  bnb_4bit_compute_dtype=torch.float16,
 
20
  bnb_4bit_quant_type="nf4"
21
  )
22
 
23
+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", trust_remote_code=True)
 
24
  model = AutoModelForCausalLM.from_pretrained(
25
+ "HuggingFaceH4/zephyr-7b-beta",
26
  quantization_config=bnb_config,
27
  device_map="auto",
28
  torch_dtype=torch.bfloat16,
 
33
  # Text Extraction (PDF/DOCX)
34
  # ===============================
35
  def extract_text(file_path: str) -> str:
 
36
  if not file_path or not os.path.isfile(file_path):
37
  return ""
 
 
38
  try:
39
+ if file_path.lower().endswith('.pdf'):
40
  result = subprocess.run(
41
  ['pdftotext', '-layout', file_path, '-'],
42
  stdout=subprocess.PIPE,
 
44
  check=False
45
  )
46
  return result.stdout.decode('utf-8', errors='ignore')
47
+ elif file_path.lower().endswith('.docx'):
 
48
  with zipfile.ZipFile(file_path) as zf:
49
  with zf.open('word/document.xml') as docx_xml:
50
  xml_bytes = docx_xml.read()
 
52
  xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
53
  text = re.sub(r'<[^>]+>', ' ', xml_text)
54
  return re.sub(r'\s+', ' ', text)
 
 
55
  except Exception:
56
+ pass
57
+ return ""
58
 
59
  # ===============================
60
  # Name Extraction (Fallback)
61
  # ===============================
62
  def extract_name(text: str, filename: str) -> str:
 
63
  if text:
64
  lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
65
  for line in lines[:10]:
66
+ if not re.match(r'(?i)resume|curriculum vitae', line):
67
+ words = line.split()
68
+ if 1 < len(words) <= 4 and all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
 
 
69
  return line
70
  base = os.path.basename(filename)
71
  base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
 
74
  return base.title().strip()
75
 
76
  # ===============================
77
+ # Zephyr Parsing
78
  # ===============================
79
+ def parse_with_zephyr(text: str) -> dict:
 
 
80
  prompt = f"""
81
+ Extract the following information from the resume text provided below.
82
+ Return ONLY a valid JSON object (no extra commentary).
83
 
84
  Information to extract:
85
+ - Full Name
86
+ - Email
87
+ - Phone
88
+ - Skills (list)
89
+ - Education (list of degrees + institutions)
90
+ - Experience (list of jobs with company, title, and dates)
91
 
92
+ Resume:
93
  {text}
94
 
95
+ JSON format:
96
  {{
97
  "name": "Full Name",
98
  "email": "[email protected]",
 
102
  "experience": ["Job1 - Company1 (Dates)", "Job2 - Company2 (Dates)"]
103
  }}
104
  """
 
105
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
106
+ outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0)
107
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
108
 
 
109
  match = re.search(r"\{.*\}", response, re.S)
110
  if match:
111
  try:
112
  return json.loads(match.group())
113
  except:
114
  pass
 
115
  return {"name": "", "email": "", "phone": "", "skills": [], "education": [], "experience": []}
116
+
117
+ # ===============================
118
+ # Main Parse Function
119
+ # ===============================
120
+ def parse_resume(file_path: str, filename: str) -> dict:
121
+ text = extract_text(file_path)
122
+ name_fallback = extract_name(text, filename)
123
+ data = parse_with_zephyr(text)
124
+ if not data.get("name"):
125
+ data["name"] = name_fallback
126
+ return data
requirements.txt CHANGED
@@ -62,5 +62,4 @@ requests>=2.31.0
62
  psycopg2-binary
63
  matplotlib
64
  bitsandbytes>=0.41.0
65
- flash-attn==2.3.6 --no-build-isolation
66
 
 
62
  psycopg2-binary
63
  matplotlib
64
  bitsandbytes>=0.41.0
 
65