husseinelsaadi commited on
Commit
efffc2e
·
1 Parent(s): f3f24e3
Files changed (1) hide show
  1. backend/services/resume_parser.py +64 -62
backend/services/resume_parser.py CHANGED
@@ -3,100 +3,102 @@ import os
3
  import re
4
  import subprocess
5
  import zipfile
6
- from typing import List
7
- from transformers import pipeline
 
8
 
9
  # ===============================
10
- # Load Lightweight Resume Parser
11
  # ===============================
12
- resume_parser_model = pipeline("text-classification", model="Kiet/autotrain-resume_parser-1159242747")
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # ===============================
15
- # PDF/DOCX Text Extraction
16
  # ===============================
17
  def extract_text(file_path: str) -> str:
18
- """Extract text from PDF or DOCX resumes."""
19
  if not file_path or not os.path.isfile(file_path):
20
  return ""
21
 
22
  lower_name = file_path.lower()
23
  try:
24
- if lower_name.endswith('.pdf'):
25
  result = subprocess.run(
26
- ['pdftotext', '-layout', file_path, '-'],
27
  stdout=subprocess.PIPE,
28
  stderr=subprocess.PIPE,
29
  check=False
30
  )
31
- return result.stdout.decode('utf-8', errors='ignore')
32
 
33
- elif lower_name.endswith('.docx'):
34
  with zipfile.ZipFile(file_path) as zf:
35
- with zf.open('word/document.xml') as docx_xml:
36
  xml_bytes = docx_xml.read()
37
- xml_text = xml_bytes.decode('utf-8', errors='ignore')
38
- xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
39
- text = re.sub(r'<[^>]+>', ' ', xml_text)
40
- return re.sub(r'\s+', ' ', text)
41
  else:
42
  return ""
43
  except Exception:
44
  return ""
45
 
46
  # ===============================
47
- # Fallback Name Extraction
48
  # ===============================
49
- def extract_name(text: str, filename: str) -> str:
50
- """Extract candidate's name from resume text or filename."""
51
- if text:
52
- lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
53
- for line in lines[:10]:
54
- if re.match(r'(?i)resume|curriculum vitae', line):
55
- continue
56
- words = line.split()
57
- if 1 < len(words) <= 4:
58
- if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
59
- return line
60
- base = os.path.basename(filename)
61
- base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
62
- base = re.sub(r'[\._-]+', ' ', base)
63
- base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
64
- return base.title().strip()
65
 
66
- # ===============================
67
- # Model-based Resume Parsing
68
- # ===============================
69
- def parse_with_kiet_model(text: str) -> dict:
70
- """Use Kiet's resume parser model to extract fields."""
71
- try:
72
- # The pipeline might return structured text (needs post-processing)
73
- parsed_output = resume_parser_model(text)
74
-
75
- # Since the model output may vary, we simulate structured mapping
76
- return {
77
- "name": parsed_output[0]['label'] if parsed_output else "",
78
- "skills": "Extracted Skills Here",
79
- "education": "Extracted Education Here",
80
- "experience": "Extracted Experience Here"
81
- }
82
- except Exception:
83
- return {"name": "", "skills": "", "education": "", "experience": ""}
 
 
 
84
 
85
  # ===============================
86
  # Main Parse Function
87
  # ===============================
88
  def parse_resume(file_path: str, filename: str) -> dict:
89
- """Main function to parse resumes."""
90
  text = extract_text(file_path)
91
- name = extract_name(text, filename)
92
-
93
- ents = parse_with_kiet_model(text)
94
- if not ents.get("name"):
95
- ents["name"] = name
96
 
97
- return {
98
- "name": ents.get("name", ""),
99
- "skills": ents.get("skills", ""),
100
- "education": ents.get("education", ""),
101
- "experience": ents.get("experience", "")
102
- }
 
 
 
 
3
  import re
4
  import subprocess
5
  import zipfile
6
+ from typing import List, Dict
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
9
 
10
  # ===============================
11
+ # Load Model & Tokenizer
12
  # ===============================
13
+ MODEL_ID = "sravya-abburi/ResumeParserBERT"
14
+
15
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
16
+ model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
17
+
18
+ ner_pipeline = pipeline(
19
+ "ner",
20
+ model=model,
21
+ tokenizer=tokenizer,
22
+ aggregation_strategy="simple",
23
+ device=0 if torch.cuda.is_available() else -1
24
+ )
25
 
26
  # ===============================
27
+ # Text Extraction
28
  # ===============================
29
  def extract_text(file_path: str) -> str:
30
+ """Extract raw text from PDF or DOCX."""
31
  if not file_path or not os.path.isfile(file_path):
32
  return ""
33
 
34
  lower_name = file_path.lower()
35
  try:
36
+ if lower_name.endswith(".pdf"):
37
  result = subprocess.run(
38
+ ["pdftotext", "-layout", file_path, "-"],
39
  stdout=subprocess.PIPE,
40
  stderr=subprocess.PIPE,
41
  check=False
42
  )
43
+ return result.stdout.decode("utf-8", errors="ignore")
44
 
45
+ elif lower_name.endswith(".docx"):
46
  with zipfile.ZipFile(file_path) as zf:
47
+ with zf.open("word/document.xml") as docx_xml:
48
  xml_bytes = docx_xml.read()
49
+ xml_text = xml_bytes.decode("utf-8", errors="ignore")
50
+ xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
51
+ text = re.sub(r"<[^>]+>", " ", xml_text)
52
+ return re.sub(r"\s+", " ", text)
53
  else:
54
  return ""
55
  except Exception:
56
  return ""
57
 
58
  # ===============================
59
+ # Parse Resume using BERT NER
60
  # ===============================
61
+ def parse_with_bert(text: str) -> Dict[str, str]:
62
+ """Parse resume text into structured fields using BERT NER."""
63
+ entities = ner_pipeline(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ name_tokens, skill_tokens, edu_tokens, exp_tokens = [], [], [], []
66
+
67
+ for ent in entities:
68
+ label = ent["entity_group"].upper()
69
+ word = ent["word"].strip()
70
+
71
+ if label == "NAME" and word not in name_tokens:
72
+ name_tokens.append(word)
73
+ elif label == "SKILL" and word not in skill_tokens:
74
+ skill_tokens.append(word)
75
+ elif label == "EDUCATION" and word not in edu_tokens:
76
+ edu_tokens.append(word)
77
+ elif label == "EXPERIENCE" and word not in exp_tokens:
78
+ exp_tokens.append(word)
79
+
80
+ return {
81
+ "name": " ".join(name_tokens),
82
+ "skills": ", ".join(skill_tokens),
83
+ "education": ", ".join(edu_tokens),
84
+ "experience": ", ".join(exp_tokens)
85
+ }
86
 
87
  # ===============================
88
  # Main Parse Function
89
  # ===============================
90
  def parse_resume(file_path: str, filename: str) -> dict:
91
+ """Main function for resume parsing."""
92
  text = extract_text(file_path)
93
+ if not text:
94
+ return {"name": "", "skills": "", "education": "", "experience": ""}
 
 
 
95
 
96
+ ents = parse_with_bert(text)
97
+
98
+ # Fallback: use filename for name if model doesn't find one
99
+ if not ents["name"]:
100
+ base = os.path.basename(filename)
101
+ base = re.sub(r"\.(pdf|docx|doc)$", "", base, flags=re.I)
102
+ ents["name"] = re.sub(r"[\._-]+", " ", base).title().strip()
103
+
104
+ return ents