husseinelsaadi commited on
Commit
864c2ae
·
1 Parent(s): efffc2e
Files changed (1) hide show
  1. backend/services/resume_parser.py +38 -93
backend/services/resume_parser.py CHANGED
@@ -1,104 +1,49 @@
1
- from __future__ import annotations
2
- import os
3
- import re
4
- import subprocess
5
- import zipfile
6
- from typing import List, Dict
7
- import torch
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 
9
 
10
- # ===============================
11
- # Load Model & Tokenizer
12
- # ===============================
13
- MODEL_ID = "sravya-abburi/ResumeParserBERT"
 
14
 
15
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
16
- model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
17
-
18
- ner_pipeline = pipeline(
19
- "ner",
20
- model=model,
21
- tokenizer=tokenizer,
22
- aggregation_strategy="simple",
23
- device=0 if torch.cuda.is_available() else -1
24
- )
25
-
26
- # ===============================
27
- # Text Extraction
28
- # ===============================
29
  def extract_text(file_path: str) -> str:
30
- """Extract raw text from PDF or DOCX."""
31
- if not file_path or not os.path.isfile(file_path):
32
- return ""
33
-
34
- lower_name = file_path.lower()
35
- try:
36
- if lower_name.endswith(".pdf"):
37
- result = subprocess.run(
38
- ["pdftotext", "-layout", file_path, "-"],
39
- stdout=subprocess.PIPE,
40
- stderr=subprocess.PIPE,
41
- check=False
42
- )
43
- return result.stdout.decode("utf-8", errors="ignore")
44
-
45
- elif lower_name.endswith(".docx"):
46
- with zipfile.ZipFile(file_path) as zf:
47
- with zf.open("word/document.xml") as docx_xml:
48
- xml_bytes = docx_xml.read()
49
- xml_text = xml_bytes.decode("utf-8", errors="ignore")
50
- xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
51
- text = re.sub(r"<[^>]+>", " ", xml_text)
52
- return re.sub(r"\s+", " ", text)
53
- else:
54
- return ""
55
- except Exception:
56
- return ""
57
-
58
- # ===============================
59
- # Parse Resume using BERT NER
60
- # ===============================
61
- def parse_with_bert(text: str) -> Dict[str, str]:
62
- """Parse resume text into structured fields using BERT NER."""
63
  entities = ner_pipeline(text)
64
 
65
- name_tokens, skill_tokens, edu_tokens, exp_tokens = [], [], [], []
66
-
67
  for ent in entities:
68
  label = ent["entity_group"].upper()
69
- word = ent["word"].strip()
70
-
71
- if label == "NAME" and word not in name_tokens:
72
- name_tokens.append(word)
73
- elif label == "SKILL" and word not in skill_tokens:
74
- skill_tokens.append(word)
75
- elif label == "EDUCATION" and word not in edu_tokens:
76
- edu_tokens.append(word)
77
- elif label == "EXPERIENCE" and word not in exp_tokens:
78
- exp_tokens.append(word)
79
 
80
  return {
81
- "name": " ".join(name_tokens),
82
- "skills": ", ".join(skill_tokens),
83
- "education": ", ".join(edu_tokens),
84
- "experience": ", ".join(exp_tokens)
85
  }
86
-
87
- # ===============================
88
- # Main Parse Function
89
- # ===============================
90
- def parse_resume(file_path: str, filename: str) -> dict:
91
- """Main function for resume parsing."""
92
- text = extract_text(file_path)
93
- if not text:
94
- return {"name": "", "skills": "", "education": "", "experience": ""}
95
-
96
- ents = parse_with_bert(text)
97
-
98
- # Fallback: use filename for name if model doesn't find one
99
- if not ents["name"]:
100
- base = os.path.basename(filename)
101
- base = re.sub(r"\.(pdf|docx|doc)$", "", base, flags=re.I)
102
- ents["name"] = re.sub(r"[\._-]+", " ", base).title().strip()
103
-
104
- return ents
 
 
 
 
 
 
 
 
1
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
2
+ import subprocess, zipfile, re, os
3
 
4
+ # === Load pretrained HF model instead of training ===
5
+ MODEL_NAME = "sravya-abburi/ResumeParserBERT" # or Kiet/autotrain-resume_parser-1159242747
6
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
7
+ model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
8
+ ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
9
 
10
+ # === Extract text from PDF/DOCX ===
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def extract_text(file_path: str) -> str:
12
+ if file_path.lower().endswith(".pdf"):
13
+ result = subprocess.run(
14
+ ["pdftotext", "-layout", file_path, "-"],
15
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
16
+ )
17
+ return result.stdout.decode("utf-8", errors="ignore")
18
+ elif file_path.lower().endswith(".docx"):
19
+ with zipfile.ZipFile(file_path) as zf:
20
+ with zf.open("word/document.xml") as docx_xml:
21
+ xml_text = docx_xml.read().decode("utf-8", errors="ignore")
22
+ xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
23
+ return re.sub(r"<[^>]+>", " ", xml_text)
24
+ return ""
25
+
26
+ # === Parse resume with NER ===
27
+ def parse_resume(file_path: str) -> dict:
28
+ text = extract_text(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  entities = ner_pipeline(text)
30
 
31
+ name, skills, education, experience = [], [], [], []
 
32
  for ent in entities:
33
  label = ent["entity_group"].upper()
34
+ word = ent["word"]
35
+ if label == "NAME":
36
+ name.append(word)
37
+ elif label == "SKILL":
38
+ skills.append(word)
39
+ elif label in ["EDUCATION", "DEGREE"]:
40
+ education.append(word)
41
+ elif label in ["EXPERIENCE", "JOB", "ROLE"]:
42
+ experience.append(word)
 
43
 
44
  return {
45
+ "name": " ".join(set(name)),
46
+ "skills": ", ".join(set(skills)),
47
+ "education": ", ".join(set(education)),
48
+ "experience": ", ".join(set(experience))
49
  }