husseinelsaadi commited on
Commit
33fa314
·
1 Parent(s): 682910e
Files changed (1) hide show
  1. backend/services/resume_parser.py +32 -49
backend/services/resume_parser.py CHANGED
@@ -1,78 +1,61 @@
1
- import os
2
- import re
3
- import subprocess
4
- import zipfile
5
  import json
 
 
 
 
 
6
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
7
 
8
  # --------------------
9
- # Load Model
10
  # --------------------
11
- MODEL_NAME = "sravya-abburi/ResumeParserBERT"
12
 
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
  model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
15
-
16
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
17
 
18
  # --------------------
19
- # Extract Text
20
  # --------------------
21
  def extract_text(file_path: str) -> str:
22
- """Extract text from PDF/DOCX resumes."""
23
- if file_path.lower().endswith(".pdf"):
24
- try:
25
- result = subprocess.run(
26
- ["pdftotext", "-layout", file_path, "-"],
27
- stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
28
- )
29
- return result.stdout.decode("utf-8", errors="ignore")
30
- except:
31
- return ""
32
- elif file_path.lower().endswith(".docx"):
33
- try:
34
- with zipfile.ZipFile(file_path) as zf:
35
- with zf.open("word/document.xml") as docx_xml:
36
- xml_text = docx_xml.read().decode("utf-8", errors="ignore")
37
- xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
38
- return re.sub(r"<[^>]+>", " ", xml_text)
39
- except:
40
- return ""
41
- return ""
42
 
43
  # --------------------
44
  # Parse Resume
45
  # --------------------
46
- def parse_resume(file_path: str, filename: str = None) -> dict:
47
- """Extract Name, Skills, Education, Experience from resume."""
48
  text = extract_text(file_path)
49
  entities = ner_pipeline(text)
50
 
51
- name, skills, education, experience = [], [], [], []
 
 
 
 
52
  for ent in entities:
53
  label = ent["entity_group"].upper()
54
- word = ent["word"].strip()
55
 
56
  if label == "NAME":
57
- name.append(word)
58
  elif label == "SKILL":
59
- skills.append(word)
60
  elif label in ["EDUCATION", "DEGREE"]:
61
- education.append(word)
62
- elif label in ["EXPERIENCE", "JOB", "ROLE"]:
63
- experience.append(word)
64
 
65
  return {
66
- "name": " ".join(dict.fromkeys(name)),
67
- "skills": ", ".join(dict.fromkeys(skills)),
68
- "education": ", ".join(dict.fromkeys(education)),
69
- "experience": ", ".join(dict.fromkeys(experience))
70
  }
71
-
72
- # --------------------
73
- # Example
74
- # --------------------
75
- if __name__ == "__main__":
76
- resume_path = "resume.pdf" # Change to test file
77
- result = parse_resume(resume_path)
78
- print(json.dumps(result, indent=2))
 
 
 
 
 
1
  import json
2
+ from pathlib import Path
3
+ from typing import Dict
4
+
5
+ from pdfminer.high_level import extract_text as pdf_extract_text
6
+ from docx import Document
7
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
8
 
9
  # --------------------
10
+ # Load Resume NER Model
11
  # --------------------
12
+ MODEL_NAME = "Ioana23/bert-finetuned-resumes-ner"
13
 
14
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
15
  model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 
16
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
17
 
18
  # --------------------
19
+ # Extract Text from PDF/DOCX
20
  # --------------------
21
  def extract_text(file_path: str) -> str:
22
+ path = Path(file_path)
23
+ if path.suffix.lower() == ".pdf":
24
+ return pdf_extract_text(file_path)
25
+ elif path.suffix.lower() == ".docx":
26
+ doc = Document(file_path)
27
+ return "\n".join([p.text for p in doc.paragraphs])
28
+ else:
29
+ raise ValueError("Unsupported file format")
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  # --------------------
32
  # Parse Resume
33
  # --------------------
34
+ def parse_resume(file_path: str) -> Dict[str, str]:
 
35
  text = extract_text(file_path)
36
  entities = ner_pipeline(text)
37
 
38
+ name = []
39
+ skills = []
40
+ education = []
41
+ experience = []
42
+
43
  for ent in entities:
44
  label = ent["entity_group"].upper()
45
+ value = ent["word"].strip()
46
 
47
  if label == "NAME":
48
+ name.append(value)
49
  elif label == "SKILL":
50
+ skills.append(value)
51
  elif label in ["EDUCATION", "DEGREE"]:
52
+ education.append(value)
53
+ elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
54
+ experience.append(value)
55
 
56
  return {
57
+ "name": " ".join(dict.fromkeys(name)) or "Not Found",
58
+ "skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
59
+ "education": ", ".join(dict.fromkeys(education)) or "Not Found",
60
+ "experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
61
  }