husseinelsaadi commited on
Commit
682910e
·
1 Parent(s): 947d727
Files changed (1) hide show
  1. backend/services/resume_parser.py +45 -35
backend/services/resume_parser.py CHANGED
@@ -1,56 +1,58 @@
 
 
 
 
 
1
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
2
- import zipfile, re, os
3
- from PyPDF2 import PdfReader # Lightweight & already in Spaces
4
 
5
- # ===============================
6
- # Load Model & Tokenizer
7
- # ===============================
8
- MODEL_NAME = "sravya-abburi/ResumeParserBERT" # Swap to Kiet model if needed
 
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
  model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 
11
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
12
 
13
- # ===============================
14
- # Extract Text (PDF & DOCX)
15
- # ===============================
16
  def extract_text(file_path: str) -> str:
17
- """Extract text from PDF or DOCX without external dependencies."""
18
- file_path_lower = file_path.lower()
19
-
20
- # PDF reading using PyPDF2 (no fitz, no installs needed)
21
- if file_path_lower.endswith(".pdf"):
22
- text = ""
23
- with open(file_path, "rb") as f:
24
- reader = PdfReader(f)
25
- for page in reader.pages:
26
- page_text = page.extract_text()
27
- if page_text:
28
- text += page_text + "\n"
29
- return text
30
-
31
- # DOCX reading by extracting XML content
32
- elif file_path_lower.endswith(".docx"):
33
- with zipfile.ZipFile(file_path) as zf:
34
- with zf.open("word/document.xml") as docx_xml:
35
- xml_text = docx_xml.read().decode("utf-8", errors="ignore")
36
- xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
37
- return re.sub(r"<[^>]+>", " ", xml_text)
38
-
39
  return ""
40
 
41
- # ===============================
42
  # Parse Resume
43
- # ===============================
44
  def parse_resume(file_path: str, filename: str = None) -> dict:
45
- """Parse resume and extract structured information."""
46
  text = extract_text(file_path)
47
-
48
  entities = ner_pipeline(text)
49
 
50
  name, skills, education, experience = [], [], [], []
51
  for ent in entities:
52
  label = ent["entity_group"].upper()
53
  word = ent["word"].strip()
 
54
  if label == "NAME":
55
  name.append(word)
56
  elif label == "SKILL":
@@ -66,3 +68,11 @@ def parse_resume(file_path: str, filename: str = None) -> dict:
66
  "education": ", ".join(dict.fromkeys(education)),
67
  "experience": ", ".join(dict.fromkeys(experience))
68
  }
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import subprocess
4
+ import zipfile
5
+ import json
6
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 
 
7
 
8
+ # --------------------
9
+ # Load Model
10
+ # --------------------
11
+ MODEL_NAME = "sravya-abburi/ResumeParserBERT"
12
+
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
  model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
15
+
16
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
17
 
18
+ # --------------------
19
+ # Extract Text
20
+ # --------------------
21
  def extract_text(file_path: str) -> str:
22
+ """Extract text from PDF/DOCX resumes."""
23
+ if file_path.lower().endswith(".pdf"):
24
+ try:
25
+ result = subprocess.run(
26
+ ["pdftotext", "-layout", file_path, "-"],
27
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
28
+ )
29
+ return result.stdout.decode("utf-8", errors="ignore")
30
+ except:
31
+ return ""
32
+ elif file_path.lower().endswith(".docx"):
33
+ try:
34
+ with zipfile.ZipFile(file_path) as zf:
35
+ with zf.open("word/document.xml") as docx_xml:
36
+ xml_text = docx_xml.read().decode("utf-8", errors="ignore")
37
+ xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
38
+ return re.sub(r"<[^>]+>", " ", xml_text)
39
+ except:
40
+ return ""
 
 
 
41
  return ""
42
 
43
+ # --------------------
44
  # Parse Resume
45
+ # --------------------
46
  def parse_resume(file_path: str, filename: str = None) -> dict:
47
+ """Extract Name, Skills, Education, Experience from resume."""
48
  text = extract_text(file_path)
 
49
  entities = ner_pipeline(text)
50
 
51
  name, skills, education, experience = [], [], [], []
52
  for ent in entities:
53
  label = ent["entity_group"].upper()
54
  word = ent["word"].strip()
55
+
56
  if label == "NAME":
57
  name.append(word)
58
  elif label == "SKILL":
 
68
  "education": ", ".join(dict.fromkeys(education)),
69
  "experience": ", ".join(dict.fromkeys(experience))
70
  }
71
+
72
+ # --------------------
73
+ # Example
74
+ # --------------------
75
+ if __name__ == "__main__":
76
+ resume_path = "resume.pdf" # Change to test file
77
+ result = parse_resume(resume_path)
78
+ print(json.dumps(result, indent=2))