husseinelsaadi commited on
Commit
b8deff5
·
1 Parent(s): 288175b
Files changed (1) hide show
  1. backend/services/resume_parser.py +17 -19
backend/services/resume_parser.py CHANGED
@@ -6,49 +6,47 @@ from pdfminer.high_level import extract_text as pdf_extract_text
6
  from docx import Document
7
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
8
 
9
- # --------------------
10
- # Load PyTorch Resume NER Model
11
- # --------------------
12
- MODEL_NAME = "manishiitg/resume-ner" # Works with PyTorch on Hugging Face Spaces
13
 
14
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
15
  model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
16
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
17
 
18
- # --------------------
19
- # Extract Text from PDF/DOCX
20
- # --------------------
21
  def extract_text(file_path: str) -> str:
22
  path = Path(file_path)
23
  if path.suffix.lower() == ".pdf":
24
- return pdf_extract_text(file_path)
25
  elif path.suffix.lower() == ".docx":
26
  doc = Document(file_path)
27
- return "\n".join([p.text for p in doc.paragraphs])
28
  else:
29
  raise ValueError("Unsupported file format")
 
 
 
 
30
 
31
- # --------------------
32
- # Parse Resume (returns only: full name, skills, education, experience)
33
- # --------------------
34
  def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
35
  text = extract_text(file_path)
36
  entities = ner_pipeline(text)
37
 
38
- name_parts = []
39
- skills = []
40
- education = []
41
- experience = []
 
 
 
42
 
43
  for ent in entities:
44
  label = ent["entity_group"].upper()
45
  value = ent["word"].strip()
46
 
47
- if label == "NAME":
48
  name_parts.append(value)
49
- elif label == "SKILL":
50
  skills.append(value)
51
- elif label in ["EDUCATION", "DEGREE"]:
52
  education.append(value)
53
  elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
54
  experience.append(value)
 
6
  from docx import Document
7
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
8
 
9
+ MODEL_NAME = "manishiitg/resume-ner"
 
 
 
10
 
11
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
  model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
13
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
14
 
 
 
 
15
  def extract_text(file_path: str) -> str:
16
  path = Path(file_path)
17
  if path.suffix.lower() == ".pdf":
18
+ text = pdf_extract_text(file_path)
19
  elif path.suffix.lower() == ".docx":
20
  doc = Document(file_path)
21
+ text = "\n".join([p.text for p in doc.paragraphs])
22
  else:
23
  raise ValueError("Unsupported file format")
24
+
25
+ # Clean text
26
+ text = text.replace("\n", " ").replace("\r", " ").strip()
27
+ return text
28
 
 
 
 
29
  def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
30
  text = extract_text(file_path)
31
  entities = ner_pipeline(text)
32
 
33
+ # Debug: Print actual detected entities
34
+ print("\n=== DEBUG: Entities Detected ===")
35
+ for ent in entities:
36
+ print(f"{ent['entity_group']} => {ent['word']}")
37
+ print("==============================\n")
38
+
39
+ name_parts, skills, education, experience = [], [], [], []
40
 
41
  for ent in entities:
42
  label = ent["entity_group"].upper()
43
  value = ent["word"].strip()
44
 
45
+ if label in ["NAME", "PERSON"]:
46
  name_parts.append(value)
47
+ elif label in ["SKILL", "SKILLS"]:
48
  skills.append(value)
49
+ elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
50
  education.append(value)
51
  elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
52
  experience.append(value)