husseinelsaadi commited on
Commit
288175b
·
1 Parent(s): c0dac84

resume parser updated

Browse files
Files changed (1) hide show
  1. backend/services/resume_parser.py +54 -103
backend/services/resume_parser.py CHANGED
@@ -1,112 +1,63 @@
1
- import re
2
  from pathlib import Path
 
 
3
  from pdfminer.high_level import extract_text as pdf_extract_text
4
  from docx import Document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- class ResumeParser:
7
- def __init__(self):
8
- pass
9
-
10
- def extract_text(self, file_path: str) -> str:
11
- """Extract text from PDF or DOCX files"""
12
- path = Path(file_path)
13
-
14
- if path.suffix.lower() == ".pdf":
15
- text = pdf_extract_text(file_path)
16
- return re.sub(r'\s+', ' ', text).strip()
17
- elif path.suffix.lower() == ".docx":
18
- doc = Document(file_path)
19
- return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
20
- else:
21
- raise ValueError("Unsupported file format")
22
 
23
- def extract_name(self, text: str) -> str:
24
- """Extract name from resume text"""
25
- # Try to find name at the beginning of document
26
- first_lines = [line.strip() for line in text.split('\n')[:10] if line.strip()]
27
-
28
- for line in first_lines:
29
- # Simple name pattern (2-4 words, all starting with capital)
30
- if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3}$', line):
31
- if not any(word.lower() in ['resume', 'cv', 'curriculum'] for word in line.split()):
32
- return line
33
-
34
- # Fallback: return first non-empty line that looks like a name
35
- for line in first_lines:
36
- if 2 <= len(line.split()) <= 4 and line[0].isupper():
37
- return line
38
-
39
- return "Not Found"
40
 
41
- def extract_sections(self, text: str) -> dict:
42
- """Extract skills, education, and experience using regex"""
43
- results = {
44
- "skills": [],
45
- "education": [],
46
- "experience": []
47
- }
48
-
49
- # Extract skills
50
- skills_match = re.search(
51
- r'(?:skills|technologies|expertise)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
52
- text, re.IGNORECASE
53
- )
54
- if skills_match:
55
- skills_text = skills_match.group(1)
56
- results["skills"] = [s.strip() for s in re.split(r'[,;]', skills_text) if s.strip()]
57
-
58
- # Extract education
59
- edu_match = re.search(
60
- r'(?:education|degrees?)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
61
- text, re.IGNORECASE
62
- )
63
- if edu_match:
64
- results["education"] = [e.strip() for e in edu_match.group(1).split('\n') if e.strip()]
65
-
66
- # Extract experience
67
- exp_match = re.search(
68
- r'(?:experience|work history|employment)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
69
- text, re.IGNORECASE
70
- )
71
- if exp_match:
72
- results["experience"] = [x.strip() for x in exp_match.group(1).split('\n') if x.strip()]
73
-
74
- return results
75
 
76
- def parse_resume(self, file_path: str) -> dict:
77
- """Main parsing function"""
78
- try:
79
- text = self.extract_text(file_path)
80
-
81
- if not text or len(text.strip()) < 10:
82
- return {
83
- "name": "Error: Empty file",
84
- "skills": [],
85
- "education": [],
86
- "experience": []
87
- }
88
-
89
- name = self.extract_name(text)
90
- sections = self.extract_sections(text)
91
-
92
- return {
93
- "name": name,
94
- "skills": sections["skills"][:10], # Limit to 10 skills
95
- "education": sections["education"][:3], # Limit to 3 items
96
- "experience": sections["experience"][:3] # Limit to 3 items
97
- }
98
-
99
- except Exception as e:
100
- return {
101
- "name": f"Error: {str(e)}",
102
- "skills": [],
103
- "education": [],
104
- "experience": []
105
- }
106
 
107
- # Global instance
108
- resume_parser = ResumeParser()
109
 
110
- def parse_resume(file_path: str) -> dict:
111
- """Public interface for resume parsing"""
112
- return resume_parser.parse_resume(file_path)
 
 
 
 
1
+ import json
2
  from pathlib import Path
3
+ from typing import Dict
4
+
5
  from pdfminer.high_level import extract_text as pdf_extract_text
6
  from docx import Document
7
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
8
+
9
+ # --------------------
10
+ # Load PyTorch Resume NER Model
11
+ # --------------------
12
+ MODEL_NAME = "manishiitg/resume-ner" # Works with PyTorch on Hugging Face Spaces
13
+
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
15
+ model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
16
+ ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
17
+
18
+ # --------------------
19
+ # Extract Text from PDF/DOCX
20
+ # --------------------
21
+ def extract_text(file_path: str) -> str:
22
+ path = Path(file_path)
23
+ if path.suffix.lower() == ".pdf":
24
+ return pdf_extract_text(file_path)
25
+ elif path.suffix.lower() == ".docx":
26
+ doc = Document(file_path)
27
+ return "\n".join([p.text for p in doc.paragraphs])
28
+ else:
29
+ raise ValueError("Unsupported file format")
30
 
31
+ # --------------------
32
+ # Parse Resume (returns only: full name, skills, education, experience)
33
+ # --------------------
34
+ def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
35
+ text = extract_text(file_path)
36
+ entities = ner_pipeline(text)
 
 
 
 
 
 
 
 
 
 
37
 
38
+ name_parts = []
39
+ skills = []
40
+ education = []
41
+ experience = []
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ for ent in entities:
44
+ label = ent["entity_group"].upper()
45
+ value = ent["word"].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ if label == "NAME":
48
+ name_parts.append(value)
49
+ elif label == "SKILL":
50
+ skills.append(value)
51
+ elif label in ["EDUCATION", "DEGREE"]:
52
+ education.append(value)
53
+ elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
54
+ experience.append(value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"
 
57
 
58
+ return {
59
+ "name": full_name,
60
+ "skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
61
+ "education": ", ".join(dict.fromkeys(education)) or "Not Found",
62
+ "experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
63
+ }