husseinelsaadi commited on
Commit
775c09c
·
1 Parent(s): ff62567
Files changed (1) hide show
  1. backend/services/resume_parser.py +30 -277
backend/services/resume_parser.py CHANGED
@@ -1,95 +1,21 @@
1
- """
2
- resume_parser.py
3
- =================
4
-
5
- This module provides lightweight functions to extract useful information
6
- from a candidate's resume. The design avoids heavy dependencies such
7
- as spaCy or pdfminer because Hugging Face Spaces environments are
8
- resource‑constrained and installing additional packages at runtime is
9
- often not feasible. Instead, built‑in Python libraries and a
10
- few simple heuristics are used to extract text from both PDF and DOCX
11
- files and to infer the candidate's name, skills, education and
12
- experience from that text.
13
-
14
- The parser operates on the assumption that most resumes follow a
15
- relatively consistent structure: the candidate's name appears near the
16
- top of the document, headings such as "Education" and "Experience"
17
- demarcate sections, and common skill keywords are scattered
18
- throughout. These assumptions will not hold for every CV, but they
19
- provide a reasonable baseline for auto‑filling form fields. Users can
20
- always edit the populated fields before submitting their application.
21
-
22
- Functions
23
- ---------
24
-
25
- * ``extract_text(file_path: str) -> str``
26
- Read a resume file (PDF or DOCX) and return its plain text. PDFs
27
- are processed using the ``pdftotext`` command line tool, which is
28
- available in the Hugging Face Spaces container. DOCX files are
29
- treated as zip archives; the ``word/document.xml`` component is
30
- parsed and stripped of XML tags.
31
-
32
- * ``extract_name(text: str, filename: str) -> str``
33
- Attempt to infer the candidate's full name from the document text.
34
- If no plausible name is found in the first few lines of the text,
35
- fall back to deriving a name from the file name itself.
36
-
37
- * ``extract_skills(text: str) -> list[str]``
38
- Search for a predefined list of common technical and soft skills
39
- within the resume text. Matches are case‑insensitive and unique
40
- values are returned in their original capitalisation.
41
-
42
- * ``extract_education(text: str) -> list[str]``
43
- Identify lines mentioning educational qualifications. Heuristics
44
- include the presence of keywords like "University", "Bachelor",
45
- "Master", "PhD", etc.
46
-
47
- * ``extract_experience(text: str) -> list[str]``
48
- Extract statements describing work experience. Lines containing
49
- keywords such as "experience", "Developer", "Engineer" or those
50
- matching patterns with years of service are considered.
51
-
52
- * ``parse_resume(file_path: str, filename: str) -> dict``
53
- High‑level wrapper that orchestrates the text extraction and
54
- information extraction functions. Returns a dictionary with keys
55
- ``name``, ``skills``, ``education``, and ``experience``.
56
-
57
- The main Flask route can import ``parse_resume`` from this module and
58
- return its result as JSON. Because the heuristics are conservative and
59
- string‑based, the parser runs quickly on both CPU and GPU hosts.
60
- """
61
-
62
  from __future__ import annotations
63
-
64
  import os
65
  import re
66
  import subprocess
67
  import zipfile
68
  from typing import List
 
69
 
 
 
70
 
71
  def extract_text(file_path: str) -> str:
72
- """Extract raw text from a PDF or DOCX resume.
73
-
74
- Parameters
75
- ----------
76
- file_path : str
77
- Absolute path to the uploaded resume.
78
-
79
- Returns
80
- -------
81
- str
82
- The textual content of the resume. If extraction fails,
83
- returns an empty string.
84
- """
85
  if not file_path or not os.path.isfile(file_path):
86
  return ""
87
 
88
  lower_name = file_path.lower()
89
  try:
90
- # If the file ends with .pdf use pdftotext. The '-layout'
91
- # flag preserves relative positioning which helps preserve
92
- # line breaks in the output. Output is sent to stdout.
93
  if lower_name.endswith('.pdf'):
94
  try:
95
  result = subprocess.run(
@@ -98,244 +24,71 @@ def extract_text(file_path: str) -> str:
98
  stderr=subprocess.PIPE,
99
  check=False
100
  )
101
- raw_text = result.stdout.decode('utf-8', errors='ignore')
102
- # Normalize whitespace and ensure section keywords are on separate lines
103
- raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
104
- raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
105
- raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
106
- # Replace multiple spaces/tabs but keep newlines
107
- raw_text = re.sub(r'[ \t]+', ' ', raw_text)
108
- # Ensure section keywords are isolated
109
- raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
110
- raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
111
- raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
112
- return raw_text
113
-
114
-
115
  except Exception:
116
  return ""
117
- # If it's a .docx treat it as a zip archive and pull the main
118
- # document XML. Note that .doc files are not supported since
119
- # they use a binary format.
120
  elif lower_name.endswith('.docx'):
121
  try:
122
  with zipfile.ZipFile(file_path) as zf:
123
  with zf.open('word/document.xml') as docx_xml:
124
  xml_bytes = docx_xml.read()
125
- # Remove XML tags to leave plain text. Replace
126
- # tags with spaces to avoid accidental word
127
- # concatenation.
128
  xml_text = xml_bytes.decode('utf-8', errors='ignore')
129
- # Replace common markup elements with newlines to
130
- # preserve paragraph structure. Some tags like
131
- # ``<w:p>`` represent paragraphs in Word.
132
  xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
133
- # Remove remaining tags
134
  text = re.sub(r'<[^>]+>', ' ', xml_text)
135
- # Collapse multiple whitespace
136
  text = re.sub(r'\s+', ' ', text)
137
  return text
138
  except Exception:
139
  return ""
140
  else:
141
- # Unsupported file type
142
  return ""
143
  except Exception:
144
  return ""
145
 
146
-
147
  def extract_name(text: str, filename: str) -> str:
148
- """Attempt to extract the candidate's full name from the resume.
149
-
150
- This function first inspects the first few lines of the resume
151
- text. It looks for lines containing between two and four words
152
- where each word starts with an uppercase letter. If such a line
153
- isn't found, it falls back to deriving a name from the file name.
154
-
155
- Parameters
156
- ----------
157
- text : str
158
- The full resume text.
159
- filename : str
160
- The original filename of the uploaded resume.
161
-
162
- Returns
163
- -------
164
- str
165
- Inferred full name or an empty string if not found.
166
- """
167
  if text:
168
- # Consider the first 10 lines for a potential name. Strip
169
- # whitespace and ignore empty lines.
170
  lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
171
  for line in lines[:10]:
172
- # Remove common headings like "Resume" or "Curriculum Vitae"
173
  if re.match(r'(?i)resume|curriculum vitae', line):
174
  continue
175
  words = line.split()
176
- # A plausible name typically has 2–4 words
177
  if 1 < len(words) <= 4:
178
- # All words must start with an uppercase letter (allow
179
- # accented characters) and contain at least one letter.
180
  if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
181
  return line
182
- # Fallback: derive a name from the filename
183
  base = os.path.basename(filename)
184
- # Remove extension
185
  base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
186
- # Replace underscores, dashes and dots with spaces
187
  base = re.sub(r'[\._-]+', ' ', base)
188
- # Remove common tokens like 'cv' or 'resume'
189
  base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
190
  base = re.sub(r'\s+', ' ', base).strip()
191
- # Title case the remaining string
192
  return base.title() if base else ''
193
 
194
-
195
- def extract_skills(text: str) -> List[str]:
196
- """Identify common skills mentioned in the resume.
197
-
198
- A predefined set of skills is checked against the resume text in a
199
- case‑insensitive manner. If a skill phrase appears anywhere in the
200
- text, it is added to the result list. Multi‑word skills must match
201
- the full phrase to count as a hit.
202
-
203
- Parameters
204
- ----------
205
- text : str
206
- The resume's full text.
207
-
208
- Returns
209
- -------
210
- list[str]
211
- Unique skills found in the resume, preserving their original
212
- capitalisation where possible.
213
- """
214
- if not text:
215
- return []
216
- lower_text = text.lower()
217
- # Define a set of common technical and soft skills. This list can
218
- # be extended in future iterations without modifying the parser
219
- SKILLS = [
220
- 'python', 'java', 'c++', 'c', 'javascript', 'html', 'css',
221
- 'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring',
222
- 'machine learning', 'deep learning', 'nlp', 'data analysis',
223
- 'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git',
224
- 'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux',
225
- 'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
226
- 'matplotlib', 'excel', 'powerpoint', 'project management',
227
- 'communication', 'teamwork', 'leadership', 'problem solving',
228
- 'public speaking', 'writing', 'analysis', 'time management'
229
- ]
230
- found = []
231
- for skill in SKILLS:
232
- pattern = re.escape(skill.lower())
233
- if re.search(r'\b' + pattern + r'(\b|[^a-zA-Z])', lower_text):
234
- # Preserve the original capitalisation of the skill phrase
235
- found.append(skill.title() if skill.islower() else skill)
236
- return list(dict.fromkeys(found)) # Remove duplicates, preserve order
237
-
238
-
239
- def extract_education(text: str) -> List[str]:
240
- """Gather educational qualifications from the resume text.
241
-
242
- The function searches for lines containing keywords related to
243
- education. Only distinct lines with meaningful content are
244
- included.
245
-
246
- Parameters
247
- ----------
248
- text : str
249
-
250
- Returns
251
- -------
252
- list[str]
253
- Lines representing educational qualifications.
254
- """
255
- if not text:
256
- return []
257
- lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
258
- education_keywords = [
259
- 'university', 'college', 'bachelor', 'bachelors', 'master', 'masters',
260
- 'phd', 'b.sc', 'bsc', 'm.sc', 'msc', 'mba', 'school', 'degree',
261
- 'diploma', 'engineering', 'work history'
262
- ]
263
-
264
- results = []
265
- for line in lines:
266
- lower = line.lower()
267
- if any(kw in lower for kw in education_keywords):
268
- # Avoid capturing the same line twice
269
- if line not in results:
270
- results.append(line)
271
- # If nothing found, return an empty list
272
- return results
273
-
274
-
275
- def extract_experience(text: str) -> List[str]:
276
- """Extract snippets of work experience from resume text.
277
-
278
- Heuristics are used to detect sentences or lines that likely
279
- describe professional experience. Indicators include the presence
280
- of keywords like "experience", job titles, or explicit durations.
281
-
282
- Parameters
283
- ----------
284
- text : str
285
-
286
- Returns
287
- -------
288
- list[str]
289
- A list of lines summarising work experience.
290
- """
291
- if not text:
292
- return []
293
- lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
294
- # Keywords signalling experience entries
295
- exp_keywords = [
296
- 'experience', 'worked', 'employment', 'internship', 'developer',
297
- 'engineer', 'manager', 'analyst', 'consultant', 'assistant',
298
- 'years', 'year', 'months', 'month', 'present'
299
- ]
300
- results = []
301
- for line in lines:
302
- lower = line.lower()
303
- if any(kw in lower for kw in exp_keywords):
304
- # Filter out lines that are just section headings
305
- if len(lower.split()) > 2:
306
- if line not in results:
307
- results.append(line)
308
- return results
309
-
310
 
311
  def parse_resume(file_path: str, filename: str) -> dict:
312
- """High‑level helper to parse a resume into structured fields.
313
-
314
- Parameters
315
- ----------
316
- file_path : str
317
- Location of the uploaded file on disk.
318
- filename : str
319
- The original filename as provided by the user. Used as a
320
- fallback for name extraction if the document text does not
321
- reveal a plausible name.
322
-
323
- Returns
324
- -------
325
- dict
326
- Dictionary with keys ``name``, ``skills``, ``education`` and
327
- ``experience``. Each value is a string, except for the name
328
- which is a single string. Lists are joined into a comma or
329
- newline separated string suitable for form fields.
330
- """
331
  text = extract_text(file_path)
332
  name = extract_name(text, filename)
333
- skills_list = extract_skills(text)
334
- education_list = extract_education(text)
335
- experience_list = extract_experience(text)
336
  return {
337
  'name': name or '',
338
- 'skills': ', '.join(skills_list) if skills_list else '',
339
- 'education': '\n'.join(education_list) if education_list else '',
340
- 'experience': '\n'.join(experience_list) if experience_list else ''
341
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
 
2
  import os
3
  import re
4
  import subprocess
5
  import zipfile
6
  from typing import List
7
+ from transformers import pipeline
8
 
9
+ # Load the NER model for resume parsing
10
+ ner = pipeline("ner", model="AI-Sweden-Models/distilbert-resume-ner", aggregation_strategy="simple")
11
 
12
  def extract_text(file_path: str) -> str:
13
+ """Extract text from PDF or DOCX."""
 
 
 
 
 
 
 
 
 
 
 
 
14
  if not file_path or not os.path.isfile(file_path):
15
  return ""
16
 
17
  lower_name = file_path.lower()
18
  try:
 
 
 
19
  if lower_name.endswith('.pdf'):
20
  try:
21
  result = subprocess.run(
 
24
  stderr=subprocess.PIPE,
25
  check=False
26
  )
27
+ return result.stdout.decode('utf-8', errors='ignore')
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  except Exception:
29
  return ""
 
 
 
30
  elif lower_name.endswith('.docx'):
31
  try:
32
  with zipfile.ZipFile(file_path) as zf:
33
  with zf.open('word/document.xml') as docx_xml:
34
  xml_bytes = docx_xml.read()
 
 
 
35
  xml_text = xml_bytes.decode('utf-8', errors='ignore')
 
 
 
36
  xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
 
37
  text = re.sub(r'<[^>]+>', ' ', xml_text)
 
38
  text = re.sub(r'\s+', ' ', text)
39
  return text
40
  except Exception:
41
  return ""
42
  else:
 
43
  return ""
44
  except Exception:
45
  return ""
46
 
 
47
  def extract_name(text: str, filename: str) -> str:
48
+ """Extract candidate's name from text or filename."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if text:
 
 
50
  lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
51
  for line in lines[:10]:
 
52
  if re.match(r'(?i)resume|curriculum vitae', line):
53
  continue
54
  words = line.split()
 
55
  if 1 < len(words) <= 4:
 
 
56
  if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
57
  return line
 
58
  base = os.path.basename(filename)
 
59
  base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
 
60
  base = re.sub(r'[\._-]+', ' ', base)
 
61
  base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
62
  base = re.sub(r'\s+', ' ', base).strip()
 
63
  return base.title() if base else ''
64
 
65
+ def extract_entities(text: str) -> dict:
66
+ """Extract structured info using NER model."""
67
+ entities = ner(text)
68
+ skills, education, experience = [], [], []
69
+ for ent in entities:
70
+ label = ent['entity_group'].upper()
71
+ word = ent['word'].strip()
72
+ if label in ["SKILL", "TECH", "TECHNOLOGY"]:
73
+ skills.append(word)
74
+ elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
75
+ education.append(word)
76
+ elif label in ["EXPERIENCE", "JOB", "ROLE"]:
77
+ experience.append(word)
78
+ return {
79
+ "skills": list(dict.fromkeys(skills)),
80
+ "education": list(dict.fromkeys(education)),
81
+ "experience": list(dict.fromkeys(experience))
82
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  def parse_resume(file_path: str, filename: str) -> dict:
85
+ """Main function to parse resume fields."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  text = extract_text(file_path)
87
  name = extract_name(text, filename)
88
+ ents = extract_entities(text)
 
 
89
  return {
90
  'name': name or '',
91
+ 'skills': ', '.join(ents["skills"]) if ents["skills"] else '',
92
+ 'education': ', '.join(ents["education"]) if ents["education"] else '',
93
+ 'experience': ', '.join(ents["experience"]) if ents["experience"] else ''
94
+ }