Spaces:
Paused
Paused
Commit
·
775c09c
1
Parent(s):
ff62567
updated
Browse files- backend/services/resume_parser.py +30 -277
backend/services/resume_parser.py
CHANGED
@@ -1,95 +1,21 @@
|
|
1 |
-
"""
|
2 |
-
resume_parser.py
|
3 |
-
=================
|
4 |
-
|
5 |
-
This module provides lightweight functions to extract useful information
|
6 |
-
from a candidate's resume. The design avoids heavy dependencies such
|
7 |
-
as spaCy or pdfminer because Hugging Face Spaces environments are
|
8 |
-
resource‑constrained and installing additional packages at runtime is
|
9 |
-
often not feasible. Instead, built‑in Python libraries and a
|
10 |
-
few simple heuristics are used to extract text from both PDF and DOCX
|
11 |
-
files and to infer the candidate's name, skills, education and
|
12 |
-
experience from that text.
|
13 |
-
|
14 |
-
The parser operates on the assumption that most resumes follow a
|
15 |
-
relatively consistent structure: the candidate's name appears near the
|
16 |
-
top of the document, headings such as "Education" and "Experience"
|
17 |
-
demarcate sections, and common skill keywords are scattered
|
18 |
-
throughout. These assumptions will not hold for every CV, but they
|
19 |
-
provide a reasonable baseline for auto‑filling form fields. Users can
|
20 |
-
always edit the populated fields before submitting their application.
|
21 |
-
|
22 |
-
Functions
|
23 |
-
---------
|
24 |
-
|
25 |
-
* ``extract_text(file_path: str) -> str``
|
26 |
-
Read a resume file (PDF or DOCX) and return its plain text. PDFs
|
27 |
-
are processed using the ``pdftotext`` command line tool, which is
|
28 |
-
available in the Hugging Face Spaces container. DOCX files are
|
29 |
-
treated as zip archives; the ``word/document.xml`` component is
|
30 |
-
parsed and stripped of XML tags.
|
31 |
-
|
32 |
-
* ``extract_name(text: str, filename: str) -> str``
|
33 |
-
Attempt to infer the candidate's full name from the document text.
|
34 |
-
If no plausible name is found in the first few lines of the text,
|
35 |
-
fall back to deriving a name from the file name itself.
|
36 |
-
|
37 |
-
* ``extract_skills(text: str) -> list[str]``
|
38 |
-
Search for a predefined list of common technical and soft skills
|
39 |
-
within the resume text. Matches are case‑insensitive and unique
|
40 |
-
values are returned in their original capitalisation.
|
41 |
-
|
42 |
-
* ``extract_education(text: str) -> list[str]``
|
43 |
-
Identify lines mentioning educational qualifications. Heuristics
|
44 |
-
include the presence of keywords like "University", "Bachelor",
|
45 |
-
"Master", "PhD", etc.
|
46 |
-
|
47 |
-
* ``extract_experience(text: str) -> list[str]``
|
48 |
-
Extract statements describing work experience. Lines containing
|
49 |
-
keywords such as "experience", "Developer", "Engineer" or those
|
50 |
-
matching patterns with years of service are considered.
|
51 |
-
|
52 |
-
* ``parse_resume(file_path: str, filename: str) -> dict``
|
53 |
-
High‑level wrapper that orchestrates the text extraction and
|
54 |
-
information extraction functions. Returns a dictionary with keys
|
55 |
-
``name``, ``skills``, ``education``, and ``experience``.
|
56 |
-
|
57 |
-
The main Flask route can import ``parse_resume`` from this module and
|
58 |
-
return its result as JSON. Because the heuristics are conservative and
|
59 |
-
string‑based, the parser runs quickly on both CPU and GPU hosts.
|
60 |
-
"""
|
61 |
-
|
62 |
from __future__ import annotations
|
63 |
-
|
64 |
import os
|
65 |
import re
|
66 |
import subprocess
|
67 |
import zipfile
|
68 |
from typing import List
|
|
|
69 |
|
|
|
|
|
70 |
|
71 |
def extract_text(file_path: str) -> str:
|
72 |
-
"""Extract
|
73 |
-
|
74 |
-
Parameters
|
75 |
-
----------
|
76 |
-
file_path : str
|
77 |
-
Absolute path to the uploaded resume.
|
78 |
-
|
79 |
-
Returns
|
80 |
-
-------
|
81 |
-
str
|
82 |
-
The textual content of the resume. If extraction fails,
|
83 |
-
returns an empty string.
|
84 |
-
"""
|
85 |
if not file_path or not os.path.isfile(file_path):
|
86 |
return ""
|
87 |
|
88 |
lower_name = file_path.lower()
|
89 |
try:
|
90 |
-
# If the file ends with .pdf use pdftotext. The '-layout'
|
91 |
-
# flag preserves relative positioning which helps preserve
|
92 |
-
# line breaks in the output. Output is sent to stdout.
|
93 |
if lower_name.endswith('.pdf'):
|
94 |
try:
|
95 |
result = subprocess.run(
|
@@ -98,244 +24,71 @@ def extract_text(file_path: str) -> str:
|
|
98 |
stderr=subprocess.PIPE,
|
99 |
check=False
|
100 |
)
|
101 |
-
|
102 |
-
# Normalize whitespace and ensure section keywords are on separate lines
|
103 |
-
raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
|
104 |
-
raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
|
105 |
-
raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
|
106 |
-
# Replace multiple spaces/tabs but keep newlines
|
107 |
-
raw_text = re.sub(r'[ \t]+', ' ', raw_text)
|
108 |
-
# Ensure section keywords are isolated
|
109 |
-
raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
|
110 |
-
raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
|
111 |
-
raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
|
112 |
-
return raw_text
|
113 |
-
|
114 |
-
|
115 |
except Exception:
|
116 |
return ""
|
117 |
-
# If it's a .docx treat it as a zip archive and pull the main
|
118 |
-
# document XML. Note that .doc files are not supported since
|
119 |
-
# they use a binary format.
|
120 |
elif lower_name.endswith('.docx'):
|
121 |
try:
|
122 |
with zipfile.ZipFile(file_path) as zf:
|
123 |
with zf.open('word/document.xml') as docx_xml:
|
124 |
xml_bytes = docx_xml.read()
|
125 |
-
# Remove XML tags to leave plain text. Replace
|
126 |
-
# tags with spaces to avoid accidental word
|
127 |
-
# concatenation.
|
128 |
xml_text = xml_bytes.decode('utf-8', errors='ignore')
|
129 |
-
# Replace common markup elements with newlines to
|
130 |
-
# preserve paragraph structure. Some tags like
|
131 |
-
# ``<w:p>`` represent paragraphs in Word.
|
132 |
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
|
133 |
-
# Remove remaining tags
|
134 |
text = re.sub(r'<[^>]+>', ' ', xml_text)
|
135 |
-
# Collapse multiple whitespace
|
136 |
text = re.sub(r'\s+', ' ', text)
|
137 |
return text
|
138 |
except Exception:
|
139 |
return ""
|
140 |
else:
|
141 |
-
# Unsupported file type
|
142 |
return ""
|
143 |
except Exception:
|
144 |
return ""
|
145 |
|
146 |
-
|
147 |
def extract_name(text: str, filename: str) -> str:
|
148 |
-
"""
|
149 |
-
|
150 |
-
This function first inspects the first few lines of the resume
|
151 |
-
text. It looks for lines containing between two and four words
|
152 |
-
where each word starts with an uppercase letter. If such a line
|
153 |
-
isn't found, it falls back to deriving a name from the file name.
|
154 |
-
|
155 |
-
Parameters
|
156 |
-
----------
|
157 |
-
text : str
|
158 |
-
The full resume text.
|
159 |
-
filename : str
|
160 |
-
The original filename of the uploaded resume.
|
161 |
-
|
162 |
-
Returns
|
163 |
-
-------
|
164 |
-
str
|
165 |
-
Inferred full name or an empty string if not found.
|
166 |
-
"""
|
167 |
if text:
|
168 |
-
# Consider the first 10 lines for a potential name. Strip
|
169 |
-
# whitespace and ignore empty lines.
|
170 |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
171 |
for line in lines[:10]:
|
172 |
-
# Remove common headings like "Resume" or "Curriculum Vitae"
|
173 |
if re.match(r'(?i)resume|curriculum vitae', line):
|
174 |
continue
|
175 |
words = line.split()
|
176 |
-
# A plausible name typically has 2–4 words
|
177 |
if 1 < len(words) <= 4:
|
178 |
-
# All words must start with an uppercase letter (allow
|
179 |
-
# accented characters) and contain at least one letter.
|
180 |
if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
|
181 |
return line
|
182 |
-
# Fallback: derive a name from the filename
|
183 |
base = os.path.basename(filename)
|
184 |
-
# Remove extension
|
185 |
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
|
186 |
-
# Replace underscores, dashes and dots with spaces
|
187 |
base = re.sub(r'[\._-]+', ' ', base)
|
188 |
-
# Remove common tokens like 'cv' or 'resume'
|
189 |
base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
|
190 |
base = re.sub(r'\s+', ' ', base).strip()
|
191 |
-
# Title case the remaining string
|
192 |
return base.title() if base else ''
|
193 |
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
capitalisation where possible.
|
213 |
-
"""
|
214 |
-
if not text:
|
215 |
-
return []
|
216 |
-
lower_text = text.lower()
|
217 |
-
# Define a set of common technical and soft skills. This list can
|
218 |
-
# be extended in future iterations without modifying the parser
|
219 |
-
SKILLS = [
|
220 |
-
'python', 'java', 'c++', 'c', 'javascript', 'html', 'css',
|
221 |
-
'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring',
|
222 |
-
'machine learning', 'deep learning', 'nlp', 'data analysis',
|
223 |
-
'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git',
|
224 |
-
'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux',
|
225 |
-
'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
|
226 |
-
'matplotlib', 'excel', 'powerpoint', 'project management',
|
227 |
-
'communication', 'teamwork', 'leadership', 'problem solving',
|
228 |
-
'public speaking', 'writing', 'analysis', 'time management'
|
229 |
-
]
|
230 |
-
found = []
|
231 |
-
for skill in SKILLS:
|
232 |
-
pattern = re.escape(skill.lower())
|
233 |
-
if re.search(r'\b' + pattern + r'(\b|[^a-zA-Z])', lower_text):
|
234 |
-
# Preserve the original capitalisation of the skill phrase
|
235 |
-
found.append(skill.title() if skill.islower() else skill)
|
236 |
-
return list(dict.fromkeys(found)) # Remove duplicates, preserve order
|
237 |
-
|
238 |
-
|
239 |
-
def extract_education(text: str) -> List[str]:
|
240 |
-
"""Gather educational qualifications from the resume text.
|
241 |
-
|
242 |
-
The function searches for lines containing keywords related to
|
243 |
-
education. Only distinct lines with meaningful content are
|
244 |
-
included.
|
245 |
-
|
246 |
-
Parameters
|
247 |
-
----------
|
248 |
-
text : str
|
249 |
-
|
250 |
-
Returns
|
251 |
-
-------
|
252 |
-
list[str]
|
253 |
-
Lines representing educational qualifications.
|
254 |
-
"""
|
255 |
-
if not text:
|
256 |
-
return []
|
257 |
-
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
258 |
-
education_keywords = [
|
259 |
-
'university', 'college', 'bachelor', 'bachelors', 'master', 'masters',
|
260 |
-
'phd', 'b.sc', 'bsc', 'm.sc', 'msc', 'mba', 'school', 'degree',
|
261 |
-
'diploma', 'engineering', 'work history'
|
262 |
-
]
|
263 |
-
|
264 |
-
results = []
|
265 |
-
for line in lines:
|
266 |
-
lower = line.lower()
|
267 |
-
if any(kw in lower for kw in education_keywords):
|
268 |
-
# Avoid capturing the same line twice
|
269 |
-
if line not in results:
|
270 |
-
results.append(line)
|
271 |
-
# If nothing found, return an empty list
|
272 |
-
return results
|
273 |
-
|
274 |
-
|
275 |
-
def extract_experience(text: str) -> List[str]:
|
276 |
-
"""Extract snippets of work experience from resume text.
|
277 |
-
|
278 |
-
Heuristics are used to detect sentences or lines that likely
|
279 |
-
describe professional experience. Indicators include the presence
|
280 |
-
of keywords like "experience", job titles, or explicit durations.
|
281 |
-
|
282 |
-
Parameters
|
283 |
-
----------
|
284 |
-
text : str
|
285 |
-
|
286 |
-
Returns
|
287 |
-
-------
|
288 |
-
list[str]
|
289 |
-
A list of lines summarising work experience.
|
290 |
-
"""
|
291 |
-
if not text:
|
292 |
-
return []
|
293 |
-
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
294 |
-
# Keywords signalling experience entries
|
295 |
-
exp_keywords = [
|
296 |
-
'experience', 'worked', 'employment', 'internship', 'developer',
|
297 |
-
'engineer', 'manager', 'analyst', 'consultant', 'assistant',
|
298 |
-
'years', 'year', 'months', 'month', 'present'
|
299 |
-
]
|
300 |
-
results = []
|
301 |
-
for line in lines:
|
302 |
-
lower = line.lower()
|
303 |
-
if any(kw in lower for kw in exp_keywords):
|
304 |
-
# Filter out lines that are just section headings
|
305 |
-
if len(lower.split()) > 2:
|
306 |
-
if line not in results:
|
307 |
-
results.append(line)
|
308 |
-
return results
|
309 |
-
|
310 |
|
311 |
def parse_resume(file_path: str, filename: str) -> dict:
|
312 |
-
"""
|
313 |
-
|
314 |
-
Parameters
|
315 |
-
----------
|
316 |
-
file_path : str
|
317 |
-
Location of the uploaded file on disk.
|
318 |
-
filename : str
|
319 |
-
The original filename as provided by the user. Used as a
|
320 |
-
fallback for name extraction if the document text does not
|
321 |
-
reveal a plausible name.
|
322 |
-
|
323 |
-
Returns
|
324 |
-
-------
|
325 |
-
dict
|
326 |
-
Dictionary with keys ``name``, ``skills``, ``education`` and
|
327 |
-
``experience``. Each value is a string, except for the name
|
328 |
-
which is a single string. Lists are joined into a comma or
|
329 |
-
newline separated string suitable for form fields.
|
330 |
-
"""
|
331 |
text = extract_text(file_path)
|
332 |
name = extract_name(text, filename)
|
333 |
-
|
334 |
-
education_list = extract_education(text)
|
335 |
-
experience_list = extract_experience(text)
|
336 |
return {
|
337 |
'name': name or '',
|
338 |
-
'skills': ', '.join(
|
339 |
-
'education': '
|
340 |
-
'experience': '
|
341 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from __future__ import annotations
|
|
|
2 |
import os
|
3 |
import re
|
4 |
import subprocess
|
5 |
import zipfile
|
6 |
from typing import List
|
7 |
+
from transformers import pipeline
|
8 |
|
9 |
+
# Load the NER model for resume parsing
|
10 |
+
ner = pipeline("ner", model="AI-Sweden-Models/distilbert-resume-ner", aggregation_strategy="simple")
|
11 |
|
12 |
def extract_text(file_path: str) -> str:
|
13 |
+
"""Extract text from PDF or DOCX."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
if not file_path or not os.path.isfile(file_path):
|
15 |
return ""
|
16 |
|
17 |
lower_name = file_path.lower()
|
18 |
try:
|
|
|
|
|
|
|
19 |
if lower_name.endswith('.pdf'):
|
20 |
try:
|
21 |
result = subprocess.run(
|
|
|
24 |
stderr=subprocess.PIPE,
|
25 |
check=False
|
26 |
)
|
27 |
+
return result.stdout.decode('utf-8', errors='ignore')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
except Exception:
|
29 |
return ""
|
|
|
|
|
|
|
30 |
elif lower_name.endswith('.docx'):
|
31 |
try:
|
32 |
with zipfile.ZipFile(file_path) as zf:
|
33 |
with zf.open('word/document.xml') as docx_xml:
|
34 |
xml_bytes = docx_xml.read()
|
|
|
|
|
|
|
35 |
xml_text = xml_bytes.decode('utf-8', errors='ignore')
|
|
|
|
|
|
|
36 |
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
|
|
|
37 |
text = re.sub(r'<[^>]+>', ' ', xml_text)
|
|
|
38 |
text = re.sub(r'\s+', ' ', text)
|
39 |
return text
|
40 |
except Exception:
|
41 |
return ""
|
42 |
else:
|
|
|
43 |
return ""
|
44 |
except Exception:
|
45 |
return ""
|
46 |
|
|
|
47 |
def extract_name(text: str, filename: str) -> str:
|
48 |
+
"""Extract candidate's name from text or filename."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
if text:
|
|
|
|
|
50 |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
51 |
for line in lines[:10]:
|
|
|
52 |
if re.match(r'(?i)resume|curriculum vitae', line):
|
53 |
continue
|
54 |
words = line.split()
|
|
|
55 |
if 1 < len(words) <= 4:
|
|
|
|
|
56 |
if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
|
57 |
return line
|
|
|
58 |
base = os.path.basename(filename)
|
|
|
59 |
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
|
|
|
60 |
base = re.sub(r'[\._-]+', ' ', base)
|
|
|
61 |
base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
|
62 |
base = re.sub(r'\s+', ' ', base).strip()
|
|
|
63 |
return base.title() if base else ''
|
64 |
|
65 |
+
def extract_entities(text: str) -> dict:
|
66 |
+
"""Extract structured info using NER model."""
|
67 |
+
entities = ner(text)
|
68 |
+
skills, education, experience = [], [], []
|
69 |
+
for ent in entities:
|
70 |
+
label = ent['entity_group'].upper()
|
71 |
+
word = ent['word'].strip()
|
72 |
+
if label in ["SKILL", "TECH", "TECHNOLOGY"]:
|
73 |
+
skills.append(word)
|
74 |
+
elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
|
75 |
+
education.append(word)
|
76 |
+
elif label in ["EXPERIENCE", "JOB", "ROLE"]:
|
77 |
+
experience.append(word)
|
78 |
+
return {
|
79 |
+
"skills": list(dict.fromkeys(skills)),
|
80 |
+
"education": list(dict.fromkeys(education)),
|
81 |
+
"experience": list(dict.fromkeys(experience))
|
82 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
def parse_resume(file_path: str, filename: str) -> dict:
|
85 |
+
"""Main function to parse resume fields."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
text = extract_text(file_path)
|
87 |
name = extract_name(text, filename)
|
88 |
+
ents = extract_entities(text)
|
|
|
|
|
89 |
return {
|
90 |
'name': name or '',
|
91 |
+
'skills': ', '.join(ents["skills"]) if ents["skills"] else '',
|
92 |
+
'education': ', '.join(ents["education"]) if ents["education"] else '',
|
93 |
+
'experience': ', '.join(ents["experience"]) if ents["experience"] else ''
|
94 |
+
}
|