Codingo / backend /services /resume_parser.py
husseinelsaadi's picture
resume parser implemented
af02e64
raw
history blame
12.4 kB
"""
resume_parser.py
=================
This module provides lightweight functions to extract useful information
from a candidate's resume. The design avoids heavy dependencies such
as spaCy or pdfminer because Hugging Face Spaces environments are
resource‑constrained and installing additional packages at runtime is
often not feasible. Instead, built‑in Python libraries and a
few simple heuristics are used to extract text from both PDF and DOCX
files and to infer the candidate's name, skills, education and
experience from that text.
The parser operates on the assumption that most resumes follow a
relatively consistent structure: the candidate's name appears near the
top of the document, headings such as "Education" and "Experience"
demarcate sections, and common skill keywords are scattered
throughout. These assumptions will not hold for every CV, but they
provide a reasonable baseline for auto‑filling form fields. Users can
always edit the populated fields before submitting their application.
Functions
---------
* ``extract_text(file_path: str) -> str``
Read a resume file (PDF or DOCX) and return its plain text. PDFs
are processed using the ``pdftotext`` command line tool, which is
available in the Hugging Face Spaces container. DOCX files are
treated as zip archives; the ``word/document.xml`` component is
parsed and stripped of XML tags.
* ``extract_name(text: str, filename: str) -> str``
Attempt to infer the candidate's full name from the document text.
If no plausible name is found in the first few lines of the text,
fall back to deriving a name from the file name itself.
* ``extract_skills(text: str) -> list[str]``
Search for a predefined list of common technical and soft skills
within the resume text. Matches are case‑insensitive and unique
values are returned in their original capitalisation.
* ``extract_education(text: str) -> list[str]``
Identify lines mentioning educational qualifications. Heuristics
include the presence of keywords like "University", "Bachelor",
"Master", "PhD", etc.
* ``extract_experience(text: str) -> list[str]``
Extract statements describing work experience. Lines containing
keywords such as "experience", "Developer", "Engineer" or those
matching patterns with years of service are considered.
* ``parse_resume(file_path: str, filename: str) -> dict``
High‑level wrapper that orchestrates the text extraction and
information extraction functions. Returns a dictionary with keys
``name``, ``skills``, ``education``, and ``experience``.
The main Flask route can import ``parse_resume`` from this module and
return its result as JSON. Because the heuristics are conservative and
string‑based, the parser runs quickly on both CPU and GPU hosts.
"""
from __future__ import annotations
import os
import re
import subprocess
import zipfile
from typing import List
def extract_text(file_path: str) -> str:
"""Extract raw text from a PDF or DOCX resume.
Parameters
----------
file_path : str
Absolute path to the uploaded resume.
Returns
-------
str
The textual content of the resume. If extraction fails,
returns an empty string.
"""
if not file_path or not os.path.isfile(file_path):
return ""
lower_name = file_path.lower()
try:
# If the file ends with .pdf use pdftotext. The '-layout'
# flag preserves relative positioning which helps preserve
# line breaks in the output. Output is sent to stdout.
if lower_name.endswith('.pdf'):
try:
result = subprocess.run(
['pdftotext', '-layout', file_path, '-'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False
)
return result.stdout.decode('utf-8', errors='ignore')
except Exception:
return ""
# If it's a .docx treat it as a zip archive and pull the main
# document XML. Note that .doc files are not supported since
# they use a binary format.
elif lower_name.endswith('.docx'):
try:
with zipfile.ZipFile(file_path) as zf:
with zf.open('word/document.xml') as docx_xml:
xml_bytes = docx_xml.read()
# Remove XML tags to leave plain text. Replace
# tags with spaces to avoid accidental word
# concatenation.
xml_text = xml_bytes.decode('utf-8', errors='ignore')
# Replace common markup elements with newlines to
# preserve paragraph structure. Some tags like
# ``<w:p>`` represent paragraphs in Word.
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
# Remove remaining tags
text = re.sub(r'<[^>]+>', ' ', xml_text)
# Collapse multiple whitespace
text = re.sub(r'\s+', ' ', text)
return text
except Exception:
return ""
else:
# Unsupported file type
return ""
except Exception:
return ""
def extract_name(text: str, filename: str) -> str:
"""Attempt to extract the candidate's full name from the resume.
This function first inspects the first few lines of the resume
text. It looks for lines containing between two and four words
where each word starts with an uppercase letter. If such a line
isn't found, it falls back to deriving a name from the file name.
Parameters
----------
text : str
The full resume text.
filename : str
The original filename of the uploaded resume.
Returns
-------
str
Inferred full name or an empty string if not found.
"""
if text:
# Consider the first 10 lines for a potential name. Strip
# whitespace and ignore empty lines.
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for line in lines[:10]:
# Remove common headings like "Resume" or "Curriculum Vitae"
if re.match(r'(?i)resume|curriculum vitae', line):
continue
words = line.split()
# A plausible name typically has 2–4 words
if 1 < len(words) <= 4:
# All words must start with an uppercase letter (allow
# accented characters) and contain at least one letter.
if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
return line
# Fallback: derive a name from the filename
base = os.path.basename(filename)
# Remove extension
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
# Replace underscores, dashes and dots with spaces
base = re.sub(r'[\._-]+', ' ', base)
# Remove common tokens like 'cv' or 'resume'
base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
base = re.sub(r'\s+', ' ', base).strip()
# Title case the remaining string
return base.title() if base else ''
def extract_skills(text: str) -> List[str]:
"""Identify common skills mentioned in the resume.
A predefined set of skills is checked against the resume text in a
case‑insensitive manner. If a skill phrase appears anywhere in the
text, it is added to the result list. Multi‑word skills must match
the full phrase to count as a hit.
Parameters
----------
text : str
The resume's full text.
Returns
-------
list[str]
Unique skills found in the resume, preserving their original
capitalisation where possible.
"""
if not text:
return []
lower_text = text.lower()
# Define a set of common technical and soft skills. This list can
# be extended in future iterations without modifying the parser
SKILLS = [
'python', 'java', 'c++', 'c', 'javascript', 'html', 'css',
'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring',
'machine learning', 'deep learning', 'nlp', 'data analysis',
'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git',
'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux',
'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
'matplotlib', 'excel', 'powerpoint', 'project management',
'communication', 'teamwork', 'leadership', 'problem solving',
'public speaking', 'writing', 'analysis', 'time management'
]
found = []
for skill in SKILLS:
pattern = re.escape(skill.lower())
if re.search(r'\b' + pattern + r'\b', lower_text):
# Preserve the original capitalisation of the skill phrase
found.append(skill.title() if skill.islower() else skill)
return list(dict.fromkeys(found)) # Remove duplicates, preserve order
def extract_education(text: str) -> List[str]:
"""Gather educational qualifications from the resume text.
The function searches for lines containing keywords related to
education. Only distinct lines with meaningful content are
included.
Parameters
----------
text : str
Returns
-------
list[str]
Lines representing educational qualifications.
"""
if not text:
return []
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
education_keywords = [
'university', 'college', 'bachelor', 'master', 'phd', 'b.sc',
'm.sc', 'mba', 'school', 'degree', 'diploma', 'engineering'
]
results = []
for line in lines:
lower = line.lower()
if any(kw in lower for kw in education_keywords):
# Avoid capturing the same line twice
if line not in results:
results.append(line)
# If nothing found, return an empty list
return results
def extract_experience(text: str) -> List[str]:
"""Extract snippets of work experience from resume text.
Heuristics are used to detect sentences or lines that likely
describe professional experience. Indicators include the presence
of keywords like "experience", job titles, or explicit durations.
Parameters
----------
text : str
Returns
-------
list[str]
A list of lines summarising work experience.
"""
if not text:
return []
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
# Keywords signalling experience entries
exp_keywords = [
'experience', 'worked', 'employment', 'internship', 'developer',
'engineer', 'manager', 'analyst', 'consultant', 'assistant',
'years', 'year', 'months', 'month', 'present'
]
results = []
for line in lines:
lower = line.lower()
if any(kw in lower for kw in exp_keywords):
# Filter out lines that are just section headings
if len(lower.split()) > 2:
if line not in results:
results.append(line)
return results
def parse_resume(file_path: str, filename: str) -> dict:
"""High‑level helper to parse a resume into structured fields.
Parameters
----------
file_path : str
Location of the uploaded file on disk.
filename : str
The original filename as provided by the user. Used as a
fallback for name extraction if the document text does not
reveal a plausible name.
Returns
-------
dict
Dictionary with keys ``name``, ``skills``, ``education`` and
``experience``. Each value is a string, except for the name
which is a single string. Lists are joined into a comma or
newline separated string suitable for form fields.
"""
text = extract_text(file_path)
name = extract_name(text, filename)
skills_list = extract_skills(text)
education_list = extract_education(text)
experience_list = extract_experience(text)
return {
'name': name or '',
'skills': ', '.join(skills_list) if skills_list else '',
'education': '\n'.join(education_list) if education_list else '',
'experience': '\n'.join(experience_list) if experience_list else ''
}