Spaces:
Paused
Paused
""" | |
resume_parser.py | |
================= | |
This module provides lightweight functions to extract useful information | |
from a candidate's resume. The design avoids heavy dependencies such | |
as spaCy or pdfminer because Hugging Face Spaces environments are | |
resource‑constrained and installing additional packages at runtime is | |
often not feasible. Instead, built‑in Python libraries and a | |
few simple heuristics are used to extract text from both PDF and DOCX | |
files and to infer the candidate's name, skills, education and | |
experience from that text. | |
The parser operates on the assumption that most resumes follow a | |
relatively consistent structure: the candidate's name appears near the | |
top of the document, headings such as "Education" and "Experience" | |
demarcate sections, and common skill keywords are scattered | |
throughout. These assumptions will not hold for every CV, but they | |
provide a reasonable baseline for auto‑filling form fields. Users can | |
always edit the populated fields before submitting their application. | |
Functions | |
--------- | |
* ``extract_text(file_path: str) -> str`` | |
Read a resume file (PDF or DOCX) and return its plain text. PDFs | |
are processed using the ``pdftotext`` command line tool, which is | |
available in the Hugging Face Spaces container. DOCX files are | |
treated as zip archives; the ``word/document.xml`` component is | |
parsed and stripped of XML tags. | |
* ``extract_name(text: str, filename: str) -> str`` | |
Attempt to infer the candidate's full name from the document text. | |
If no plausible name is found in the first few lines of the text, | |
fall back to deriving a name from the file name itself. | |
* ``extract_skills(text: str) -> list[str]`` | |
Search for a predefined list of common technical and soft skills | |
within the resume text. Matches are case‑insensitive and unique | |
values are returned in their original capitalisation. | |
* ``extract_education(text: str) -> list[str]`` | |
Identify lines mentioning educational qualifications. Heuristics | |
include the presence of keywords like "University", "Bachelor", | |
"Master", "PhD", etc. | |
* ``extract_experience(text: str) -> list[str]`` | |
Extract statements describing work experience. Lines containing | |
keywords such as "experience", "Developer", "Engineer" or those | |
matching patterns with years of service are considered. | |
* ``parse_resume(file_path: str, filename: str) -> dict`` | |
High‑level wrapper that orchestrates the text extraction and | |
information extraction functions. Returns a dictionary with keys | |
``name``, ``skills``, ``education``, and ``experience``. | |
The main Flask route can import ``parse_resume`` from this module and | |
return its result as JSON. Because the heuristics are conservative and | |
string‑based, the parser runs quickly on both CPU and GPU hosts. | |
""" | |
from __future__ import annotations | |
import os | |
import re | |
import subprocess | |
import zipfile | |
from typing import List | |
def extract_text(file_path: str) -> str: | |
"""Extract raw text from a PDF or DOCX resume. | |
Parameters | |
---------- | |
file_path : str | |
Absolute path to the uploaded resume. | |
Returns | |
------- | |
str | |
The textual content of the resume. If extraction fails, | |
returns an empty string. | |
""" | |
if not file_path or not os.path.isfile(file_path): | |
return "" | |
lower_name = file_path.lower() | |
try: | |
# If the file ends with .pdf use pdftotext. The '-layout' | |
# flag preserves relative positioning which helps preserve | |
# line breaks in the output. Output is sent to stdout. | |
if lower_name.endswith('.pdf'): | |
try: | |
result = subprocess.run( | |
['pdftotext', '-layout', file_path, '-'], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
check=False | |
) | |
return result.stdout.decode('utf-8', errors='ignore') | |
except Exception: | |
return "" | |
# If it's a .docx treat it as a zip archive and pull the main | |
# document XML. Note that .doc files are not supported since | |
# they use a binary format. | |
elif lower_name.endswith('.docx'): | |
try: | |
with zipfile.ZipFile(file_path) as zf: | |
with zf.open('word/document.xml') as docx_xml: | |
xml_bytes = docx_xml.read() | |
# Remove XML tags to leave plain text. Replace | |
# tags with spaces to avoid accidental word | |
# concatenation. | |
xml_text = xml_bytes.decode('utf-8', errors='ignore') | |
# Replace common markup elements with newlines to | |
# preserve paragraph structure. Some tags like | |
# ``<w:p>`` represent paragraphs in Word. | |
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I) | |
# Remove remaining tags | |
text = re.sub(r'<[^>]+>', ' ', xml_text) | |
# Collapse multiple whitespace | |
text = re.sub(r'\s+', ' ', text) | |
return text | |
except Exception: | |
return "" | |
else: | |
# Unsupported file type | |
return "" | |
except Exception: | |
return "" | |
def extract_name(text: str, filename: str) -> str: | |
"""Attempt to extract the candidate's full name from the resume. | |
This function first inspects the first few lines of the resume | |
text. It looks for lines containing between two and four words | |
where each word starts with an uppercase letter. If such a line | |
isn't found, it falls back to deriving a name from the file name. | |
Parameters | |
---------- | |
text : str | |
The full resume text. | |
filename : str | |
The original filename of the uploaded resume. | |
Returns | |
------- | |
str | |
Inferred full name or an empty string if not found. | |
""" | |
if text: | |
# Consider the first 10 lines for a potential name. Strip | |
# whitespace and ignore empty lines. | |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()] | |
for line in lines[:10]: | |
# Remove common headings like "Resume" or "Curriculum Vitae" | |
if re.match(r'(?i)resume|curriculum vitae', line): | |
continue | |
words = line.split() | |
# A plausible name typically has 2–4 words | |
if 1 < len(words) <= 4: | |
# All words must start with an uppercase letter (allow | |
# accented characters) and contain at least one letter. | |
if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words): | |
return line | |
# Fallback: derive a name from the filename | |
base = os.path.basename(filename) | |
# Remove extension | |
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I) | |
# Replace underscores, dashes and dots with spaces | |
base = re.sub(r'[\._-]+', ' ', base) | |
# Remove common tokens like 'cv' or 'resume' | |
base = re.sub(r'(?i)\b(cv|resume)\b', '', base) | |
base = re.sub(r'\s+', ' ', base).strip() | |
# Title case the remaining string | |
return base.title() if base else '' | |
def extract_skills(text: str) -> List[str]: | |
"""Identify common skills mentioned in the resume. | |
A predefined set of skills is checked against the resume text in a | |
case‑insensitive manner. If a skill phrase appears anywhere in the | |
text, it is added to the result list. Multi‑word skills must match | |
the full phrase to count as a hit. | |
Parameters | |
---------- | |
text : str | |
The resume's full text. | |
Returns | |
------- | |
list[str] | |
Unique skills found in the resume, preserving their original | |
capitalisation where possible. | |
""" | |
if not text: | |
return [] | |
lower_text = text.lower() | |
# Define a set of common technical and soft skills. This list can | |
# be extended in future iterations without modifying the parser | |
SKILLS = [ | |
'python', 'java', 'c++', 'c', 'javascript', 'html', 'css', | |
'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring', | |
'machine learning', 'deep learning', 'nlp', 'data analysis', | |
'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git', | |
'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux', | |
'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy', | |
'matplotlib', 'excel', 'powerpoint', 'project management', | |
'communication', 'teamwork', 'leadership', 'problem solving', | |
'public speaking', 'writing', 'analysis', 'time management' | |
] | |
found = [] | |
for skill in SKILLS: | |
pattern = re.escape(skill.lower()) | |
if re.search(r'\b' + pattern + r'\b', lower_text): | |
# Preserve the original capitalisation of the skill phrase | |
found.append(skill.title() if skill.islower() else skill) | |
return list(dict.fromkeys(found)) # Remove duplicates, preserve order | |
def extract_education(text: str) -> List[str]: | |
"""Gather educational qualifications from the resume text. | |
The function searches for lines containing keywords related to | |
education. Only distinct lines with meaningful content are | |
included. | |
Parameters | |
---------- | |
text : str | |
Returns | |
------- | |
list[str] | |
Lines representing educational qualifications. | |
""" | |
if not text: | |
return [] | |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()] | |
education_keywords = [ | |
'university', 'college', 'bachelor', 'master', 'phd', 'b.sc', | |
'm.sc', 'mba', 'school', 'degree', 'diploma', 'engineering' | |
] | |
results = [] | |
for line in lines: | |
lower = line.lower() | |
if any(kw in lower for kw in education_keywords): | |
# Avoid capturing the same line twice | |
if line not in results: | |
results.append(line) | |
# If nothing found, return an empty list | |
return results | |
def extract_experience(text: str) -> List[str]: | |
"""Extract snippets of work experience from resume text. | |
Heuristics are used to detect sentences or lines that likely | |
describe professional experience. Indicators include the presence | |
of keywords like "experience", job titles, or explicit durations. | |
Parameters | |
---------- | |
text : str | |
Returns | |
------- | |
list[str] | |
A list of lines summarising work experience. | |
""" | |
if not text: | |
return [] | |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()] | |
# Keywords signalling experience entries | |
exp_keywords = [ | |
'experience', 'worked', 'employment', 'internship', 'developer', | |
'engineer', 'manager', 'analyst', 'consultant', 'assistant', | |
'years', 'year', 'months', 'month', 'present' | |
] | |
results = [] | |
for line in lines: | |
lower = line.lower() | |
if any(kw in lower for kw in exp_keywords): | |
# Filter out lines that are just section headings | |
if len(lower.split()) > 2: | |
if line not in results: | |
results.append(line) | |
return results | |
def parse_resume(file_path: str, filename: str) -> dict: | |
"""High‑level helper to parse a resume into structured fields. | |
Parameters | |
---------- | |
file_path : str | |
Location of the uploaded file on disk. | |
filename : str | |
The original filename as provided by the user. Used as a | |
fallback for name extraction if the document text does not | |
reveal a plausible name. | |
Returns | |
------- | |
dict | |
Dictionary with keys ``name``, ``skills``, ``education`` and | |
``experience``. Each value is a string, except for the name | |
which is a single string. Lists are joined into a comma or | |
newline separated string suitable for form fields. | |
""" | |
text = extract_text(file_path) | |
name = extract_name(text, filename) | |
skills_list = extract_skills(text) | |
education_list = extract_education(text) | |
experience_list = extract_experience(text) | |
return { | |
'name': name or '', | |
'skills': ', '.join(skills_list) if skills_list else '', | |
'education': '\n'.join(education_list) if education_list else '', | |
'experience': '\n'.join(experience_list) if experience_list else '' | |
} |