Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

Codingo / backend /services /resume_parser.py

husseinelsaadi

resume parser implemented

af02e64 15 days ago

raw

history blame

12.4 kB

	"""
	resume_parser.py
	=================

	This module provides lightweight functions to extract useful information
	from a candidate's resume. The design avoids heavy dependencies such
	as spaCy or pdfminer because Hugging Face Spaces environments are
	resource‑constrained and installing additional packages at runtime is
	often not feasible. Instead, built‑in Python libraries and a
	few simple heuristics are used to extract text from both PDF and DOCX
	files and to infer the candidate's name, skills, education and
	experience from that text.

	The parser operates on the assumption that most resumes follow a
	relatively consistent structure: the candidate's name appears near the
	top of the document, headings such as "Education" and "Experience"
	demarcate sections, and common skill keywords are scattered
	throughout. These assumptions will not hold for every CV, but they
	provide a reasonable baseline for auto‑filling form fields. Users can
	always edit the populated fields before submitting their application.

	Functions
	---------

	* ``extract_text(file_path: str) -> str``
	Read a resume file (PDF or DOCX) and return its plain text. PDFs
	are processed using the ``pdftotext`` command line tool, which is
	available in the Hugging Face Spaces container. DOCX files are
	treated as zip archives; the ``word/document.xml`` component is
	parsed and stripped of XML tags.

	* ``extract_name(text: str, filename: str) -> str``
	Attempt to infer the candidate's full name from the document text.
	If no plausible name is found in the first few lines of the text,
	fall back to deriving a name from the file name itself.

	* ``extract_skills(text: str) -> list[str]``
	Search for a predefined list of common technical and soft skills
	within the resume text. Matches are case‑insensitive and unique
	values are returned in their original capitalisation.

	* ``extract_education(text: str) -> list[str]``
	Identify lines mentioning educational qualifications. Heuristics
	include the presence of keywords like "University", "Bachelor",
	"Master", "PhD", etc.

	* ``extract_experience(text: str) -> list[str]``
	Extract statements describing work experience. Lines containing
	keywords such as "experience", "Developer", "Engineer" or those
	matching patterns with years of service are considered.

	* ``parse_resume(file_path: str, filename: str) -> dict``
	High‑level wrapper that orchestrates the text extraction and
	information extraction functions. Returns a dictionary with keys
	``name``, ``skills``, ``education``, and ``experience``.

	The main Flask route can import ``parse_resume`` from this module and
	return its result as JSON. Because the heuristics are conservative and
	string‑based, the parser runs quickly on both CPU and GPU hosts.
	"""

	from __future__ import annotations

	import os
	import re
	import subprocess
	import zipfile
	from typing import List


	def extract_text(file_path: str) -> str:
	"""Extract raw text from a PDF or DOCX resume.

	Parameters
	----------
	file_path : str
	Absolute path to the uploaded resume.

	Returns
	-------
	str
	The textual content of the resume. If extraction fails,
	returns an empty string.
	"""
	if not file_path or not os.path.isfile(file_path):
	return ""

	lower_name = file_path.lower()
	try:
	# If the file ends with .pdf use pdftotext. The '-layout'
	# flag preserves relative positioning which helps preserve
	# line breaks in the output. Output is sent to stdout.
	if lower_name.endswith('.pdf'):
	try:
	result = subprocess.run(
	['pdftotext', '-layout', file_path, '-'],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	check=False
	)
	return result.stdout.decode('utf-8', errors='ignore')
	except Exception:
	return ""
	# If it's a .docx treat it as a zip archive and pull the main
	# document XML. Note that .doc files are not supported since
	# they use a binary format.
	elif lower_name.endswith('.docx'):
	try:
	with zipfile.ZipFile(file_path) as zf:
	with zf.open('word/document.xml') as docx_xml:
	xml_bytes = docx_xml.read()
	# Remove XML tags to leave plain text. Replace
	# tags with spaces to avoid accidental word
	# concatenation.
	xml_text = xml_bytes.decode('utf-8', errors='ignore')
	# Replace common markup elements with newlines to
	# preserve paragraph structure. Some tags like
	# ``<w:p>`` represent paragraphs in Word.
	xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
	# Remove remaining tags
	text = re.sub(r'<[^>]+>', ' ', xml_text)
	# Collapse multiple whitespace
	text = re.sub(r'\s+', ' ', text)
	return text
	except Exception:
	return ""
	else:
	# Unsupported file type
	return ""
	except Exception:
	return ""


	def extract_name(text: str, filename: str) -> str:
	"""Attempt to extract the candidate's full name from the resume.

	This function first inspects the first few lines of the resume
	text. It looks for lines containing between two and four words
	where each word starts with an uppercase letter. If such a line
	isn't found, it falls back to deriving a name from the file name.

	Parameters
	----------
	text : str
	The full resume text.
	filename : str
	The original filename of the uploaded resume.

	Returns
	-------
	str
	Inferred full name or an empty string if not found.
	"""
	if text:
	# Consider the first 10 lines for a potential name. Strip
	# whitespace and ignore empty lines.
	lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
	for line in lines[:10]:
	# Remove common headings like "Resume" or "Curriculum Vitae"
	if re.match(r'(?i)resume\|curriculum vitae', line):
	continue
	words = line.split()
	# A plausible name typically has 2–4 words
	if 1 < len(words) <= 4:
	# All words must start with an uppercase letter (allow
	# accented characters) and contain at least one letter.
	if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
	return line
	# Fallback: derive a name from the filename
	base = os.path.basename(filename)
	# Remove extension
	base = re.sub(r'\.(pdf\|docx\|doc)$', '', base, flags=re.I)
	# Replace underscores, dashes and dots with spaces
	base = re.sub(r'[\._-]+', ' ', base)
	# Remove common tokens like 'cv' or 'resume'
	base = re.sub(r'(?i)\b(cv\|resume)\b', '', base)
	base = re.sub(r'\s+', ' ', base).strip()
	# Title case the remaining string
	return base.title() if base else ''


	def extract_skills(text: str) -> List[str]:
	"""Identify common skills mentioned in the resume.

	A predefined set of skills is checked against the resume text in a
	case‑insensitive manner. If a skill phrase appears anywhere in the
	text, it is added to the result list. Multi‑word skills must match
	the full phrase to count as a hit.

	Parameters
	----------
	text : str
	The resume's full text.

	Returns
	-------
	list[str]
	Unique skills found in the resume, preserving their original
	capitalisation where possible.
	"""
	if not text:
	return []
	lower_text = text.lower()
	# Define a set of common technical and soft skills. This list can
	# be extended in future iterations without modifying the parser
	SKILLS = [
	'python', 'java', 'c++', 'c', 'javascript', 'html', 'css',
	'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring',
	'machine learning', 'deep learning', 'nlp', 'data analysis',
	'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git',
	'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux',
	'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
	'matplotlib', 'excel', 'powerpoint', 'project management',
	'communication', 'teamwork', 'leadership', 'problem solving',
	'public speaking', 'writing', 'analysis', 'time management'
	]
	found = []
	for skill in SKILLS:
	pattern = re.escape(skill.lower())
	if re.search(r'\b' + pattern + r'\b', lower_text):
	# Preserve the original capitalisation of the skill phrase
	found.append(skill.title() if skill.islower() else skill)
	return list(dict.fromkeys(found)) # Remove duplicates, preserve order


	def extract_education(text: str) -> List[str]:
	"""Gather educational qualifications from the resume text.

	The function searches for lines containing keywords related to
	education. Only distinct lines with meaningful content are
	included.

	Parameters
	----------
	text : str

	Returns
	-------
	list[str]
	Lines representing educational qualifications.
	"""
	if not text:
	return []
	lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
	education_keywords = [
	'university', 'college', 'bachelor', 'master', 'phd', 'b.sc',
	'm.sc', 'mba', 'school', 'degree', 'diploma', 'engineering'
	]
	results = []
	for line in lines:
	lower = line.lower()
	if any(kw in lower for kw in education_keywords):
	# Avoid capturing the same line twice
	if line not in results:
	results.append(line)
	# If nothing found, return an empty list
	return results


	def extract_experience(text: str) -> List[str]:
	"""Extract snippets of work experience from resume text.

	Heuristics are used to detect sentences or lines that likely
	describe professional experience. Indicators include the presence
	of keywords like "experience", job titles, or explicit durations.

	Parameters
	----------
	text : str

	Returns
	-------
	list[str]
	A list of lines summarising work experience.
	"""
	if not text:
	return []
	lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
	# Keywords signalling experience entries
	exp_keywords = [
	'experience', 'worked', 'employment', 'internship', 'developer',
	'engineer', 'manager', 'analyst', 'consultant', 'assistant',
	'years', 'year', 'months', 'month', 'present'
	]
	results = []
	for line in lines:
	lower = line.lower()
	if any(kw in lower for kw in exp_keywords):
	# Filter out lines that are just section headings
	if len(lower.split()) > 2:
	if line not in results:
	results.append(line)
	return results


	def parse_resume(file_path: str, filename: str) -> dict:
	"""High‑level helper to parse a resume into structured fields.

	Parameters
	----------
	file_path : str
	Location of the uploaded file on disk.
	filename : str
	The original filename as provided by the user. Used as a
	fallback for name extraction if the document text does not
	reveal a plausible name.

	Returns
	-------
	dict
	Dictionary with keys ``name``, ``skills``, ``education`` and
	``experience``. Each value is a string, except for the name
	which is a single string. Lists are joined into a comma or
	newline separated string suitable for form fields.
	"""
	text = extract_text(file_path)
	name = extract_name(text, filename)
	skills_list = extract_skills(text)
	education_list = extract_education(text)
	experience_list = extract_experience(text)
	return {
	'name': name or '',
	'skills': ', '.join(skills_list) if skills_list else '',
	'education': '\n'.join(education_list) if education_list else '',
	'experience': '\n'.join(experience_list) if experience_list else ''
	}