""" resume_parser.py ================= This module provides lightweight functions to extract useful information from a candidate's resume. The design avoids heavy dependencies such as spaCy or pdfminer because Hugging Face Spaces environments are resource‑constrained and installing additional packages at runtime is often not feasible. Instead, built‑in Python libraries and a few simple heuristics are used to extract text from both PDF and DOCX files and to infer the candidate's name, skills, education and experience from that text. The parser operates on the assumption that most resumes follow a relatively consistent structure: the candidate's name appears near the top of the document, headings such as "Education" and "Experience" demarcate sections, and common skill keywords are scattered throughout. These assumptions will not hold for every CV, but they provide a reasonable baseline for auto‑filling form fields. Users can always edit the populated fields before submitting their application. Functions --------- * ``extract_text(file_path: str) -> str`` Read a resume file (PDF or DOCX) and return its plain text. PDFs are processed using the ``pdftotext`` command line tool, which is available in the Hugging Face Spaces container. DOCX files are treated as zip archives; the ``word/document.xml`` component is parsed and stripped of XML tags. * ``extract_name(text: str, filename: str) -> str`` Attempt to infer the candidate's full name from the document text. If no plausible name is found in the first few lines of the text, fall back to deriving a name from the file name itself. * ``extract_skills(text: str) -> list[str]`` Search for a predefined list of common technical and soft skills within the resume text. Matches are case‑insensitive and unique values are returned in their original capitalisation. * ``extract_education(text: str) -> list[str]`` Identify lines mentioning educational qualifications. Heuristics include the presence of keywords like "University", "Bachelor", "Master", "PhD", etc. * ``extract_experience(text: str) -> list[str]`` Extract statements describing work experience. Lines containing keywords such as "experience", "Developer", "Engineer" or those matching patterns with years of service are considered. * ``parse_resume(file_path: str, filename: str) -> dict`` High‑level wrapper that orchestrates the text extraction and information extraction functions. Returns a dictionary with keys ``name``, ``skills``, ``education``, and ``experience``. The main Flask route can import ``parse_resume`` from this module and return its result as JSON. Because the heuristics are conservative and string‑based, the parser runs quickly on both CPU and GPU hosts. """ from __future__ import annotations import os import re import subprocess import zipfile from typing import List def extract_text(file_path: str) -> str: """Extract raw text from a PDF or DOCX resume. Parameters ---------- file_path : str Absolute path to the uploaded resume. Returns ------- str The textual content of the resume. If extraction fails, returns an empty string. """ if not file_path or not os.path.isfile(file_path): return "" lower_name = file_path.lower() try: # If the file ends with .pdf use pdftotext. The '-layout' # flag preserves relative positioning which helps preserve # line breaks in the output. Output is sent to stdout. if lower_name.endswith('.pdf'): try: result = subprocess.run( ['pdftotext', '-layout', file_path, '-'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False ) return result.stdout.decode('utf-8', errors='ignore') except Exception: return "" # If it's a .docx treat it as a zip archive and pull the main # document XML. Note that .doc files are not supported since # they use a binary format. elif lower_name.endswith('.docx'): try: with zipfile.ZipFile(file_path) as zf: with zf.open('word/document.xml') as docx_xml: xml_bytes = docx_xml.read() # Remove XML tags to leave plain text. Replace # tags with spaces to avoid accidental word # concatenation. xml_text = xml_bytes.decode('utf-8', errors='ignore') # Replace common markup elements with newlines to # preserve paragraph structure. Some tags like # ```` represent paragraphs in Word. xml_text = re.sub(r']*>', '\n', xml_text, flags=re.I) # Remove remaining tags text = re.sub(r'<[^>]+>', ' ', xml_text) # Collapse multiple whitespace text = re.sub(r'\s+', ' ', text) return text except Exception: return "" else: # Unsupported file type return "" except Exception: return "" def extract_name(text: str, filename: str) -> str: """Attempt to extract the candidate's full name from the resume. This function first inspects the first few lines of the resume text. It looks for lines containing between two and four words where each word starts with an uppercase letter. If such a line isn't found, it falls back to deriving a name from the file name. Parameters ---------- text : str The full resume text. filename : str The original filename of the uploaded resume. Returns ------- str Inferred full name or an empty string if not found. """ if text: # Consider the first 10 lines for a potential name. Strip # whitespace and ignore empty lines. lines = [ln.strip() for ln in text.splitlines() if ln.strip()] for line in lines[:10]: # Remove common headings like "Resume" or "Curriculum Vitae" if re.match(r'(?i)resume|curriculum vitae', line): continue words = line.split() # A plausible name typically has 2–4 words if 1 < len(words) <= 4: # All words must start with an uppercase letter (allow # accented characters) and contain at least one letter. if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words): return line # Fallback: derive a name from the filename base = os.path.basename(filename) # Remove extension base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I) # Replace underscores, dashes and dots with spaces base = re.sub(r'[\._-]+', ' ', base) # Remove common tokens like 'cv' or 'resume' base = re.sub(r'(?i)\b(cv|resume)\b', '', base) base = re.sub(r'\s+', ' ', base).strip() # Title case the remaining string return base.title() if base else '' def extract_skills(text: str) -> List[str]: """Identify common skills mentioned in the resume. A predefined set of skills is checked against the resume text in a case‑insensitive manner. If a skill phrase appears anywhere in the text, it is added to the result list. Multi‑word skills must match the full phrase to count as a hit. Parameters ---------- text : str The resume's full text. Returns ------- list[str] Unique skills found in the resume, preserving their original capitalisation where possible. """ if not text: return [] lower_text = text.lower() # Define a set of common technical and soft skills. This list can # be extended in future iterations without modifying the parser SKILLS = [ 'python', 'java', 'c++', 'c', 'javascript', 'html', 'css', 'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring', 'machine learning', 'deep learning', 'nlp', 'data analysis', 'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git', 'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux', 'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy', 'matplotlib', 'excel', 'powerpoint', 'project management', 'communication', 'teamwork', 'leadership', 'problem solving', 'public speaking', 'writing', 'analysis', 'time management' ] found = [] for skill in SKILLS: pattern = re.escape(skill.lower()) if re.search(r'\b' + pattern + r'\b', lower_text): # Preserve the original capitalisation of the skill phrase found.append(skill.title() if skill.islower() else skill) return list(dict.fromkeys(found)) # Remove duplicates, preserve order def extract_education(text: str) -> List[str]: """Gather educational qualifications from the resume text. The function searches for lines containing keywords related to education. Only distinct lines with meaningful content are included. Parameters ---------- text : str Returns ------- list[str] Lines representing educational qualifications. """ if not text: return [] lines = [ln.strip() for ln in text.splitlines() if ln.strip()] education_keywords = [ 'university', 'college', 'bachelor', 'master', 'phd', 'b.sc', 'm.sc', 'mba', 'school', 'degree', 'diploma', 'engineering' ] results = [] for line in lines: lower = line.lower() if any(kw in lower for kw in education_keywords): # Avoid capturing the same line twice if line not in results: results.append(line) # If nothing found, return an empty list return results def extract_experience(text: str) -> List[str]: """Extract snippets of work experience from resume text. Heuristics are used to detect sentences or lines that likely describe professional experience. Indicators include the presence of keywords like "experience", job titles, or explicit durations. Parameters ---------- text : str Returns ------- list[str] A list of lines summarising work experience. """ if not text: return [] lines = [ln.strip() for ln in text.splitlines() if ln.strip()] # Keywords signalling experience entries exp_keywords = [ 'experience', 'worked', 'employment', 'internship', 'developer', 'engineer', 'manager', 'analyst', 'consultant', 'assistant', 'years', 'year', 'months', 'month', 'present' ] results = [] for line in lines: lower = line.lower() if any(kw in lower for kw in exp_keywords): # Filter out lines that are just section headings if len(lower.split()) > 2: if line not in results: results.append(line) return results def parse_resume(file_path: str, filename: str) -> dict: """High‑level helper to parse a resume into structured fields. Parameters ---------- file_path : str Location of the uploaded file on disk. filename : str The original filename as provided by the user. Used as a fallback for name extraction if the document text does not reveal a plausible name. Returns ------- dict Dictionary with keys ``name``, ``skills``, ``education`` and ``experience``. Each value is a string, except for the name which is a single string. Lists are joined into a comma or newline separated string suitable for form fields. """ text = extract_text(file_path) name = extract_name(text, filename) skills_list = extract_skills(text) education_list = extract_education(text) experience_list = extract_experience(text) return { 'name': name or '', 'skills': ', '.join(skills_list) if skills_list else '', 'education': '\n'.join(education_list) if education_list else '', 'experience': '\n'.join(experience_list) if experience_list else '' }