Spaces:

husseinelsaadi
/

Codingo

Paused

File size: 12,440 Bytes

af02e64

"""
resume_parser.py
=================

This module provides lightweight functions to extract useful information
from a candidate's resume.  The design avoids heavy dependencies such
as spaCy or pdfminer because Hugging Face Spaces environments are
resource‑constrained and installing additional packages at runtime is
often not feasible.  Instead, built‑in Python libraries and a
few simple heuristics are used to extract text from both PDF and DOCX
files and to infer the candidate's name, skills, education and
experience from that text.

The parser operates on the assumption that most resumes follow a
relatively consistent structure: the candidate's name appears near the
top of the document, headings such as "Education" and "Experience"
demarcate sections, and common skill keywords are scattered
throughout.  These assumptions will not hold for every CV, but they
provide a reasonable baseline for auto‑filling form fields.  Users can
always edit the populated fields before submitting their application.

Functions
---------

* ``extract_text(file_path: str) -> str``
    Read a resume file (PDF or DOCX) and return its plain text.  PDFs
    are processed using the ``pdftotext`` command line tool, which is
    available in the Hugging Face Spaces container.  DOCX files are
    treated as zip archives; the ``word/document.xml`` component is
    parsed and stripped of XML tags.

* ``extract_name(text: str, filename: str) -> str``
    Attempt to infer the candidate's full name from the document text.
    If no plausible name is found in the first few lines of the text,
    fall back to deriving a name from the file name itself.

* ``extract_skills(text: str) -> list[str]``
    Search for a predefined list of common technical and soft skills
    within the resume text.  Matches are case‑insensitive and unique
    values are returned in their original capitalisation.

* ``extract_education(text: str) -> list[str]``
    Identify lines mentioning educational qualifications.  Heuristics
    include the presence of keywords like "University", "Bachelor",
    "Master", "PhD", etc.

* ``extract_experience(text: str) -> list[str]``
    Extract statements describing work experience.  Lines containing
    keywords such as "experience", "Developer", "Engineer" or those
    matching patterns with years of service are considered.

* ``parse_resume(file_path: str, filename: str) -> dict``
    High‑level wrapper that orchestrates the text extraction and
    information extraction functions.  Returns a dictionary with keys
    ``name``, ``skills``, ``education``, and ``experience``.

The main Flask route can import ``parse_resume`` from this module and
return its result as JSON.  Because the heuristics are conservative and
string‑based, the parser runs quickly on both CPU and GPU hosts.
"""

from __future__ import annotations

import os
import re
import subprocess
import zipfile
from typing import List


def extract_text(file_path: str) -> str:
    """Extract raw text from a PDF or DOCX resume.

    Parameters
    ----------
    file_path : str
        Absolute path to the uploaded resume.

    Returns
    -------
    str
        The textual content of the resume.  If extraction fails,
        returns an empty string.
    """
    if not file_path or not os.path.isfile(file_path):
        return ""

    lower_name = file_path.lower()
    try:
        # If the file ends with .pdf use pdftotext.  The '-layout'
        # flag preserves relative positioning which helps preserve
        # line breaks in the output.  Output is sent to stdout.
        if lower_name.endswith('.pdf'):
            try:
                result = subprocess.run(
                    ['pdftotext', '-layout', file_path, '-'],
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    check=False
                )
                return result.stdout.decode('utf-8', errors='ignore')
            except Exception:
                return ""
        # If it's a .docx treat it as a zip archive and pull the main
        # document XML.  Note that .doc files are not supported since
        # they use a binary format.
        elif lower_name.endswith('.docx'):
            try:
                with zipfile.ZipFile(file_path) as zf:
                    with zf.open('word/document.xml') as docx_xml:
                        xml_bytes = docx_xml.read()
                        # Remove XML tags to leave plain text.  Replace
                        # tags with spaces to avoid accidental word
                        # concatenation.
                        xml_text = xml_bytes.decode('utf-8', errors='ignore')
                        # Replace common markup elements with newlines to
                        # preserve paragraph structure.  Some tags like
                        # ``<w:p>`` represent paragraphs in Word.
                        xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
                        # Remove remaining tags
                        text = re.sub(r'<[^>]+>', ' ', xml_text)
                        # Collapse multiple whitespace
                        text = re.sub(r'\s+', ' ', text)
                        return text
            except Exception:
                return ""
        else:
            # Unsupported file type
            return ""
    except Exception:
        return ""


def extract_name(text: str, filename: str) -> str:
    """Attempt to extract the candidate's full name from the resume.

    This function first inspects the first few lines of the resume
    text.  It looks for lines containing between two and four words
    where each word starts with an uppercase letter.  If such a line
    isn't found, it falls back to deriving a name from the file name.

    Parameters
    ----------
    text : str
        The full resume text.
    filename : str
        The original filename of the uploaded resume.

    Returns
    -------
    str
        Inferred full name or an empty string if not found.
    """
    if text:
        # Consider the first 10 lines for a potential name.  Strip
        # whitespace and ignore empty lines.
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        for line in lines[:10]:
            # Remove common headings like "Resume" or "Curriculum Vitae"
            if re.match(r'(?i)resume|curriculum vitae', line):
                continue
            words = line.split()
            # A plausible name typically has 2–4 words
            if 1 < len(words) <= 4:
                # All words must start with an uppercase letter (allow
                # accented characters) and contain at least one letter.
                if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
                    return line
    # Fallback: derive a name from the filename
    base = os.path.basename(filename)
    # Remove extension
    base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
    # Replace underscores, dashes and dots with spaces
    base = re.sub(r'[\._-]+', ' ', base)
    # Remove common tokens like 'cv' or 'resume'
    base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
    base = re.sub(r'\s+', ' ', base).strip()
    # Title case the remaining string
    return base.title() if base else ''


def extract_skills(text: str) -> List[str]:
    """Identify common skills mentioned in the resume.

    A predefined set of skills is checked against the resume text in a
    case‑insensitive manner.  If a skill phrase appears anywhere in the
    text, it is added to the result list.  Multi‑word skills must match
    the full phrase to count as a hit.

    Parameters
    ----------
    text : str
        The resume's full text.

    Returns
    -------
    list[str]
        Unique skills found in the resume, preserving their original
        capitalisation where possible.
    """
    if not text:
        return []
    lower_text = text.lower()
    # Define a set of common technical and soft skills.  This list can
    # be extended in future iterations without modifying the parser
    SKILLS = [
        'python', 'java', 'c++', 'c', 'javascript', 'html', 'css',
        'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring',
        'machine learning', 'deep learning', 'nlp', 'data analysis',
        'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git',
        'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux',
        'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
        'matplotlib', 'excel', 'powerpoint', 'project management',
        'communication', 'teamwork', 'leadership', 'problem solving',
        'public speaking', 'writing', 'analysis', 'time management'
    ]
    found = []
    for skill in SKILLS:
        pattern = re.escape(skill.lower())
        if re.search(r'\b' + pattern + r'\b', lower_text):
            # Preserve the original capitalisation of the skill phrase
            found.append(skill.title() if skill.islower() else skill)
    return list(dict.fromkeys(found))  # Remove duplicates, preserve order


def extract_education(text: str) -> List[str]:
    """Gather educational qualifications from the resume text.

    The function searches for lines containing keywords related to
    education.  Only distinct lines with meaningful content are
    included.

    Parameters
    ----------
    text : str

    Returns
    -------
    list[str]
        Lines representing educational qualifications.
    """
    if not text:
        return []
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    education_keywords = [
        'university', 'college', 'bachelor', 'master', 'phd', 'b.sc',
        'm.sc', 'mba', 'school', 'degree', 'diploma', 'engineering'
    ]
    results = []
    for line in lines:
        lower = line.lower()
        if any(kw in lower for kw in education_keywords):
            # Avoid capturing the same line twice
            if line not in results:
                results.append(line)
    # If nothing found, return an empty list
    return results


def extract_experience(text: str) -> List[str]:
    """Extract snippets of work experience from resume text.

    Heuristics are used to detect sentences or lines that likely
    describe professional experience.  Indicators include the presence
    of keywords like "experience", job titles, or explicit durations.

    Parameters
    ----------
    text : str

    Returns
    -------
    list[str]
        A list of lines summarising work experience.
    """
    if not text:
        return []
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    # Keywords signalling experience entries
    exp_keywords = [
        'experience', 'worked', 'employment', 'internship', 'developer',
        'engineer', 'manager', 'analyst', 'consultant', 'assistant',
        'years', 'year', 'months', 'month', 'present'
    ]
    results = []
    for line in lines:
        lower = line.lower()
        if any(kw in lower for kw in exp_keywords):
            # Filter out lines that are just section headings
            if len(lower.split()) > 2:
                if line not in results:
                    results.append(line)
    return results


def parse_resume(file_path: str, filename: str) -> dict:
    """High‑level helper to parse a resume into structured fields.

    Parameters
    ----------
    file_path : str
        Location of the uploaded file on disk.
    filename : str
        The original filename as provided by the user.  Used as a
        fallback for name extraction if the document text does not
        reveal a plausible name.

    Returns
    -------
    dict
        Dictionary with keys ``name``, ``skills``, ``education`` and
        ``experience``.  Each value is a string, except for the name
        which is a single string.  Lists are joined into a comma or
        newline separated string suitable for form fields.
    """
    text = extract_text(file_path)
    name = extract_name(text, filename)
    skills_list = extract_skills(text)
    education_list = extract_education(text)
    experience_list = extract_experience(text)
    return {
        'name': name or '',
        'skills': ', '.join(skills_list) if skills_list else '',
        'education': '\n'.join(education_list) if education_list else '',
        'experience': '\n'.join(experience_list) if experience_list else ''
    }