Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on 17 days ago

Commit

775c09c

1 Parent(s): ff62567

updated

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +30 -277

backend/services/resume_parser.py CHANGED Viewed

@@ -1,95 +1,21 @@
-"""
-resume_parser.py
-=================
-This module provides lightweight functions to extract useful information
-from a candidate's resume.  The design avoids heavy dependencies such
-as spaCy or pdfminer because Hugging Face Spaces environments are
-resource‑constrained and installing additional packages at runtime is
-often not feasible.  Instead, built‑in Python libraries and a
-few simple heuristics are used to extract text from both PDF and DOCX
-files and to infer the candidate's name, skills, education and
-experience from that text.
-The parser operates on the assumption that most resumes follow a
-relatively consistent structure: the candidate's name appears near the
-top of the document, headings such as "Education" and "Experience"
-demarcate sections, and common skill keywords are scattered
-throughout.  These assumptions will not hold for every CV, but they
-provide a reasonable baseline for auto‑filling form fields.  Users can
-always edit the populated fields before submitting their application.
-Functions
----------
-* ``extract_text(file_path: str) -> str``
-    Read a resume file (PDF or DOCX) and return its plain text.  PDFs
-    are processed using the ``pdftotext`` command line tool, which is
-    available in the Hugging Face Spaces container.  DOCX files are
-    treated as zip archives; the ``word/document.xml`` component is
-    parsed and stripped of XML tags.
-* ``extract_name(text: str, filename: str) -> str``
-    Attempt to infer the candidate's full name from the document text.
-    If no plausible name is found in the first few lines of the text,
-    fall back to deriving a name from the file name itself.
-* ``extract_skills(text: str) -> list[str]``
-    Search for a predefined list of common technical and soft skills
-    within the resume text.  Matches are case‑insensitive and unique
-    values are returned in their original capitalisation.
-* ``extract_education(text: str) -> list[str]``
-    Identify lines mentioning educational qualifications.  Heuristics
-    include the presence of keywords like "University", "Bachelor",
-    "Master", "PhD", etc.
-* ``extract_experience(text: str) -> list[str]``
-    Extract statements describing work experience.  Lines containing
-    keywords such as "experience", "Developer", "Engineer" or those
-    matching patterns with years of service are considered.
-* ``parse_resume(file_path: str, filename: str) -> dict``
-    High‑level wrapper that orchestrates the text extraction and
-    information extraction functions.  Returns a dictionary with keys
-    ``name``, ``skills``, ``education``, and ``experience``.
-The main Flask route can import ``parse_resume`` from this module and
-return its result as JSON.  Because the heuristics are conservative and
-string‑based, the parser runs quickly on both CPU and GPU hosts.
-"""
 from __future__ import annotations
 import os
 import re
 import subprocess
 import zipfile
 from typing import List
 def extract_text(file_path: str) -> str:
-    """Extract raw text from a PDF or DOCX resume.
-    Parameters
-    ----------
-    file_path : str
-        Absolute path to the uploaded resume.
-    Returns
-    -------
-    str
-        The textual content of the resume.  If extraction fails,
-        returns an empty string.
-    """
     if not file_path or not os.path.isfile(file_path):
         return ""
     lower_name = file_path.lower()
     try:
-        # If the file ends with .pdf use pdftotext.  The '-layout'
-        # flag preserves relative positioning which helps preserve
-        # line breaks in the output.  Output is sent to stdout.
         if lower_name.endswith('.pdf'):
             try:
                 result = subprocess.run(
@@ -98,244 +24,71 @@ def extract_text(file_path: str) -> str:
                     stderr=subprocess.PIPE,
                     check=False
                 )
-                raw_text = result.stdout.decode('utf-8', errors='ignore')
-                # Normalize whitespace and ensure section keywords are on separate lines
-                raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
-                raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
-                raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
-                # Replace multiple spaces/tabs but keep newlines
-                raw_text = re.sub(r'[ \t]+', ' ', raw_text)
-                # Ensure section keywords are isolated
-                raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
-                raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
-                raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
-                return raw_text
             except Exception:
                 return ""
-        # If it's a .docx treat it as a zip archive and pull the main
-        # document XML.  Note that .doc files are not supported since
-        # they use a binary format.
         elif lower_name.endswith('.docx'):
             try:
                 with zipfile.ZipFile(file_path) as zf:
                     with zf.open('word/document.xml') as docx_xml:
                         xml_bytes = docx_xml.read()
-                        # Remove XML tags to leave plain text.  Replace
-                        # tags with spaces to avoid accidental word
-                        # concatenation.
                         xml_text = xml_bytes.decode('utf-8', errors='ignore')
-                        # Replace common markup elements with newlines to
-                        # preserve paragraph structure.  Some tags like
-                        # ``<w:p>`` represent paragraphs in Word.
                         xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
-                        # Remove remaining tags
                         text = re.sub(r'<[^>]+>', ' ', xml_text)
-                        # Collapse multiple whitespace
                         text = re.sub(r'\s+', ' ', text)
                         return text
             except Exception:
                 return ""
         else:
-            # Unsupported file type
             return ""
     except Exception:
         return ""
 def extract_name(text: str, filename: str) -> str:
-    """Attempt to extract the candidate's full name from the resume.
-    This function first inspects the first few lines of the resume
-    text.  It looks for lines containing between two and four words
-    where each word starts with an uppercase letter.  If such a line
-    isn't found, it falls back to deriving a name from the file name.
-    Parameters
-    ----------
-    text : str
-        The full resume text.
-    filename : str
-        The original filename of the uploaded resume.
-    Returns
-    -------
-    str
-        Inferred full name or an empty string if not found.
-    """
     if text:
-        # Consider the first 10 lines for a potential name.  Strip
-        # whitespace and ignore empty lines.
         lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
         for line in lines[:10]:
-            # Remove common headings like "Resume" or "Curriculum Vitae"
             if re.match(r'(?i)resume|curriculum vitae', line):
                 continue
             words = line.split()
-            # A plausible name typically has 2–4 words
             if 1 < len(words) <= 4:
-                # All words must start with an uppercase letter (allow
-                # accented characters) and contain at least one letter.
                 if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
                     return line
-    # Fallback: derive a name from the filename
     base = os.path.basename(filename)
-    # Remove extension
     base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
-    # Replace underscores, dashes and dots with spaces
     base = re.sub(r'[\._-]+', ' ', base)
-    # Remove common tokens like 'cv' or 'resume'
     base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
     base = re.sub(r'\s+', ' ', base).strip()
-    # Title case the remaining string
     return base.title() if base else ''
-def extract_skills(text: str) -> List[str]:
-    """Identify common skills mentioned in the resume.
-    A predefined set of skills is checked against the resume text in a
-    case‑insensitive manner.  If a skill phrase appears anywhere in the
-    text, it is added to the result list.  Multi‑word skills must match
-    the full phrase to count as a hit.
-    Parameters
-    ----------
-    text : str
-        The resume's full text.
-    Returns
-    -------
-    list[str]
-        Unique skills found in the resume, preserving their original
-        capitalisation where possible.
-    """
-    if not text:
-        return []
-    lower_text = text.lower()
-    # Define a set of common technical and soft skills.  This list can
-    # be extended in future iterations without modifying the parser
-    SKILLS = [
-        'python', 'java', 'c++', 'c', 'javascript', 'html', 'css',
-        'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring',
-        'machine learning', 'deep learning', 'nlp', 'data analysis',
-        'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git',
-        'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux',
-        'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
-        'matplotlib', 'excel', 'powerpoint', 'project management',
-        'communication', 'teamwork', 'leadership', 'problem solving',
-        'public speaking', 'writing', 'analysis', 'time management'
-    ]
-    found = []
-    for skill in SKILLS:
-        pattern = re.escape(skill.lower())
-        if re.search(r'\b' + pattern + r'(\b|[^a-zA-Z])', lower_text):
-            # Preserve the original capitalisation of the skill phrase
-            found.append(skill.title() if skill.islower() else skill)
-    return list(dict.fromkeys(found))  # Remove duplicates, preserve order
-def extract_education(text: str) -> List[str]:
-    """Gather educational qualifications from the resume text.
-    The function searches for lines containing keywords related to
-    education.  Only distinct lines with meaningful content are
-    included.
-    Parameters
-    ----------
-    text : str
-    Returns
-    -------
-    list[str]
-        Lines representing educational qualifications.
-    """
-    if not text:
-        return []
-    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
-    education_keywords = [
-        'university', 'college', 'bachelor', 'bachelors', 'master', 'masters',
-        'phd', 'b.sc', 'bsc', 'm.sc', 'msc', 'mba', 'school', 'degree',
-        'diploma', 'engineering', 'work history'
-    ]
-    results = []
-    for line in lines:
-        lower = line.lower()
-        if any(kw in lower for kw in education_keywords):
-            # Avoid capturing the same line twice
-            if line not in results:
-                results.append(line)
-    # If nothing found, return an empty list
-    return results
-def extract_experience(text: str) -> List[str]:
-    """Extract snippets of work experience from resume text.
-    Heuristics are used to detect sentences or lines that likely
-    describe professional experience.  Indicators include the presence
-    of keywords like "experience", job titles, or explicit durations.
-    Parameters
-    ----------
-    text : str
-    Returns
-    -------
-    list[str]
-        A list of lines summarising work experience.
-    """
-    if not text:
-        return []
-    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
-    # Keywords signalling experience entries
-    exp_keywords = [
-        'experience', 'worked', 'employment', 'internship', 'developer',
-        'engineer', 'manager', 'analyst', 'consultant', 'assistant',
-        'years', 'year', 'months', 'month', 'present'
-    ]
-    results = []
-    for line in lines:
-        lower = line.lower()
-        if any(kw in lower for kw in exp_keywords):
-            # Filter out lines that are just section headings
-            if len(lower.split()) > 2:
-                if line not in results:
-                    results.append(line)
-    return results
 def parse_resume(file_path: str, filename: str) -> dict:
-    """High‑level helper to parse a resume into structured fields.
-    Parameters
-    ----------
-    file_path : str
-        Location of the uploaded file on disk.
-    filename : str
-        The original filename as provided by the user.  Used as a
-        fallback for name extraction if the document text does not
-        reveal a plausible name.
-    Returns
-    -------
-    dict
-        Dictionary with keys ``name``, ``skills``, ``education`` and
-        ``experience``.  Each value is a string, except for the name
-        which is a single string.  Lists are joined into a comma or
-        newline separated string suitable for form fields.
-    """
     text = extract_text(file_path)
     name = extract_name(text, filename)
-    skills_list = extract_skills(text)
-    education_list = extract_education(text)
-    experience_list = extract_experience(text)
     return {
         'name': name or '',
-        'skills': ', '.join(skills_list) if skills_list else '',
-        'education': '\n'.join(education_list) if education_list else '',
-        'experience': '\n'.join(experience_list) if experience_list else ''
-    }

 from __future__ import annotations
 import os
 import re
 import subprocess
 import zipfile
 from typing import List
+from transformers import pipeline
+# Load the NER model for resume parsing
+ner = pipeline("ner", model="AI-Sweden-Models/distilbert-resume-ner", aggregation_strategy="simple")
 def extract_text(file_path: str) -> str:
+    """Extract text from PDF or DOCX."""
     if not file_path or not os.path.isfile(file_path):
         return ""
     lower_name = file_path.lower()
     try:
         if lower_name.endswith('.pdf'):
             try:
                 result = subprocess.run(
                     stderr=subprocess.PIPE,
                     check=False
                 )
+                return result.stdout.decode('utf-8', errors='ignore')
             except Exception:
                 return ""
         elif lower_name.endswith('.docx'):
             try:
                 with zipfile.ZipFile(file_path) as zf:
                     with zf.open('word/document.xml') as docx_xml:
                         xml_bytes = docx_xml.read()
                         xml_text = xml_bytes.decode('utf-8', errors='ignore')
                         xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
                         text = re.sub(r'<[^>]+>', ' ', xml_text)
                         text = re.sub(r'\s+', ' ', text)
                         return text
             except Exception:
                 return ""
         else:
             return ""
     except Exception:
         return ""
 def extract_name(text: str, filename: str) -> str:
+    """Extract candidate's name from text or filename."""
     if text:
         lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
         for line in lines[:10]:
             if re.match(r'(?i)resume|curriculum vitae', line):
                 continue
             words = line.split()
             if 1 < len(words) <= 4:
                 if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
                     return line
     base = os.path.basename(filename)
     base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
     base = re.sub(r'[\._-]+', ' ', base)
     base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
     base = re.sub(r'\s+', ' ', base).strip()
     return base.title() if base else ''
+def extract_entities(text: str) -> dict:
+    """Extract structured info using NER model."""
+    entities = ner(text)
+    skills, education, experience = [], [], []
+    for ent in entities:
+        label = ent['entity_group'].upper()
+        word = ent['word'].strip()
+        if label in ["SKILL", "TECH", "TECHNOLOGY"]:
+            skills.append(word)
+        elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
+            education.append(word)
+        elif label in ["EXPERIENCE", "JOB", "ROLE"]:
+            experience.append(word)
+    return {
+        "skills": list(dict.fromkeys(skills)),
+        "education": list(dict.fromkeys(education)),
+        "experience": list(dict.fromkeys(experience))
+    }
 def parse_resume(file_path: str, filename: str) -> dict:
+    """Main function to parse resume fields."""
     text = extract_text(file_path)
     name = extract_name(text, filename)
+    ents = extract_entities(text)
     return {
         'name': name or '',
+        'skills': ', '.join(ents["skills"]) if ents["skills"] else '',
+        'education': ', '.join(ents["education"]) if ents["education"] else '',
+        'experience': ', '.join(ents["experience"]) if ents["experience"] else ''
+    }