Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files

xet

Community

husseinelsaadi commited on Aug 3

Commit

af02e64

1 Parent(s): c11e18e

resume parser implemented

Browse files

Files changed (3) hide show

app.py +41 -21
backend/services/resume_parser.py +326 -0
backend/templates/apply.html +2 -2

app.py CHANGED Viewed

@@ -26,6 +26,12 @@ sys.path.append(current_dir)
 from backend.models.database import db, Job, Application, init_db
 from backend.models.user import User
 from backend.routes.auth import auth_bp, handle_resume_upload
 from backend.routes.interview_api import interview_api
 # Import additional utilities
 import re
@@ -175,33 +181,47 @@ def chatbot_endpoint():
 @app.route('/parse_resume', methods=['POST'])
 def parse_resume():
     file = request.files.get('resume')
-    features, error, filepath = handle_resume_upload(file)
-    if error:
-        return {"error": "Error processing resume. Please try again."}, 400
-    if not features:
-        return {
-            "name": "",
-            "email": "",
-            "mobile_number": "",
-            "skills": [],
-            "experience": [],
-            "education": [],
-            "summary": ""
-        }, 200
     response = {
-        "name": features.get('name', ''),
-        "email": features.get('email', ''),
-        "mobile_number": features.get('mobile_number', ''),
-        "skills": features.get('skills', []),
-        "experience": features.get('experience', []),
-        "education": features.get('education', []),
-        "summary": features.get('summary', '')
     }
-    return response, 200
 @app.route("/interview/<int:job_id>")
 @login_required

 from backend.models.database import db, Job, Application, init_db
 from backend.models.user import User
 from backend.routes.auth import auth_bp, handle_resume_upload
+# Import the resume parsing helper.  This module contains lightweight
+# heuristics for extracting information from PDF and DOCX files without
+# relying on heavy external libraries.  See
+# ``codingo/backend/services/resume_parser.py`` for details.
+from backend.services.resume_parser import parse_resume as _parse_resume_helper
 from backend.routes.interview_api import interview_api
 # Import additional utilities
 import re
 @app.route('/parse_resume', methods=['POST'])
 def parse_resume():
+    """
+    Parse an uploaded resume (PDF or DOCX) and return extracted
+    information in JSON format.
+    This endpoint is separate from the main application flow.  It saves
+    the uploaded file to a temporary location (via ``handle_resume_upload``)
+    so that recruiters can review the original document later, then
+    invokes a lightweight parser to extract the candidate's name,
+    skills, education and experience.  Errors during upload or
+    parsing are reported back to the client.
+    """
     file = request.files.get('resume')
+    if not file or file.filename == '':
+        return jsonify({"error": "No file uploaded"}), 400
+    # Save the file using the existing helper.  We ignore the
+    # ``features`` return value because ``handle_resume_upload`` no
+    # longer parses resumes itself; it simply stores the file and
+    # returns the path on disk.
+    features, error, filepath = handle_resume_upload(file)
+    if error or not filepath:
+        return jsonify({"error": "Error processing resume. Please try again."}), 400
+    try:
+        # Parse the stored file.  Pass both the path and the original
+        # filename so that the parser can fall back to the filename
+        # when inferring the candidate's name.
+        parsed = _parse_resume_helper(filepath, file.filename)
+    except Exception as exc:
+        # Log to stderr for debugging
+        print(f"Resume parsing error: {exc}", file=sys.stderr)
+        return jsonify({"error": "Failed to parse resume"}), 500
+    # Normalise the response to ensure string values for the form
     response = {
+        'name': parsed.get('name', ''),
+        'skills': parsed.get('skills', ''),
+        'education': parsed.get('education', ''),
+        'experience': parsed.get('experience', '')
     }
+    return jsonify(response), 200
 @app.route("/interview/<int:job_id>")
 @login_required

backend/services/resume_parser.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""
+resume_parser.py
+=================
+This module provides lightweight functions to extract useful information
+from a candidate's resume.  The design avoids heavy dependencies such
+as spaCy or pdfminer because Hugging Face Spaces environments are
+resource‑constrained and installing additional packages at runtime is
+often not feasible.  Instead, built‑in Python libraries and a
+few simple heuristics are used to extract text from both PDF and DOCX
+files and to infer the candidate's name, skills, education and
+experience from that text.
+The parser operates on the assumption that most resumes follow a
+relatively consistent structure: the candidate's name appears near the
+top of the document, headings such as "Education" and "Experience"
+demarcate sections, and common skill keywords are scattered
+throughout.  These assumptions will not hold for every CV, but they
+provide a reasonable baseline for auto‑filling form fields.  Users can
+always edit the populated fields before submitting their application.
+Functions
+---------
+* ``extract_text(file_path: str) -> str``
+    Read a resume file (PDF or DOCX) and return its plain text.  PDFs
+    are processed using the ``pdftotext`` command line tool, which is
+    available in the Hugging Face Spaces container.  DOCX files are
+    treated as zip archives; the ``word/document.xml`` component is
+    parsed and stripped of XML tags.
+* ``extract_name(text: str, filename: str) -> str``
+    Attempt to infer the candidate's full name from the document text.
+    If no plausible name is found in the first few lines of the text,
+    fall back to deriving a name from the file name itself.
+* ``extract_skills(text: str) -> list[str]``
+    Search for a predefined list of common technical and soft skills
+    within the resume text.  Matches are case‑insensitive and unique
+    values are returned in their original capitalisation.
+* ``extract_education(text: str) -> list[str]``
+    Identify lines mentioning educational qualifications.  Heuristics
+    include the presence of keywords like "University", "Bachelor",
+    "Master", "PhD", etc.
+* ``extract_experience(text: str) -> list[str]``
+    Extract statements describing work experience.  Lines containing
+    keywords such as "experience", "Developer", "Engineer" or those
+    matching patterns with years of service are considered.
+* ``parse_resume(file_path: str, filename: str) -> dict``
+    High‑level wrapper that orchestrates the text extraction and
+    information extraction functions.  Returns a dictionary with keys
+    ``name``, ``skills``, ``education``, and ``experience``.
+The main Flask route can import ``parse_resume`` from this module and
+return its result as JSON.  Because the heuristics are conservative and
+string‑based, the parser runs quickly on both CPU and GPU hosts.
+"""
+from __future__ import annotations
+import os
+import re
+import subprocess
+import zipfile
+from typing import List
+def extract_text(file_path: str) -> str:
+    """Extract raw text from a PDF or DOCX resume.
+    Parameters
+    ----------
+    file_path : str
+        Absolute path to the uploaded resume.
+    Returns
+    -------
+    str
+        The textual content of the resume.  If extraction fails,
+        returns an empty string.
+    """
+    if not file_path or not os.path.isfile(file_path):
+        return ""
+    lower_name = file_path.lower()
+    try:
+        # If the file ends with .pdf use pdftotext.  The '-layout'
+        # flag preserves relative positioning which helps preserve
+        # line breaks in the output.  Output is sent to stdout.
+        if lower_name.endswith('.pdf'):
+            try:
+                result = subprocess.run(
+                    ['pdftotext', '-layout', file_path, '-'],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    check=False
+                )
+                return result.stdout.decode('utf-8', errors='ignore')
+            except Exception:
+                return ""
+        # If it's a .docx treat it as a zip archive and pull the main
+        # document XML.  Note that .doc files are not supported since
+        # they use a binary format.
+        elif lower_name.endswith('.docx'):
+            try:
+                with zipfile.ZipFile(file_path) as zf:
+                    with zf.open('word/document.xml') as docx_xml:
+                        xml_bytes = docx_xml.read()
+                        # Remove XML tags to leave plain text.  Replace
+                        # tags with spaces to avoid accidental word
+                        # concatenation.
+                        xml_text = xml_bytes.decode('utf-8', errors='ignore')
+                        # Replace common markup elements with newlines to
+                        # preserve paragraph structure.  Some tags like
+                        # ``<w:p>`` represent paragraphs in Word.
+                        xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
+                        # Remove remaining tags
+                        text = re.sub(r'<[^>]+>', ' ', xml_text)
+                        # Collapse multiple whitespace
+                        text = re.sub(r'\s+', ' ', text)
+                        return text
+            except Exception:
+                return ""
+        else:
+            # Unsupported file type
+            return ""
+    except Exception:
+        return ""
+def extract_name(text: str, filename: str) -> str:
+    """Attempt to extract the candidate's full name from the resume.
+    This function first inspects the first few lines of the resume
+    text.  It looks for lines containing between two and four words
+    where each word starts with an uppercase letter.  If such a line
+    isn't found, it falls back to deriving a name from the file name.
+    Parameters
+    ----------
+    text : str
+        The full resume text.
+    filename : str
+        The original filename of the uploaded resume.
+    Returns
+    -------
+    str
+        Inferred full name or an empty string if not found.
+    """
+    if text:
+        # Consider the first 10 lines for a potential name.  Strip
+        # whitespace and ignore empty lines.
+        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+        for line in lines[:10]:
+            # Remove common headings like "Resume" or "Curriculum Vitae"
+            if re.match(r'(?i)resume|curriculum vitae', line):
+                continue
+            words = line.split()
+            # A plausible name typically has 2–4 words
+            if 1 < len(words) <= 4:
+                # All words must start with an uppercase letter (allow
+                # accented characters) and contain at least one letter.
+                if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
+                    return line
+    # Fallback: derive a name from the filename
+    base = os.path.basename(filename)
+    # Remove extension
+    base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
+    # Replace underscores, dashes and dots with spaces
+    base = re.sub(r'[\._-]+', ' ', base)
+    # Remove common tokens like 'cv' or 'resume'
+    base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
+    base = re.sub(r'\s+', ' ', base).strip()
+    # Title case the remaining string
+    return base.title() if base else ''
+def extract_skills(text: str) -> List[str]:
+    """Identify common skills mentioned in the resume.
+    A predefined set of skills is checked against the resume text in a
+    case‑insensitive manner.  If a skill phrase appears anywhere in the
+    text, it is added to the result list.  Multi‑word skills must match
+    the full phrase to count as a hit.
+    Parameters
+    ----------
+    text : str
+        The resume's full text.
+    Returns
+    -------
+    list[str]
+        Unique skills found in the resume, preserving their original
+        capitalisation where possible.
+    """
+    if not text:
+        return []
+    lower_text = text.lower()
+    # Define a set of common technical and soft skills.  This list can
+    # be extended in future iterations without modifying the parser
+    SKILLS = [
+        'python', 'java', 'c++', 'c', 'javascript', 'html', 'css',
+        'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring',
+        'machine learning', 'deep learning', 'nlp', 'data analysis',
+        'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git',
+        'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux',
+        'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
+        'matplotlib', 'excel', 'powerpoint', 'project management',
+        'communication', 'teamwork', 'leadership', 'problem solving',
+        'public speaking', 'writing', 'analysis', 'time management'
+    ]
+    found = []
+    for skill in SKILLS:
+        pattern = re.escape(skill.lower())
+        if re.search(r'\b' + pattern + r'\b', lower_text):
+            # Preserve the original capitalisation of the skill phrase
+            found.append(skill.title() if skill.islower() else skill)
+    return list(dict.fromkeys(found))  # Remove duplicates, preserve order
+def extract_education(text: str) -> List[str]:
+    """Gather educational qualifications from the resume text.
+    The function searches for lines containing keywords related to
+    education.  Only distinct lines with meaningful content are
+    included.
+    Parameters
+    ----------
+    text : str
+    Returns
+    -------
+    list[str]
+        Lines representing educational qualifications.
+    """
+    if not text:
+        return []
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    education_keywords = [
+        'university', 'college', 'bachelor', 'master', 'phd', 'b.sc',
+        'm.sc', 'mba', 'school', 'degree', 'diploma', 'engineering'
+    ]
+    results = []
+    for line in lines:
+        lower = line.lower()
+        if any(kw in lower for kw in education_keywords):
+            # Avoid capturing the same line twice
+            if line not in results:
+                results.append(line)
+    # If nothing found, return an empty list
+    return results
+def extract_experience(text: str) -> List[str]:
+    """Extract snippets of work experience from resume text.
+    Heuristics are used to detect sentences or lines that likely
+    describe professional experience.  Indicators include the presence
+    of keywords like "experience", job titles, or explicit durations.
+    Parameters
+    ----------
+    text : str
+    Returns
+    -------
+    list[str]
+        A list of lines summarising work experience.
+    """
+    if not text:
+        return []
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    # Keywords signalling experience entries
+    exp_keywords = [
+        'experience', 'worked', 'employment', 'internship', 'developer',
+        'engineer', 'manager', 'analyst', 'consultant', 'assistant',
+        'years', 'year', 'months', 'month', 'present'
+    ]
+    results = []
+    for line in lines:
+        lower = line.lower()
+        if any(kw in lower for kw in exp_keywords):
+            # Filter out lines that are just section headings
+            if len(lower.split()) > 2:
+                if line not in results:
+                    results.append(line)
+    return results
+def parse_resume(file_path: str, filename: str) -> dict:
+    """High‑level helper to parse a resume into structured fields.
+    Parameters
+    ----------
+    file_path : str
+        Location of the uploaded file on disk.
+    filename : str
+        The original filename as provided by the user.  Used as a
+        fallback for name extraction if the document text does not
+        reveal a plausible name.
+    Returns
+    -------
+    dict
+        Dictionary with keys ``name``, ``skills``, ``education`` and
+        ``experience``.  Each value is a string, except for the name
+        which is a single string.  Lists are joined into a comma or
+        newline separated string suitable for form fields.
+    """
+    text = extract_text(file_path)
+    name = extract_name(text, filename)
+    skills_list = extract_skills(text)
+    education_list = extract_education(text)
+    experience_list = extract_experience(text)
+    return {
+        'name': name or '',
+        'skills': ', '.join(skills_list) if skills_list else '',
+        'education': '\n'.join(education_list) if education_list else '',
+        'experience': '\n'.join(experience_list) if experience_list else ''
+    }

backend/templates/apply.html CHANGED Viewed

@@ -15,12 +15,12 @@
 {% block content %}
 <section class="content-section">
-    <ul class="breadcrumbs">
         <li><a href="{{ url_for('index') }}">Home</a></li>
         <li><a href="{{ url_for('jobs') }}">Jobs</a></li>
         <li><a href="{{ url_for('job_detail', job_id=job.id) }}">{{ job.role }}</a></li>
         <li>Apply</li>
-    </ul>
     <div class="card">
         <div class="card-header">

 {% block content %}
 <section class="content-section">
+    <!-- <ul class="breadcrumbs">
         <li><a href="{{ url_for('index') }}">Home</a></li>
         <li><a href="{{ url_for('jobs') }}">Jobs</a></li>
         <li><a href="{{ url_for('job_detail', job_id=job.id) }}">{{ job.role }}</a></li>
         <li>Apply</li>
+    </ul> -->
     <div class="card">
         <div class="card-header">