husseinelsaadi commited on
Commit
af02e64
·
1 Parent(s): c11e18e

resume parser implemented

Browse files
app.py CHANGED
@@ -26,6 +26,12 @@ sys.path.append(current_dir)
26
  from backend.models.database import db, Job, Application, init_db
27
  from backend.models.user import User
28
  from backend.routes.auth import auth_bp, handle_resume_upload
 
 
 
 
 
 
29
  from backend.routes.interview_api import interview_api
30
  # Import additional utilities
31
  import re
@@ -175,33 +181,47 @@ def chatbot_endpoint():
175
 
176
  @app.route('/parse_resume', methods=['POST'])
177
  def parse_resume():
 
 
 
 
 
 
 
 
 
 
 
178
  file = request.files.get('resume')
179
- features, error, filepath = handle_resume_upload(file)
 
180
 
181
- if error:
182
- return {"error": "Error processing resume. Please try again."}, 400
 
 
 
 
 
183
 
184
- if not features:
185
- return {
186
- "name": "",
187
- "email": "",
188
- "mobile_number": "",
189
- "skills": [],
190
- "experience": [],
191
- "education": [],
192
- "summary": ""
193
- }, 200
194
 
 
195
  response = {
196
- "name": features.get('name', ''),
197
- "email": features.get('email', ''),
198
- "mobile_number": features.get('mobile_number', ''),
199
- "skills": features.get('skills', []),
200
- "experience": features.get('experience', []),
201
- "education": features.get('education', []),
202
- "summary": features.get('summary', '')
203
  }
204
- return response, 200
205
 
206
  @app.route("/interview/<int:job_id>")
207
  @login_required
 
26
  from backend.models.database import db, Job, Application, init_db
27
  from backend.models.user import User
28
  from backend.routes.auth import auth_bp, handle_resume_upload
29
+
30
+ # Import the resume parsing helper. This module contains lightweight
31
+ # heuristics for extracting information from PDF and DOCX files without
32
+ # relying on heavy external libraries. See
33
+ # ``codingo/backend/services/resume_parser.py`` for details.
34
+ from backend.services.resume_parser import parse_resume as _parse_resume_helper
35
  from backend.routes.interview_api import interview_api
36
  # Import additional utilities
37
  import re
 
181
 
182
  @app.route('/parse_resume', methods=['POST'])
183
  def parse_resume():
184
+ """
185
+ Parse an uploaded resume (PDF or DOCX) and return extracted
186
+ information in JSON format.
187
+
188
+ This endpoint is separate from the main application flow. It saves
189
+ the uploaded file to a temporary location (via ``handle_resume_upload``)
190
+ so that recruiters can review the original document later, then
191
+ invokes a lightweight parser to extract the candidate's name,
192
+ skills, education and experience. Errors during upload or
193
+ parsing are reported back to the client.
194
+ """
195
  file = request.files.get('resume')
196
+ if not file or file.filename == '':
197
+ return jsonify({"error": "No file uploaded"}), 400
198
 
199
+ # Save the file using the existing helper. We ignore the
200
+ # ``features`` return value because ``handle_resume_upload`` no
201
+ # longer parses resumes itself; it simply stores the file and
202
+ # returns the path on disk.
203
+ features, error, filepath = handle_resume_upload(file)
204
+ if error or not filepath:
205
+ return jsonify({"error": "Error processing resume. Please try again."}), 400
206
 
207
+ try:
208
+ # Parse the stored file. Pass both the path and the original
209
+ # filename so that the parser can fall back to the filename
210
+ # when inferring the candidate's name.
211
+ parsed = _parse_resume_helper(filepath, file.filename)
212
+ except Exception as exc:
213
+ # Log to stderr for debugging
214
+ print(f"Resume parsing error: {exc}", file=sys.stderr)
215
+ return jsonify({"error": "Failed to parse resume"}), 500
 
216
 
217
+ # Normalise the response to ensure string values for the form
218
  response = {
219
+ 'name': parsed.get('name', ''),
220
+ 'skills': parsed.get('skills', ''),
221
+ 'education': parsed.get('education', ''),
222
+ 'experience': parsed.get('experience', '')
 
 
 
223
  }
224
+ return jsonify(response), 200
225
 
226
  @app.route("/interview/<int:job_id>")
227
  @login_required
backend/services/resume_parser.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ resume_parser.py
3
+ =================
4
+
5
+ This module provides lightweight functions to extract useful information
6
+ from a candidate's resume. The design avoids heavy dependencies such
7
+ as spaCy or pdfminer because Hugging Face Spaces environments are
8
+ resource‑constrained and installing additional packages at runtime is
9
+ often not feasible. Instead, built‑in Python libraries and a
10
+ few simple heuristics are used to extract text from both PDF and DOCX
11
+ files and to infer the candidate's name, skills, education and
12
+ experience from that text.
13
+
14
+ The parser operates on the assumption that most resumes follow a
15
+ relatively consistent structure: the candidate's name appears near the
16
+ top of the document, headings such as "Education" and "Experience"
17
+ demarcate sections, and common skill keywords are scattered
18
+ throughout. These assumptions will not hold for every CV, but they
19
+ provide a reasonable baseline for auto‑filling form fields. Users can
20
+ always edit the populated fields before submitting their application.
21
+
22
+ Functions
23
+ ---------
24
+
25
+ * ``extract_text(file_path: str) -> str``
26
+ Read a resume file (PDF or DOCX) and return its plain text. PDFs
27
+ are processed using the ``pdftotext`` command line tool, which is
28
+ available in the Hugging Face Spaces container. DOCX files are
29
+ treated as zip archives; the ``word/document.xml`` component is
30
+ parsed and stripped of XML tags.
31
+
32
+ * ``extract_name(text: str, filename: str) -> str``
33
+ Attempt to infer the candidate's full name from the document text.
34
+ If no plausible name is found in the first few lines of the text,
35
+ fall back to deriving a name from the file name itself.
36
+
37
+ * ``extract_skills(text: str) -> list[str]``
38
+ Search for a predefined list of common technical and soft skills
39
+ within the resume text. Matches are case‑insensitive and unique
40
+ values are returned in their original capitalisation.
41
+
42
+ * ``extract_education(text: str) -> list[str]``
43
+ Identify lines mentioning educational qualifications. Heuristics
44
+ include the presence of keywords like "University", "Bachelor",
45
+ "Master", "PhD", etc.
46
+
47
+ * ``extract_experience(text: str) -> list[str]``
48
+ Extract statements describing work experience. Lines containing
49
+ keywords such as "experience", "Developer", "Engineer" or those
50
+ matching patterns with years of service are considered.
51
+
52
+ * ``parse_resume(file_path: str, filename: str) -> dict``
53
+ High‑level wrapper that orchestrates the text extraction and
54
+ information extraction functions. Returns a dictionary with keys
55
+ ``name``, ``skills``, ``education``, and ``experience``.
56
+
57
+ The main Flask route can import ``parse_resume`` from this module and
58
+ return its result as JSON. Because the heuristics are conservative and
59
+ string‑based, the parser runs quickly on both CPU and GPU hosts.
60
+ """
61
+
62
+ from __future__ import annotations
63
+
64
+ import os
65
+ import re
66
+ import subprocess
67
+ import zipfile
68
+ from typing import List
69
+
70
+
71
+ def extract_text(file_path: str) -> str:
72
+ """Extract raw text from a PDF or DOCX resume.
73
+
74
+ Parameters
75
+ ----------
76
+ file_path : str
77
+ Absolute path to the uploaded resume.
78
+
79
+ Returns
80
+ -------
81
+ str
82
+ The textual content of the resume. If extraction fails,
83
+ returns an empty string.
84
+ """
85
+ if not file_path or not os.path.isfile(file_path):
86
+ return ""
87
+
88
+ lower_name = file_path.lower()
89
+ try:
90
+ # If the file ends with .pdf use pdftotext. The '-layout'
91
+ # flag preserves relative positioning which helps preserve
92
+ # line breaks in the output. Output is sent to stdout.
93
+ if lower_name.endswith('.pdf'):
94
+ try:
95
+ result = subprocess.run(
96
+ ['pdftotext', '-layout', file_path, '-'],
97
+ stdout=subprocess.PIPE,
98
+ stderr=subprocess.PIPE,
99
+ check=False
100
+ )
101
+ return result.stdout.decode('utf-8', errors='ignore')
102
+ except Exception:
103
+ return ""
104
+ # If it's a .docx treat it as a zip archive and pull the main
105
+ # document XML. Note that .doc files are not supported since
106
+ # they use a binary format.
107
+ elif lower_name.endswith('.docx'):
108
+ try:
109
+ with zipfile.ZipFile(file_path) as zf:
110
+ with zf.open('word/document.xml') as docx_xml:
111
+ xml_bytes = docx_xml.read()
112
+ # Remove XML tags to leave plain text. Replace
113
+ # tags with spaces to avoid accidental word
114
+ # concatenation.
115
+ xml_text = xml_bytes.decode('utf-8', errors='ignore')
116
+ # Replace common markup elements with newlines to
117
+ # preserve paragraph structure. Some tags like
118
+ # ``<w:p>`` represent paragraphs in Word.
119
+ xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
120
+ # Remove remaining tags
121
+ text = re.sub(r'<[^>]+>', ' ', xml_text)
122
+ # Collapse multiple whitespace
123
+ text = re.sub(r'\s+', ' ', text)
124
+ return text
125
+ except Exception:
126
+ return ""
127
+ else:
128
+ # Unsupported file type
129
+ return ""
130
+ except Exception:
131
+ return ""
132
+
133
+
134
+ def extract_name(text: str, filename: str) -> str:
135
+ """Attempt to extract the candidate's full name from the resume.
136
+
137
+ This function first inspects the first few lines of the resume
138
+ text. It looks for lines containing between two and four words
139
+ where each word starts with an uppercase letter. If such a line
140
+ isn't found, it falls back to deriving a name from the file name.
141
+
142
+ Parameters
143
+ ----------
144
+ text : str
145
+ The full resume text.
146
+ filename : str
147
+ The original filename of the uploaded resume.
148
+
149
+ Returns
150
+ -------
151
+ str
152
+ Inferred full name or an empty string if not found.
153
+ """
154
+ if text:
155
+ # Consider the first 10 lines for a potential name. Strip
156
+ # whitespace and ignore empty lines.
157
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
158
+ for line in lines[:10]:
159
+ # Remove common headings like "Resume" or "Curriculum Vitae"
160
+ if re.match(r'(?i)resume|curriculum vitae', line):
161
+ continue
162
+ words = line.split()
163
+ # A plausible name typically has 2–4 words
164
+ if 1 < len(words) <= 4:
165
+ # All words must start with an uppercase letter (allow
166
+ # accented characters) and contain at least one letter.
167
+ if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
168
+ return line
169
+ # Fallback: derive a name from the filename
170
+ base = os.path.basename(filename)
171
+ # Remove extension
172
+ base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
173
+ # Replace underscores, dashes and dots with spaces
174
+ base = re.sub(r'[\._-]+', ' ', base)
175
+ # Remove common tokens like 'cv' or 'resume'
176
+ base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
177
+ base = re.sub(r'\s+', ' ', base).strip()
178
+ # Title case the remaining string
179
+ return base.title() if base else ''
180
+
181
+
182
+ def extract_skills(text: str) -> List[str]:
183
+ """Identify common skills mentioned in the resume.
184
+
185
+ A predefined set of skills is checked against the resume text in a
186
+ case‑insensitive manner. If a skill phrase appears anywhere in the
187
+ text, it is added to the result list. Multi‑word skills must match
188
+ the full phrase to count as a hit.
189
+
190
+ Parameters
191
+ ----------
192
+ text : str
193
+ The resume's full text.
194
+
195
+ Returns
196
+ -------
197
+ list[str]
198
+ Unique skills found in the resume, preserving their original
199
+ capitalisation where possible.
200
+ """
201
+ if not text:
202
+ return []
203
+ lower_text = text.lower()
204
+ # Define a set of common technical and soft skills. This list can
205
+ # be extended in future iterations without modifying the parser
206
+ SKILLS = [
207
+ 'python', 'java', 'c++', 'c', 'javascript', 'html', 'css',
208
+ 'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring',
209
+ 'machine learning', 'deep learning', 'nlp', 'data analysis',
210
+ 'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git',
211
+ 'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux',
212
+ 'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
213
+ 'matplotlib', 'excel', 'powerpoint', 'project management',
214
+ 'communication', 'teamwork', 'leadership', 'problem solving',
215
+ 'public speaking', 'writing', 'analysis', 'time management'
216
+ ]
217
+ found = []
218
+ for skill in SKILLS:
219
+ pattern = re.escape(skill.lower())
220
+ if re.search(r'\b' + pattern + r'\b', lower_text):
221
+ # Preserve the original capitalisation of the skill phrase
222
+ found.append(skill.title() if skill.islower() else skill)
223
+ return list(dict.fromkeys(found)) # Remove duplicates, preserve order
224
+
225
+
226
+ def extract_education(text: str) -> List[str]:
227
+ """Gather educational qualifications from the resume text.
228
+
229
+ The function searches for lines containing keywords related to
230
+ education. Only distinct lines with meaningful content are
231
+ included.
232
+
233
+ Parameters
234
+ ----------
235
+ text : str
236
+
237
+ Returns
238
+ -------
239
+ list[str]
240
+ Lines representing educational qualifications.
241
+ """
242
+ if not text:
243
+ return []
244
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
245
+ education_keywords = [
246
+ 'university', 'college', 'bachelor', 'master', 'phd', 'b.sc',
247
+ 'm.sc', 'mba', 'school', 'degree', 'diploma', 'engineering'
248
+ ]
249
+ results = []
250
+ for line in lines:
251
+ lower = line.lower()
252
+ if any(kw in lower for kw in education_keywords):
253
+ # Avoid capturing the same line twice
254
+ if line not in results:
255
+ results.append(line)
256
+ # If nothing found, return an empty list
257
+ return results
258
+
259
+
260
+ def extract_experience(text: str) -> List[str]:
261
+ """Extract snippets of work experience from resume text.
262
+
263
+ Heuristics are used to detect sentences or lines that likely
264
+ describe professional experience. Indicators include the presence
265
+ of keywords like "experience", job titles, or explicit durations.
266
+
267
+ Parameters
268
+ ----------
269
+ text : str
270
+
271
+ Returns
272
+ -------
273
+ list[str]
274
+ A list of lines summarising work experience.
275
+ """
276
+ if not text:
277
+ return []
278
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
279
+ # Keywords signalling experience entries
280
+ exp_keywords = [
281
+ 'experience', 'worked', 'employment', 'internship', 'developer',
282
+ 'engineer', 'manager', 'analyst', 'consultant', 'assistant',
283
+ 'years', 'year', 'months', 'month', 'present'
284
+ ]
285
+ results = []
286
+ for line in lines:
287
+ lower = line.lower()
288
+ if any(kw in lower for kw in exp_keywords):
289
+ # Filter out lines that are just section headings
290
+ if len(lower.split()) > 2:
291
+ if line not in results:
292
+ results.append(line)
293
+ return results
294
+
295
+
296
+ def parse_resume(file_path: str, filename: str) -> dict:
297
+ """High‑level helper to parse a resume into structured fields.
298
+
299
+ Parameters
300
+ ----------
301
+ file_path : str
302
+ Location of the uploaded file on disk.
303
+ filename : str
304
+ The original filename as provided by the user. Used as a
305
+ fallback for name extraction if the document text does not
306
+ reveal a plausible name.
307
+
308
+ Returns
309
+ -------
310
+ dict
311
+ Dictionary with keys ``name``, ``skills``, ``education`` and
312
+ ``experience``. Each value is a string, except for the name
313
+ which is a single string. Lists are joined into a comma or
314
+ newline separated string suitable for form fields.
315
+ """
316
+ text = extract_text(file_path)
317
+ name = extract_name(text, filename)
318
+ skills_list = extract_skills(text)
319
+ education_list = extract_education(text)
320
+ experience_list = extract_experience(text)
321
+ return {
322
+ 'name': name or '',
323
+ 'skills': ', '.join(skills_list) if skills_list else '',
324
+ 'education': '\n'.join(education_list) if education_list else '',
325
+ 'experience': '\n'.join(experience_list) if experience_list else ''
326
+ }
backend/templates/apply.html CHANGED
@@ -15,12 +15,12 @@
15
 
16
  {% block content %}
17
  <section class="content-section">
18
- <ul class="breadcrumbs">
19
  <li><a href="{{ url_for('index') }}">Home</a></li>
20
  <li><a href="{{ url_for('jobs') }}">Jobs</a></li>
21
  <li><a href="{{ url_for('job_detail', job_id=job.id) }}">{{ job.role }}</a></li>
22
  <li>Apply</li>
23
- </ul>
24
 
25
  <div class="card">
26
  <div class="card-header">
 
15
 
16
  {% block content %}
17
  <section class="content-section">
18
+ <!-- <ul class="breadcrumbs">
19
  <li><a href="{{ url_for('index') }}">Home</a></li>
20
  <li><a href="{{ url_for('jobs') }}">Jobs</a></li>
21
  <li><a href="{{ url_for('job_detail', job_id=job.id) }}">{{ job.role }}</a></li>
22
  <li>Apply</li>
23
+ </ul> -->
24
 
25
  <div class="card">
26
  <div class="card-header">