husseinelsaadi commited on
Commit
ff62567
·
1 Parent(s): af02e64

parse resume added

Browse files
Files changed (1) hide show
  1. backend/services/resume_parser.py +19 -4
backend/services/resume_parser.py CHANGED
@@ -98,7 +98,20 @@ def extract_text(file_path: str) -> str:
98
  stderr=subprocess.PIPE,
99
  check=False
100
  )
101
- return result.stdout.decode('utf-8', errors='ignore')
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  except Exception:
103
  return ""
104
  # If it's a .docx treat it as a zip archive and pull the main
@@ -217,7 +230,7 @@ def extract_skills(text: str) -> List[str]:
217
  found = []
218
  for skill in SKILLS:
219
  pattern = re.escape(skill.lower())
220
- if re.search(r'\b' + pattern + r'\b', lower_text):
221
  # Preserve the original capitalisation of the skill phrase
222
  found.append(skill.title() if skill.islower() else skill)
223
  return list(dict.fromkeys(found)) # Remove duplicates, preserve order
@@ -243,9 +256,11 @@ def extract_education(text: str) -> List[str]:
243
  return []
244
  lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
245
  education_keywords = [
246
- 'university', 'college', 'bachelor', 'master', 'phd', 'b.sc',
247
- 'm.sc', 'mba', 'school', 'degree', 'diploma', 'engineering'
 
248
  ]
 
249
  results = []
250
  for line in lines:
251
  lower = line.lower()
 
98
  stderr=subprocess.PIPE,
99
  check=False
100
  )
101
+ raw_text = result.stdout.decode('utf-8', errors='ignore')
102
+ # Normalize whitespace and ensure section keywords are on separate lines
103
+ raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
104
+ raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
105
+ raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
106
+ # Replace multiple spaces/tabs but keep newlines
107
+ raw_text = re.sub(r'[ \t]+', ' ', raw_text)
108
+ # Ensure section keywords are isolated
109
+ raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
110
+ raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
111
+ raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
112
+ return raw_text
113
+
114
+
115
  except Exception:
116
  return ""
117
  # If it's a .docx treat it as a zip archive and pull the main
 
230
  found = []
231
  for skill in SKILLS:
232
  pattern = re.escape(skill.lower())
233
+ if re.search(r'\b' + pattern + r'(\b|[^a-zA-Z])', lower_text):
234
  # Preserve the original capitalisation of the skill phrase
235
  found.append(skill.title() if skill.islower() else skill)
236
  return list(dict.fromkeys(found)) # Remove duplicates, preserve order
 
256
  return []
257
  lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
258
  education_keywords = [
259
+ 'university', 'college', 'bachelor', 'bachelors', 'master', 'masters',
260
+ 'phd', 'b.sc', 'bsc', 'm.sc', 'msc', 'mba', 'school', 'degree',
261
+ 'diploma', 'engineering', 'work history'
262
  ]
263
+
264
  results = []
265
  for line in lines:
266
  lower = line.lower()