Spaces:
Paused
Paused
Commit
·
ff62567
1
Parent(s):
af02e64
parse resume added
Browse files
backend/services/resume_parser.py
CHANGED
@@ -98,7 +98,20 @@ def extract_text(file_path: str) -> str:
|
|
98 |
stderr=subprocess.PIPE,
|
99 |
check=False
|
100 |
)
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
except Exception:
|
103 |
return ""
|
104 |
# If it's a .docx treat it as a zip archive and pull the main
|
@@ -217,7 +230,7 @@ def extract_skills(text: str) -> List[str]:
|
|
217 |
found = []
|
218 |
for skill in SKILLS:
|
219 |
pattern = re.escape(skill.lower())
|
220 |
-
if re.search(r'\b' + pattern + r'\b', lower_text):
|
221 |
# Preserve the original capitalisation of the skill phrase
|
222 |
found.append(skill.title() if skill.islower() else skill)
|
223 |
return list(dict.fromkeys(found)) # Remove duplicates, preserve order
|
@@ -243,9 +256,11 @@ def extract_education(text: str) -> List[str]:
|
|
243 |
return []
|
244 |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
245 |
education_keywords = [
|
246 |
-
'university', 'college', 'bachelor', '
|
247 |
-
'
|
|
|
248 |
]
|
|
|
249 |
results = []
|
250 |
for line in lines:
|
251 |
lower = line.lower()
|
|
|
98 |
stderr=subprocess.PIPE,
|
99 |
check=False
|
100 |
)
|
101 |
+
raw_text = result.stdout.decode('utf-8', errors='ignore')
|
102 |
+
# Normalize whitespace and ensure section keywords are on separate lines
|
103 |
+
raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
|
104 |
+
raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
|
105 |
+
raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
|
106 |
+
# Replace multiple spaces/tabs but keep newlines
|
107 |
+
raw_text = re.sub(r'[ \t]+', ' ', raw_text)
|
108 |
+
# Ensure section keywords are isolated
|
109 |
+
raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
|
110 |
+
raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
|
111 |
+
raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
|
112 |
+
return raw_text
|
113 |
+
|
114 |
+
|
115 |
except Exception:
|
116 |
return ""
|
117 |
# If it's a .docx treat it as a zip archive and pull the main
|
|
|
230 |
found = []
|
231 |
for skill in SKILLS:
|
232 |
pattern = re.escape(skill.lower())
|
233 |
+
if re.search(r'\b' + pattern + r'(\b|[^a-zA-Z])', lower_text):
|
234 |
# Preserve the original capitalisation of the skill phrase
|
235 |
found.append(skill.title() if skill.islower() else skill)
|
236 |
return list(dict.fromkeys(found)) # Remove duplicates, preserve order
|
|
|
256 |
return []
|
257 |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
258 |
education_keywords = [
|
259 |
+
'university', 'college', 'bachelor', 'bachelors', 'master', 'masters',
|
260 |
+
'phd', 'b.sc', 'bsc', 'm.sc', 'msc', 'mba', 'school', 'degree',
|
261 |
+
'diploma', 'engineering', 'work history'
|
262 |
]
|
263 |
+
|
264 |
results = []
|
265 |
for line in lines:
|
266 |
lower = line.lower()
|