Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on 17 days ago

Commit

ff62567

1 Parent(s): af02e64

parse resume added

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +19 -4

backend/services/resume_parser.py CHANGED Viewed

@@ -98,7 +98,20 @@ def extract_text(file_path: str) -> str:
                     stderr=subprocess.PIPE,
                     check=False
                 )
-                return result.stdout.decode('utf-8', errors='ignore')
             except Exception:
                 return ""
         # If it's a .docx treat it as a zip archive and pull the main
@@ -217,7 +230,7 @@ def extract_skills(text: str) -> List[str]:
     found = []
     for skill in SKILLS:
         pattern = re.escape(skill.lower())
-        if re.search(r'\b' + pattern + r'\b', lower_text):
             # Preserve the original capitalisation of the skill phrase
             found.append(skill.title() if skill.islower() else skill)
     return list(dict.fromkeys(found))  # Remove duplicates, preserve order
@@ -243,9 +256,11 @@ def extract_education(text: str) -> List[str]:
         return []
     lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
     education_keywords = [
-        'university', 'college', 'bachelor', 'master', 'phd', 'b.sc',
-        'm.sc', 'mba', 'school', 'degree', 'diploma', 'engineering'
     ]
     results = []
     for line in lines:
         lower = line.lower()

                     stderr=subprocess.PIPE,
                     check=False
                 )
+                raw_text = result.stdout.decode('utf-8', errors='ignore')
+                # Normalize whitespace and ensure section keywords are on separate lines
+                raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
+                raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
+                raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
+                # Replace multiple spaces/tabs but keep newlines
+                raw_text = re.sub(r'[ \t]+', ' ', raw_text)
+                # Ensure section keywords are isolated
+                raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
+                raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
+                raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
+                return raw_text
             except Exception:
                 return ""
         # If it's a .docx treat it as a zip archive and pull the main
     found = []
     for skill in SKILLS:
         pattern = re.escape(skill.lower())
+        if re.search(r'\b' + pattern + r'(\b|[^a-zA-Z])', lower_text):
             # Preserve the original capitalisation of the skill phrase
             found.append(skill.title() if skill.islower() else skill)
     return list(dict.fromkeys(found))  # Remove duplicates, preserve order
         return []
     lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
     education_keywords = [
+        'university', 'college', 'bachelor', 'bachelors', 'master', 'masters',
+        'phd', 'b.sc', 'bsc', 'm.sc', 'msc', 'mba', 'school', 'degree',
+        'diploma', 'engineering', 'work history'
     ]
     results = []
     for line in lines:
         lower = line.lower()