Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on Jul 23

Commit

9e72d2c

1 Parent(s): 22b00f2

updated

Browse files

Files changed (1) hide show

backend/models/resume_parser/resume_to_features.py +235 -23

backend/models/resume_parser/resume_to_features.py CHANGED Viewed

@@ -1,39 +1,251 @@
 import os
-from pyresparser import ResumeParser
-def extract_resume_features(resume_path):
-    """
-    Extract features from a resume file.
-    Args:
-        resume_path (str): Path to the resume file
-    Returns:
-        dict: Dictionary containing extracted features from resume
     """
     try:
-        data = ResumeParser(resume_path).get_extracted_data()
-        return data
     except Exception as e:
-        print(f"Error parsing resume: {e}")
         return {
             'name': '',
             'email': '',
             'mobile_number': '',
             'skills': [],
             'experience': [],
-            'no_of_pages': 0,
-            'total_experience': 0
         }
-# Example usage (will run if script is executed directly)
-if __name__ == "__main__":
-    # Build absolute path to the resume file
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    resume_path = os.path.join(current_dir, '../../../data/resumes/Hussein El Saadi - CV.pdf')
-    # Parse and print the extracted data
-    data = extract_resume_features(resume_path)
-    print(data)

 import os
+import re
+import json
+from pathlib import Path
+import PyPDF2
+from docx import Document
+import textract
+class SimpleResumeParser:
+    def __init__(self):
+        # Common skills keywords
+        self.skills_keywords = [
+            'python', 'javascript', 'java', 'c++', 'c#', 'php', 'ruby', 'go', 'rust',
+            'html', 'css', 'react', 'angular', 'vue', 'node.js', 'express', 'django',
+            'flask', 'spring', 'laravel', 'rails', 'asp.net', 'jquery', 'bootstrap',
+            'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch',
+            'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'github',
+            'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'scikit-learn',
+            'data analysis', 'pandas', 'numpy', 'matplotlib', 'tableau', 'power bi',
+            'agile', 'scrum', 'devops', 'ci/cd', 'microservices', 'api', 'rest', 'graphql'
+        ]
+        # Education keywords
+        self.education_keywords = [
+            'bachelor', 'master', 'phd', 'degree', 'university', 'college', 'institute',
+            'computer science', 'engineering', 'mathematics', 'physics', 'chemistry',
+            'business', 'mba', 'certification', 'diploma'
+        ]
+        # Experience keywords
+        self.experience_keywords = [
+            'experience', 'worked', 'developed', 'managed', 'led', 'created', 'built',
+            'designed', 'implemented', 'maintained', 'optimized', 'improved', 'years'
+        ]
+    def extract_text_from_pdf(self, file_path):
+        """Extract text from PDF file"""
+        try:
+            with open(file_path, 'rb') as file:
+                reader = PyPDF2.PdfReader(file)
+                text = ""
+                for page in reader.pages:
+                    text += page.extract_text() + "\n"
+                return text
+        except Exception as e:
+            print(f"Error reading PDF: {e}")
+            return ""
+    def extract_text_from_docx(self, file_path):
+        """Extract text from DOCX file"""
+        try:
+            doc = Document(file_path)
+            text = ""
+            for paragraph in doc.paragraphs:
+                text += paragraph.text + "\n"
+            return text
+        except Exception as e:
+            print(f"Error reading DOCX: {e}")
+            return ""
+    def extract_text_from_doc(self, file_path):
+        """Extract text from DOC file using textract"""
+        try:
+            text = textract.process(file_path).decode('utf-8')
+            return text
+        except Exception as e:
+            print(f"Error reading DOC: {e}")
+            return ""
+    def extract_text(self, file_path):
+        """Extract text based on file extension"""
+        file_extension = Path(file_path).suffix.lower()
+        if file_extension == '.pdf':
+            return self.extract_text_from_pdf(file_path)
+        elif file_extension == '.docx':
+            return self.extract_text_from_docx(file_path)
+        elif file_extension == '.doc':
+            return self.extract_text_from_doc(file_path)
+        else:
+            return ""
+    def extract_email(self, text):
+        """Extract email addresses from text"""
+        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+        emails = re.findall(email_pattern, text)
+        return emails[0] if emails else ""
+    def extract_phone(self, text):
+        """Extract phone numbers from text"""
+        phone_patterns = [
+            r'\+?1?[-.\s]?$$?([0-9]{3})$$?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
+            r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})',
+            r'(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})',
+            r'($$\d{3}$$\s?\d{3}[-.\s]?\d{4})'
+        ]
+        for pattern in phone_patterns:
+            matches = re.findall(pattern, text)
+            if matches:
+                if isinstance(matches[0], tuple):
+                    return ''.join(matches[0])
+                return matches[0]
+        return ""
+    def extract_name(self, text):
+        """Extract name from text (simple heuristic)"""
+        lines = text.split('\n')
+        for line in lines[:5]:  # Check first 5 lines
+            line = line.strip()
+            if len(line.split()) == 2 and line.replace(' ', '').isalpha():
+                # Simple check: two words, all alphabetic
+                if not any(keyword in line.lower() for keyword in ['resume', 'cv', 'curriculum']):
+                    return line.title()
+        return ""
+    def extract_skills(self, text):
+        """Extract skills from text"""
+        text_lower = text.lower()
+        found_skills = []
+        for skill in self.skills_keywords:
+            if skill.lower() in text_lower:
+                found_skills.append(skill.title())
+        # Remove duplicates and return
+        return list(set(found_skills))
+    def extract_education(self, text):
+        """Extract education information"""
+        text_lower = text.lower()
+        education = []
+        # Look for education section
+        education_section = ""
+        lines = text.split('\n')
+        in_education_section = False
+        for line in lines:
+            line_lower = line.lower()
+            if any(keyword in line_lower for keyword in ['education', 'academic', 'qualification']):
+                in_education_section = True
+                continue
+            elif in_education_section and any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'project']):
+                break
+            elif in_education_section:
+                education_section += line + " "
+        # Extract degrees and institutions
+        for keyword in self.education_keywords:
+            if keyword in text_lower:
+                # Find context around the keyword
+                pattern = rf'.{{0,50}}{re.escape(keyword)}.{{0,50}}'
+                matches = re.findall(pattern, text, re.IGNORECASE)
+                education.extend(matches)
+        return education[:3]  # Return top 3 education entries
+    def extract_experience(self, text):
+        """Extract work experience"""
+        experience = []
+        lines = text.split('\n')
+        # Look for experience section
+        in_experience_section = False
+        current_experience = ""
+        for line in lines:
+            line_lower = line.lower()
+            if any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'career']):
+                in_experience_section = True
+                continue
+            elif in_experience_section and any(keyword in line_lower for keyword in ['education', 'skill', 'project']):
+                if current_experience:
+                    experience.append(current_experience.strip())
+                break
+            elif in_experience_section:
+                if line.strip():
+                    current_experience += line + " "
+                elif current_experience:
+                    experience.append(current_experience.strip())
+                    current_experience = ""
+        if current_experience:
+            experience.append(current_experience.strip())
+        return experience[:3]  # Return top 3 experience entries
+    def extract_summary(self, text):
+        """Extract summary/objective"""
+        lines = text.split('\n')
+        summary = ""
+        for i, line in enumerate(lines):
+            line_lower = line.lower()
+            if any(keyword in line_lower for keyword in ['summary', 'objective', 'profile', 'about']):
+                # Get next few lines as summary
+                summary_lines = lines[i+1:i+4]
+                summary = ' '.join([l.strip() for l in summary_lines if l.strip()])
+                break
+        return summary[:200]  # Limit to 200 characters
+def extract_resume_features(file_path):
+    """
+    Main function to extract features from resume
+    Returns a dictionary with extracted information
     """
     try:
+        parser = SimpleResumeParser()
+        text = parser.extract_text(file_path)
+        if not text:
+            return {
+                'name': '',
+                'email': '',
+                'mobile_number': '',
+                'skills': [],
+                'experience': [],
+                'education': [],
+                'summary': ''
+            }
+        # Extract all features
+        features = {
+            'name': parser.extract_name(text),
+            'email': parser.extract_email(text),
+            'mobile_number': parser.extract_phone(text),
+            'skills': parser.extract_skills(text),
+            'experience': parser.extract_experience(text),
+            'education': parser.extract_education(text),
+            'summary': parser.extract_summary(text)
+        }
+        return features
     except Exception as e:
+        print(f"Error extracting resume features: {e}")
         return {
             'name': '',
             'email': '',
             'mobile_number': '',
             'skills': [],
             'experience': [],
+            'education': [],
+            'summary': ''
         }
+# For backward compatibility
+def parse_resume(file_path):
+    return extract_resume_features(file_path)