Spaces:

Dhahlan2000
/

AppyJob

Sleeping

App Files Files Community

Dhahlan2000 commited on Jan 6

Commit

7b2b80a

1 Parent(s): 50fac90

Enhance CV parsing functionality in app.py by expanding section header patterns to include more variations and improving line cleaning. Implement a more robust categorization process for unstructured content, extracting contact information, education, experience, and skills based on keywords. Additionally, update text extraction from DOCX files to include table content, ensuring comprehensive CV data retrieval. This change improves the accuracy and completeness of CV parsing, contributing to better email generation based on user inputs.

Browse files

Files changed (1) hide show

app.py +67 -18

app.py CHANGED Viewed

@@ -18,32 +18,71 @@ def parse_cv_sections(text: str) -> Dict[str, str]:
         'projects': '',
     }
-    # Common section headers in CVs
     section_patterns = {
-        'contact': r'(?i)(contact|personal\s+information|profile)',
-        'education': r'(?i)(education|academic|qualification)',
-        'experience': r'(?i)(experience|work|employment|professional)',
-        'skills': r'(?i)(skills|technical skills|competencies)',
-        'projects': r'(?i)(projects|personal projects)',
     }
-    # Split text into lines
-    lines = text.split('\n')
     current_section = None
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
         # Check if line is a section header
         for section, pattern in section_patterns.items():
             if re.search(pattern, line, re.IGNORECASE):
                 current_section = section
                 break
-        if current_section and line:
-            sections[current_section] += line + '\n'
     return sections
@@ -59,17 +98,27 @@ def extract_cv_text(file):
         if file_ext == '.pdf':
             reader = PdfReader(file)
             for page in reader.pages:
-                text += page.extract_text()
         elif file_ext == '.docx':
             doc = docx.Document(file)
             for paragraph in doc.paragraphs:
-                text += paragraph.text + '\n'
         else:
             return "Unsupported file format. Please upload PDF or DOCX files."
         # Parse the CV into sections
         sections = parse_cv_sections(text)
         return sections
     except Exception as e:

         'projects': '',
     }
+    # Common section headers in CVs with more variations
     section_patterns = {
+        'contact': r'(?i)(contact|personal\s+information|profile|contact\s+details|about\s+me)',
+        'education': r'(?i)(education|academic|qualification|academic\s+background|educational\s+background)',
+        'experience': r'(?i)(experience|work|employment|professional|work\s+history|career|professional\s+experience)',
+        'skills': r'(?i)(skills|technical\s+skills|competencies|expertise|technologies|tools|programming|languages)',
+        'projects': r'(?i)(projects|personal\s+projects|portfolio|work\s+samples)',
     }
+    # Split text into lines and clean
+    lines = [line.strip() for line in text.split('\n') if line.strip()]
     current_section = None
+    section_content = []
+    # First pass: identify sections
+    for i, line in enumerate(lines):
         # Check if line is a section header
         for section, pattern in section_patterns.items():
             if re.search(pattern, line, re.IGNORECASE):
+                # If we found a section header
+                if current_section:
+                    # Save previous section content
+                    sections[current_section] = '\n'.join(section_content)
                 current_section = section
+                section_content = []
                 break
+        else:
+            # If line is not a header and we have a current section, add to content
+            if current_section:
+                section_content.append(line)
+    # Save the last section
+    if current_section and section_content:
+        sections[current_section] = '\n'.join(section_content)
+    # If no sections were found, try to categorize the content
+    if all(not content for content in sections.values()):
+        lines_text = '\n'.join(lines)
+        # Look for email addresses and phone numbers for contact
+        email_pattern = r'[\w\.-]+@[\w\.-]+'
+        phone_pattern = r'[\+\d]?(\d{2,3}[-\.\s]?){2}\d{4}'
+        emails = re.findall(email_pattern, lines_text)
+        phones = re.findall(phone_pattern, lines_text)
+        if emails or phones:
+            sections['contact'] = '\n'.join(emails + phones)
+        # Look for education keywords
+        edu_keywords = r'(?i)(university|college|school|degree|bachelor|master|phd|diploma)'
+        edu_lines = [l for l in lines if re.search(edu_keywords, l)]
+        if edu_lines:
+            sections['education'] = '\n'.join(edu_lines)
+        # Look for experience keywords
+        exp_keywords = r'(?i)(worked|developer|engineer|manager|consultant|analyst)'
+        exp_lines = [l for l in lines if re.search(exp_keywords, l)]
+        if exp_lines:
+            sections['experience'] = '\n'.join(exp_lines)
+        # Look for skills
+        skill_keywords = r'(?i)(python|java|javascript|react|node|sql|aws|docker|kubernetes|git)'
+        skill_lines = [l for l in lines if re.search(skill_keywords, l)]
+        if skill_lines:
+            sections['skills'] = '\n'.join(skill_lines)
     return sections
         if file_ext == '.pdf':
             reader = PdfReader(file)
             for page in reader.pages:
+                text += page.extract_text() + "\n"
         elif file_ext == '.docx':
             doc = docx.Document(file)
             for paragraph in doc.paragraphs:
+                text += paragraph.text + "\n"
+            # Also check tables in docx
+            for table in doc.tables:
+                for row in table.rows:
+                    for cell in row.cells:
+                        text += cell.text + "\n"
         else:
             return "Unsupported file format. Please upload PDF or DOCX files."
         # Parse the CV into sections
         sections = parse_cv_sections(text)
+        # Verify that we have content
+        if all(not content.strip() for content in sections.values()):
+            return f"Could not parse CV sections. Raw text:\n{text}"
         return sections
     except Exception as e: