Dhahlan2000 commited on
Commit
7b2b80a
·
1 Parent(s): 50fac90

Enhance CV parsing functionality in app.py by expanding section header patterns to include more variations and improving line cleaning. Implement a more robust categorization process for unstructured content, extracting contact information, education, experience, and skills based on keywords. Additionally, update text extraction from DOCX files to include table content, ensuring comprehensive CV data retrieval. This change improves the accuracy and completeness of CV parsing, contributing to better email generation based on user inputs.

Browse files
Files changed (1) hide show
  1. app.py +67 -18
app.py CHANGED
@@ -18,32 +18,71 @@ def parse_cv_sections(text: str) -> Dict[str, str]:
18
  'projects': '',
19
  }
20
 
21
- # Common section headers in CVs
22
  section_patterns = {
23
- 'contact': r'(?i)(contact|personal\s+information|profile)',
24
- 'education': r'(?i)(education|academic|qualification)',
25
- 'experience': r'(?i)(experience|work|employment|professional)',
26
- 'skills': r'(?i)(skills|technical skills|competencies)',
27
- 'projects': r'(?i)(projects|personal projects)',
28
  }
29
 
30
- # Split text into lines
31
- lines = text.split('\n')
32
  current_section = None
 
33
 
34
- for line in lines:
35
- line = line.strip()
36
- if not line:
37
- continue
38
-
39
  # Check if line is a section header
40
  for section, pattern in section_patterns.items():
41
  if re.search(pattern, line, re.IGNORECASE):
 
 
 
 
42
  current_section = section
 
43
  break
44
-
45
- if current_section and line:
46
- sections[current_section] += line + '\n'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  return sections
49
 
@@ -59,17 +98,27 @@ def extract_cv_text(file):
59
  if file_ext == '.pdf':
60
  reader = PdfReader(file)
61
  for page in reader.pages:
62
- text += page.extract_text()
63
 
64
  elif file_ext == '.docx':
65
  doc = docx.Document(file)
66
  for paragraph in doc.paragraphs:
67
- text += paragraph.text + '\n'
 
 
 
 
 
68
  else:
69
  return "Unsupported file format. Please upload PDF or DOCX files."
70
 
71
  # Parse the CV into sections
72
  sections = parse_cv_sections(text)
 
 
 
 
 
73
  return sections
74
 
75
  except Exception as e:
 
18
  'projects': '',
19
  }
20
 
21
+ # Common section headers in CVs with more variations
22
  section_patterns = {
23
+ 'contact': r'(?i)(contact|personal\s+information|profile|contact\s+details|about\s+me)',
24
+ 'education': r'(?i)(education|academic|qualification|academic\s+background|educational\s+background)',
25
+ 'experience': r'(?i)(experience|work|employment|professional|work\s+history|career|professional\s+experience)',
26
+ 'skills': r'(?i)(skills|technical\s+skills|competencies|expertise|technologies|tools|programming|languages)',
27
+ 'projects': r'(?i)(projects|personal\s+projects|portfolio|work\s+samples)',
28
  }
29
 
30
+ # Split text into lines and clean
31
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
32
  current_section = None
33
+ section_content = []
34
 
35
+ # First pass: identify sections
36
+ for i, line in enumerate(lines):
 
 
 
37
  # Check if line is a section header
38
  for section, pattern in section_patterns.items():
39
  if re.search(pattern, line, re.IGNORECASE):
40
+ # If we found a section header
41
+ if current_section:
42
+ # Save previous section content
43
+ sections[current_section] = '\n'.join(section_content)
44
  current_section = section
45
+ section_content = []
46
  break
47
+ else:
48
+ # If line is not a header and we have a current section, add to content
49
+ if current_section:
50
+ section_content.append(line)
51
+
52
+ # Save the last section
53
+ if current_section and section_content:
54
+ sections[current_section] = '\n'.join(section_content)
55
+
56
+ # If no sections were found, try to categorize the content
57
+ if all(not content for content in sections.values()):
58
+ lines_text = '\n'.join(lines)
59
+
60
+ # Look for email addresses and phone numbers for contact
61
+ email_pattern = r'[\w\.-]+@[\w\.-]+'
62
+ phone_pattern = r'[\+\d]?(\d{2,3}[-\.\s]?){2}\d{4}'
63
+
64
+ emails = re.findall(email_pattern, lines_text)
65
+ phones = re.findall(phone_pattern, lines_text)
66
+ if emails or phones:
67
+ sections['contact'] = '\n'.join(emails + phones)
68
+
69
+ # Look for education keywords
70
+ edu_keywords = r'(?i)(university|college|school|degree|bachelor|master|phd|diploma)'
71
+ edu_lines = [l for l in lines if re.search(edu_keywords, l)]
72
+ if edu_lines:
73
+ sections['education'] = '\n'.join(edu_lines)
74
+
75
+ # Look for experience keywords
76
+ exp_keywords = r'(?i)(worked|developer|engineer|manager|consultant|analyst)'
77
+ exp_lines = [l for l in lines if re.search(exp_keywords, l)]
78
+ if exp_lines:
79
+ sections['experience'] = '\n'.join(exp_lines)
80
+
81
+ # Look for skills
82
+ skill_keywords = r'(?i)(python|java|javascript|react|node|sql|aws|docker|kubernetes|git)'
83
+ skill_lines = [l for l in lines if re.search(skill_keywords, l)]
84
+ if skill_lines:
85
+ sections['skills'] = '\n'.join(skill_lines)
86
 
87
  return sections
88
 
 
98
  if file_ext == '.pdf':
99
  reader = PdfReader(file)
100
  for page in reader.pages:
101
+ text += page.extract_text() + "\n"
102
 
103
  elif file_ext == '.docx':
104
  doc = docx.Document(file)
105
  for paragraph in doc.paragraphs:
106
+ text += paragraph.text + "\n"
107
+ # Also check tables in docx
108
+ for table in doc.tables:
109
+ for row in table.rows:
110
+ for cell in row.cells:
111
+ text += cell.text + "\n"
112
  else:
113
  return "Unsupported file format. Please upload PDF or DOCX files."
114
 
115
  # Parse the CV into sections
116
  sections = parse_cv_sections(text)
117
+
118
+ # Verify that we have content
119
+ if all(not content.strip() for content in sections.values()):
120
+ return f"Could not parse CV sections. Raw text:\n{text}"
121
+
122
  return sections
123
 
124
  except Exception as e: