Spaces:
Sleeping
Sleeping
Commit
·
2d88e43
1
Parent(s):
7b2b80a
Refactor app.py to simplify CV text extraction and enhance email generation. Removed the parse_cv_sections function, returning the full CV text instead of structured sections. Updated related functions to accommodate this change, improving the overall flow and user experience in the Streamlit interface. This refactor streamlines the process of handling CV uploads and job descriptions, ensuring a more efficient email generation workflow.
Browse files
app.py
CHANGED
@@ -8,84 +8,6 @@ import docx
|
|
8 |
import re
|
9 |
from typing import Dict
|
10 |
|
11 |
-
def parse_cv_sections(text: str) -> Dict[str, str]:
|
12 |
-
"""Parse CV text into structured sections."""
|
13 |
-
sections = {
|
14 |
-
'contact': '',
|
15 |
-
'education': '',
|
16 |
-
'experience': '',
|
17 |
-
'skills': '',
|
18 |
-
'projects': '',
|
19 |
-
}
|
20 |
-
|
21 |
-
# Common section headers in CVs with more variations
|
22 |
-
section_patterns = {
|
23 |
-
'contact': r'(?i)(contact|personal\s+information|profile|contact\s+details|about\s+me)',
|
24 |
-
'education': r'(?i)(education|academic|qualification|academic\s+background|educational\s+background)',
|
25 |
-
'experience': r'(?i)(experience|work|employment|professional|work\s+history|career|professional\s+experience)',
|
26 |
-
'skills': r'(?i)(skills|technical\s+skills|competencies|expertise|technologies|tools|programming|languages)',
|
27 |
-
'projects': r'(?i)(projects|personal\s+projects|portfolio|work\s+samples)',
|
28 |
-
}
|
29 |
-
|
30 |
-
# Split text into lines and clean
|
31 |
-
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
32 |
-
current_section = None
|
33 |
-
section_content = []
|
34 |
-
|
35 |
-
# First pass: identify sections
|
36 |
-
for i, line in enumerate(lines):
|
37 |
-
# Check if line is a section header
|
38 |
-
for section, pattern in section_patterns.items():
|
39 |
-
if re.search(pattern, line, re.IGNORECASE):
|
40 |
-
# If we found a section header
|
41 |
-
if current_section:
|
42 |
-
# Save previous section content
|
43 |
-
sections[current_section] = '\n'.join(section_content)
|
44 |
-
current_section = section
|
45 |
-
section_content = []
|
46 |
-
break
|
47 |
-
else:
|
48 |
-
# If line is not a header and we have a current section, add to content
|
49 |
-
if current_section:
|
50 |
-
section_content.append(line)
|
51 |
-
|
52 |
-
# Save the last section
|
53 |
-
if current_section and section_content:
|
54 |
-
sections[current_section] = '\n'.join(section_content)
|
55 |
-
|
56 |
-
# If no sections were found, try to categorize the content
|
57 |
-
if all(not content for content in sections.values()):
|
58 |
-
lines_text = '\n'.join(lines)
|
59 |
-
|
60 |
-
# Look for email addresses and phone numbers for contact
|
61 |
-
email_pattern = r'[\w\.-]+@[\w\.-]+'
|
62 |
-
phone_pattern = r'[\+\d]?(\d{2,3}[-\.\s]?){2}\d{4}'
|
63 |
-
|
64 |
-
emails = re.findall(email_pattern, lines_text)
|
65 |
-
phones = re.findall(phone_pattern, lines_text)
|
66 |
-
if emails or phones:
|
67 |
-
sections['contact'] = '\n'.join(emails + phones)
|
68 |
-
|
69 |
-
# Look for education keywords
|
70 |
-
edu_keywords = r'(?i)(university|college|school|degree|bachelor|master|phd|diploma)'
|
71 |
-
edu_lines = [l for l in lines if re.search(edu_keywords, l)]
|
72 |
-
if edu_lines:
|
73 |
-
sections['education'] = '\n'.join(edu_lines)
|
74 |
-
|
75 |
-
# Look for experience keywords
|
76 |
-
exp_keywords = r'(?i)(worked|developer|engineer|manager|consultant|analyst)'
|
77 |
-
exp_lines = [l for l in lines if re.search(exp_keywords, l)]
|
78 |
-
if exp_lines:
|
79 |
-
sections['experience'] = '\n'.join(exp_lines)
|
80 |
-
|
81 |
-
# Look for skills
|
82 |
-
skill_keywords = r'(?i)(python|java|javascript|react|node|sql|aws|docker|kubernetes|git)'
|
83 |
-
skill_lines = [l for l in lines if re.search(skill_keywords, l)]
|
84 |
-
if skill_lines:
|
85 |
-
sections['skills'] = '\n'.join(skill_lines)
|
86 |
-
|
87 |
-
return sections
|
88 |
-
|
89 |
def extract_cv_text(file):
|
90 |
"""Extract text from PDF or DOCX CV files."""
|
91 |
if file is None:
|
@@ -98,28 +20,16 @@ def extract_cv_text(file):
|
|
98 |
if file_ext == '.pdf':
|
99 |
reader = PdfReader(file)
|
100 |
for page in reader.pages:
|
101 |
-
text += page.extract_text()
|
102 |
|
103 |
elif file_ext == '.docx':
|
104 |
doc = docx.Document(file)
|
105 |
for paragraph in doc.paragraphs:
|
106 |
-
text += paragraph.text +
|
107 |
-
# Also check tables in docx
|
108 |
-
for table in doc.tables:
|
109 |
-
for row in table.rows:
|
110 |
-
for cell in row.cells:
|
111 |
-
text += cell.text + "\n"
|
112 |
else:
|
113 |
return "Unsupported file format. Please upload PDF or DOCX files."
|
114 |
|
115 |
-
#
|
116 |
-
sections = parse_cv_sections(text)
|
117 |
-
|
118 |
-
# Verify that we have content
|
119 |
-
if all(not content.strip() for content in sections.values()):
|
120 |
-
return f"Could not parse CV sections. Raw text:\n{text}"
|
121 |
-
|
122 |
-
return sections
|
123 |
|
124 |
except Exception as e:
|
125 |
return f"Error processing file: {str(e)}"
|
@@ -130,20 +40,13 @@ access_token = os.getenv('API_KEY')
|
|
130 |
# Initialize the inference client (if needed for other API-based tasks)
|
131 |
client = InferenceClient(token=access_token)
|
132 |
|
133 |
-
def create_email_prompt(job_description: str,
|
134 |
"""Create a detailed prompt for email generation."""
|
135 |
return f"""Job Description:
|
136 |
{job_description}
|
137 |
|
138 |
Your CV Details:
|
139 |
-
|
140 |
-
{cv_sections['experience']}
|
141 |
-
|
142 |
-
Skills:
|
143 |
-
{cv_sections['skills']}
|
144 |
-
|
145 |
-
Education:
|
146 |
-
{cv_sections['education']}
|
147 |
|
148 |
Instructions: Write a professional job application email following these guidelines:
|
149 |
1. Start with a proper greeting
|
@@ -157,9 +60,9 @@ Keep the tone professional, confident, and enthusiastic. Be concise but impactfu
|
|
157 |
|
158 |
Email:"""
|
159 |
|
160 |
-
def conversation_predict(input_text: str,
|
161 |
"""Generate a response using the model with streaming output."""
|
162 |
-
prompt = create_email_prompt(input_text,
|
163 |
|
164 |
# Use the streaming API
|
165 |
try:
|
@@ -223,18 +126,18 @@ CV Summary:
|
|
223 |
# Streamlit UI section
|
224 |
st.title("AI Job Application Email Generator")
|
225 |
|
226 |
-
def update_ui(message, cv_file,
|
227 |
"""Handle the UI updates for email generation."""
|
228 |
# Create placeholder for the generated email
|
229 |
email_placeholder = st.empty()
|
230 |
|
231 |
# Generate button
|
232 |
if st.button("Generate Email", key="generate_button"):
|
233 |
-
if message and cv_file and isinstance(
|
234 |
email_text = ""
|
235 |
# Stream the response
|
236 |
try:
|
237 |
-
for chunk in conversation_predict(message,
|
238 |
if chunk:
|
239 |
email_text += chunk
|
240 |
# Update the text area with each chunk, using timestamp in key
|
@@ -256,27 +159,25 @@ with tab1:
|
|
256 |
cv_file = st.file_uploader("Upload CV (PDF or DOCX)", type=["pdf", "docx"])
|
257 |
|
258 |
if cv_file:
|
259 |
-
|
260 |
-
if isinstance(
|
261 |
-
st.success("CV uploaded
|
262 |
else:
|
263 |
-
st.error(
|
264 |
-
|
265 |
else:
|
266 |
-
|
267 |
|
268 |
# Job description input
|
269 |
st.markdown("### Job Description")
|
270 |
message = st.text_area("Paste the job description here:", height=200)
|
271 |
|
272 |
# Call the updated UI function with parameters
|
273 |
-
update_ui(message, cv_file,
|
274 |
|
275 |
with tab2:
|
276 |
-
if cv_file and isinstance(
|
277 |
-
st.markdown("###
|
278 |
-
|
279 |
-
with st.expander(f"{section.title()}"):
|
280 |
-
st.text(content)
|
281 |
else:
|
282 |
-
st.info("Upload a CV to view
|
|
|
8 |
import re
|
9 |
from typing import Dict
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def extract_cv_text(file):
|
12 |
"""Extract text from PDF or DOCX CV files."""
|
13 |
if file is None:
|
|
|
20 |
if file_ext == '.pdf':
|
21 |
reader = PdfReader(file)
|
22 |
for page in reader.pages:
|
23 |
+
text += page.extract_text()
|
24 |
|
25 |
elif file_ext == '.docx':
|
26 |
doc = docx.Document(file)
|
27 |
for paragraph in doc.paragraphs:
|
28 |
+
text += paragraph.text + '\n'
|
|
|
|
|
|
|
|
|
|
|
29 |
else:
|
30 |
return "Unsupported file format. Please upload PDF or DOCX files."
|
31 |
|
32 |
+
return text # Return the full text instead of parsed sections
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
except Exception as e:
|
35 |
return f"Error processing file: {str(e)}"
|
|
|
40 |
# Initialize the inference client (if needed for other API-based tasks)
|
41 |
client = InferenceClient(token=access_token)
|
42 |
|
43 |
+
def create_email_prompt(job_description: str, cv_text: str) -> str:
|
44 |
"""Create a detailed prompt for email generation."""
|
45 |
return f"""Job Description:
|
46 |
{job_description}
|
47 |
|
48 |
Your CV Details:
|
49 |
+
{cv_text}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
Instructions: Write a professional job application email following these guidelines:
|
52 |
1. Start with a proper greeting
|
|
|
60 |
|
61 |
Email:"""
|
62 |
|
63 |
+
def conversation_predict(input_text: str, cv_text: str):
|
64 |
"""Generate a response using the model with streaming output."""
|
65 |
+
prompt = create_email_prompt(input_text, cv_text)
|
66 |
|
67 |
# Use the streaming API
|
68 |
try:
|
|
|
126 |
# Streamlit UI section
|
127 |
st.title("AI Job Application Email Generator")
|
128 |
|
129 |
+
def update_ui(message, cv_file, cv_text):
|
130 |
"""Handle the UI updates for email generation."""
|
131 |
# Create placeholder for the generated email
|
132 |
email_placeholder = st.empty()
|
133 |
|
134 |
# Generate button
|
135 |
if st.button("Generate Email", key="generate_button"):
|
136 |
+
if message and cv_file and isinstance(cv_text, str) and not cv_text.startswith("Error"):
|
137 |
email_text = ""
|
138 |
# Stream the response
|
139 |
try:
|
140 |
+
for chunk in conversation_predict(message, cv_text):
|
141 |
if chunk:
|
142 |
email_text += chunk
|
143 |
# Update the text area with each chunk, using timestamp in key
|
|
|
159 |
cv_file = st.file_uploader("Upload CV (PDF or DOCX)", type=["pdf", "docx"])
|
160 |
|
161 |
if cv_file:
|
162 |
+
cv_text = extract_cv_text(cv_file)
|
163 |
+
if isinstance(cv_text, str) and not cv_text.startswith("Error"):
|
164 |
+
st.success("CV uploaded successfully!")
|
165 |
else:
|
166 |
+
st.error(cv_text)
|
167 |
+
cv_text = None
|
168 |
else:
|
169 |
+
cv_text = None
|
170 |
|
171 |
# Job description input
|
172 |
st.markdown("### Job Description")
|
173 |
message = st.text_area("Paste the job description here:", height=200)
|
174 |
|
175 |
# Call the updated UI function with parameters
|
176 |
+
update_ui(message, cv_file, cv_text)
|
177 |
|
178 |
with tab2:
|
179 |
+
if cv_file and isinstance(cv_text, str) and not cv_text.startswith("Error"):
|
180 |
+
st.markdown("### CV Content")
|
181 |
+
st.text_area("Full CV Text", value=cv_text, height=400)
|
|
|
|
|
182 |
else:
|
183 |
+
st.info("Upload a CV to view content")
|