|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from flask import Flask, request, jsonify, render_template, send_from_directory
|
|
import pdfplumber
|
|
import io
|
|
import re
|
|
import nltk
|
|
from nltk.corpus import stopwords
|
|
from nltk.tokenize import word_tokenize
|
|
import spacy
|
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
|
import torch
|
|
import logging
|
|
from collections import Counter
|
|
import os
|
|
|
|
|
|
os.environ["TRANSFORMERS_CACHE"] = os.path.join(os.getcwd(), "models")
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
app = Flask(__name__, static_folder='static')
|
|
|
|
|
|
try:
|
|
nltk.download('punkt', quiet=True)
|
|
nltk.download('stopwords', quiet=True)
|
|
stop_words = set(stopwords.words('english'))
|
|
except Exception as e:
|
|
logger.warning(f"NLTK download error: {str(e)}")
|
|
stop_words = set()
|
|
|
|
|
|
try:
|
|
nlp = spacy.load("en_core_web_sm")
|
|
except Exception as e:
|
|
logger.warning(f"SpaCy model loading error: {str(e)}")
|
|
|
|
|
|
def download_spacy_model():
|
|
import subprocess
|
|
subprocess.call(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
|
|
|
try:
|
|
download_spacy_model()
|
|
nlp = spacy.load("en_core_web_sm")
|
|
except:
|
|
logger.error("Failed to load spaCy model")
|
|
nlp = None
|
|
|
|
|
|
try:
|
|
model_name = "distilgpt2"
|
|
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
|
model = GPT2LMHeadModel.from_pretrained(model_name)
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
model.to(device)
|
|
except Exception as e:
|
|
logger.error(f"Model loading error: {str(e)}")
|
|
model = None
|
|
tokenizer = None
|
|
|
|
def extract_skills(text):
|
|
"""Extract technical skills from resume text"""
|
|
|
|
common_skills = {
|
|
'programming': ['python', 'java', 'javascript', 'c++', 'c#', 'ruby', 'php', 'swift', 'kotlin', 'go', 'rust', 'typescript', 'scala', 'perl', 'shell', 'bash', 'sql', 'html', 'css'],
|
|
'frameworks': ['react', 'angular', 'vue', 'django', 'flask', 'spring', 'express', 'rails', 'asp.net', 'laravel', 'node.js', 'bootstrap', 'jquery', 'tensorflow', 'pytorch', 'numpy', 'pandas'],
|
|
'databases': ['mysql', 'postgresql', 'mongodb', 'oracle', 'sql server', 'sqlite', 'redis', 'cassandra', 'dynamodb', 'firebase', 'elasticsearch'],
|
|
'tools': ['git', 'docker', 'kubernetes', 'jenkins', 'aws', 'azure', 'gcp', 'terraform', 'ansible', 'jira', 'confluence', 'notion', 'figma', 'photoshop', 'illustrator'],
|
|
'methodologies': ['agile', 'scrum', 'kanban', 'devops', 'ci/cd', 'test driven development', 'tdd', 'behavior driven development', 'bdd', 'rest', 'soap', 'microservices', 'serverless']
|
|
}
|
|
|
|
|
|
all_skills = [skill for category in common_skills.values() for skill in category]
|
|
|
|
|
|
found_skills = []
|
|
text_lower = text.lower()
|
|
|
|
for skill in all_skills:
|
|
|
|
pattern = r'\b' + re.escape(skill) + r'\b'
|
|
if re.search(pattern, text_lower):
|
|
found_skills.append(skill)
|
|
|
|
|
|
if nlp:
|
|
doc = nlp(text)
|
|
for ent in doc.ents:
|
|
if ent.label_ in ["ORG", "PRODUCT"] and len(ent.text) > 2:
|
|
entity = ent.text.lower()
|
|
|
|
if any(tech_word in entity for tech_word in ["tech", "software", "platform", "system", "framework", "api", "cloud"]):
|
|
found_skills.append(ent.text)
|
|
|
|
|
|
skill_counter = Counter(found_skills)
|
|
top_skills = [skill for skill, _ in skill_counter.most_common(10)]
|
|
|
|
return top_skills
|
|
|
|
def extract_experience(text):
|
|
"""Extract work experience information from resume"""
|
|
experience_data = []
|
|
|
|
|
|
experience_headers = ["experience", "work experience", "employment history", "professional experience"]
|
|
|
|
|
|
job_title_pattern = r"(?:^|\n)(?:Senior |Lead |Junior |Staff |Principal )?\b(?:Developer|Engineer|Designer|Manager|Director|Analyst|Consultant|Administrator|Architect|Specialist)\b"
|
|
date_pattern = r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}\s*(?:-|–|to)\s*(?:(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}|Present|Current|Now)"
|
|
|
|
|
|
text_lower = text.lower()
|
|
section_start = None
|
|
for header in experience_headers:
|
|
if header in text_lower:
|
|
section_start = text_lower.find(header)
|
|
break
|
|
|
|
if section_start is not None:
|
|
|
|
next_section_start = float('inf')
|
|
for next_header in ["education", "skills", "projects", "certifications", "references"]:
|
|
pos = text_lower.find(next_header, section_start + 1)
|
|
if pos > section_start and pos < next_section_start:
|
|
next_section_start = pos
|
|
|
|
experience_section = text[section_start:next_section_start] if next_section_start < float('inf') else text[section_start:]
|
|
|
|
|
|
job_titles = re.findall(job_title_pattern, experience_section, re.IGNORECASE)
|
|
|
|
|
|
date_ranges = re.findall(date_pattern, experience_section)
|
|
|
|
|
|
for i, title in enumerate(job_titles[:3]):
|
|
date = date_ranges[i] if i < len(date_ranges) else "Unknown date"
|
|
experience_data.append({"title": title.strip(), "date": date})
|
|
|
|
return experience_data
|
|
|
|
def extract_education(text):
|
|
"""Extract education information from resume"""
|
|
education_data = []
|
|
|
|
|
|
degree_pattern = r"\b(?:Bachelor|Master|PhD|Doctorate|BSc|MSc|BA|MA|MBA|MD|JD|BS|MS|B\.S\.|M\.S\.|B\.A\.|M\.A\.)['\s\w]*\b"
|
|
institution_pattern = r"\b(?:University|College|Institute|School) of [\w\s]+\b"
|
|
|
|
|
|
degrees = re.findall(degree_pattern, text)
|
|
institutions = re.findall(institution_pattern, text)
|
|
|
|
|
|
for i, degree in enumerate(degrees[:2]):
|
|
institution = institutions[i] if i < len(institutions) else "Unknown institution"
|
|
education_data.append({"degree": degree.strip(), "institution": institution})
|
|
|
|
return education_data
|
|
|
|
def preprocess_resume(text):
|
|
"""Extract structured information from resume text"""
|
|
|
|
text = text.replace('\n\n', ' [BREAK] ')
|
|
text = re.sub(r'\s+', ' ', text)
|
|
text = text.replace(' [BREAK] ', '\n\n')
|
|
|
|
|
|
skills = extract_skills(text)
|
|
experience = extract_experience(text)
|
|
education = extract_education(text)
|
|
|
|
|
|
resume_data = {
|
|
"skills": skills,
|
|
"experience": experience,
|
|
"education": education,
|
|
"full_text": text
|
|
}
|
|
|
|
return resume_data
|
|
|
|
def generate_interview_questions(resume_data):
|
|
"""Generate interview questions based on resume data"""
|
|
|
|
default_questions = [
|
|
|
|
"What challenges have you faced when working with databases, and how did you overcome them?",
|
|
"Describe a project where you had to optimize code for performance. What approach did you take?",
|
|
"How do you ensure your code is maintainable and follows best practices?",
|
|
"What software development methodologies are you familiar with, and which do you prefer?",
|
|
"How do you approach testing your code?",
|
|
|
|
|
|
"Tell me about a challenging project you worked on and how you approached it.",
|
|
"Describe a situation where you had to learn a new technology quickly.",
|
|
"How do you handle tight deadlines and pressure?"
|
|
]
|
|
|
|
|
|
if model is None or tokenizer is None:
|
|
return default_questions
|
|
|
|
|
|
skills_str = ", ".join(resume_data["skills"])
|
|
|
|
experience_str = ""
|
|
for exp in resume_data["experience"]:
|
|
experience_str += f"{exp['title']} ({exp['date']}), "
|
|
|
|
|
|
prompt = f"""Generate 8 interview questions based on this resume information:
|
|
Skills: {skills_str}
|
|
Experience: {experience_str}
|
|
|
|
Include 5 technical questions specific to the candidate's skills and 3 behavioral questions.
|
|
Format each question on a new line and make them realistic interview questions.
|
|
"""
|
|
|
|
try:
|
|
|
|
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
|
|
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)
|
|
|
|
|
|
output = model.generate(
|
|
input_ids,
|
|
attention_mask=attention_mask,
|
|
max_length=1024,
|
|
num_return_sequences=1,
|
|
no_repeat_ngram_size=2,
|
|
do_sample=True,
|
|
top_p=0.92,
|
|
top_k=50,
|
|
temperature=0.85
|
|
)
|
|
|
|
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
|
|
|
|
|
|
text_split = generated_text.replace(prompt, "").strip().split("\n")
|
|
|
|
|
|
questions = []
|
|
for line in text_split:
|
|
|
|
line = re.sub(r'^\d+[\.\)]\s*', '', line.strip())
|
|
|
|
|
|
if line and ('?' in line or any(q_word in line.lower() for q_word in ["how", "what", "why", "when", "where", "describe", "tell", "explain"])):
|
|
|
|
if not line.endswith('?') and any(q_word in line.lower() for q_word in ["how", "what", "why", "when", "where"]):
|
|
line += '?'
|
|
questions.append(line)
|
|
|
|
|
|
if len(questions) >= 5:
|
|
return questions[:8]
|
|
else:
|
|
|
|
logger.warning("Generated questions insufficient, using defaults")
|
|
return default_questions
|
|
|
|
except Exception as e:
|
|
logger.error(f"Question generation error: {str(e)}")
|
|
return default_questions
|
|
|
|
@app.route('/')
|
|
def index():
|
|
return render_template('index.html')
|
|
|
|
@app.route('/static/<path:path>')
|
|
def serve_static(path):
|
|
return send_from_directory('static', path)
|
|
|
|
@app.route('/upload-resume', methods=['POST'])
|
|
def upload_resume():
|
|
try:
|
|
if 'resume' not in request.files:
|
|
return jsonify({"error": "No file part"}), 400
|
|
|
|
file = request.files['resume']
|
|
|
|
if file.filename == '':
|
|
return jsonify({"error": "No selected file"}), 400
|
|
|
|
if not file.filename.endswith('.pdf'):
|
|
return jsonify({"error": "Only PDF files are supported"}), 400
|
|
|
|
|
|
text = ""
|
|
try:
|
|
with pdfplumber.open(io.BytesIO(file.read())) as pdf:
|
|
for page in pdf.pages:
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
text += page_text + "\n"
|
|
except Exception as e:
|
|
logger.error(f"PDF extraction error: {str(e)}")
|
|
return jsonify({"error": "Unable to extract text from PDF. Is the file corrupt?"}), 500
|
|
|
|
if not text.strip():
|
|
return jsonify({"error": "No text could be extracted from the PDF"}), 400
|
|
|
|
|
|
resume_data = preprocess_resume(text)
|
|
questions = generate_interview_questions(resume_data)
|
|
|
|
|
|
if resume_data["skills"]:
|
|
top_skill = resume_data["skills"][0]
|
|
skill_question = f"Tell me about your experience with {top_skill} and how you've applied it in your projects."
|
|
questions.append(skill_question)
|
|
|
|
return jsonify(questions)
|
|
|
|
except Exception as e:
|
|
logger.error(f"General error: {str(e)}")
|
|
return jsonify({"error": "An error occurred processing your request. Please try again."}), 500
|
|
|
|
if __name__ == '__main__':
|
|
|
|
for directory in ['templates', 'static']:
|
|
os.makedirs(directory, exist_ok=True)
|
|
|
|
|
|
templates_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates')
|
|
if not os.path.exists(os.path.join(templates_dir, 'index.html')):
|
|
with open(os.path.join(templates_dir, 'index.html'), 'w') as f:
|
|
f.write('''<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Resume Question Generator</title>
|
|
<meta http-equiv="refresh" content="0;url=/" />
|
|
</head>
|
|
<body>
|
|
<p>Redirecting...</p>
|
|
</body>
|
|
</html>''')
|
|
|
|
|
|
app.run(debug=True) |