Spaces:

Dhahlan2000
/

AppyJob

Sleeping

AppyJob / app.py

Dhahlan2000

Reduce max_new_tokens in model generation from 2048 to 512 in app.py to optimize response length and improve performance. This change aims to enhance the efficiency of the conversation prediction function.

02386cb 8 months ago

raw

history blame

7.5 kB

	import streamlit as st
	from huggingface_hub import InferenceClient
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	import os
	from PyPDF2 import PdfReader
	import docx
	import re
	from typing import Dict

	def parse_cv_sections(text: str) -> Dict[str, str]:
	"""Parse CV text into structured sections."""
	sections = {
	'contact': '',
	'education': '',
	'experience': '',
	'skills': '',
	'projects': '',
	'other': '', # Added other section for miscellaneous content
	}

	# Common section headers in CVs
	section_patterns = {
	'contact': r'(?i)(contact\|personal\s+information\|profile)',
	'education': r'(?i)(education\|academic\|qualification)',
	'experience': r'(?i)(experience\|work\|employment\|professional)',
	'skills': r'(?i)(skills\|technical skills\|competencies)',
	'projects': r'(?i)(projects\|personal projects)',
	}

	# Split text into lines
	lines = text.split('\n')
	current_section = None

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Check if line is a section header
	section_found = False
	for section, pattern in section_patterns.items():
	if re.search(pattern, line, re.IGNORECASE):
	current_section = section
	section_found = True
	break

	if current_section and line:
	# If line doesn't match any known section and we haven't found a section yet,
	# put it in 'other'
	if not section_found and current_section is None:
	sections['other'] += line + '\n'
	else:
	sections[current_section] += line + '\n'

	return sections

	def extract_cv_text(file):
	"""Extract text from PDF or DOCX CV files."""
	if file is None:
	return "No CV uploaded"

	file_ext = os.path.splitext(file.name)[1].lower()
	text = ""

	try:
	if file_ext == '.pdf':
	reader = PdfReader(file)
	for page in reader.pages:
	text += page.extract_text()

	elif file_ext == '.docx':
	doc = docx.Document(file)
	for paragraph in doc.paragraphs:
	text += paragraph.text + '\n'
	else:
	return "Unsupported file format. Please upload PDF or DOCX files."

	# Parse the CV into sections
	sections = parse_cv_sections(text)
	return sections

	except Exception as e:
	return f"Error processing file: {str(e)}"

	# Replace 'your_huggingface_token' with your actual Hugging Face access token
	access_token = os.getenv('API_KEY')

	# Initialize the tokenizer and model with the Hugging Face access token
	tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", use_auth_token=access_token)
	model = AutoModelForCausalLM.from_pretrained(
	"google/gemma-2b-it",
	torch_dtype=torch.bfloat16,
	use_auth_token=access_token
	)
	model.eval() # Set the model to evaluation mode

	# Initialize the inference client (if needed for other API-based tasks)
	client = InferenceClient(token=access_token)

	def create_email_prompt(job_description: str, cv_sections: Dict[str, str]) -> str:
	"""Create a detailed prompt for email generation."""
	return f"""Based on the following information, generate only a professional job application email.

	Job Description:
	{job_description}

	CV Details:
	Experience:
	{cv_sections['experience']}

	Skills:
	{cv_sections['skills']}

	Education:
	{cv_sections['education']}

	Additional Information:
	{cv_sections['other']}

	Contact Information:
	{cv_sections['contact']}

	Guidelines:
	1. Start with a proper greeting
	2. First paragraph: Express interest in the position and mention how you found it
	3. Second paragraph: Highlight 2-3 most relevant experiences that match the job requirements
	4. Third paragraph: Mention specific skills that align with the role
	5. Closing paragraph: Express enthusiasm for an interview and provide contact information
	6. End with a professional closing

	Generate only the email, without any additional text or explanations."""

	def conversation_predict(input_text: str, cv_sections: Dict[str, str]):
	"""Generate a response using the model with improved prompting."""
	prompt = create_email_prompt(input_text, cv_sections)

	# Tokenize the input text
	input_ids = tokenizer(prompt, return_tensors="pt").input_ids

	# Generate a response with the model
	outputs = model.generate(
	input_ids,
	max_new_tokens=512,
	temperature=0.7,
	top_p=0.95,
	do_sample=True
	)

	# Decode and return the generated response
	return tokenizer.decode(outputs[0], skip_special_tokens=True)

	def respond(
	message: str,
	history: list[tuple[str, str]],
	system_message: str,
	cv_file,
	max_tokens: int,
	temperature: float,
	top_p: float,
	):
	"""Generate a response for a multi-turn chat conversation."""
	# Extract CV text and update system message
	cv_text = extract_cv_text(cv_file) if cv_file else "No CV provided"

	updated_system_message = f"""Task: Write a professional job application email.

	CV Summary:
	{cv_text}

	{system_message}"""

	messages = [{"role": "system", "content": updated_system_message}]

	for user_input, assistant_reply in history:
	if user_input:
	messages.append({"role": "user", "content": user_input})
	if assistant_reply:
	messages.append({"role": "assistant", "content": assistant_reply})

	messages.append({"role": "user", "content": message})

	response = ""

	for message_chunk in client.chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	):
	token = message_chunk["choices"][0]["delta"].get("content", "")
	response += token
	yield response

	# Streamlit UI section
	st.title("AI Job Application Email Generator")

	# Add tabs for different sections
	tab1, tab2 = st.tabs(["Generate Email", "View CV Details"])

	with tab1:
	# CV file upload
	cv_file = st.file_uploader("Upload CV (PDF or DOCX)", type=["pdf", "docx"])

	if cv_file:
	cv_sections = extract_cv_text(cv_file)
	if isinstance(cv_sections, dict):
	st.success("CV uploaded and parsed successfully!")
	else:
	st.error(cv_sections) # Show error message if parsing failed

	# Job description input
	st.markdown("### Job Description")
	message = st.text_area("Paste the job description here:", height=200)

	# Generate button
	if st.button("Generate Email"):
	if message and cv_file and isinstance(cv_sections, dict):
	response = conversation_predict(message, cv_sections)
	# Remove any potential prompt text from the response
	email_text = response.split("Email:")[-1].strip()
	st.text_area("Generated Email", email_text, height=400)
	else:
	st.warning("Please upload a CV and enter a job description.")

	with tab2:
	if cv_file and isinstance(cv_sections, dict):
	st.markdown("### Parsed CV Details")
	for section, content in cv_sections.items():
	with st.expander(f"{section.title()}"):
	st.text(content)
	else:
	st.info("Upload a CV to view parsed details")