Dhahlan2000 commited on
Commit
7c0b46d
·
1 Parent(s): 3ce6b08

Refactor app.py to transition from Streamlit to Gradio for a more interactive user interface. Implement CV text extraction from both PDF and DOCX formats, enhancing file upload capabilities. Update email generation process to utilize Hugging Face's model directly, improving response generation. Modify requirements.txt to include 'gradio', 'huggingface_hub', and 'python-docx' for new dependencies.

Browse files
Files changed (2) hide show
  1. app.py +123 -131
  2. requirements.txt +3 -0
app.py CHANGED
@@ -1,137 +1,129 @@
1
- import streamlit as st
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
- import PyPDF2
4
- from dotenv import load_dotenv
5
  import os
6
-
7
- # Load environment variables from .env
8
- load_dotenv()
9
-
10
- # API Key
11
- access_token = os.getenv("API_KEY")
12
-
13
- # Streamlit App Title
14
- st.title("Job Description and CV-Based Email Generator")
15
- st.write("""
16
- This app uses Hugging Face's Gemma model to generate a professional email based on a pre-parsed CV and a job description.
17
- Upload your CV once in the sidebar, and the system will reuse the parsed details for generating emails.
18
- """)
19
-
20
- # Sidebar for Settings and CV Upload
21
- st.sidebar.title("Settings and CV Upload")
22
-
23
- # File Upload for CV in Sidebar
24
- uploaded_file = st.sidebar.file_uploader("Upload your CV (PDF format):", type=["pdf"])
25
-
26
- if "parsed_cv" not in st.session_state:
27
- st.session_state.parsed_cv = None
28
-
29
- if "email_history" not in st.session_state:
30
- st.session_state.email_history = []
31
-
32
- if uploaded_file is not None:
33
- try:
34
- # Extract text from PDF
35
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
36
- cv_text = "".join([page.extract_text() for page in pdf_reader.pages])
37
- st.sidebar.success("CV uploaded and text extracted successfully!")
38
-
39
- # Parse CV details and save to session state
40
- def parse_cv(cv_text):
41
- return f"""
42
- Name: [Extracted Name]
43
- Contact Information: [Extracted Contact Info]
44
- Skills: [Extracted Skills]
45
- Experience: [Extracted Experience]
46
- Education: [Extracted Education]
47
- Summary: {cv_text[:500]}... # Truncated summary of the CV
48
- """
49
-
50
- st.session_state.parsed_cv = parse_cv(cv_text)
51
- st.sidebar.success("CV parsed successfully!")
52
- except Exception as e:
53
- st.sidebar.error(f"Failed to extract text from CV: {e}")
54
-
55
- if st.session_state.parsed_cv:
56
- st.sidebar.write("### Parsed CV Details:")
57
- st.sidebar.text(st.session_state.parsed_cv)
58
-
59
- # Ensure Access Token is Provided
60
- if access_token:
61
- @st.cache_resource
62
- def initialize_pipeline(access_token):
63
- try:
64
- tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=access_token)
65
- model = AutoModelForCausalLM.from_pretrained(
66
- "google/gemma-2b-it",
67
- token=access_token,
68
- )
69
- return pipeline(
70
- "text-generation",
71
- model=model,
72
- tokenizer=tokenizer,
73
- max_new_tokens=2048,
74
- temperature=0.7,
75
- top_p=0.95
76
- )
77
- except Exception as e:
78
- st.error(f"Failed to initialize the model: {str(e)}")
79
- return None
80
-
81
- text_gen_pipeline = initialize_pipeline(access_token)
82
-
83
- # Input job description
84
- job_description = st.text_area("Enter the job description:", "")
85
-
86
- # Display generated email
87
- if st.button("Generate Email"):
88
- if st.session_state.parsed_cv and job_description.strip():
89
- try:
90
- # Improved prompt template
91
- prompt = f"""Task: Write a professional job application email.
92
-
93
- CV Summary:
94
- {st.session_state.parsed_cv}
95
-
96
- Job Description:
97
- {job_description}
98
 
99
  Instructions: Write a concise and professional email expressing interest in the position.
100
  Highlight relevant experience and skills from the CV that match the job requirements.
101
  Keep the tone professional and enthusiastic.
102
 
103
- Email:
104
- """
105
- # Generate email using the pipeline
106
- if text_gen_pipeline:
107
- response = text_gen_pipeline(
108
- prompt,
109
- clean_up_tokenization_spaces=True,
110
- return_full_text=False
111
- )[0]['generated_text']
112
-
113
- # Save response in history
114
- st.session_state.email_history.append({
115
- "job_description": job_description,
116
- "email": response
117
- })
118
-
119
- # Display response
120
- st.subheader("Generated Email:")
121
- st.write(response)
122
-
123
- # Display conversation history
124
- if st.session_state.email_history:
125
- st.subheader("Previous Generations:")
126
- for idx, entry in enumerate(st.session_state.email_history, 1):
127
- st.write(f"### Email {idx}")
128
- st.write(f"**Job Description:** {entry['job_description']}")
129
- st.write(f"**Generated Email:** {entry['email']}")
130
- else:
131
- st.error("Text generation pipeline not properly initialized.")
132
- except Exception as e:
133
- st.error(f"Error generating email: {str(e)}")
134
- else:
135
- st.warning("Please upload your CV in the sidebar and enter a job description.")
136
- else:
137
- st.warning("Please enter your Hugging Face access token in the sidebar to use the app.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import InferenceClient
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ import torch
5
  import os
6
+ from PyPDF2 import PdfReader
7
+ import docx
8
+
9
+ def extract_cv_text(file):
10
+ """Extract text from PDF or DOCX CV files."""
11
+ if file is None:
12
+ return "No CV uploaded"
13
+
14
+ file_ext = os.path.splitext(file.name)[1].lower()
15
+
16
+ if file_ext == '.pdf':
17
+ reader = PdfReader(file)
18
+ text = ""
19
+ for page in reader.pages:
20
+ text += page.extract_text()
21
+ return text
22
+
23
+ elif file_ext == '.docx':
24
+ doc = docx.Document(file)
25
+ text = ""
26
+ for paragraph in doc.paragraphs:
27
+ text += paragraph.text + "\n"
28
+ return text
29
+
30
+ else:
31
+ return "Unsupported file format. Please upload PDF or DOCX files."
32
+
33
+ # Replace 'your_huggingface_token' with your actual Hugging Face access token
34
+ access_token = os.getenv('token')
35
+
36
+ # Initialize the tokenizer and model with the Hugging Face access token
37
+ tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", use_auth_token=access_token)
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ "google/gemma-2b-it",
40
+ torch_dtype=torch.bfloat16,
41
+ use_auth_token=access_token
42
+ )
43
+ model.eval() # Set the model to evaluation mode
44
+
45
+ # Initialize the inference client (if needed for other API-based tasks)
46
+ client = InferenceClient(token=access_token)
47
+
48
+ def conversation_predict(input_text):
49
+ """Generate a response for single-turn input using the model."""
50
+ # Tokenize the input text
51
+ input_ids = tokenizer("""Job Description:
52
+ {input_text}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  Instructions: Write a concise and professional email expressing interest in the position.
55
  Highlight relevant experience and skills from the CV that match the job requirements.
56
  Keep the tone professional and enthusiastic.
57
 
58
+ Email:""", return_tensors="pt").input_ids
59
+
60
+ # Generate a response with the model
61
+ outputs = model.generate(input_ids, max_new_tokens=2048)
62
+
63
+ # Decode and return the generated response
64
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
65
+
66
+ def respond(
67
+ message: str,
68
+ history: list[tuple[str, str]],
69
+ system_message: str,
70
+ cv_file,
71
+ max_tokens: int,
72
+ temperature: float,
73
+ top_p: float,
74
+ ):
75
+ """Generate a response for a multi-turn chat conversation."""
76
+ # Extract CV text and update system message
77
+ cv_text = extract_cv_text(cv_file) if cv_file else "No CV provided"
78
+
79
+ updated_system_message = f"""Task: Write a professional job application email.
80
+
81
+ CV Summary:
82
+ {cv_text}
83
+
84
+ {system_message}"""
85
+
86
+ messages = [{"role": "system", "content": updated_system_message}]
87
+
88
+ for user_input, assistant_reply in history:
89
+ if user_input:
90
+ messages.append({"role": "user", "content": user_input})
91
+ if assistant_reply:
92
+ messages.append({"role": "assistant", "content": assistant_reply})
93
+
94
+ messages.append({"role": "user", "content": message})
95
+
96
+ response = ""
97
+
98
+ for message_chunk in client.chat_completion(
99
+ messages=messages,
100
+ max_tokens=max_tokens,
101
+ stream=True,
102
+ temperature=temperature,
103
+ top_p=top_p,
104
+ ):
105
+ token = message_chunk["choices"][0]["delta"].get("content", "")
106
+ response += token
107
+ yield response
108
+
109
+ # Create a Gradio ChatInterface demo
110
+ demo = gr.ChatInterface(
111
+ fn=respond,
112
+ additional_inputs=[
113
+ gr.Textbox(value="Instructions: Write a concise and professional email expressing interest in the position.",
114
+ label="System message"),
115
+ gr.File(label="Upload CV (PDF or DOCX)", file_types=[".pdf", ".docx"]),
116
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
117
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
118
+ gr.Slider(
119
+ minimum=0.1,
120
+ maximum=1.0,
121
+ value=0.95,
122
+ step=0.05,
123
+ label="Top-p (nucleus sampling)",
124
+ ),
125
+ ],
126
+ )
127
+
128
+ if __name__ == "__main__":
129
+ demo.launch()
requirements.txt CHANGED
@@ -5,4 +5,7 @@ langchain
5
  transformers
6
  torch
7
  PyPDF2
 
 
 
8
 
 
5
  transformers
6
  torch
7
  PyPDF2
8
+ gradio
9
+ huggingface_hub
10
+ python-docx
11