Commit
·
0426e30
1
Parent(s):
61568f9
Added enhanced error handling for web search step.
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
|
|
4 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
5 |
import torch
|
6 |
from duckduckgo_search import DDGS
|
|
|
7 |
|
8 |
# Load the SmolLM model and tokenizer
|
9 |
model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
|
@@ -13,6 +14,62 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
13 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
model.to(device)
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def get_gap_assessment_prompt(job_and_company_info, resume):
|
17 |
gap_assessment_prompt = f"""
|
18 |
|
@@ -280,7 +337,7 @@ def process_job_description(company_name, company_url, job_description, resume):
|
|
280 |
clean_job_description = smol_lm_jd_process(job_description, system_prompt_summary)
|
281 |
|
282 |
system_prompt_get_job_title = "Extract only the job title from the following job description. Respond with nothing but the job title—no labels, no comments, no summaries, no locations, or extra text. If the title is unusually long or nonstandard, replace it with the most common, concise, and widely recognized job title for the role in plain text with only letters and numbers, remove any special characters nor punctuation no '\n' and no tabs, because any of these cause problems with downstream automated steps. Your answer must be 7 words or fewer. Acceptable examples may look like: 'Systems Analyst', 'marketing director', 'patient advocate III', ...\n\nThis is an excerpt of the job description:\n"
|
283 |
-
extracted_job_title = smol_lm_jd_process(job_description[:350], system_prompt_get_job_title, max_new_tokens=150)[:50].lower().replace("job","").replace("title","").replace("\n","")
|
284 |
|
285 |
role = Role(company_name, company_url, job_description, clean_job_description, extracted_job_title)
|
286 |
|
@@ -292,22 +349,12 @@ def process_job_description(company_name, company_url, job_description, resume):
|
|
292 |
"recent_news": f"{role.company_name} recent news relevant to the role {role.job_title} role",
|
293 |
"competitive_advantages": f"{role.company_name} competitive advantages relevant to the role {role.job_title}"
|
294 |
}
|
295 |
-
search_client =
|
296 |
search_results = {}
|
297 |
for key, query in searches.items():
|
298 |
print(f"searching {query}")
|
299 |
-
|
300 |
-
|
301 |
-
max_results=3)
|
302 |
-
print(f"searching {query} successful")
|
303 |
-
search_results[key] = results
|
304 |
-
except Exception as exc:
|
305 |
-
print(f"Rate limit error, will wait and retry searching {query}.")
|
306 |
-
sleep(5)
|
307 |
-
results = search_client.text(query,
|
308 |
-
max_results=3)
|
309 |
-
print(f"searching {query} successful")
|
310 |
-
search_results[key] = results
|
311 |
sleep(3)
|
312 |
|
313 |
|
|
|
4 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
5 |
import torch
|
6 |
from duckduckgo_search import DDGS
|
7 |
+
import re
|
8 |
|
9 |
# Load the SmolLM model and tokenizer
|
10 |
model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
|
|
|
14 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
15 |
model.to(device)
|
16 |
|
17 |
+
class DDGSSearchClient:
|
18 |
+
def __init__(self, max_retries=4, timeout=35, backoff_factor=1):
|
19 |
+
"""
|
20 |
+
Initialize a DDGSSearchClient instance.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
- max_retries (int): The maximum number of retries. Defaults to 4.
|
24 |
+
- timeout (int): The timeout for the DDGS client. Defaults to 35.
|
25 |
+
- backoff_factor (int): The backoff factor for exponential backoff. Defaults to 1.
|
26 |
+
"""
|
27 |
+
self.max_retries = max_retries
|
28 |
+
self.timeout = timeout
|
29 |
+
self.backoff_factor = backoff_factor
|
30 |
+
self.search_client = DDGS(timeout=timeout)
|
31 |
+
|
32 |
+
def search(self, query):
|
33 |
+
"""
|
34 |
+
Perform a DDGS search with retry mechanism.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
- query (str): The search query.
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
- search_results: The results of the DDGS search.
|
41 |
+
"""
|
42 |
+
for attempt in range(self.max_retries + 1):
|
43 |
+
try:
|
44 |
+
search_results = self.search_client.text(query)
|
45 |
+
return search_results
|
46 |
+
except Exception as e:
|
47 |
+
if attempt < self.max_retries:
|
48 |
+
# Exponential backoff
|
49 |
+
delay = self.backoff_factor * (2 ** attempt)
|
50 |
+
print(f"Search failed (attempt {attempt + 1}/{self.max_retries + 1}). Retrying query: {query} in {delay} seconds...")
|
51 |
+
sleep(delay)
|
52 |
+
else:
|
53 |
+
raise Exception(f"Search failed after {self.max_retries + 1} attempts. Giving up. Error from Search API: {e}.")
|
54 |
+
|
55 |
+
|
56 |
+
def clean_string(s):
|
57 |
+
# Replace hyphens with spaces
|
58 |
+
s = s.replace('-', ' ')
|
59 |
+
|
60 |
+
# Remove anything other than A-Z, a-z, and " "
|
61 |
+
s = re.sub('[^A-Za-z ]+', '', s)
|
62 |
+
s.replace("-","").replace('—', '. ')
|
63 |
+
|
64 |
+
return s
|
65 |
+
|
66 |
+
|
67 |
+
def replace_em_dashes(s):
|
68 |
+
# Replace em dashes with a period
|
69 |
+
s = s.replace('—', '. ')
|
70 |
+
return s
|
71 |
+
|
72 |
+
|
73 |
def get_gap_assessment_prompt(job_and_company_info, resume):
|
74 |
gap_assessment_prompt = f"""
|
75 |
|
|
|
337 |
clean_job_description = smol_lm_jd_process(job_description, system_prompt_summary)
|
338 |
|
339 |
system_prompt_get_job_title = "Extract only the job title from the following job description. Respond with nothing but the job title—no labels, no comments, no summaries, no locations, or extra text. If the title is unusually long or nonstandard, replace it with the most common, concise, and widely recognized job title for the role in plain text with only letters and numbers, remove any special characters nor punctuation no '\n' and no tabs, because any of these cause problems with downstream automated steps. Your answer must be 7 words or fewer. Acceptable examples may look like: 'Systems Analyst', 'marketing director', 'patient advocate III', ...\n\nThis is an excerpt of the job description:\n"
|
340 |
+
extracted_job_title = clean_string(smol_lm_jd_process(job_description[:350], system_prompt_get_job_title, max_new_tokens=150)[:50].lower().replace("job","").replace("title","").replace("\n",""))
|
341 |
|
342 |
role = Role(company_name, company_url, job_description, clean_job_description, extracted_job_title)
|
343 |
|
|
|
349 |
"recent_news": f"{role.company_name} recent news relevant to the role {role.job_title} role",
|
350 |
"competitive_advantages": f"{role.company_name} competitive advantages relevant to the role {role.job_title}"
|
351 |
}
|
352 |
+
search_client = DDGSSearchClient()
|
353 |
search_results = {}
|
354 |
for key, query in searches.items():
|
355 |
print(f"searching {query}")
|
356 |
+
results = search_client.search(query)
|
357 |
+
search_results[key] = results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
sleep(3)
|
359 |
|
360 |
|