david-thrower commited on
Commit
0426e30
·
1 Parent(s): 61568f9

Added enhanced error handling for web search step.

Browse files
Files changed (1) hide show
  1. app.py +61 -14
app.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import torch
6
  from duckduckgo_search import DDGS
 
7
 
8
  # Load the SmolLM model and tokenizer
9
  model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
@@ -13,6 +14,62 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
  model.to(device)
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def get_gap_assessment_prompt(job_and_company_info, resume):
17
  gap_assessment_prompt = f"""
18
 
@@ -280,7 +337,7 @@ def process_job_description(company_name, company_url, job_description, resume):
280
  clean_job_description = smol_lm_jd_process(job_description, system_prompt_summary)
281
 
282
  system_prompt_get_job_title = "Extract only the job title from the following job description. Respond with nothing but the job title—no labels, no comments, no summaries, no locations, or extra text. If the title is unusually long or nonstandard, replace it with the most common, concise, and widely recognized job title for the role in plain text with only letters and numbers, remove any special characters nor punctuation no '\n' and no tabs, because any of these cause problems with downstream automated steps. Your answer must be 7 words or fewer. Acceptable examples may look like: 'Systems Analyst', 'marketing director', 'patient advocate III', ...\n\nThis is an excerpt of the job description:\n"
283
- extracted_job_title = smol_lm_jd_process(job_description[:350], system_prompt_get_job_title, max_new_tokens=150)[:50].lower().replace("job","").replace("title","").replace("\n","")
284
 
285
  role = Role(company_name, company_url, job_description, clean_job_description, extracted_job_title)
286
 
@@ -292,22 +349,12 @@ def process_job_description(company_name, company_url, job_description, resume):
292
  "recent_news": f"{role.company_name} recent news relevant to the role {role.job_title} role",
293
  "competitive_advantages": f"{role.company_name} competitive advantages relevant to the role {role.job_title}"
294
  }
295
- search_client = DDGS(timeout=35)
296
  search_results = {}
297
  for key, query in searches.items():
298
  print(f"searching {query}")
299
- try:
300
- results = search_client.text(query,
301
- max_results=3)
302
- print(f"searching {query} successful")
303
- search_results[key] = results
304
- except Exception as exc:
305
- print(f"Rate limit error, will wait and retry searching {query}.")
306
- sleep(5)
307
- results = search_client.text(query,
308
- max_results=3)
309
- print(f"searching {query} successful")
310
- search_results[key] = results
311
  sleep(3)
312
 
313
 
 
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import torch
6
  from duckduckgo_search import DDGS
7
+ import re
8
 
9
  # Load the SmolLM model and tokenizer
10
  model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
 
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
  model.to(device)
16
 
17
+ class DDGSSearchClient:
18
+ def __init__(self, max_retries=4, timeout=35, backoff_factor=1):
19
+ """
20
+ Initialize a DDGSSearchClient instance.
21
+
22
+ Args:
23
+ - max_retries (int): The maximum number of retries. Defaults to 4.
24
+ - timeout (int): The timeout for the DDGS client. Defaults to 35.
25
+ - backoff_factor (int): The backoff factor for exponential backoff. Defaults to 1.
26
+ """
27
+ self.max_retries = max_retries
28
+ self.timeout = timeout
29
+ self.backoff_factor = backoff_factor
30
+ self.search_client = DDGS(timeout=timeout)
31
+
32
+ def search(self, query):
33
+ """
34
+ Perform a DDGS search with retry mechanism.
35
+
36
+ Args:
37
+ - query (str): The search query.
38
+
39
+ Returns:
40
+ - search_results: The results of the DDGS search.
41
+ """
42
+ for attempt in range(self.max_retries + 1):
43
+ try:
44
+ search_results = self.search_client.text(query)
45
+ return search_results
46
+ except Exception as e:
47
+ if attempt < self.max_retries:
48
+ # Exponential backoff
49
+ delay = self.backoff_factor * (2 ** attempt)
50
+ print(f"Search failed (attempt {attempt + 1}/{self.max_retries + 1}). Retrying query: {query} in {delay} seconds...")
51
+ sleep(delay)
52
+ else:
53
+ raise Exception(f"Search failed after {self.max_retries + 1} attempts. Giving up. Error from Search API: {e}.")
54
+
55
+
56
+ def clean_string(s):
57
+ # Replace hyphens with spaces
58
+ s = s.replace('-', ' ')
59
+
60
+ # Remove anything other than A-Z, a-z, and " "
61
+ s = re.sub('[^A-Za-z ]+', '', s)
62
+ s.replace("-","").replace('—', '. ')
63
+
64
+ return s
65
+
66
+
67
+ def replace_em_dashes(s):
68
+ # Replace em dashes with a period
69
+ s = s.replace('—', '. ')
70
+ return s
71
+
72
+
73
  def get_gap_assessment_prompt(job_and_company_info, resume):
74
  gap_assessment_prompt = f"""
75
 
 
337
  clean_job_description = smol_lm_jd_process(job_description, system_prompt_summary)
338
 
339
  system_prompt_get_job_title = "Extract only the job title from the following job description. Respond with nothing but the job title—no labels, no comments, no summaries, no locations, or extra text. If the title is unusually long or nonstandard, replace it with the most common, concise, and widely recognized job title for the role in plain text with only letters and numbers, remove any special characters nor punctuation no '\n' and no tabs, because any of these cause problems with downstream automated steps. Your answer must be 7 words or fewer. Acceptable examples may look like: 'Systems Analyst', 'marketing director', 'patient advocate III', ...\n\nThis is an excerpt of the job description:\n"
340
+ extracted_job_title = clean_string(smol_lm_jd_process(job_description[:350], system_prompt_get_job_title, max_new_tokens=150)[:50].lower().replace("job","").replace("title","").replace("\n",""))
341
 
342
  role = Role(company_name, company_url, job_description, clean_job_description, extracted_job_title)
343
 
 
349
  "recent_news": f"{role.company_name} recent news relevant to the role {role.job_title} role",
350
  "competitive_advantages": f"{role.company_name} competitive advantages relevant to the role {role.job_title}"
351
  }
352
+ search_client = DDGSSearchClient()
353
  search_results = {}
354
  for key, query in searches.items():
355
  print(f"searching {query}")
356
+ results = search_client.search(query)
357
+ search_results[key] = results
 
 
 
 
 
 
 
 
 
 
358
  sleep(3)
359
 
360