Spaces:

david-thrower
/

job-application-optimizer

Sleeping

App Files Files Community

david-thrower commited on Jun 27

Commit

0426e30

1 Parent(s): 61568f9

Added enhanced error handling for web search step.

Browse files

Files changed (1) hide show

app.py +61 -14

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 from duckduckgo_search import DDGS
 # Load the SmolLM model and tokenizer
 model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
@@ -13,6 +14,62 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 def get_gap_assessment_prompt(job_and_company_info, resume):
     gap_assessment_prompt = f"""
@@ -280,7 +337,7 @@ def process_job_description(company_name, company_url, job_description, resume):
     clean_job_description = smol_lm_jd_process(job_description, system_prompt_summary)
     system_prompt_get_job_title = "Extract only the job title from the following job description. Respond with nothing but the job title—no labels, no comments, no summaries, no locations, or extra text. If the title is unusually long or nonstandard, replace it with the most common, concise, and widely recognized job title for the role in plain text with only letters and numbers, remove any special characters nor punctuation no '\n' and no tabs, because any of these cause problems with downstream automated steps. Your answer must be 7 words or fewer. Acceptable examples may look like: 'Systems Analyst', 'marketing director', 'patient advocate III', ...\n\nThis is an excerpt of the job description:\n"
-    extracted_job_title = smol_lm_jd_process(job_description[:350], system_prompt_get_job_title, max_new_tokens=150)[:50].lower().replace("job","").replace("title","").replace("\n","")
     role = Role(company_name, company_url, job_description, clean_job_description, extracted_job_title)
@@ -292,22 +349,12 @@ def process_job_description(company_name, company_url, job_description, resume):
         "recent_news": f"{role.company_name} recent news relevant to the role {role.job_title} role",
         "competitive_advantages": f"{role.company_name} competitive advantages relevant to the role {role.job_title}"
     }
-    search_client = DDGS(timeout=35)
     search_results = {}
     for key, query in searches.items():
         print(f"searching {query}")
-        try:
-            results = search_client.text(query,
-                                         max_results=3)
-            print(f"searching {query} successful")
-            search_results[key] = results
-        except Exception as exc:
-            print(f"Rate limit error, will wait and retry searching {query}.")
-            sleep(5)
-            results = search_client.text(query,
-                                         max_results=3)
-            print(f"searching {query} successful")
-            search_results[key] = results
         sleep(3)

 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 from duckduckgo_search import DDGS
+import re
 # Load the SmolLM model and tokenizer
 model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+class DDGSSearchClient:
+    def __init__(self, max_retries=4, timeout=35, backoff_factor=1):
+        """
+        Initialize a DDGSSearchClient instance.
+        Args:
+        - max_retries (int): The maximum number of retries. Defaults to 4.
+        - timeout (int): The timeout for the DDGS client. Defaults to 35.
+        - backoff_factor (int): The backoff factor for exponential backoff. Defaults to 1.
+        """
+        self.max_retries = max_retries
+        self.timeout = timeout
+        self.backoff_factor = backoff_factor
+        self.search_client = DDGS(timeout=timeout)
+    def search(self, query):
+        """
+        Perform a DDGS search with retry mechanism.
+        Args:
+        - query (str): The search query.
+        Returns:
+        - search_results: The results of the DDGS search.
+        """
+        for attempt in range(self.max_retries + 1):
+            try:
+                search_results = self.search_client.text(query)
+                return search_results
+            except Exception as e:
+                if attempt < self.max_retries:
+                    # Exponential backoff
+                    delay = self.backoff_factor * (2 ** attempt)
+                    print(f"Search failed (attempt {attempt + 1}/{self.max_retries + 1}). Retrying query: {query} in {delay} seconds...")
+                    sleep(delay)
+                else:
+                    raise Exception(f"Search failed after {self.max_retries + 1} attempts. Giving up. Error from Search API: {e}.")
+def clean_string(s):
+    # Replace hyphens with spaces
+    s = s.replace('-', ' ')
+    # Remove anything other than A-Z, a-z, and " "
+    s = re.sub('[^A-Za-z ]+', '', s)
+    s.replace("-","").replace('—', '. ')
+    return s
+def replace_em_dashes(s):
+    # Replace em dashes with a period
+    s = s.replace('—', '. ')
+    return s
 def get_gap_assessment_prompt(job_and_company_info, resume):
     gap_assessment_prompt = f"""
     clean_job_description = smol_lm_jd_process(job_description, system_prompt_summary)
     system_prompt_get_job_title = "Extract only the job title from the following job description. Respond with nothing but the job title—no labels, no comments, no summaries, no locations, or extra text. If the title is unusually long or nonstandard, replace it with the most common, concise, and widely recognized job title for the role in plain text with only letters and numbers, remove any special characters nor punctuation no '\n' and no tabs, because any of these cause problems with downstream automated steps. Your answer must be 7 words or fewer. Acceptable examples may look like: 'Systems Analyst', 'marketing director', 'patient advocate III', ...\n\nThis is an excerpt of the job description:\n"
+    extracted_job_title = clean_string(smol_lm_jd_process(job_description[:350], system_prompt_get_job_title, max_new_tokens=150)[:50].lower().replace("job","").replace("title","").replace("\n",""))
     role = Role(company_name, company_url, job_description, clean_job_description, extracted_job_title)
         "recent_news": f"{role.company_name} recent news relevant to the role {role.job_title} role",
         "competitive_advantages": f"{role.company_name} competitive advantages relevant to the role {role.job_title}"
     }
+    search_client = DDGSSearchClient()
     search_results = {}
     for key, query in searches.items():
         print(f"searching {query}")
+        results = search_client.search(query)
+        search_results[key] = results
         sleep(3)