File size: 3,173 Bytes
adbf293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# imports
import json
import time

import gradio as gr
from transformers import AutoTokenizer, AutoModel
import openai
# pytorch library
import torch
import torch.nn.functional as f

from fuzzywuzzy import process

from roles_list import roles
from openai import OpenAI
# Load the model from the specified directory
embed_store = {}
model = 'sentence-transformers/all-MiniLM-L12-v2'
sbert_model = AutoModel.from_pretrained(model)
sbert_tokenizer = AutoTokenizer.from_pretrained(model)

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key="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",
)


for role in roles:
    encoding = sbert_tokenizer(role,  # the texts to be tokenized
                               max_length=10,
                               padding="max_length",
                               return_tensors='pt'  # return the tensors (not lists)
                               )
    with torch.no_grad():
        # get the model embeddings
        embed = sbert_model(**encoding)
        embed = embed.pooler_output
    embed_store[role] = f.normalize(embed, p=2, dim=1)
print("Model is ready for inference")


def get_role_from_sbert(title):
    start_time = time.time()
    encoding = sbert_tokenizer(title,
                         max_length=10,
                         padding="max_length",
                         return_tensors='pt'
                         )
    # Run the model prediction on the input data
    with torch.no_grad():
        # get the model embeddings
        embed = sbert_model(**encoding)
        embed = embed.pooler_output
    store_cos = {}
    for role in embed_store:
        cos_sim = torch.nn.functional.cosine_similarity(f.normalize(embed, p=2, dim=1), embed_store[role])
        store_cos[role] = round(cos_sim.item(), 3)
    # Get the top 3 items with the highest cosine similarity
    top_3_keys_values = sorted(store_cos.items(), key=lambda item: item[1], reverse=True)
    job_scores_str = '\n'.join([f"{job}: {score}" for job, score in top_3_keys_values])

    end_time = time.time()
    execution_time = end_time - start_time
    # Convert to dictionary if needed or keep as list of tuples
    return job_scores_str + f" \nExecution time: {str(execution_time)}"


def fuzzy_match(title):
    """
    Find the best matches for a query from a list of choices using fuzzy matching.

    Parameters:
    - query: The search string.
    - choices: A list of strings to search through.
    - limit: The maximum number of matches to return.

    Returns:
    A list of tuples with the match and its score. Higher score means closer match.
    """
    matches = process.extract(title, roles, limit=3)
    return matches


def fuzzy_match_sbert(title):
    matches = fuzzy_match(title)
    sbert_results = get_role_from_sbert(title)

    new_list = [matches, sbert_results]
    return new_list


demo = gr.Interface(fn=get_role_from_sbert,
                    inputs=gr.Textbox(label="Job Title"),
                    outputs=gr.Textbox(label="Role"),
                    title="HackerRank Role Classifier")


gr.close_all()
demo.launch(server_name='0.0.0.0', server_port=8081, share=True)