File size: 4,227 Bytes
adbf293
 
 
 
 
4fa2adf
adbf293
 
 
 
 
4fa2adf
 
 
 
 
 
 
 
 
adbf293
4fa2adf
adbf293
 
 
 
 
 
 
 
 
 
 
4fa2adf
adbf293
 
 
 
 
4fa2adf
 
 
 
adbf293
 
 
 
 
 
 
 
 
 
 
4fa2adf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adbf293
 
 
4fa2adf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adbf293
 
4fa2adf
 
 
 
 
 
 
 
adbf293
4fa2adf
adbf293
 
4fa2adf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# imports
import json
import time

import gradio as gr
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
import torch.nn.functional as f

from roles_list import roles

# Load the SBERT model and tokenizer
sbert_model_name = 'sentence-transformers/all-MiniLM-L12-v2'
sbert_model = AutoModel.from_pretrained(sbert_model_name)
sbert_tokenizer = AutoTokenizer.from_pretrained(sbert_model_name)

# Load the LLM model and tokenizer
llm_model_name = 'bert-base-uncased'  # Using BERT for sequence classification
llm_model = AutoModelForSequenceClassification.from_pretrained(llm_model_name)
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)

embed_store = {}
for role in roles:
    encoding = sbert_tokenizer(role,  # the texts to be tokenized
                               max_length=10,
                               padding="max_length",
                               return_tensors='pt'  # return the tensors (not lists)
                               )
    with torch.no_grad():
        # get the model embeddings
        embed = sbert_model(**encoding)
        embed = embed.pooler_output
    embed_store[role] = f.normalize(embed, p=2, dim=1)
print("SBERT model is ready for inference")


def get_role_from_sbert(title):
    start_time = time.time()
    encoding = sbert_tokenizer(title,
                               max_length=10,
                               padding="max_length",
                               return_tensors='pt'
                               )
    # Run the model prediction on the input data
    with torch.no_grad():
        # get the model embeddings
        embed = sbert_model(**encoding)
        embed = embed.pooler_output
    store_cos = {}
    for role in embed_store:
        cos_sim = torch.nn.functional.cosine_similarity(f.normalize(embed, p=2, dim=1), embed_store[role])
        store_cos[role] = round(cos_sim.item(), 3)
    # Get the top 3 items with the highest cosine similarity
    top_3_keys_values = sorted(store_cos.items(), key=lambda item: item[1], reverse=True)
    job_scores = [{"Role": job, "SBERT Score": score} for job, score in top_3_keys_values]

    end_time = time.time()
    execution_time = end_time - start_time

    return job_scores, execution_time


def get_role_from_llm(title):
    start_time = time.time()

    llm_scores = []
    for role in roles:
        inputs = llm_tokenizer.encode_plus(title, role, return_tensors='pt', max_length=512, truncation=True)
        with torch.no_grad():
            outputs = llm_model(**inputs)
            score = torch.softmax(outputs.logits, dim=1)[0][1].item()
            llm_scores.append({"Role": role, "LLM Score": round(score, 3)})

    end_time = time.time()
    execution_time = end_time - start_time

    return llm_scores, execution_time


def classify_role(title):
    sbert_scores, sbert_execution_time = get_role_from_sbert(title)
    llm_scores, llm_execution_time = get_role_from_llm(title)

    # Merge results into a single table
    role_dict = {item["Role"]: item for item in sbert_scores}
    for item in llm_scores:
        if item["Role"] in role_dict:
            role_dict[item["Role"]]["LLM Score"] = item["LLM Score"]
        else:
            role_dict[item["Role"]] = {"Role": item["Role"], "SBERT Score": "", "LLM Score": item["LLM Score"]}

    results = []
    for role, scores in role_dict.items():
        results.append([role, scores.get("SBERT Score", ""), scores.get("LLM Score", "")])

    execution_time_info = f"SBERT Execution Time: {sbert_execution_time:.4f} seconds, LLM Execution Time: {llm_execution_time:.4f} seconds"

    return results, execution_time_info


# Gradio Blocks interface
with gr.Blocks() as demo:
    gr.Markdown("# HackerRank Role Classifier")
    with gr.Column():
        input_text = gr.Textbox(label="Job Title")
        classify_button = gr.Button("Classify")
        output_table = gr.Dataframe(headers=["Role", "SBERT Score", "LLM Score"], label="Role Scores")
        execution_time_text = gr.Textbox(label="Execution Time", interactive=False)

    classify_button.click(fn=classify_role, inputs=input_text, outputs=[output_table, execution_time_text])

gr.close_all()
demo.launch()