File size: 4,113 Bytes
84f0cff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from sentence_transformers import SentenceTransformer, util
from collections import Counter
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from google.cloud import secretmanager


class SecondaryModelDependencies:
    def __init__(self):
        self.text_similarity_model = SentenceTransformer(
            'sentence-transformers/all-mpnet-base-v2')
        api_key = self.access_openai_api_key()
        self.llm_gpt35 = ChatOpenAI(
            api_key=api_key, model="gpt-3.5-turbo")
        self.llm_gpt4 = ChatOpenAI(
            api_key=api_key, model="gpt-4-turbo")

    def access_openai_api_key(self):
        client = secretmanager.SecretManagerServiceClient()
        name = "projects/steady-climate-416810/secrets/OPENAI_API_KEY/versions/1"
        response = client.access_secret_version(request={"name": name})
        return response.payload.data.decode('UTF-8')

    def calculate_features(self, question: str, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
        backspace_count_normalized = backspace_count / len(answer)
        typing_duration_normalized = typing_duration / len(answer)
        letter_discrepancy = self.calculate_letter_discrepancy(
            answer, letter_click_counts)

        gpt35_answer = self.generate_gpt35_answer(question)
        gpt4_answer = self.generate_gpt4_answer(question)

        cosine_sim_gpt35 = self.calculate_similarity_gpt35(
            answer, gpt35_answer)
        cosine_sim_gpt4 = self.calculate_similarity_gpt4(answer, gpt4_answer)

        return [
            probability, backspace_count_normalized, typing_duration_normalized,
            letter_discrepancy, cosine_sim_gpt35, cosine_sim_gpt4
        ]

    def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
        # Calculate letter frequencies in the text
        text_letter_counts = Counter(text.lower())

        # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
        ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
                  for letter in "abcdefghijklmnopqrstuvwxyz"]

        # Average the ratios and normalize by the length of the text
        average_ratio = sum(ratios) / len(ratios)
        discrepancy_ratio_normalized = average_ratio / \
            (len(text) if len(text) > 0 else 1)

        return discrepancy_ratio_normalized

    def generate_gpt35_answer(self, question: str):
        messages = [
            SystemMessage(
                content="Please answer the following question based solely on your internal knowledge, without external references. Assume you are the human."),
            HumanMessage(question)
        ]

        gpt35_answer = self.llm_gpt35.invoke(messages)
        return gpt35_answer.content

    def generate_gpt4_answer(self, question: str):
        messages = [
            SystemMessage(
                content="Please answer the following question based solely on your internal knowledge, without external references. Assume you are the human."),
            HumanMessage(question)
        ]

        gpt4_answer = self.llm_gpt4.invoke(messages)
        return gpt4_answer.content

    def calculate_similarity_gpt35(self, answer: str, gpt35_answer: str) -> float:
        embedding1 = self.text_similarity_model.encode(
            [answer], convert_to_tensor=True)
        embedding2 = self.text_similarity_model.encode(
            [gpt35_answer], convert_to_tensor=True)
        cosine_scores = util.cos_sim(embedding1, embedding2)
        return cosine_scores.item()

    def calculate_similarity_gpt4(self, answer: str, gpt4_answer: str) -> float:
        embedding1 = self.text_similarity_model.encode(
            [answer], convert_to_tensor=True)
        embedding2 = self.text_similarity_model.encode(
            [gpt4_answer], convert_to_tensor=True)
        cosine_scores = util.cos_sim(embedding1, embedding2)
        return cosine_scores.item()