File size: 6,013 Bytes
143b0d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
## 
using CSV, JLD2, DataFrames, OpenAI, StatsBase, Distances, TidierPlots

## TODO: Base Functions 
# 1. Create a function to generate embeddings
# 2. Create a function to get the distance to the closest claim, cut based on threshold
# 3. Create a function to get the distance to the closest counterclaim, no cutting on threshold 
# 4. Create a function to compare the distance to the closest claim and counterclaim, assign label only if the distance to the closest claim is less than the distance to the closest counterclaim


## Analysis: 
# What is the distribution of distances by assigned narrative and label? 

### UTILITIES ####
# Define count function 
function table(df::DataFrame, cols::Vector{Symbol})
    combine(groupby(df, cols), nrow)
end
#########

"""
## Embeddings to recover narratives 
narrative_embeddings = create_narrative_embeddings()
"""
function create_narrative_embeddings(regenerate=false)
    if !regenerate && isfile("data/narrative_embeddings.jld2")
        return load_object("data/narrative_embeddings.jld2")
    end
    @info "Regenerating narrative embeddings..."
    narratives = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
    ## narrative Embeddings 
    n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], narratives[!, "Misinformation Narrative"])
    ## Add vector of embeddings to dataset 
    narratives[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
    # Save the embeddings
    save_object("data/narrative_embeddings.jld2", narratives)
    return narratives
end

"""
# This is the testing data
target_embeddings = create_test_embeddings()
"""
function create_test_embeddings(regenerate=false)
    if !regenerate && isfile("data/test_embeddings.jld2")
        return load_object("data/test_embeddings.jld2")
    end
    @info "Regenerating test embeddings..."
    df_test = CSV.read("data/Indicator_Test.csv", DataFrame)
    ## narrative Embeddings 
    n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], df_test[!, "text"])
    ## Add vector of embeddings to dataset 
    df_test[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
    # Save the embeddings
    save_object("data/test_embeddings.jld2", df_test)
    return df_test
end

"""
### The embeddings for each example are along the rows, so they can be compared column-wise (fast)
narrative_embeddings = create_narrative_embeddings()
target_embeddings = create_test_embeddings()
one_shot_classification!(narrative_embeddings, target_embeddings)
## Show the results - text, closest narrative
target_embeddings[:, ["text", "Closest Narrative", "label"]] |> first(5)
"""
function one_shot_classification!(narrative_embeddings, target_embeddings)
    ## Matrix of embeddings
    narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
    target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
    # Create a search function
    function search(narrative_matrix, target_matrix)
        distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
        # get the index of the column with the smallest distance
        narrative_index = argmin(distances, dims=2)
        return narrative_index
    end
    # Search for the closest narrative for each test data
    narrative_assignment = search(narrative_matrix, target_matrix)
    target_embeddings[:, "Closest Narrative"] = [narrative_embeddings[x[2], "Misinformation Narrative"] for x in narrative_assignment[:, 1]]
    return target_embeddings
end

function get_distances!(narrative_embeddings, target_embeddings)
    ## Matrix of embeddings
    narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
    target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
    # Create a search function
    function embedding_distances(narrative_matrix, target_matrix)
        distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
        # get the index of the column with the smallest distance
        return distances[argmin(distances, dims=2)][:, 1]
    end
    # Search for the closest narrative for each test data
    target_embeddings[:, "Dist"] = embedding_distances(narrative_matrix, target_matrix)
    return target_embeddings
end


## Add vector of embeddings to the test dataset 

# 3. Generate embeddings of the narratives in multiple languages 
# 4. Create a langchain search function to check which narrative is closest to the input narrative
# 5. Figure out how effective the embeddings are in recovering the narrative classification 

## STEPS::::: Models 
# 1. Within each of the classified narratives, reuse the embeddings to find the misinfo by selecting the top K matches 
# 2. Train a model on the embeddings to predict the misinfo

# Get the embeddings for the narratives
narrative_embeddings = create_narrative_embeddings()
target_embeddings = create_test_embeddings()
one_shot_classification!(narrative_embeddings, target_embeddings)
get_distances!(narrative_embeddings, target_embeddings)

# Plot the distribution of distances by narrative and label
using TidierPlots

## By Label 
ggplot(target_embeddings, @aes(x = label, y = Dist)) +
    geom_violin() + labs(x="Misinfo Label", y="Distance") #+ geom_hline()
## By Narrative 
#ggplot(target_embeddings |> (data -> filter(:label => x -> x .== 1.0, data)), @aes(x = "Closest Narrative", y = Dist)) +
#    geom_violin()

### Assign MisinfoPred = true if distance is less than .2 

target_embeddings[!, "MisinfoPred"] = target_embeddings[!, "Dist"] .< 0.2

## Precision and Recall
using MLJ

y_true = target_embeddings[!, "label"]
y_pred = target_embeddings[!, "MisinfoPred"]
confusion_matrix(y_pred, y_true)
accuracy(y_true, y_pred)
true_positive_rate(y_true, y_pred)
false_positive_rate(y_true, y_pred)

## Top 10 closest narratives
target_embeddings |> 
    (data -> filter(:label => x -> x .== 1.0, data)) |> 
    (data -> sort(data, :Dist)) |>
    (data -> first(data, 10)) |> 
    (data -> select(data, ["text", "Closest Narrative", "Dist"]))