File size: 6,013 Bytes
143b0d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
##
using CSV, JLD2, DataFrames, OpenAI, StatsBase, Distances, TidierPlots
## TODO: Base Functions
# 1. Create a function to generate embeddings
# 2. Create a function to get the distance to the closest claim, cut based on threshold
# 3. Create a function to get the distance to the closest counterclaim, no cutting on threshold
# 4. Create a function to compare the distance to the closest claim and counterclaim, assign label only if the distance to the closest claim is less than the distance to the closest counterclaim
## Analysis:
# What is the distribution of distances by assigned narrative and label?
### UTILITIES ####
# Define count function
function table(df::DataFrame, cols::Vector{Symbol})
combine(groupby(df, cols), nrow)
end
#########
"""
## Embeddings to recover narratives
narrative_embeddings = create_narrative_embeddings()
"""
function create_narrative_embeddings(regenerate=false)
if !regenerate && isfile("data/narrative_embeddings.jld2")
return load_object("data/narrative_embeddings.jld2")
end
@info "Regenerating narrative embeddings..."
narratives = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
## narrative Embeddings
n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], narratives[!, "Misinformation Narrative"])
## Add vector of embeddings to dataset
narratives[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
# Save the embeddings
save_object("data/narrative_embeddings.jld2", narratives)
return narratives
end
"""
# This is the testing data
target_embeddings = create_test_embeddings()
"""
function create_test_embeddings(regenerate=false)
if !regenerate && isfile("data/test_embeddings.jld2")
return load_object("data/test_embeddings.jld2")
end
@info "Regenerating test embeddings..."
df_test = CSV.read("data/Indicator_Test.csv", DataFrame)
## narrative Embeddings
n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], df_test[!, "text"])
## Add vector of embeddings to dataset
df_test[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
# Save the embeddings
save_object("data/test_embeddings.jld2", df_test)
return df_test
end
"""
### The embeddings for each example are along the rows, so they can be compared column-wise (fast)
narrative_embeddings = create_narrative_embeddings()
target_embeddings = create_test_embeddings()
one_shot_classification!(narrative_embeddings, target_embeddings)
## Show the results - text, closest narrative
target_embeddings[:, ["text", "Closest Narrative", "label"]] |> first(5)
"""
function one_shot_classification!(narrative_embeddings, target_embeddings)
## Matrix of embeddings
narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
# Create a search function
function search(narrative_matrix, target_matrix)
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
# get the index of the column with the smallest distance
narrative_index = argmin(distances, dims=2)
return narrative_index
end
# Search for the closest narrative for each test data
narrative_assignment = search(narrative_matrix, target_matrix)
target_embeddings[:, "Closest Narrative"] = [narrative_embeddings[x[2], "Misinformation Narrative"] for x in narrative_assignment[:, 1]]
return target_embeddings
end
function get_distances!(narrative_embeddings, target_embeddings)
## Matrix of embeddings
narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
# Create a search function
function embedding_distances(narrative_matrix, target_matrix)
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
# get the index of the column with the smallest distance
return distances[argmin(distances, dims=2)][:, 1]
end
# Search for the closest narrative for each test data
target_embeddings[:, "Dist"] = embedding_distances(narrative_matrix, target_matrix)
return target_embeddings
end
## Add vector of embeddings to the test dataset
# 3. Generate embeddings of the narratives in multiple languages
# 4. Create a langchain search function to check which narrative is closest to the input narrative
# 5. Figure out how effective the embeddings are in recovering the narrative classification
## STEPS::::: Models
# 1. Within each of the classified narratives, reuse the embeddings to find the misinfo by selecting the top K matches
# 2. Train a model on the embeddings to predict the misinfo
# Get the embeddings for the narratives
narrative_embeddings = create_narrative_embeddings()
target_embeddings = create_test_embeddings()
one_shot_classification!(narrative_embeddings, target_embeddings)
get_distances!(narrative_embeddings, target_embeddings)
# Plot the distribution of distances by narrative and label
using TidierPlots
## By Label
ggplot(target_embeddings, @aes(x = label, y = Dist)) +
geom_violin() + labs(x="Misinfo Label", y="Distance") #+ geom_hline()
## By Narrative
#ggplot(target_embeddings |> (data -> filter(:label => x -> x .== 1.0, data)), @aes(x = "Closest Narrative", y = Dist)) +
# geom_violin()
### Assign MisinfoPred = true if distance is less than .2
target_embeddings[!, "MisinfoPred"] = target_embeddings[!, "Dist"] .< 0.2
## Precision and Recall
using MLJ
y_true = target_embeddings[!, "label"]
y_pred = target_embeddings[!, "MisinfoPred"]
confusion_matrix(y_pred, y_true)
accuracy(y_true, y_pred)
true_positive_rate(y_true, y_pred)
false_positive_rate(y_true, y_pred)
## Top 10 closest narratives
target_embeddings |>
(data -> filter(:label => x -> x .== 1.0, data)) |>
(data -> sort(data, :Dist)) |>
(data -> first(data, 10)) |>
(data -> select(data, ["text", "Closest Narrative", "Dist"])) |