stefanjwojcik's picture
add scripts
143b0d4 verified
##
using CSV, JLD2, DataFrames, OpenAI, StatsBase, Distances, TidierPlots
## TODO: Base Functions
# 1. Create a function to generate embeddings
# 2. Create a function to get the distance to the closest claim, cut based on threshold
# 3. Create a function to get the distance to the closest counterclaim, no cutting on threshold
# 4. Create a function to compare the distance to the closest claim and counterclaim, assign label only if the distance to the closest claim is less than the distance to the closest counterclaim
## Analysis:
# What is the distribution of distances by assigned narrative and label?
### UTILITIES ####
# Define count function
function table(df::DataFrame, cols::Vector{Symbol})
combine(groupby(df, cols), nrow)
end
#########
"""
## Embeddings to recover narratives
narrative_embeddings = create_narrative_embeddings()
"""
function create_narrative_embeddings(regenerate=false)
if !regenerate && isfile("data/narrative_embeddings.jld2")
return load_object("data/narrative_embeddings.jld2")
end
@info "Regenerating narrative embeddings..."
narratives = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
## narrative Embeddings
n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], narratives[!, "Misinformation Narrative"])
## Add vector of embeddings to dataset
narratives[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
# Save the embeddings
save_object("data/narrative_embeddings.jld2", narratives)
return narratives
end
"""
# This is the testing data
target_embeddings = create_test_embeddings()
"""
function create_test_embeddings(regenerate=false)
if !regenerate && isfile("data/test_embeddings.jld2")
return load_object("data/test_embeddings.jld2")
end
@info "Regenerating test embeddings..."
df_test = CSV.read("data/Indicator_Test.csv", DataFrame)
## narrative Embeddings
n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], df_test[!, "text"])
## Add vector of embeddings to dataset
df_test[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
# Save the embeddings
save_object("data/test_embeddings.jld2", df_test)
return df_test
end
"""
### The embeddings for each example are along the rows, so they can be compared column-wise (fast)
narrative_embeddings = create_narrative_embeddings()
target_embeddings = create_test_embeddings()
one_shot_classification!(narrative_embeddings, target_embeddings)
## Show the results - text, closest narrative
target_embeddings[:, ["text", "Closest Narrative", "label"]] |> first(5)
"""
function one_shot_classification!(narrative_embeddings, target_embeddings)
## Matrix of embeddings
narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
# Create a search function
function search(narrative_matrix, target_matrix)
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
# get the index of the column with the smallest distance
narrative_index = argmin(distances, dims=2)
return narrative_index
end
# Search for the closest narrative for each test data
narrative_assignment = search(narrative_matrix, target_matrix)
target_embeddings[:, "Closest Narrative"] = [narrative_embeddings[x[2], "Misinformation Narrative"] for x in narrative_assignment[:, 1]]
return target_embeddings
end
function get_distances!(narrative_embeddings, target_embeddings)
## Matrix of embeddings
narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
# Create a search function
function embedding_distances(narrative_matrix, target_matrix)
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
# get the index of the column with the smallest distance
return distances[argmin(distances, dims=2)][:, 1]
end
# Search for the closest narrative for each test data
target_embeddings[:, "Dist"] = embedding_distances(narrative_matrix, target_matrix)
return target_embeddings
end
## Add vector of embeddings to the test dataset
# 3. Generate embeddings of the narratives in multiple languages
# 4. Create a langchain search function to check which narrative is closest to the input narrative
# 5. Figure out how effective the embeddings are in recovering the narrative classification
## STEPS::::: Models
# 1. Within each of the classified narratives, reuse the embeddings to find the misinfo by selecting the top K matches
# 2. Train a model on the embeddings to predict the misinfo
# Get the embeddings for the narratives
narrative_embeddings = create_narrative_embeddings()
target_embeddings = create_test_embeddings()
one_shot_classification!(narrative_embeddings, target_embeddings)
get_distances!(narrative_embeddings, target_embeddings)
# Plot the distribution of distances by narrative and label
using TidierPlots
## By Label
ggplot(target_embeddings, @aes(x = label, y = Dist)) +
geom_violin() + labs(x="Misinfo Label", y="Distance") #+ geom_hline()
## By Narrative
#ggplot(target_embeddings |> (data -> filter(:label => x -> x .== 1.0, data)), @aes(x = "Closest Narrative", y = Dist)) +
# geom_violin()
### Assign MisinfoPred = true if distance is less than .2
target_embeddings[!, "MisinfoPred"] = target_embeddings[!, "Dist"] .< 0.2
## Precision and Recall
using MLJ
y_true = target_embeddings[!, "label"]
y_pred = target_embeddings[!, "MisinfoPred"]
confusion_matrix(y_pred, y_true)
accuracy(y_true, y_pred)
true_positive_rate(y_true, y_pred)
false_positive_rate(y_true, y_pred)
## Top 10 closest narratives
target_embeddings |>
(data -> filter(:label => x -> x .== 1.0, data)) |>
(data -> sort(data, :Dist)) |>
(data -> first(data, 10)) |>
(data -> select(data, ["text", "Closest Narrative", "Dist"]))