Spaces:
Sleeping
Sleeping
## | |
using CSV, JLD2, DataFrames, OpenAI, StatsBase, Distances, TidierPlots | |
## TODO: Base Functions | |
# 1. Create a function to generate embeddings | |
# 2. Create a function to get the distance to the closest claim, cut based on threshold | |
# 3. Create a function to get the distance to the closest counterclaim, no cutting on threshold | |
# 4. Create a function to compare the distance to the closest claim and counterclaim, assign label only if the distance to the closest claim is less than the distance to the closest counterclaim | |
## Analysis: | |
# What is the distribution of distances by assigned narrative and label? | |
### UTILITIES #### | |
# Define count function | |
function table(df::DataFrame, cols::Vector{Symbol}) | |
combine(groupby(df, cols), nrow) | |
end | |
######### | |
""" | |
## Embeddings to recover narratives | |
narrative_embeddings = create_narrative_embeddings() | |
""" | |
function create_narrative_embeddings(regenerate=false) | |
if !regenerate && isfile("data/narrative_embeddings.jld2") | |
return load_object("data/narrative_embeddings.jld2") | |
end | |
narratives = CSV.read("data/Modified Misinformation Library.csv", DataFrame) | |
## narrative Embeddings | |
n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], narratives[!, "Misinformation Narrative"]) | |
## Add vector of embeddings to dataset | |
narratives[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]] | |
# Save the embeddings | |
save_object("data/narrative_embeddings.jld2", narratives) | |
return narratives | |
end | |
""" | |
# This is the testing data | |
target_embeddings = create_test_embeddings() | |
""" | |
function create_test_embeddings(regenerate=false) | |
if !regenerate && isfile("data/test_embeddings.jld2") | |
return load_object("data/test_embeddings.jld2") | |
end | |
df_test = CSV.read("data/Indicator_Test.csv", DataFrame) | |
## narrative Embeddings | |
n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], df_test[!, "text"]) | |
## Add vector of embeddings to dataset | |
df_test[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]] | |
# Save the embeddings | |
save_object("data/test_embeddings.jld2", df_test) | |
return df_test | |
end | |
""" | |
### The embeddings for each example are along the rows, so they can be compared column-wise (fast) | |
narrative_embeddings = create_narrative_embeddings() | |
target_embeddings = create_test_embeddings() | |
one_shot_classification!(narrative_embeddings, target_embeddings) | |
## Show the results - text, closest narrative | |
target_embeddings[:, ["text", "Closest Narrative", "label"]] |> first(5) | |
""" | |
function one_shot_classification!(narrative_embeddings, target_embeddings) | |
## Matrix of embeddings | |
narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...) | |
target_matrix = hcat(target_embeddings[:, "Embeddings"]...) | |
# Create a search function | |
function search(narrative_matrix, target_matrix) | |
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2) | |
# get the index of the column with the smallest distance | |
narrative_index = argmin(distances, dims=2) | |
return narrative_index | |
end | |
# Search for the closest narrative for each test data | |
narrative_assignment = search(narrative_matrix, target_matrix) | |
target_embeddings[:, "Closest Narrative"] = [narrative_embeddings[x[2], "Misinformation Narrative"] for x in narrative_assignment[:, 1]] | |
return target_embeddings | |
end | |
function get_distances!(narrative_embeddings, target_embeddings) | |
## Matrix of embeddings | |
narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...) | |
target_matrix = hcat(target_embeddings[:, "Embeddings"]...) | |
# Create a search function | |
function embedding_distances(narrative_matrix, target_matrix) | |
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2) | |
# get the index of the column with the smallest distance | |
return distances[argmin(distances, dims=2)][:, 1] | |
end | |
# Search for the closest narrative for each test data | |
target_embeddings[:, "Dist"] = embedding_distances(narrative_matrix, target_matrix) | |
return target_embeddings | |
end | |
## Add vector of embeddings to the test dataset | |
# 3. Generate embeddings of the narratives in multiple languages | |
# 4. Create a langchain search function to check which narrative is closest to the input narrative | |
# 5. Figure out how effective the embeddings are in recovering the narrative classification | |
## STEPS::::: Models | |
# 1. Within each of the classified narratives, reuse the embeddings to find the misinfo by selecting the top K matches | |
# 2. Train a model on the embeddings to predict the misinfo | |
# Get the embeddings for the narratives | |
narrative_embeddings = create_narrative_embeddings() | |
target_embeddings = create_test_embeddings() | |
one_shot_classification!(narrative_embeddings, target_embeddings) | |
get_distances!(narrative_embeddings, target_embeddings) | |
# Plot the distribution of distances by narrative and label | |
using TidierPlots | |
## By Label | |
ggplot(target_embeddings, @aes(x = label, y = Dist)) + | |
geom_violin() + labs(x="Misinfo Label", y="Distance") #+ geom_hline() | |
## By Narrative | |
#ggplot(target_embeddings |> (data -> filter(:label => x -> x .== 1.0, data)), @aes(x = "Closest Narrative", y = Dist)) + | |
# geom_violin() | |
### Assign MisinfoPred = true if distance is less than .2 | |
target_embeddings[!, "MisinfoPred"] = target_embeddings[!, "Dist"] .< 0.2 | |
## Precision and Recall | |
using MLJ | |
y_true = target_embeddings[!, "label"] | |
y_pred = target_embeddings[!, "MisinfoPred"] | |
confusion_matrix(y_pred, y_true) | |
accuracy(y_true, y_pred) | |
true_positive_rate(y_true, y_pred) | |
false_positive_rate(y_true, y_pred) | |
## Top 10 closest narratives | |
target_embeddings |> | |
(data -> filter(:label => x -> x .== 1.0, data)) |> | |
(data -> sort(data, :Dist)) |> | |
(data -> first(data, 10)) |> | |
(data -> select(data, ["text", "Closest Narrative", "Dist"])) |