## using CSV, JLD2, DataFrames, OpenAI, StatsBase, Distances, TidierPlots ## TODO: Base Functions # 1. Create a function to generate embeddings # 2. Create a function to get the distance to the closest claim, cut based on threshold # 3. Create a function to get the distance to the closest counterclaim, no cutting on threshold # 4. Create a function to compare the distance to the closest claim and counterclaim, assign label only if the distance to the closest claim is less than the distance to the closest counterclaim ## Analysis: # What is the distribution of distances by assigned narrative and label? ### UTILITIES #### # Define count function function table(df::DataFrame, cols::Vector{Symbol}) combine(groupby(df, cols), nrow) end ######### """ ## Embeddings to recover narratives narrative_embeddings = create_narrative_embeddings() """ function create_narrative_embeddings(regenerate=false) if !regenerate && isfile("data/narrative_embeddings.jld2") return load_object("data/narrative_embeddings.jld2") end @info "Regenerating narrative embeddings..." narratives = CSV.read("data/Modified Misinformation Library.csv", DataFrame) ## narrative Embeddings n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], narratives[!, "Misinformation Narrative"]) ## Add vector of embeddings to dataset narratives[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]] # Save the embeddings save_object("data/narrative_embeddings.jld2", narratives) return narratives end """ # This is the testing data target_embeddings = create_test_embeddings() """ function create_test_embeddings(regenerate=false) if !regenerate && isfile("data/test_embeddings.jld2") return load_object("data/test_embeddings.jld2") end @info "Regenerating test embeddings..." df_test = CSV.read("data/Indicator_Test.csv", DataFrame) ## narrative Embeddings n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], df_test[!, "text"]) ## Add vector of embeddings to dataset df_test[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]] # Save the embeddings save_object("data/test_embeddings.jld2", df_test) return df_test end """ ### The embeddings for each example are along the rows, so they can be compared column-wise (fast) narrative_embeddings = create_narrative_embeddings() target_embeddings = create_test_embeddings() one_shot_classification!(narrative_embeddings, target_embeddings) ## Show the results - text, closest narrative target_embeddings[:, ["text", "Closest Narrative", "label"]] |> first(5) """ function one_shot_classification!(narrative_embeddings, target_embeddings) ## Matrix of embeddings narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...) target_matrix = hcat(target_embeddings[:, "Embeddings"]...) # Create a search function function search(narrative_matrix, target_matrix) distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2) # get the index of the column with the smallest distance narrative_index = argmin(distances, dims=2) return narrative_index end # Search for the closest narrative for each test data narrative_assignment = search(narrative_matrix, target_matrix) target_embeddings[:, "Closest Narrative"] = [narrative_embeddings[x[2], "Misinformation Narrative"] for x in narrative_assignment[:, 1]] return target_embeddings end function get_distances!(narrative_embeddings, target_embeddings) ## Matrix of embeddings narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...) target_matrix = hcat(target_embeddings[:, "Embeddings"]...) # Create a search function function embedding_distances(narrative_matrix, target_matrix) distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2) # get the index of the column with the smallest distance return distances[argmin(distances, dims=2)][:, 1] end # Search for the closest narrative for each test data target_embeddings[:, "Dist"] = embedding_distances(narrative_matrix, target_matrix) return target_embeddings end ## Add vector of embeddings to the test dataset # 3. Generate embeddings of the narratives in multiple languages # 4. Create a langchain search function to check which narrative is closest to the input narrative # 5. Figure out how effective the embeddings are in recovering the narrative classification ## STEPS::::: Models # 1. Within each of the classified narratives, reuse the embeddings to find the misinfo by selecting the top K matches # 2. Train a model on the embeddings to predict the misinfo # Get the embeddings for the narratives narrative_embeddings = create_narrative_embeddings() target_embeddings = create_test_embeddings() one_shot_classification!(narrative_embeddings, target_embeddings) get_distances!(narrative_embeddings, target_embeddings) # Plot the distribution of distances by narrative and label using TidierPlots ## By Label ggplot(target_embeddings, @aes(x = label, y = Dist)) + geom_violin() + labs(x="Misinfo Label", y="Distance") #+ geom_hline() ## By Narrative #ggplot(target_embeddings |> (data -> filter(:label => x -> x .== 1.0, data)), @aes(x = "Closest Narrative", y = Dist)) + # geom_violin() ### Assign MisinfoPred = true if distance is less than .2 target_embeddings[!, "MisinfoPred"] = target_embeddings[!, "Dist"] .< 0.2 ## Precision and Recall using MLJ y_true = target_embeddings[!, "label"] y_pred = target_embeddings[!, "MisinfoPred"] confusion_matrix(y_pred, y_true) accuracy(y_true, y_pred) true_positive_rate(y_true, y_pred) false_positive_rate(y_true, y_pred) ## Top 10 closest narratives target_embeddings |> (data -> filter(:label => x -> x .== 1.0, data)) |> (data -> sort(data, :Dist)) |> (data -> first(data, 10)) |> (data -> select(data, ["text", "Closest Narrative", "Dist"]))