Spaces:

stefanjwojcik
/

misinfo_detection_app

Running

File size: 6,013 Bytes

143b0d4

## 
using CSV, JLD2, DataFrames, OpenAI, StatsBase, Distances, TidierPlots

## TODO: Base Functions 
# 1. Create a function to generate embeddings
# 2. Create a function to get the distance to the closest claim, cut based on threshold
# 3. Create a function to get the distance to the closest counterclaim, no cutting on threshold 
# 4. Create a function to compare the distance to the closest claim and counterclaim, assign label only if the distance to the closest claim is less than the distance to the closest counterclaim


## Analysis: 
# What is the distribution of distances by assigned narrative and label? 

### UTILITIES ####
# Define count function 
function table(df::DataFrame, cols::Vector{Symbol})
    combine(groupby(df, cols), nrow)
end
#########

"""
## Embeddings to recover narratives 
narrative_embeddings = create_narrative_embeddings()
"""
function create_narrative_embeddings(regenerate=false)
    if !regenerate && isfile("data/narrative_embeddings.jld2")
        return load_object("data/narrative_embeddings.jld2")
    end
    @info "Regenerating narrative embeddings..."
    narratives = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
    ## narrative Embeddings 
    n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], narratives[!, "Misinformation Narrative"])
    ## Add vector of embeddings to dataset 
    narratives[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
    # Save the embeddings
    save_object("data/narrative_embeddings.jld2", narratives)
    return narratives
end

"""
# This is the testing data
target_embeddings = create_test_embeddings()
"""
function create_test_embeddings(regenerate=false)
    if !regenerate && isfile("data/test_embeddings.jld2")
        return load_object("data/test_embeddings.jld2")
    end
    @info "Regenerating test embeddings..."
    df_test = CSV.read("data/Indicator_Test.csv", DataFrame)
    ## narrative Embeddings 
    n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], df_test[!, "text"])
    ## Add vector of embeddings to dataset 
    df_test[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
    # Save the embeddings
    save_object("data/test_embeddings.jld2", df_test)
    return df_test
end

"""
### The embeddings for each example are along the rows, so they can be compared column-wise (fast)
narrative_embeddings = create_narrative_embeddings()
target_embeddings = create_test_embeddings()
one_shot_classification!(narrative_embeddings, target_embeddings)
## Show the results - text, closest narrative
target_embeddings[:, ["text", "Closest Narrative", "label"]] |> first(5)
"""
function one_shot_classification!(narrative_embeddings, target_embeddings)
    ## Matrix of embeddings
    narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
    target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
    # Create a search function
    function search(narrative_matrix, target_matrix)
        distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
        # get the index of the column with the smallest distance
        narrative_index = argmin(distances, dims=2)
        return narrative_index
    end
    # Search for the closest narrative for each test data
    narrative_assignment = search(narrative_matrix, target_matrix)
    target_embeddings[:, "Closest Narrative"] = [narrative_embeddings[x[2], "Misinformation Narrative"] for x in narrative_assignment[:, 1]]
    return target_embeddings
end

function get_distances!(narrative_embeddings, target_embeddings)
    ## Matrix of embeddings
    narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
    target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
    # Create a search function
    function embedding_distances(narrative_matrix, target_matrix)
        distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
        # get the index of the column with the smallest distance
        return distances[argmin(distances, dims=2)][:, 1]
    end
    # Search for the closest narrative for each test data
    target_embeddings[:, "Dist"] = embedding_distances(narrative_matrix, target_matrix)
    return target_embeddings
end


## Add vector of embeddings to the test dataset 

# 3. Generate embeddings of the narratives in multiple languages 
# 4. Create a langchain search function to check which narrative is closest to the input narrative
# 5. Figure out how effective the embeddings are in recovering the narrative classification 

## STEPS::::: Models 
# 1. Within each of the classified narratives, reuse the embeddings to find the misinfo by selecting the top K matches 
# 2. Train a model on the embeddings to predict the misinfo

# Get the embeddings for the narratives
narrative_embeddings = create_narrative_embeddings()
target_embeddings = create_test_embeddings()
one_shot_classification!(narrative_embeddings, target_embeddings)
get_distances!(narrative_embeddings, target_embeddings)

# Plot the distribution of distances by narrative and label
using TidierPlots

## By Label 
ggplot(target_embeddings, @aes(x = label, y = Dist)) +
    geom_violin() + labs(x="Misinfo Label", y="Distance") #+ geom_hline()
## By Narrative 
#ggplot(target_embeddings |> (data -> filter(:label => x -> x .== 1.0, data)), @aes(x = "Closest Narrative", y = Dist)) +
#    geom_violin()

### Assign MisinfoPred = true if distance is less than .2 

target_embeddings[!, "MisinfoPred"] = target_embeddings[!, "Dist"] .< 0.2

## Precision and Recall
using MLJ

y_true = target_embeddings[!, "label"]
y_pred = target_embeddings[!, "MisinfoPred"]
confusion_matrix(y_pred, y_true)
accuracy(y_true, y_pred)
true_positive_rate(y_true, y_pred)
false_positive_rate(y_true, y_pred)

## Top 10 closest narratives
target_embeddings |> 
    (data -> filter(:label => x -> x .== 1.0, data)) |> 
    (data -> sort(data, :Dist)) |>
    (data -> first(data, 10)) |> 
    (data -> select(data, ["text", "Closest Narrative", "Dist"]))