Spaces:

stefanjwojcik
/

misinfo_detection_app

Sleeping

App Files Files Community

misinfo_detection_app / scripts /misinfo.jl

stefanjwojcik

add scripts

143b0d4 verified 5 months ago

raw

history blame

6.01 kB

	##
	using CSV, JLD2, DataFrames, OpenAI, StatsBase, Distances, TidierPlots

	## TODO: Base Functions
	# 1. Create a function to generate embeddings
	# 2. Create a function to get the distance to the closest claim, cut based on threshold
	# 3. Create a function to get the distance to the closest counterclaim, no cutting on threshold
	# 4. Create a function to compare the distance to the closest claim and counterclaim, assign label only if the distance to the closest claim is less than the distance to the closest counterclaim


	## Analysis:
	# What is the distribution of distances by assigned narrative and label?

	### UTILITIES ####
	# Define count function
	function table(df::DataFrame, cols::Vector{Symbol})
	combine(groupby(df, cols), nrow)
	end
	#########

	"""
	## Embeddings to recover narratives
	narrative_embeddings = create_narrative_embeddings()
	"""
	function create_narrative_embeddings(regenerate=false)
	if !regenerate && isfile("data/narrative_embeddings.jld2")
	return load_object("data/narrative_embeddings.jld2")
	end
	@info "Regenerating narrative embeddings..."
	narratives = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
	## narrative Embeddings
	n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], narratives[!, "Misinformation Narrative"])
	## Add vector of embeddings to dataset
	narratives[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
	# Save the embeddings
	save_object("data/narrative_embeddings.jld2", narratives)
	return narratives
	end

	"""
	# This is the testing data
	target_embeddings = create_test_embeddings()
	"""
	function create_test_embeddings(regenerate=false)
	if !regenerate && isfile("data/test_embeddings.jld2")
	return load_object("data/test_embeddings.jld2")
	end
	@info "Regenerating test embeddings..."
	df_test = CSV.read("data/Indicator_Test.csv", DataFrame)
	## narrative Embeddings
	n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], df_test[!, "text"])
	## Add vector of embeddings to dataset
	df_test[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
	# Save the embeddings
	save_object("data/test_embeddings.jld2", df_test)
	return df_test
	end

	"""
	### The embeddings for each example are along the rows, so they can be compared column-wise (fast)
	narrative_embeddings = create_narrative_embeddings()
	target_embeddings = create_test_embeddings()
	one_shot_classification!(narrative_embeddings, target_embeddings)
	## Show the results - text, closest narrative
	target_embeddings[:, ["text", "Closest Narrative", "label"]] \|> first(5)
	"""
	function one_shot_classification!(narrative_embeddings, target_embeddings)
	## Matrix of embeddings
	narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
	target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
	# Create a search function
	function search(narrative_matrix, target_matrix)
	distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
	# get the index of the column with the smallest distance
	narrative_index = argmin(distances, dims=2)
	return narrative_index
	end
	# Search for the closest narrative for each test data
	narrative_assignment = search(narrative_matrix, target_matrix)
	target_embeddings[:, "Closest Narrative"] = [narrative_embeddings[x[2], "Misinformation Narrative"] for x in narrative_assignment[:, 1]]
	return target_embeddings
	end

	function get_distances!(narrative_embeddings, target_embeddings)
	## Matrix of embeddings
	narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
	target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
	# Create a search function
	function embedding_distances(narrative_matrix, target_matrix)
	distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
	# get the index of the column with the smallest distance
	return distances[argmin(distances, dims=2)][:, 1]
	end
	# Search for the closest narrative for each test data
	target_embeddings[:, "Dist"] = embedding_distances(narrative_matrix, target_matrix)
	return target_embeddings
	end


	## Add vector of embeddings to the test dataset

	# 3. Generate embeddings of the narratives in multiple languages
	# 4. Create a langchain search function to check which narrative is closest to the input narrative
	# 5. Figure out how effective the embeddings are in recovering the narrative classification

	## STEPS::::: Models
	# 1. Within each of the classified narratives, reuse the embeddings to find the misinfo by selecting the top K matches
	# 2. Train a model on the embeddings to predict the misinfo

	# Get the embeddings for the narratives
	narrative_embeddings = create_narrative_embeddings()
	target_embeddings = create_test_embeddings()
	one_shot_classification!(narrative_embeddings, target_embeddings)
	get_distances!(narrative_embeddings, target_embeddings)

	# Plot the distribution of distances by narrative and label
	using TidierPlots

	## By Label
	ggplot(target_embeddings, @aes(x = label, y = Dist)) +
	geom_violin() + labs(x="Misinfo Label", y="Distance") #+ geom_hline()
	## By Narrative
	#ggplot(target_embeddings \|> (data -> filter(:label => x -> x .== 1.0, data)), @aes(x = "Closest Narrative", y = Dist)) +
	# geom_violin()

	### Assign MisinfoPred = true if distance is less than .2

	target_embeddings[!, "MisinfoPred"] = target_embeddings[!, "Dist"] .< 0.2

	## Precision and Recall
	using MLJ

	y_true = target_embeddings[!, "label"]
	y_pred = target_embeddings[!, "MisinfoPred"]
	confusion_matrix(y_pred, y_true)
	accuracy(y_true, y_pred)
	true_positive_rate(y_true, y_pred)
	false_positive_rate(y_true, y_pred)

	## Top 10 closest narratives
	target_embeddings \|>
	(data -> filter(:label => x -> x .== 1.0, data)) \|>
	(data -> sort(data, :Dist)) \|>
	(data -> first(data, 10)) \|>
	(data -> select(data, ["text", "Closest Narrative", "Dist"]))