Spaces:

stefanjwojcik
/

misinfo_detection_app

Sleeping

App Files Files Community

misinfo_detection_app / scripts /CARDStestclassification.jl

stefanjwojcik

add scripts

143b0d4 verified 5 months ago

raw

history blame

3.79 kB

	## Embed all of the CARDS Data
	df = CSV.read("data/climate_data/data/test.csv", DataFrame)
	# Drop rows == "nan"
	df = filter(row -> row.text != "nan", df)
	df = OC.df_to_pd(df)
	model = "multilingual-e5-large"
	indexname = "ostreacultura-v1"
	namespace = "cards-data-test"
	out = OC.multi_embeddings(model, df, 96, "text")
	OC.upsert_data(out, "ostreacultura-v1", namespace, chunk_size=96)

	# Now, query cards data using Climate Misinformation Claims
	import OstreaCultura as OC
	using DataFrames
	claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv")
	indexname = "ostreacultura-v1"
	namespace = "cards-data-test"
	claim = claims.Claims[1]
	counterclaim = claims.Counterclaims[1]
	threshold = .8
	top_k = 400 # top_k for the initial query = to total number of claims
	@time OC.query(claim, indexname, namespace, top_k=top_k)

	# Write a loop to query all claims, then assign the claim to the top k values
	classified = DataFrame()
	@time for i in 1:size(claims)[1]
	result = OC.query(string(claims.Claims[i]), indexname, namespace; top_k=top_k, include_values=false)
	if nrow(result) == 0
	println("No results found for claim: ", claims.Claims[i])
	continue
	else
	result.assigned_claim .= claims.Claims[i]
	classified = vcat(classified, result)
	end
	end

	# Write the classified data to a csv file
	using CSV
	CSV.write("data/cards_test_query_top400_results.csv", classified)
	classified = CSV.read("data/cards_test_query_top400_results.csv", DataFrame)
	## Assign labels
	classified.predlabel80 .= 1
	classified.predlabel85 .= 0
	classified.predlabel85[classified.score .>= .85] .= 1
	classified.predlabel90 .= 0
	classified.predlabel90[classified.score .>= .90] .= 1
	classified.ytrue .= [occursin("1", x) for x in classified.claim]
	sclassified = select(classified, r"id\|predlabel\|score\|assigned_claim")


	# Group by id and combine to keep only the id with the max score
	sclassified_grouped = groupby(sclassified, :id)
	sdf = combine(sclassified_grouped) do eh
	eh[argmax(eh.score), :]
	end

	# climate full - get all ids
	pc = OC.create_pinecone_context()
	index = pc.Index(indexname)
	hi = [x for x in index.list(namespace=namespace)]
	## concat all the ids
	ids = vcat(hi...)
	# get all the data
	cardsfull = OC.fetch_data(ids, indexname, namespace)
	#CSV.write("data/climate_data/data/test_w_ids.csv", cardsfull)
	cardsfull = CSV.read("data/climate_data/data/test_w_ids.csv", DataFrame)
	## left join the full data with the predicted
	cardsfull = leftjoin(cardsfull, sdf, on=:id)
	cardsfull.ytrue .= [!occursin("0", x) for x in cardsfull.claim]
	#coalesce all the predlabels
	cardsfull.predlabel80 .= coalesce.(cardsfull.predlabel80, 0)
	cardsfull.predlabel85 .= coalesce.(cardsfull.predlabel85, 0)
	cardsfull.predlabel90 .= coalesce.(cardsfull.predlabel90, 0)

	# Get precision and recall
	using MLJ
	ytrue = [!occursin("0", x) for x in cardsfull.claim]

	## Now predlabel 90
	confusion_matrix(cardsfull.predlabel90, ytrue)
	accuracy(cardsfull.predlabel90, ytrue)
	true_positive_rate(ytrue, cardsfull.predlabel90)
	false_negative_rate(cardsfull.predlabel90,ytrue)

	## Now predlabel 85
	confusion_matrix(cardsfull.predlabel85, ytrue)
	accuracy(cardsfull.predlabel85, ytrue)
	true_positive_rate(ytrue, cardsfull.predlabel85)
	false_negative_rate(cardsfull.predlabel85, ytrue)

	## Now predlabel 80
	confusion_matrix(cardsfull.predlabel80, ytrue)
	accuracy(cardsfull.predlabel80, ytrue)
	true_positive_rate(ytrue, cardsfull.predlabel80)
	false_negative_rate(cardsfull.predlabel80, ytrue)

	CSV.write("data/cards_test_query_top400_results.csv", cardsfull)


	using TidierPlots

	## By Label
	ggplot(filter(:score => x -> !ismissing(x), cardsfull), @aes(x = ytrue, y = score)) +
	geom_violin() + labs(x="Misinfo Label", y="Score") #+ geom_hline()