Spaces:
Sleeping
Sleeping
## Embed all of the CARDS Data | |
df = CSV.read("data/climate_data/data/test.csv", DataFrame) | |
# Drop rows == "nan" | |
df = filter(row -> row.text != "nan", df) | |
df = OC.df_to_pd(df) | |
model = "multilingual-e5-large" | |
indexname = "ostreacultura-v1" | |
namespace = "cards-data-test" | |
out = OC.multi_embeddings(model, df, 96, "text") | |
OC.upsert_data(out, "ostreacultura-v1", namespace, chunk_size=96) | |
# Now, query cards data using Climate Misinformation Claims | |
import OstreaCultura as OC | |
using DataFrames | |
claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv") | |
indexname = "ostreacultura-v1" | |
namespace = "cards-data-test" | |
claim = claims.Claims[1] | |
counterclaim = claims.Counterclaims[1] | |
threshold = .8 | |
top_k = 400 # top_k for the initial query = to total number of claims | |
@time OC.query(claim, indexname, namespace, top_k=top_k) | |
# Write a loop to query all claims, then assign the claim to the top k values | |
classified = DataFrame() | |
@time for i in 1:size(claims)[1] | |
result = OC.query(string(claims.Claims[i]), indexname, namespace; top_k=top_k, include_values=false) | |
if nrow(result) == 0 | |
println("No results found for claim: ", claims.Claims[i]) | |
continue | |
else | |
result.assigned_claim .= claims.Claims[i] | |
classified = vcat(classified, result) | |
end | |
end | |
# Write the classified data to a csv file | |
using CSV | |
CSV.write("data/cards_test_query_top400_results.csv", classified) | |
classified = CSV.read("data/cards_test_query_top400_results.csv", DataFrame) | |
## Assign labels | |
classified.predlabel80 .= 1 | |
classified.predlabel85 .= 0 | |
classified.predlabel85[classified.score .>= .85] .= 1 | |
classified.predlabel90 .= 0 | |
classified.predlabel90[classified.score .>= .90] .= 1 | |
classified.ytrue .= [occursin("1", x) for x in classified.claim] | |
sclassified = select(classified, r"id|predlabel|score|assigned_claim") | |
# Group by id and combine to keep only the id with the max score | |
sclassified_grouped = groupby(sclassified, :id) | |
sdf = combine(sclassified_grouped) do eh | |
eh[argmax(eh.score), :] | |
end | |
# climate full - get all ids | |
pc = OC.create_pinecone_context() | |
index = pc.Index(indexname) | |
hi = [x for x in index.list(namespace=namespace)] | |
## concat all the ids | |
ids = vcat(hi...) | |
# get all the data | |
cardsfull = OC.fetch_data(ids, indexname, namespace) | |
cardsfull = CSV.read("data/climate_data/data/test_w_ids.csv", DataFrame) | |
## left join the full data with the predicted | |
cardsfull = leftjoin(cardsfull, sdf, on=:id) | |
cardsfull.ytrue .= [!occursin("0", x) for x in cardsfull.claim] | |
cardsfull.predlabel80 .= coalesce.(cardsfull.predlabel80, 0) | |
cardsfull.predlabel85 .= coalesce.(cardsfull.predlabel85, 0) | |
cardsfull.predlabel90 .= coalesce.(cardsfull.predlabel90, 0) | |
# Get precision and recall | |
using MLJ | |
ytrue = [!occursin("0", x) for x in cardsfull.claim] | |
## Now predlabel 90 | |
confusion_matrix(cardsfull.predlabel90, ytrue) | |
accuracy(cardsfull.predlabel90, ytrue) | |
true_positive_rate(ytrue, cardsfull.predlabel90) | |
false_negative_rate(cardsfull.predlabel90,ytrue) | |
## Now predlabel 85 | |
confusion_matrix(cardsfull.predlabel85, ytrue) | |
accuracy(cardsfull.predlabel85, ytrue) | |
true_positive_rate(ytrue, cardsfull.predlabel85) | |
false_negative_rate(cardsfull.predlabel85, ytrue) | |
## Now predlabel 80 | |
confusion_matrix(cardsfull.predlabel80, ytrue) | |
accuracy(cardsfull.predlabel80, ytrue) | |
true_positive_rate(ytrue, cardsfull.predlabel80) | |
false_negative_rate(cardsfull.predlabel80, ytrue) | |
CSV.write("data/cards_test_query_top400_results.csv", cardsfull) | |
using TidierPlots | |
## By Label | |
ggplot(filter(:score => x -> !ismissing(x), cardsfull), @aes(x = ytrue, y = score)) + | |
geom_violin() + labs(x="Misinfo Label", y="Score") #+ geom_hline() | |