|
## Embed all of the CARDS Data |
|
df = OC.DataLoader.pd.read_csv("data/climate_training.csv") |
|
model = "multilingual-e5-large" |
|
indexname = "ostreacultura-v1" |
|
namespace = "cards-data" |
|
out = OC.multi_embeddings(model, df, 96, "text") |
|
OC.upsert_data(out, "ostreacultura-v1", namespace, chunk_size=96) |
|
|
|
# Now, query cards data using Climate Misinformation Claims |
|
import OstreaCultura as OC |
|
using DataFrames |
|
claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv") |
|
indexname = "ostreacultura-v1" |
|
namespace = "cards-data" |
|
claim = claims.Claims[1] |
|
counterclaim = claims.Counterclaims[1] |
|
threshold = .8 |
|
top_k = 10_000 # top_k for the initial query = to total number of claims |
|
@time OC.query_claims(claims.Claims[1], claims.Counterclaims[1], indexname, namespace) |
|
|
|
# Write a loop to query all claims, then assign the claim to the top k values |
|
classified = DataFrame() |
|
@time for i in 1:size(claims)[1] |
|
result = OC.query_claims(string(claims.Claims[i]), string(claims.Counterclaims[i]), indexname, namespace; top_k=top_k, include_values=false) |
|
if nrow(result) == 0 |
|
println("No results found for claim: ", claims.Claims[i]) |
|
continue |
|
else |
|
result.assigned_claim .= claims.Claims[i] |
|
classified = vcat(classified, result) |
|
end |
|
end |
|
|
|
# Write the classified data to a csv file |
|
using CSV |
|
CSV.write("data/cards_top10000_results.csv", classified) |
|
classified = CSV.read("data/cards_top10000_results.csv", DataFrame) |
|
## Assign labels |
|
classified.predlabel80 .= 1 |
|
classified.predlabel85 .= 0 |
|
classified.predlabel85[classified.claim_score .>= .85] .= 1 |
|
classified.predlabel90 .= 0 |
|
classified.predlabel90[classified.claim_score .>= .90] .= 1 |
|
classified.ytrue .= [occursin("1", x) for x in classified.claim] |
|
sclassified = select(classified, r"id|predlabel|claim_score|assigned_claim") |
|
|
|
|
|
# Group by id and combine to keep only the id with the max score |
|
sclassified_grouped = groupby(sclassified, :id) |
|
sdf = combine(sclassified_grouped) do eh |
|
eh[argmax(eh.claim_score), :] |
|
end |
|
|
|
# climate full - get all ids |
|
pc = OC.create_pinecone_context() |
|
index = pc.Index(indexname) |
|
hi = [x for x in index.list(namespace=namespace)] |
|
## concat all the ids |
|
ids = vcat(hi...) |
|
# get all the data |
|
#cardsfull = OC.fetch_data(ids, indexname, namespace) |
|
#CSV.write("data/cards_full.csv", cardsfull) |
|
cardsfull = CSV.read("data/cards_full.csv", DataFrame) |
|
## left join the full data with the predicted |
|
cardsfull = leftjoin(cardsfull, sdf, on=:id) |
|
cardsfull.ytrue .= [!occursin("0", x) for x in cardsfull.claim] |
|
#coalesce all the predlabels |
|
cardsfull.predlabel80 .= coalesce.(cardsfull.predlabel80, 0) |
|
cardsfull.predlabel85 .= coalesce.(cardsfull.predlabel85, 0) |
|
cardsfull.predlabel90 .= coalesce.(cardsfull.predlabel90, 0) |
|
|
|
# Get precision and recall |
|
using MLJ |
|
ytrue = [!occursin("0", x) for x in cardsfull.claim] |
|
|
|
## Now predlabel 90 |
|
confusion_matrix(cardsfull.predlabel90, ytrue) |
|
accuracy(cardsfull.predlabel90, ytrue) |
|
true_positive_rate(ytrue, cardsfull.predlabel90) |
|
false_negative_rate(cardsfull.predlabel90,ytrue) |
|
|
|
## Now predlabel 85 |
|
confusion_matrix(cardsfull.predlabel85, ytrue) |
|
accuracy(cardsfull.predlabel85, ytrue) |
|
true_positive_rate(ytrue, cardsfull.predlabel85) |
|
false_negative_rate(cardsfull.predlabel85, ytrue) |
|
|
|
## Now predlabel 80 |
|
confusion_matrix(cardsfull.predlabel80, ytrue) |
|
accuracy(cardsfull.predlabel80, ytrue) |
|
true_positive_rate(ytrue, cardsfull.predlabel80) |
|
false_negative_rate(cardsfull.predlabel80, ytrue) |
|
|