## Embed all of the CARDS Data df = CSV.read("data/climate_data/data/test.csv", DataFrame) # Drop rows == "nan" df = filter(row -> row.text != "nan", df) df = OC.df_to_pd(df) model = "multilingual-e5-large" indexname = "ostreacultura-v1" namespace = "cards-data-test" out = OC.multi_embeddings(model, df, 96, "text") OC.upsert_data(out, "ostreacultura-v1", namespace, chunk_size=96) # Now, query cards data using Climate Misinformation Claims import OstreaCultura as OC using DataFrames claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv") indexname = "ostreacultura-v1" namespace = "cards-data-test" claim = claims.Claims[1] counterclaim = claims.Counterclaims[1] threshold = .8 top_k = 400 # top_k for the initial query = to total number of claims @time OC.query(claim, indexname, namespace, top_k=top_k) # Write a loop to query all claims, then assign the claim to the top k values classified = DataFrame() @time for i in 1:size(claims)[1] result = OC.query(string(claims.Claims[i]), indexname, namespace; top_k=top_k, include_values=false) if nrow(result) == 0 println("No results found for claim: ", claims.Claims[i]) continue else result.assigned_claim .= claims.Claims[i] classified = vcat(classified, result) end end # Write the classified data to a csv file using CSV CSV.write("data/cards_test_query_top400_results.csv", classified) classified = CSV.read("data/cards_test_query_top400_results.csv", DataFrame) ## Assign labels classified.predlabel80 .= 1 classified.predlabel85 .= 0 classified.predlabel85[classified.score .>= .85] .= 1 classified.predlabel90 .= 0 classified.predlabel90[classified.score .>= .90] .= 1 classified.ytrue .= [occursin("1", x) for x in classified.claim] sclassified = select(classified, r"id|predlabel|score|assigned_claim") # Group by id and combine to keep only the id with the max score sclassified_grouped = groupby(sclassified, :id) sdf = combine(sclassified_grouped) do eh eh[argmax(eh.score), :] end # climate full - get all ids pc = OC.create_pinecone_context() index = pc.Index(indexname) hi = [x for x in index.list(namespace=namespace)] ## concat all the ids ids = vcat(hi...) # get all the data cardsfull = OC.fetch_data(ids, indexname, namespace) #CSV.write("data/climate_data/data/test_w_ids.csv", cardsfull) cardsfull = CSV.read("data/climate_data/data/test_w_ids.csv", DataFrame) ## left join the full data with the predicted cardsfull = leftjoin(cardsfull, sdf, on=:id) cardsfull.ytrue .= [!occursin("0", x) for x in cardsfull.claim] #coalesce all the predlabels cardsfull.predlabel80 .= coalesce.(cardsfull.predlabel80, 0) cardsfull.predlabel85 .= coalesce.(cardsfull.predlabel85, 0) cardsfull.predlabel90 .= coalesce.(cardsfull.predlabel90, 0) # Get precision and recall using MLJ ytrue = [!occursin("0", x) for x in cardsfull.claim] ## Now predlabel 90 confusion_matrix(cardsfull.predlabel90, ytrue) accuracy(cardsfull.predlabel90, ytrue) true_positive_rate(ytrue, cardsfull.predlabel90) false_negative_rate(cardsfull.predlabel90,ytrue) ## Now predlabel 85 confusion_matrix(cardsfull.predlabel85, ytrue) accuracy(cardsfull.predlabel85, ytrue) true_positive_rate(ytrue, cardsfull.predlabel85) false_negative_rate(cardsfull.predlabel85, ytrue) ## Now predlabel 80 confusion_matrix(cardsfull.predlabel80, ytrue) accuracy(cardsfull.predlabel80, ytrue) true_positive_rate(ytrue, cardsfull.predlabel80) false_negative_rate(cardsfull.predlabel80, ytrue) CSV.write("data/cards_test_query_top400_results.csv", cardsfull) using TidierPlots ## By Label ggplot(filter(:score => x -> !ismissing(x), cardsfull), @aes(x = ytrue, y = score)) + geom_violin() + labs(x="Misinfo Label", y="Score") #+ geom_hline()