File size: 3,786 Bytes
143b0d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
## Embed all of the CARDS Data 
df = CSV.read("data/climate_data/data/test.csv", DataFrame)
# Drop rows == "nan"
df = filter(row -> row.text != "nan", df)
df = OC.df_to_pd(df)
model = "multilingual-e5-large"
indexname = "ostreacultura-v1"
namespace = "cards-data-test"
out = OC.multi_embeddings(model, df, 96, "text")
OC.upsert_data(out, "ostreacultura-v1", namespace, chunk_size=96)

# Now, query cards data using Climate Misinformation Claims 
import OstreaCultura as OC
using DataFrames
claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv")
indexname = "ostreacultura-v1"
namespace = "cards-data-test"
claim = claims.Claims[1]
counterclaim = claims.Counterclaims[1]
threshold = .8
top_k = 400 # top_k for the initial query = to total number of claims 
@time OC.query(claim, indexname, namespace, top_k=top_k)

# Write a loop to query all claims, then assign the claim to the top k values
classified = DataFrame()
@time for i in 1:size(claims)[1]
    result = OC.query(string(claims.Claims[i]), indexname, namespace; top_k=top_k, include_values=false)
    if nrow(result) == 0
        println("No results found for claim: ", claims.Claims[i])
        continue
    else
        result.assigned_claim .= claims.Claims[i]
        classified = vcat(classified, result)    
    end
end

# Write the classified data to a csv file
using CSV
CSV.write("data/cards_test_query_top400_results.csv", classified)
classified = CSV.read("data/cards_test_query_top400_results.csv", DataFrame)
## Assign labels 
classified.predlabel80 .= 1
classified.predlabel85 .= 0 
classified.predlabel85[classified.score .>= .85] .= 1
classified.predlabel90 .= 0
classified.predlabel90[classified.score .>= .90] .= 1
classified.ytrue .= [occursin("1", x) for x in classified.claim]
sclassified = select(classified, r"id|predlabel|score|assigned_claim")


# Group by id and combine to keep only the id with the max score 
sclassified_grouped = groupby(sclassified, :id) 
sdf = combine(sclassified_grouped) do eh
    eh[argmax(eh.score), :]
end

# climate full - get all ids 
pc = OC.create_pinecone_context()
index = pc.Index(indexname)
hi = [x for x in index.list(namespace=namespace)]
## concat all the ids 
ids = vcat(hi...)
# get all the data 
cardsfull = OC.fetch_data(ids, indexname, namespace)
#CSV.write("data/climate_data/data/test_w_ids.csv", cardsfull)
cardsfull = CSV.read("data/climate_data/data/test_w_ids.csv", DataFrame)
## left join the full data with the predicted 
cardsfull = leftjoin(cardsfull, sdf, on=:id)
cardsfull.ytrue .= [!occursin("0", x) for x in cardsfull.claim]
#coalesce all the predlabels
cardsfull.predlabel80 .= coalesce.(cardsfull.predlabel80, 0)
cardsfull.predlabel85 .= coalesce.(cardsfull.predlabel85, 0)
cardsfull.predlabel90 .= coalesce.(cardsfull.predlabel90, 0)

# Get precision and recall 
using MLJ
ytrue = [!occursin("0", x) for x in cardsfull.claim]

## Now predlabel 90
confusion_matrix(cardsfull.predlabel90, ytrue)
accuracy(cardsfull.predlabel90, ytrue)
true_positive_rate(ytrue, cardsfull.predlabel90)
false_negative_rate(cardsfull.predlabel90,ytrue)

## Now predlabel 85
confusion_matrix(cardsfull.predlabel85, ytrue)
accuracy(cardsfull.predlabel85, ytrue)
true_positive_rate(ytrue, cardsfull.predlabel85)
false_negative_rate(cardsfull.predlabel85, ytrue)

## Now predlabel 80
confusion_matrix(cardsfull.predlabel80, ytrue)
accuracy(cardsfull.predlabel80, ytrue)
true_positive_rate(ytrue, cardsfull.predlabel80)
false_negative_rate(cardsfull.predlabel80, ytrue)

CSV.write("data/cards_test_query_top400_results.csv", cardsfull)


using TidierPlots

## By Label 
ggplot(filter(:score => x -> !ismissing(x), cardsfull), @aes(x = ytrue, y = score)) +
    geom_violin() + labs(x="Misinfo Label", y="Score") #+ geom_hline()