|
|
|
|
|
|
|
import OstreaCultura as OC |
|
dat = CSV.read("data/Climate Misinformation Library with counterclaims.csv", DataFrame) |
|
|
|
dat = stack(select(dat, r"laims"), [:Claims, :Counterclaims], variable_name=:Type, value_name=:text) |
|
dropmissing!(dat) |
|
dat.label = ifelse.(dat.Type .== "Claims", 1, 0) |
|
|
|
model = "multilingual-e5-large" |
|
embeds = OC.multi_embeddings(dat) |
|
|
|
features = convert(Array, embeds.Embeddings) |
|
y = convert(Array, dat.label) |
|
|
|
@sk_import calibration: CalibratedClassifierCV |
|
import ScikitLearn: CrossValidation |
|
@sk_import svm: LinearSVC |
|
import ScikitLearn: CrossValidation |
|
using ScikitLearn.CrossValidation: cross_val_score |
|
resvm = LinearSVC(C=.5, loss="squared_hinge", penalty="l2", multi_class="ovr", random_state = 35552, max_iter=2000) |
|
cv = ScikitLearn.CrossValidation.KFold(189, n_folds=5, random_state = 134, shuffle=true) |
|
out = cross_val_score(resvm, features, y, cv = cv) |
|
|
|
using ScikitLearn: metrics |
|
y_pred = ScikitLearn.CrossValidation.cross_val_predict(resvm, features, y, cv=cv) |
|
|
|
pre = sum((y .== 1) .& (y_pred .== 1)) / sum(y_pred .== 1) |
|
|
|
rec = sum((y .== 1) .& (y_pred .== 1)) / sum(y .== 1) |
|
|
|
|
|
fit!(resvm, features, y) |
|
|
|
sv = resvm.coef_ |
|
histogram(sv[1, :]) |
|
|
|
calsvm = CalibratedClassifierCV(resvm) |
|
calsvm.fit(features, y) |
|
|
|
prob_preds = calsvm.predict_proba(features) |
|
|
|
top_k = 5 |
|
top_indices = sortperm(prob_preds[:, 1], rev=true)[1:top_k] |
|
prob_preds[top_indices, :] |
|
dat[top_indices, :] |
|
|
|
top_indices = sortperm(prob_preds[:, 2], rev=true)[1:top_k] |
|
prob_preds[top_indices, :] |
|
dat[top_indices, :] |