|
|
|
import OstreaCultura as OC |
|
|
|
|
|
cards = CSV.read("data/Climate Misinformation Library with counterclaims.csv", DataFrame) |
|
oclib = CSV.read("data/Modified Misinformation Library.csv", DataFrame) |
|
|
|
|
|
oclib = select(oclib, Not(r"Random")) |
|
|
|
rename!(oclib, :Target => :Topic) |
|
|
|
rename!(oclib, "Misinformation Narrative" => "Narrative") |
|
|
|
oclib[!, :Claims] = oclib[!, :Narrative]; |
|
|
|
oclib[!, :Model] .= oclib[!, :Topic]; |
|
|
|
oclib = select(oclib, Not(r"Type")) |
|
|
|
|
|
|
|
cards = select(cards, Not(r"Sub-narrative")) |
|
|
|
|
|
df = vcat(cards, oclib; cols=:union) |
|
|
|
CSV.write("data/Combined Misinformation Library.csv", df) |
|
|
|
|
|
|
|
pc = OC.create_inf_pinecone_context() |
|
df = OC.DataLoader.pd.read_csv("data/climate_test.csv") |
|
model = "multilingual-e5-large" |
|
test_embeds = OC.multi_embeddings(pc, model, df, 96, "text") |
|
|
|
|
|
OC.upsert_data(test_embeds, "test-index", "test-namespace") |
|
|
|
|
|
df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") |
|
model = "multilingual-e5-large" |
|
test_embeds = OC.multi_embeddings(model, df, 96, "text") |
|
|
|
test_embeds.drop(columns=["channelID", "MessageID", "AccountID", "topics", "weak topics", |
|
"contexts", "indicators", "CSV_File"], inplace=true) |
|
OC.upsert_data(test_embeds, "test-index", "indicator-test-namespace-2", chunk_size=96) |
|
|
|
|
|
|
|
df = OC.DataLoader.pd.read_csv("data/Modified Misinformation Library.csv") |
|
model = "multilingual-e5-large" |
|
out = OC.multi_embeddings(model, df, 96, "Misinformation Narrative") |
|
|
|
out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true) |
|
|
|
out.drop(columns=["Random ID"], inplace=true) |
|
OC.upsert_data(out, "ostreacultura-v1", "modified-misinfo-library", chunk_size=96) |
|
|
|
|
|
import XLSX, DataFrames |
|
xf = XLSX.readxlsx("data/Misinformation Library.xlsx") |
|
allsheets = ["LGBTQ", "Anti-Semitic", "Reproductive Health", "Climate"] |
|
|
|
out = DataFrame() |
|
for sheet in allsheets |
|
df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", sheet)) |
|
|
|
df = select(df, [:Model, :Narrative, :Instances]) |
|
|
|
out = vcat(out, df) |
|
end |
|
|
|
out[ismissing.(out.Instances), :Instances] .= out[ismissing.(out.Instances), :Narrative] |
|
[out[!, col] = string.(out[!, col]) for col in names(out)] |
|
|
|
out = unique(out) |
|
model = "multilingual-e5-large" |
|
out = OC.multi_embeddings(dropmissing(out); textcol="Instances") |
|
|
|
out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true) |
|
OC.upsert_data(out, "ostreacultura-v1", "expanded-misinfo-library", chunk_size=96) |
|
|
|
|
|
using BenchmarkTools |
|
claims = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") |
|
indexname = "ostreacultura-v1" |
|
namespace = "expanded-misinfo-library" |
|
classified = [] |
|
|
|
@time for i in 1:1000 |
|
claim = claims.text[i] |
|
push!(classified, OC.classify_claim(claim, "", indexname, namespace; top_k=5, include_values=false)) |
|
end |
|
|
|
OC.classify_claim(claims.text[1], "", indexname, namespace; top_k=5, include_values=false) |
|
|
|
|