misinfo_detection_app / scripts /database_scratch.jl
stefanjwojcik's picture
add scripts
143b0d4 verified
## Uploading the data to the database
import OstreaCultura as OC
### Creating a long database of claims
cards = CSV.read("data/Climate Misinformation Library with counterclaims.csv", DataFrame)
oclib = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
## OC Library modification:
## 1. Drop Random ID
oclib = select(oclib, Not(r"Random"))
## 2. Rename Target to Topic
rename!(oclib, :Target => :Topic)
## 3. Rename Misinformation Narrative to Narrative
rename!(oclib, "Misinformation Narrative" => "Narrative")
## 4. Add column Claims, populate with Narrative
oclib[!, :Claims] = oclib[!, :Narrative];
## Model -> Topic
oclib[!, :Model] .= oclib[!, :Topic];
## Drop Type
oclib = select(oclib, Not(r"Type"))
## Cards modification:
## 1. Drop Sub-narrative
cards = select(cards, Not(r"Sub-narrative"))
## Vcat the two dataframes with cols = :union
df = vcat(cards, oclib; cols=:union)
## Save as CSV
CSV.write("data/Combined Misinformation Library.csv", df)
### CREATING TEST SET ON INFERENCE ENDPOINT
pc = OC.create_inf_pinecone_context()
df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
model = "multilingual-e5-large"
test_embeds = OC.multi_embeddings(pc, model, df, 96, "text")
## Uploading the data to the database
OC.upsert_data(test_embeds, "test-index", "test-namespace")
## CREATING Test Set for Indicator Test
df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
model = "multilingual-e5-large"
test_embeds = OC.multi_embeddings(model, df, 96, "text")
# Drop all columns except text, id, label, and embeddings
test_embeds.drop(columns=["channelID", "MessageID", "AccountID", "topics", "weak topics",
"contexts", "indicators", "CSV_File"], inplace=true)
OC.upsert_data(test_embeds, "test-index", "indicator-test-namespace-2", chunk_size=96)
## Creating Initial Library to query against
df = OC.DataLoader.pd.read_csv("data/Modified Misinformation Library.csv")
model = "multilingual-e5-large"
out = OC.multi_embeddings(model, df, 96, "Misinformation Narrative")
# Rename column Misinformation Narrative to text
out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true)
# Drop Random ID
out.drop(columns=["Random ID"], inplace=true)
OC.upsert_data(out, "ostreacultura-v1", "modified-misinfo-library", chunk_size=96)
## Access the working database
import XLSX, DataFrames
xf = XLSX.readxlsx("data/Misinformation Library.xlsx")
allsheets = ["LGBTQ", "Anti-Semitic", "Reproductive Health", "Climate"]
#df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", "LGBTQ"))
out = DataFrame()
for sheet in allsheets
df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", sheet))
# select model, narrative, instances
df = select(df, [:Model, :Narrative, :Instances])
# convert all columns to string
out = vcat(out, df)
end
# if Instance is missing, fill with Narrative
out[ismissing.(out.Instances), :Instances] .= out[ismissing.(out.Instances), :Narrative]
[out[!, col] = string.(out[!, col]) for col in names(out)]
# drop duplicate instances
out = unique(out)
model = "multilingual-e5-large"
out = OC.multi_embeddings(dropmissing(out); textcol="Instances")
# Rename column Misinformation Narrative to text
out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true)
OC.upsert_data(out, "ostreacultura-v1", "expanded-misinfo-library", chunk_size=96)
## How long does it take to query and classify 1000 claims
using BenchmarkTools
claims = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
indexname = "ostreacultura-v1"
namespace = "expanded-misinfo-library"
classified = []
## TODO: Adjust for longer text by splitting
@time for i in 1:1000
claim = claims.text[i]
push!(classified, OC.classify_claim(claim, "", indexname, namespace; top_k=5, include_values=false))
end
OC.classify_claim(claims.text[1], "", indexname, namespace; top_k=5, include_values=false)