## Uploading the data to the database import OstreaCultura as OC ### Creating a long database of claims cards = CSV.read("data/Climate Misinformation Library with counterclaims.csv", DataFrame) oclib = CSV.read("data/Modified Misinformation Library.csv", DataFrame) ## OC Library modification: ## 1. Drop Random ID oclib = select(oclib, Not(r"Random")) ## 2. Rename Target to Topic rename!(oclib, :Target => :Topic) ## 3. Rename Misinformation Narrative to Narrative rename!(oclib, "Misinformation Narrative" => "Narrative") ## 4. Add column Claims, populate with Narrative oclib[!, :Claims] = oclib[!, :Narrative]; ## Model -> Topic oclib[!, :Model] .= oclib[!, :Topic]; ## Drop Type oclib = select(oclib, Not(r"Type")) ## Cards modification: ## 1. Drop Sub-narrative cards = select(cards, Not(r"Sub-narrative")) ## Vcat the two dataframes with cols = :union df = vcat(cards, oclib; cols=:union) ## Save as CSV CSV.write("data/Combined Misinformation Library.csv", df) ### CREATING TEST SET ON INFERENCE ENDPOINT pc = OC.create_inf_pinecone_context() df = OC.DataLoader.pd.read_csv("data/climate_test.csv") model = "multilingual-e5-large" test_embeds = OC.multi_embeddings(pc, model, df, 96, "text") ## Uploading the data to the database OC.upsert_data(test_embeds, "test-index", "test-namespace") ## CREATING Test Set for Indicator Test df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") model = "multilingual-e5-large" test_embeds = OC.multi_embeddings(model, df, 96, "text") # Drop all columns except text, id, label, and embeddings test_embeds.drop(columns=["channelID", "MessageID", "AccountID", "topics", "weak topics", "contexts", "indicators", "CSV_File"], inplace=true) OC.upsert_data(test_embeds, "test-index", "indicator-test-namespace-2", chunk_size=96) ## Creating Initial Library to query against df = OC.DataLoader.pd.read_csv("data/Modified Misinformation Library.csv") model = "multilingual-e5-large" out = OC.multi_embeddings(model, df, 96, "Misinformation Narrative") # Rename column Misinformation Narrative to text out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true) # Drop Random ID out.drop(columns=["Random ID"], inplace=true) OC.upsert_data(out, "ostreacultura-v1", "modified-misinfo-library", chunk_size=96) ## Access the working database import XLSX, DataFrames xf = XLSX.readxlsx("data/Misinformation Library.xlsx") allsheets = ["LGBTQ", "Anti-Semitic", "Reproductive Health", "Climate"] #df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", "LGBTQ")) out = DataFrame() for sheet in allsheets df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", sheet)) # select model, narrative, instances df = select(df, [:Model, :Narrative, :Instances]) # convert all columns to string out = vcat(out, df) end # if Instance is missing, fill with Narrative out[ismissing.(out.Instances), :Instances] .= out[ismissing.(out.Instances), :Narrative] [out[!, col] = string.(out[!, col]) for col in names(out)] # drop duplicate instances out = unique(out) model = "multilingual-e5-large" out = OC.multi_embeddings(dropmissing(out); textcol="Instances") # Rename column Misinformation Narrative to text out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true) OC.upsert_data(out, "ostreacultura-v1", "expanded-misinfo-library", chunk_size=96) ## How long does it take to query and classify 1000 claims using BenchmarkTools claims = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") indexname = "ostreacultura-v1" namespace = "expanded-misinfo-library" classified = [] ## TODO: Adjust for longer text by splitting @time for i in 1:1000 claim = claims.text[i] push!(classified, OC.classify_claim(claim, "", indexname, namespace; top_k=5, include_values=false)) end OC.classify_claim(claims.text[1], "", indexname, namespace; top_k=5, include_values=false)