Spaces:
Sleeping
Sleeping
## Uploading the data to the database | |
import OstreaCultura as OC | |
### Creating a long database of claims | |
cards = CSV.read("data/Climate Misinformation Library with counterclaims.csv", DataFrame) | |
oclib = CSV.read("data/Modified Misinformation Library.csv", DataFrame) | |
## OC Library modification: | |
## 1. Drop Random ID | |
oclib = select(oclib, Not(r"Random")) | |
## 2. Rename Target to Topic | |
rename!(oclib, :Target => :Topic) | |
## 3. Rename Misinformation Narrative to Narrative | |
rename!(oclib, "Misinformation Narrative" => "Narrative") | |
## 4. Add column Claims, populate with Narrative | |
oclib[!, :Claims] = oclib[!, :Narrative]; | |
## Model -> Topic | |
oclib[!, :Model] .= oclib[!, :Topic]; | |
## Drop Type | |
oclib = select(oclib, Not(r"Type")) | |
## Cards modification: | |
## 1. Drop Sub-narrative | |
cards = select(cards, Not(r"Sub-narrative")) | |
## Vcat the two dataframes with cols = :union | |
df = vcat(cards, oclib; cols=:union) | |
## Save as CSV | |
CSV.write("data/Combined Misinformation Library.csv", df) | |
### CREATING TEST SET ON INFERENCE ENDPOINT | |
pc = OC.create_inf_pinecone_context() | |
df = OC.DataLoader.pd.read_csv("data/climate_test.csv") | |
model = "multilingual-e5-large" | |
test_embeds = OC.multi_embeddings(pc, model, df, 96, "text") | |
## Uploading the data to the database | |
OC.upsert_data(test_embeds, "test-index", "test-namespace") | |
## CREATING Test Set for Indicator Test | |
df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") | |
model = "multilingual-e5-large" | |
test_embeds = OC.multi_embeddings(model, df, 96, "text") | |
# Drop all columns except text, id, label, and embeddings | |
test_embeds.drop(columns=["channelID", "MessageID", "AccountID", "topics", "weak topics", | |
"contexts", "indicators", "CSV_File"], inplace=true) | |
OC.upsert_data(test_embeds, "test-index", "indicator-test-namespace-2", chunk_size=96) | |
## Creating Initial Library to query against | |
df = OC.DataLoader.pd.read_csv("data/Modified Misinformation Library.csv") | |
model = "multilingual-e5-large" | |
out = OC.multi_embeddings(model, df, 96, "Misinformation Narrative") | |
# Rename column Misinformation Narrative to text | |
out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true) | |
# Drop Random ID | |
out.drop(columns=["Random ID"], inplace=true) | |
OC.upsert_data(out, "ostreacultura-v1", "modified-misinfo-library", chunk_size=96) | |
## Access the working database | |
import XLSX, DataFrames | |
xf = XLSX.readxlsx("data/Misinformation Library.xlsx") | |
allsheets = ["LGBTQ", "Anti-Semitic", "Reproductive Health", "Climate"] | |
#df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", "LGBTQ")) | |
out = DataFrame() | |
for sheet in allsheets | |
df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", sheet)) | |
# select model, narrative, instances | |
df = select(df, [:Model, :Narrative, :Instances]) | |
# convert all columns to string | |
out = vcat(out, df) | |
end | |
# if Instance is missing, fill with Narrative | |
out[ismissing.(out.Instances), :Instances] .= out[ismissing.(out.Instances), :Narrative] | |
[out[!, col] = string.(out[!, col]) for col in names(out)] | |
# drop duplicate instances | |
out = unique(out) | |
model = "multilingual-e5-large" | |
out = OC.multi_embeddings(dropmissing(out); textcol="Instances") | |
# Rename column Misinformation Narrative to text | |
out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true) | |
OC.upsert_data(out, "ostreacultura-v1", "expanded-misinfo-library", chunk_size=96) | |
## How long does it take to query and classify 1000 claims | |
using BenchmarkTools | |
claims = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") | |
indexname = "ostreacultura-v1" | |
namespace = "expanded-misinfo-library" | |
classified = [] | |
## TODO: Adjust for longer text by splitting | |
@time for i in 1:1000 | |
claim = claims.text[i] | |
push!(classified, OC.classify_claim(claim, "", indexname, namespace; top_k=5, include_values=false)) | |
end | |
OC.classify_claim(claims.text[1], "", indexname, namespace; top_k=5, include_values=false) | |