Spaces:

stefanjwojcik
/

misinfo_detection_app

Running

File size: 3,904 Bytes

143b0d4

## Uploading the data to the database
import OstreaCultura as OC

### Creating a long database of claims 
cards = CSV.read("data/Climate Misinformation Library with counterclaims.csv", DataFrame)
oclib = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
## OC Library modification: 
## 1. Drop Random ID
oclib = select(oclib, Not(r"Random"))
## 2. Rename Target to Topic 
rename!(oclib, :Target => :Topic)
## 3. Rename Misinformation Narrative to Narrative 
rename!(oclib, "Misinformation Narrative" => "Narrative")
## 4. Add column Claims, populate with Narrative 
oclib[!, :Claims] = oclib[!, :Narrative];
## Model -> Topic 
oclib[!, :Model] .= oclib[!, :Topic];
## Drop Type 
oclib = select(oclib, Not(r"Type"))

## Cards modification:
## 1. Drop Sub-narrative 
cards = select(cards, Not(r"Sub-narrative"))

## Vcat the two dataframes with cols = :union 
df = vcat(cards, oclib; cols=:union)
## Save as CSV 
CSV.write("data/Combined Misinformation Library.csv", df)


### CREATING TEST SET ON INFERENCE ENDPOINT  
pc = OC.create_inf_pinecone_context()
df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
model = "multilingual-e5-large"
test_embeds = OC.multi_embeddings(pc, model, df, 96, "text")

## Uploading the data to the database
OC.upsert_data(test_embeds, "test-index", "test-namespace")

## CREATING Test Set for Indicator Test
df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
model = "multilingual-e5-large"
test_embeds = OC.multi_embeddings(model, df, 96, "text")
# Drop all columns except text, id, label, and embeddings
test_embeds.drop(columns=["channelID", "MessageID", "AccountID", "topics", "weak topics",
    "contexts", "indicators", "CSV_File"], inplace=true)
OC.upsert_data(test_embeds, "test-index", "indicator-test-namespace-2", chunk_size=96)


## Creating Initial Library to query against 
df = OC.DataLoader.pd.read_csv("data/Modified Misinformation Library.csv")
model = "multilingual-e5-large"
out = OC.multi_embeddings(model, df, 96, "Misinformation Narrative")
# Rename column Misinformation Narrative to text
out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true)
# Drop Random ID 
out.drop(columns=["Random ID"], inplace=true)
OC.upsert_data(out, "ostreacultura-v1", "modified-misinfo-library", chunk_size=96)

## Access the working database
import XLSX, DataFrames
xf = XLSX.readxlsx("data/Misinformation Library.xlsx")
allsheets = ["LGBTQ", "Anti-Semitic", "Reproductive Health", "Climate"]
#df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", "LGBTQ"))
out = DataFrame()
for sheet in allsheets
    df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", sheet))
    # select model, narrative, instances 
    df = select(df, [:Model, :Narrative, :Instances])
    # convert all columns to string
    out = vcat(out, df)
end
# if Instance is missing, fill with Narrative 
out[ismissing.(out.Instances), :Instances] .= out[ismissing.(out.Instances), :Narrative]
[out[!, col] = string.(out[!, col]) for col in names(out)]
# drop duplicate instances
out = unique(out)
model = "multilingual-e5-large"
out = OC.multi_embeddings(dropmissing(out); textcol="Instances")
# Rename column Misinformation Narrative to text
out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true)
OC.upsert_data(out, "ostreacultura-v1", "expanded-misinfo-library", chunk_size=96)

## How long does it take to query and classify 1000 claims 
using BenchmarkTools
claims = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
indexname = "ostreacultura-v1"
namespace = "expanded-misinfo-library"
classified = []
## TODO: Adjust for longer text by splitting 
@time for i in 1:1000
    claim = claims.text[i]
    push!(classified, OC.classify_claim(claim, "", indexname, namespace; top_k=5, include_values=false))
end

OC.classify_claim(claims.text[1], "", indexname, namespace; top_k=5, include_values=false)