Spaces:

stefanjwojcik
/

misinfo_detection_app

Sleeping

App Files Files Community

misinfo_detection_app / scripts /database_scratch.jl

stefanjwojcik

add scripts

143b0d4 verified 5 months ago

raw

history blame

3.9 kB

	## Uploading the data to the database
	import OstreaCultura as OC

	### Creating a long database of claims
	cards = CSV.read("data/Climate Misinformation Library with counterclaims.csv", DataFrame)
	oclib = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
	## OC Library modification:
	## 1. Drop Random ID
	oclib = select(oclib, Not(r"Random"))
	## 2. Rename Target to Topic
	rename!(oclib, :Target => :Topic)
	## 3. Rename Misinformation Narrative to Narrative
	rename!(oclib, "Misinformation Narrative" => "Narrative")
	## 4. Add column Claims, populate with Narrative
	oclib[!, :Claims] = oclib[!, :Narrative];
	## Model -> Topic
	oclib[!, :Model] .= oclib[!, :Topic];
	## Drop Type
	oclib = select(oclib, Not(r"Type"))

	## Cards modification:
	## 1. Drop Sub-narrative
	cards = select(cards, Not(r"Sub-narrative"))

	## Vcat the two dataframes with cols = :union
	df = vcat(cards, oclib; cols=:union)
	## Save as CSV
	CSV.write("data/Combined Misinformation Library.csv", df)


	### CREATING TEST SET ON INFERENCE ENDPOINT
	pc = OC.create_inf_pinecone_context()
	df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
	model = "multilingual-e5-large"
	test_embeds = OC.multi_embeddings(pc, model, df, 96, "text")

	## Uploading the data to the database
	OC.upsert_data(test_embeds, "test-index", "test-namespace")

	## CREATING Test Set for Indicator Test
	df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
	model = "multilingual-e5-large"
	test_embeds = OC.multi_embeddings(model, df, 96, "text")
	# Drop all columns except text, id, label, and embeddings
	test_embeds.drop(columns=["channelID", "MessageID", "AccountID", "topics", "weak topics",
	"contexts", "indicators", "CSV_File"], inplace=true)
	OC.upsert_data(test_embeds, "test-index", "indicator-test-namespace-2", chunk_size=96)


	## Creating Initial Library to query against
	df = OC.DataLoader.pd.read_csv("data/Modified Misinformation Library.csv")
	model = "multilingual-e5-large"
	out = OC.multi_embeddings(model, df, 96, "Misinformation Narrative")
	# Rename column Misinformation Narrative to text
	out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true)
	# Drop Random ID
	out.drop(columns=["Random ID"], inplace=true)
	OC.upsert_data(out, "ostreacultura-v1", "modified-misinfo-library", chunk_size=96)

	## Access the working database
	import XLSX, DataFrames
	xf = XLSX.readxlsx("data/Misinformation Library.xlsx")
	allsheets = ["LGBTQ", "Anti-Semitic", "Reproductive Health", "Climate"]
	#df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", "LGBTQ"))
	out = DataFrame()
	for sheet in allsheets
	df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", sheet))
	# select model, narrative, instances
	df = select(df, [:Model, :Narrative, :Instances])
	# convert all columns to string
	out = vcat(out, df)
	end
	# if Instance is missing, fill with Narrative
	out[ismissing.(out.Instances), :Instances] .= out[ismissing.(out.Instances), :Narrative]
	[out[!, col] = string.(out[!, col]) for col in names(out)]
	# drop duplicate instances
	out = unique(out)
	model = "multilingual-e5-large"
	out = OC.multi_embeddings(dropmissing(out); textcol="Instances")
	# Rename column Misinformation Narrative to text
	out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true)
	OC.upsert_data(out, "ostreacultura-v1", "expanded-misinfo-library", chunk_size=96)

	## How long does it take to query and classify 1000 claims
	using BenchmarkTools
	claims = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
	indexname = "ostreacultura-v1"
	namespace = "expanded-misinfo-library"
	classified = []
	## TODO: Adjust for longer text by splitting
	@time for i in 1:1000
	claim = claims.text[i]
	push!(classified, OC.classify_claim(claim, "", indexname, namespace; top_k=5, include_values=false))
	end

	OC.classify_claim(claims.text[1], "", indexname, namespace; top_k=5, include_values=false)