Spaces:

stefanjwojcik
/

misinfo_detection_app

Running

App Files Files Community

misinfo_detection_app / src /deprecated /Narrative.jl

stefanjwojcik

Upload 24 files

48bb68b verified 5 months ago

raw

history blame contribute delete

9.18 kB

	## Structure of a Narrative

	function randid()
	config = Sqids.configure() # Local configuration
	id = Sqids.encode(config, [rand(1:100), rand(1:100)])
	return id
	end

	function timestamp()
	(now() - unix2datetime(0)).value
	end

	"""
	ts_to_time(timestamp()) == now()
	"""
	function ts_to_time(ts)
	return unix2datetime(ts / 1000)
	end

	"""
	Claim: something that supports a misinformation narrative

	id: unique identifier for the claim
	claim: text of the claim
	counterclaim: text of the counterclaim
	claimembedding: embedding of the claim
	counterclaimembedding: embedding of the counterclaim
	created_at: date the claim was created
	updated_at: date the claim was last updated
	source: source of the claim

	"""
	mutable struct Claim
	id::String
	claim::String # claim text
	counterclaim::String # counterclaim text
	claimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the claim
	counterclaimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the counterclaim
	created_at::Int64 # date the claim was created
	updated_at::Int64 # date the claim was last updated
	source::String # source of the claim
	keywords::Union{Array{String, 1}, Nothing} # keywords associated with the claim
	end

	"""
	createClaim(claim::String, counterclaim::String, source::String)

	Create a new Claim object with the given claim, counterclaim, and source.
	The claim and counterclaim embeddings are set to nothing by default.

	Example:
	createClaim("Solar panels poison the soil and reduce crop yields",
	"There is no evidence that solar panels poison the soil or reduce crop yields",
	"Facebook post")
	"""
	function createClaim(claim::String, counterclaim::String, source::String, keywords::Array{String, 1})
	return Claim(randid(), claim, counterclaim, nothing, nothing, timestamp(), timestamp(), source, keywords)
	end


	"""
	Narrative: a collection of claims that support a misinformation narrative

	id: unique identifier for the narrative
	title: descriptive title of the narrative
	type: broad type of narrative (e.g., anti-semitism)
	target: target group/topic of the narrative
	narrativesummary: base narrative text
	claims: list of Claim objects

	Example:
	example_narrative = Narrative(
	randid(),
	"Jews killed Jesus",
	"Anti-semitism",
	"Jews",
	"Jews are responsible for the death of Jesus",
	nothing)
	"""
	mutable struct Narrative
	id::String
	title::String # descriptive title (e.g., Jews killed Jesus)
	topic::String # broad type of narrative (e.g., anti-semitism)
	target::String # target group/topic of the narrative
	narrativesummary::String # base narrative text (e.g., Jews are responsible for the death of Jesus)
	claims::Vector{Claim} # list of Claim objects
	end

	"""
	## TODO: When you have a lot of narratives, you can create a NarrativeSet
	- If you apply a narrative set over a database, it will perform classification using all the narratives

	"""
	mutable struct NarrativeSet
	narratives::Vector{Narrative}
	end

	import Base: show
	## Make the Narrative pretty to show -
	function show(io::IO, narrative::Narrative)
	println(io, "Narrative: $(narrative.title)")
	println(io, "Topic: $(narrative.topic)")
	println(io, "Target: $(narrative.target)")
	println(io, "Narrative Summary: $(narrative.narrativesummary)")
	println(io, "Claims:")
	for claim in narrative.claims
	println(io, " - $(claim.claim)")
	end
	end

	"""
	add_claim!(narrative::Narrative, claim::Claim)

	Add a claim to a narrative.

	Example:
	add_claim!(example_narrative, example_claim)
	"""

	function add_claim!(narrative::Narrative, claim::Claim)
	push!(narrative.claims, claim)
	end

	function remove_claim!(narrative::Narrative, claim_id::String)
	narrative.claims = filter(c -> c.id != claim_id, narrative.claims)
	end

	function narrative_to_dataframe(narrative::Narrative)
	out = DataFrame( narrative_title = narrative.title,
	id = [claim.id for claim in narrative.claims],
	claim = [claim.claim for claim in narrative.claims],
	counterclaim = [claim.counterclaim for claim in narrative.claims],
	claimembedding = [claim.claimembedding for claim in narrative.claims],
	counterclaimembedding = [claim.counterclaimembedding for claim in narrative.claims],
	created_at = [claim.created_at for claim in narrative.claims],
	updated_at = [claim.updated_at for claim in narrative.claims],
	source = [claim.source for claim in narrative.claims],
	keywords = [claim.keywords for claim in narrative.claims])
	return out
	end

	"""
	# Collapse a dataframe into a narrative
	"""
	function dataframe_to_narrative(df::DataFrame, narrative_title::String, narrative_summary::String)
	claims = [Claim(row.id, row.claim, row.counterclaim, row.claimembedding, row.counterclaimembedding, row.created_at, row.updated_at, row.source, row.keywords) for row in eachrow(df)]
	return Narrative(randid(), narrative_title, "", "", narrative_summary, claims)
	end

	function deduplicate_claims_in_narrative!(narrative::Narrative)
	## check which claim in non-unique in the set
	claims = [claim.claim for claim in narrative.claims]
	is_duplicated = nonunique(DataFrame(claim=claims))
	# Get ID's of duplicated claims then remove them
	if length(claims[findall(is_duplicated)]) > 0
	for dupclaim in claims[findall(is_duplicated)]
	id_dup = [claim.id for claim in narrative.claims if claim.claim == dupclaim]
	# Remove all claims except the first one
	[remove_claim!(narrative, id) for id in id_dup[2:end]]
	end
	end
	return narrative
	end

	"""
	## Embeddings to recover narratives
	cand_embeddings = candidate_embeddings_from_narrative(narrative)
	- Input: narrative
	- Output: candidate embeddings - embeddings of text that match the regex defined in claims

	"""
	function candidate_embeddings(candidates::DataFrame; kwargs...)::DataFrame
	model_id = get(kwargs, :model_id, "text-embedding-3-small")
	textcol = get(kwargs, :textcol, "text")
	# check if text column exists
	if !textcol in names(candidates)
	error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
	end
	## Data Embeddings
	cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
	## Add vector of embeddings to dataset
	candidates[: , "Embeddings"] = [x for x in cand_embeddings]
	return candidates
	end
	## Embeddings

	"""
	df = CSV.read("data/random_300k.csv", DataFrame)
	df = filter(:message => x -> occursin(Regex("climate"), x), df)
	embeds = create_chunked_embeddings(df[:, "message"]; chunk_size=10)

	"""
	function create_openai_chunked_embeddings(texts; model_id="text-embedding-3-small", chunk_size=1000)
	## Chunk the data
	embeddings = []
	for chunk in 1:chunk_size:length(texts)
	embeddings_resp = create_embeddings(ENV["OPENAI_API_KEY"],
	texts[chunk:min(chunk+chunk_size-1, length(texts))]; model_id=model_id)
	push!(embeddings, [x["embedding"] for x in embeddings_resp.response["data"]])
	end
	return vcat(embeddings...)
	end

	"""
	## Embeddings of narrative claims
	- bang because it modifies the narrative object in place
	include("src/ExampleNarrative.jl")
	include("src/Narrative.jl")
	climate_narrative = create_example_narrative();
	generate_claim_embeddings_from_narrative!(climate_narrative)

	"""
	function generate_openai_claim_embeddings_from_narrative!(narrative::Narrative)
	## claim embeddings
	claim_embeddings = create_chunked_embeddings([x.claim for x in narrative.claims])
	[narrative.claims[i].claimembedding = claim_embeddings[i] for i in 1:length(narrative.claims)]
	## counterclaim embeddings
	counterclaim_embeddings = create_chunked_embeddings([x.counterclaim for x in narrative.claims])
	[narrative.claims[i].counterclaimembedding = counterclaim_embeddings[i] for i in 1:length(narrative.claims)]
	return nothing
	end

	"""
	## Embeddings of candidate data
	cand_embeddings = candidate_embeddings_from_narrative(narrative)
	- Input: narrative
	- Output: candidate embeddings - embeddings of text that match the regex defined in claims

	"""
	function candidate_openai_embeddings(candidates::DataFrame; kwargs...)::DataFrame
	model_id = get(kwargs, :model_id, "text-embedding-3-small")
	textcol = get(kwargs, :textcol, "text")
	# check if text column exists
	if !textcol in names(candidates)
	error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
	end
	## Data Embeddings
	cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
	## Add vector of embeddings to dataset
	candidates[: , "Embeddings"] = [x for x in cand_embeddings]
	return candidates
	end