## Structure of a Narrative function randid() config = Sqids.configure() # Local configuration id = Sqids.encode(config, [rand(1:100), rand(1:100)]) return id end function timestamp() (now() - unix2datetime(0)).value end """ ts_to_time(timestamp()) == now() """ function ts_to_time(ts) return unix2datetime(ts / 1000) end """ Claim: something that supports a misinformation narrative id: unique identifier for the claim claim: text of the claim counterclaim: text of the counterclaim claimembedding: embedding of the claim counterclaimembedding: embedding of the counterclaim created_at: date the claim was created updated_at: date the claim was last updated source: source of the claim """ mutable struct Claim id::String claim::String # claim text counterclaim::String # counterclaim text claimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the claim counterclaimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the counterclaim created_at::Int64 # date the claim was created updated_at::Int64 # date the claim was last updated source::String # source of the claim keywords::Union{Array{String, 1}, Nothing} # keywords associated with the claim end """ createClaim(claim::String, counterclaim::String, source::String) Create a new Claim object with the given claim, counterclaim, and source. The claim and counterclaim embeddings are set to nothing by default. Example: createClaim("Solar panels poison the soil and reduce crop yields", "There is no evidence that solar panels poison the soil or reduce crop yields", "Facebook post") """ function createClaim(claim::String, counterclaim::String, source::String, keywords::Array{String, 1}) return Claim(randid(), claim, counterclaim, nothing, nothing, timestamp(), timestamp(), source, keywords) end """ Narrative: a collection of claims that support a misinformation narrative id: unique identifier for the narrative title: descriptive title of the narrative type: broad type of narrative (e.g., anti-semitism) target: target group/topic of the narrative narrativesummary: base narrative text claims: list of Claim objects Example: example_narrative = Narrative( randid(), "Jews killed Jesus", "Anti-semitism", "Jews", "Jews are responsible for the death of Jesus", nothing) """ mutable struct Narrative id::String title::String # descriptive title (e.g., Jews killed Jesus) topic::String # broad type of narrative (e.g., anti-semitism) target::String # target group/topic of the narrative narrativesummary::String # base narrative text (e.g., Jews are responsible for the death of Jesus) claims::Vector{Claim} # list of Claim objects end """ ## TODO: When you have a lot of narratives, you can create a NarrativeSet - If you apply a narrative set over a database, it will perform classification using all the narratives """ mutable struct NarrativeSet narratives::Vector{Narrative} end import Base: show ## Make the Narrative pretty to show - function show(io::IO, narrative::Narrative) println(io, "Narrative: $(narrative.title)") println(io, "Topic: $(narrative.topic)") println(io, "Target: $(narrative.target)") println(io, "Narrative Summary: $(narrative.narrativesummary)") println(io, "Claims:") for claim in narrative.claims println(io, " - $(claim.claim)") end end """ add_claim!(narrative::Narrative, claim::Claim) Add a claim to a narrative. Example: add_claim!(example_narrative, example_claim) """ function add_claim!(narrative::Narrative, claim::Claim) push!(narrative.claims, claim) end function remove_claim!(narrative::Narrative, claim_id::String) narrative.claims = filter(c -> c.id != claim_id, narrative.claims) end function narrative_to_dataframe(narrative::Narrative) out = DataFrame( narrative_title = narrative.title, id = [claim.id for claim in narrative.claims], claim = [claim.claim for claim in narrative.claims], counterclaim = [claim.counterclaim for claim in narrative.claims], claimembedding = [claim.claimembedding for claim in narrative.claims], counterclaimembedding = [claim.counterclaimembedding for claim in narrative.claims], created_at = [claim.created_at for claim in narrative.claims], updated_at = [claim.updated_at for claim in narrative.claims], source = [claim.source for claim in narrative.claims], keywords = [claim.keywords for claim in narrative.claims]) return out end """ # Collapse a dataframe into a narrative """ function dataframe_to_narrative(df::DataFrame, narrative_title::String, narrative_summary::String) claims = [Claim(row.id, row.claim, row.counterclaim, row.claimembedding, row.counterclaimembedding, row.created_at, row.updated_at, row.source, row.keywords) for row in eachrow(df)] return Narrative(randid(), narrative_title, "", "", narrative_summary, claims) end function deduplicate_claims_in_narrative!(narrative::Narrative) ## check which claim in non-unique in the set claims = [claim.claim for claim in narrative.claims] is_duplicated = nonunique(DataFrame(claim=claims)) # Get ID's of duplicated claims then remove them if length(claims[findall(is_duplicated)]) > 0 for dupclaim in claims[findall(is_duplicated)] id_dup = [claim.id for claim in narrative.claims if claim.claim == dupclaim] # Remove all claims except the first one [remove_claim!(narrative, id) for id in id_dup[2:end]] end end return narrative end """ ## Embeddings to recover narratives cand_embeddings = candidate_embeddings_from_narrative(narrative) - Input: narrative - Output: candidate embeddings - embeddings of text that match the regex defined in claims """ function candidate_embeddings(candidates::DataFrame; kwargs...)::DataFrame model_id = get(kwargs, :model_id, "text-embedding-3-small") textcol = get(kwargs, :textcol, "text") # check if text column exists if !textcol in names(candidates) error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument") end ## Data Embeddings cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id); ## Add vector of embeddings to dataset candidates[: , "Embeddings"] = [x for x in cand_embeddings] return candidates end ## Embeddings """ df = CSV.read("data/random_300k.csv", DataFrame) df = filter(:message => x -> occursin(Regex("climate"), x), df) embeds = create_chunked_embeddings(df[:, "message"]; chunk_size=10) """ function create_openai_chunked_embeddings(texts; model_id="text-embedding-3-small", chunk_size=1000) ## Chunk the data embeddings = [] for chunk in 1:chunk_size:length(texts) embeddings_resp = create_embeddings(ENV["OPENAI_API_KEY"], texts[chunk:min(chunk+chunk_size-1, length(texts))]; model_id=model_id) push!(embeddings, [x["embedding"] for x in embeddings_resp.response["data"]]) end return vcat(embeddings...) end """ ## Embeddings of narrative claims - bang because it modifies the narrative object in place include("src/ExampleNarrative.jl") include("src/Narrative.jl") climate_narrative = create_example_narrative(); generate_claim_embeddings_from_narrative!(climate_narrative) """ function generate_openai_claim_embeddings_from_narrative!(narrative::Narrative) ## claim embeddings claim_embeddings = create_chunked_embeddings([x.claim for x in narrative.claims]) [narrative.claims[i].claimembedding = claim_embeddings[i] for i in 1:length(narrative.claims)] ## counterclaim embeddings counterclaim_embeddings = create_chunked_embeddings([x.counterclaim for x in narrative.claims]) [narrative.claims[i].counterclaimembedding = counterclaim_embeddings[i] for i in 1:length(narrative.claims)] return nothing end """ ## Embeddings of candidate data cand_embeddings = candidate_embeddings_from_narrative(narrative) - Input: narrative - Output: candidate embeddings - embeddings of text that match the regex defined in claims """ function candidate_openai_embeddings(candidates::DataFrame; kwargs...)::DataFrame model_id = get(kwargs, :model_id, "text-embedding-3-small") textcol = get(kwargs, :textcol, "text") # check if text column exists if !textcol in names(candidates) error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument") end ## Data Embeddings cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id); ## Add vector of embeddings to dataset candidates[: , "Embeddings"] = [x for x in cand_embeddings] return candidates end