stefanjwojcik's picture
Upload 24 files
48bb68b verified
## Structure of a Narrative
function randid()
config = Sqids.configure() # Local configuration
id = Sqids.encode(config, [rand(1:100), rand(1:100)])
return id
end
function timestamp()
(now() - unix2datetime(0)).value
end
"""
ts_to_time(timestamp()) == now()
"""
function ts_to_time(ts)
return unix2datetime(ts / 1000)
end
"""
Claim: something that supports a misinformation narrative
id: unique identifier for the claim
claim: text of the claim
counterclaim: text of the counterclaim
claimembedding: embedding of the claim
counterclaimembedding: embedding of the counterclaim
created_at: date the claim was created
updated_at: date the claim was last updated
source: source of the claim
"""
mutable struct Claim
id::String
claim::String # claim text
counterclaim::String # counterclaim text
claimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the claim
counterclaimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the counterclaim
created_at::Int64 # date the claim was created
updated_at::Int64 # date the claim was last updated
source::String # source of the claim
keywords::Union{Array{String, 1}, Nothing} # keywords associated with the claim
end
"""
createClaim(claim::String, counterclaim::String, source::String)
Create a new Claim object with the given claim, counterclaim, and source.
The claim and counterclaim embeddings are set to nothing by default.
Example:
createClaim("Solar panels poison the soil and reduce crop yields",
"There is no evidence that solar panels poison the soil or reduce crop yields",
"Facebook post")
"""
function createClaim(claim::String, counterclaim::String, source::String, keywords::Array{String, 1})
return Claim(randid(), claim, counterclaim, nothing, nothing, timestamp(), timestamp(), source, keywords)
end
"""
Narrative: a collection of claims that support a misinformation narrative
id: unique identifier for the narrative
title: descriptive title of the narrative
type: broad type of narrative (e.g., anti-semitism)
target: target group/topic of the narrative
narrativesummary: base narrative text
claims: list of Claim objects
Example:
example_narrative = Narrative(
randid(),
"Jews killed Jesus",
"Anti-semitism",
"Jews",
"Jews are responsible for the death of Jesus",
nothing)
"""
mutable struct Narrative
id::String
title::String # descriptive title (e.g., Jews killed Jesus)
topic::String # broad type of narrative (e.g., anti-semitism)
target::String # target group/topic of the narrative
narrativesummary::String # base narrative text (e.g., Jews are responsible for the death of Jesus)
claims::Vector{Claim} # list of Claim objects
end
"""
## TODO: When you have a lot of narratives, you can create a NarrativeSet
- If you apply a narrative set over a database, it will perform classification using all the narratives
"""
mutable struct NarrativeSet
narratives::Vector{Narrative}
end
import Base: show
## Make the Narrative pretty to show -
function show(io::IO, narrative::Narrative)
println(io, "Narrative: $(narrative.title)")
println(io, "Topic: $(narrative.topic)")
println(io, "Target: $(narrative.target)")
println(io, "Narrative Summary: $(narrative.narrativesummary)")
println(io, "Claims:")
for claim in narrative.claims
println(io, " - $(claim.claim)")
end
end
"""
add_claim!(narrative::Narrative, claim::Claim)
Add a claim to a narrative.
Example:
add_claim!(example_narrative, example_claim)
"""
function add_claim!(narrative::Narrative, claim::Claim)
push!(narrative.claims, claim)
end
function remove_claim!(narrative::Narrative, claim_id::String)
narrative.claims = filter(c -> c.id != claim_id, narrative.claims)
end
function narrative_to_dataframe(narrative::Narrative)
out = DataFrame( narrative_title = narrative.title,
id = [claim.id for claim in narrative.claims],
claim = [claim.claim for claim in narrative.claims],
counterclaim = [claim.counterclaim for claim in narrative.claims],
claimembedding = [claim.claimembedding for claim in narrative.claims],
counterclaimembedding = [claim.counterclaimembedding for claim in narrative.claims],
created_at = [claim.created_at for claim in narrative.claims],
updated_at = [claim.updated_at for claim in narrative.claims],
source = [claim.source for claim in narrative.claims],
keywords = [claim.keywords for claim in narrative.claims])
return out
end
"""
# Collapse a dataframe into a narrative
"""
function dataframe_to_narrative(df::DataFrame, narrative_title::String, narrative_summary::String)
claims = [Claim(row.id, row.claim, row.counterclaim, row.claimembedding, row.counterclaimembedding, row.created_at, row.updated_at, row.source, row.keywords) for row in eachrow(df)]
return Narrative(randid(), narrative_title, "", "", narrative_summary, claims)
end
function deduplicate_claims_in_narrative!(narrative::Narrative)
## check which claim in non-unique in the set
claims = [claim.claim for claim in narrative.claims]
is_duplicated = nonunique(DataFrame(claim=claims))
# Get ID's of duplicated claims then remove them
if length(claims[findall(is_duplicated)]) > 0
for dupclaim in claims[findall(is_duplicated)]
id_dup = [claim.id for claim in narrative.claims if claim.claim == dupclaim]
# Remove all claims except the first one
[remove_claim!(narrative, id) for id in id_dup[2:end]]
end
end
return narrative
end
"""
## Embeddings to recover narratives
cand_embeddings = candidate_embeddings_from_narrative(narrative)
- Input: narrative
- Output: candidate embeddings - embeddings of text that match the regex defined in claims
"""
function candidate_embeddings(candidates::DataFrame; kwargs...)::DataFrame
model_id = get(kwargs, :model_id, "text-embedding-3-small")
textcol = get(kwargs, :textcol, "text")
# check if text column exists
if !textcol in names(candidates)
error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
end
## Data Embeddings
cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
## Add vector of embeddings to dataset
candidates[: , "Embeddings"] = [x for x in cand_embeddings]
return candidates
end
## Embeddings
"""
df = CSV.read("data/random_300k.csv", DataFrame)
df = filter(:message => x -> occursin(Regex("climate"), x), df)
embeds = create_chunked_embeddings(df[:, "message"]; chunk_size=10)
"""
function create_openai_chunked_embeddings(texts; model_id="text-embedding-3-small", chunk_size=1000)
## Chunk the data
embeddings = []
for chunk in 1:chunk_size:length(texts)
embeddings_resp = create_embeddings(ENV["OPENAI_API_KEY"],
texts[chunk:min(chunk+chunk_size-1, length(texts))]; model_id=model_id)
push!(embeddings, [x["embedding"] for x in embeddings_resp.response["data"]])
end
return vcat(embeddings...)
end
"""
## Embeddings of narrative claims
- bang because it modifies the narrative object in place
include("src/ExampleNarrative.jl")
include("src/Narrative.jl")
climate_narrative = create_example_narrative();
generate_claim_embeddings_from_narrative!(climate_narrative)
"""
function generate_openai_claim_embeddings_from_narrative!(narrative::Narrative)
## claim embeddings
claim_embeddings = create_chunked_embeddings([x.claim for x in narrative.claims])
[narrative.claims[i].claimembedding = claim_embeddings[i] for i in 1:length(narrative.claims)]
## counterclaim embeddings
counterclaim_embeddings = create_chunked_embeddings([x.counterclaim for x in narrative.claims])
[narrative.claims[i].counterclaimembedding = counterclaim_embeddings[i] for i in 1:length(narrative.claims)]
return nothing
end
"""
## Embeddings of candidate data
cand_embeddings = candidate_embeddings_from_narrative(narrative)
- Input: narrative
- Output: candidate embeddings - embeddings of text that match the regex defined in claims
"""
function candidate_openai_embeddings(candidates::DataFrame; kwargs...)::DataFrame
model_id = get(kwargs, :model_id, "text-embedding-3-small")
textcol = get(kwargs, :textcol, "text")
# check if text column exists
if !textcol in names(candidates)
error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
end
## Data Embeddings
cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
## Add vector of embeddings to dataset
candidates[: , "Embeddings"] = [x for x in cand_embeddings]
return candidates
end