|
|
|
|
|
function randid() |
|
config = Sqids.configure() |
|
id = Sqids.encode(config, [rand(1:100), rand(1:100)]) |
|
return id |
|
end |
|
|
|
function timestamp() |
|
(now() - unix2datetime(0)).value |
|
end |
|
|
|
""" |
|
ts_to_time(timestamp()) == now() |
|
""" |
|
function ts_to_time(ts) |
|
return unix2datetime(ts / 1000) |
|
end |
|
|
|
""" |
|
Claim: something that supports a misinformation narrative |
|
|
|
id: unique identifier for the claim |
|
claim: text of the claim |
|
counterclaim: text of the counterclaim |
|
claimembedding: embedding of the claim |
|
counterclaimembedding: embedding of the counterclaim |
|
created_at: date the claim was created |
|
updated_at: date the claim was last updated |
|
source: source of the claim |
|
|
|
""" |
|
mutable struct Claim |
|
id::String |
|
claim::String |
|
counterclaim::String |
|
claimembedding::Union{Array{Float32, 1}, Nothing} |
|
counterclaimembedding::Union{Array{Float32, 1}, Nothing} |
|
created_at::Int64 |
|
updated_at::Int64 |
|
source::String |
|
keywords::Union{Array{String, 1}, Nothing} |
|
end |
|
|
|
""" |
|
createClaim(claim::String, counterclaim::String, source::String) |
|
|
|
Create a new Claim object with the given claim, counterclaim, and source. |
|
The claim and counterclaim embeddings are set to nothing by default. |
|
|
|
Example: |
|
createClaim("Solar panels poison the soil and reduce crop yields", |
|
"There is no evidence that solar panels poison the soil or reduce crop yields", |
|
"Facebook post") |
|
""" |
|
function createClaim(claim::String, counterclaim::String, source::String, keywords::Array{String, 1}) |
|
return Claim(randid(), claim, counterclaim, nothing, nothing, timestamp(), timestamp(), source, keywords) |
|
end |
|
|
|
|
|
""" |
|
Narrative: a collection of claims that support a misinformation narrative |
|
|
|
id: unique identifier for the narrative |
|
title: descriptive title of the narrative |
|
type: broad type of narrative (e.g., anti-semitism) |
|
target: target group/topic of the narrative |
|
narrativesummary: base narrative text |
|
claims: list of Claim objects |
|
|
|
Example: |
|
example_narrative = Narrative( |
|
randid(), |
|
"Jews killed Jesus", |
|
"Anti-semitism", |
|
"Jews", |
|
"Jews are responsible for the death of Jesus", |
|
nothing) |
|
""" |
|
mutable struct Narrative |
|
id::String |
|
title::String |
|
topic::String |
|
target::String |
|
narrativesummary::String |
|
claims::Vector{Claim} |
|
end |
|
|
|
""" |
|
## TODO: When you have a lot of narratives, you can create a NarrativeSet |
|
- If you apply a narrative set over a database, it will perform classification using all the narratives |
|
|
|
""" |
|
mutable struct NarrativeSet |
|
narratives::Vector{Narrative} |
|
end |
|
|
|
import Base: show |
|
|
|
function show(io::IO, narrative::Narrative) |
|
println(io, "Narrative: $(narrative.title)") |
|
println(io, "Topic: $(narrative.topic)") |
|
println(io, "Target: $(narrative.target)") |
|
println(io, "Narrative Summary: $(narrative.narrativesummary)") |
|
println(io, "Claims:") |
|
for claim in narrative.claims |
|
println(io, " - $(claim.claim)") |
|
end |
|
end |
|
|
|
""" |
|
add_claim!(narrative::Narrative, claim::Claim) |
|
|
|
Add a claim to a narrative. |
|
|
|
Example: |
|
add_claim!(example_narrative, example_claim) |
|
""" |
|
|
|
function add_claim!(narrative::Narrative, claim::Claim) |
|
push!(narrative.claims, claim) |
|
end |
|
|
|
function remove_claim!(narrative::Narrative, claim_id::String) |
|
narrative.claims = filter(c -> c.id != claim_id, narrative.claims) |
|
end |
|
|
|
function narrative_to_dataframe(narrative::Narrative) |
|
out = DataFrame( narrative_title = narrative.title, |
|
id = [claim.id for claim in narrative.claims], |
|
claim = [claim.claim for claim in narrative.claims], |
|
counterclaim = [claim.counterclaim for claim in narrative.claims], |
|
claimembedding = [claim.claimembedding for claim in narrative.claims], |
|
counterclaimembedding = [claim.counterclaimembedding for claim in narrative.claims], |
|
created_at = [claim.created_at for claim in narrative.claims], |
|
updated_at = [claim.updated_at for claim in narrative.claims], |
|
source = [claim.source for claim in narrative.claims], |
|
keywords = [claim.keywords for claim in narrative.claims]) |
|
return out |
|
end |
|
|
|
""" |
|
# Collapse a dataframe into a narrative |
|
""" |
|
function dataframe_to_narrative(df::DataFrame, narrative_title::String, narrative_summary::String) |
|
claims = [Claim(row.id, row.claim, row.counterclaim, row.claimembedding, row.counterclaimembedding, row.created_at, row.updated_at, row.source, row.keywords) for row in eachrow(df)] |
|
return Narrative(randid(), narrative_title, "", "", narrative_summary, claims) |
|
end |
|
|
|
function deduplicate_claims_in_narrative!(narrative::Narrative) |
|
|
|
claims = [claim.claim for claim in narrative.claims] |
|
is_duplicated = nonunique(DataFrame(claim=claims)) |
|
|
|
if length(claims[findall(is_duplicated)]) > 0 |
|
for dupclaim in claims[findall(is_duplicated)] |
|
id_dup = [claim.id for claim in narrative.claims if claim.claim == dupclaim] |
|
|
|
[remove_claim!(narrative, id) for id in id_dup[2:end]] |
|
end |
|
end |
|
return narrative |
|
end |
|
|
|
""" |
|
## Embeddings to recover narratives |
|
cand_embeddings = candidate_embeddings_from_narrative(narrative) |
|
- Input: narrative |
|
- Output: candidate embeddings - embeddings of text that match the regex defined in claims |
|
|
|
""" |
|
function candidate_embeddings(candidates::DataFrame; kwargs...)::DataFrame |
|
model_id = get(kwargs, :model_id, "text-embedding-3-small") |
|
textcol = get(kwargs, :textcol, "text") |
|
|
|
if !textcol in names(candidates) |
|
error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument") |
|
end |
|
|
|
cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id); |
|
|
|
candidates[: , "Embeddings"] = [x for x in cand_embeddings] |
|
return candidates |
|
end |
|
|
|
|
|
""" |
|
df = CSV.read("data/random_300k.csv", DataFrame) |
|
df = filter(:message => x -> occursin(Regex("climate"), x), df) |
|
embeds = create_chunked_embeddings(df[:, "message"]; chunk_size=10) |
|
|
|
""" |
|
function create_openai_chunked_embeddings(texts; model_id="text-embedding-3-small", chunk_size=1000) |
|
|
|
embeddings = [] |
|
for chunk in 1:chunk_size:length(texts) |
|
embeddings_resp = create_embeddings(ENV["OPENAI_API_KEY"], |
|
texts[chunk:min(chunk+chunk_size-1, length(texts))]; model_id=model_id) |
|
push!(embeddings, [x["embedding"] for x in embeddings_resp.response["data"]]) |
|
end |
|
return vcat(embeddings...) |
|
end |
|
|
|
""" |
|
## Embeddings of narrative claims |
|
- bang because it modifies the narrative object in place |
|
include("src/ExampleNarrative.jl") |
|
include("src/Narrative.jl") |
|
climate_narrative = create_example_narrative(); |
|
generate_claim_embeddings_from_narrative!(climate_narrative) |
|
|
|
""" |
|
function generate_openai_claim_embeddings_from_narrative!(narrative::Narrative) |
|
|
|
claim_embeddings = create_chunked_embeddings([x.claim for x in narrative.claims]) |
|
[narrative.claims[i].claimembedding = claim_embeddings[i] for i in 1:length(narrative.claims)] |
|
|
|
counterclaim_embeddings = create_chunked_embeddings([x.counterclaim for x in narrative.claims]) |
|
[narrative.claims[i].counterclaimembedding = counterclaim_embeddings[i] for i in 1:length(narrative.claims)] |
|
return nothing |
|
end |
|
|
|
""" |
|
## Embeddings of candidate data |
|
cand_embeddings = candidate_embeddings_from_narrative(narrative) |
|
- Input: narrative |
|
- Output: candidate embeddings - embeddings of text that match the regex defined in claims |
|
|
|
""" |
|
function candidate_openai_embeddings(candidates::DataFrame; kwargs...)::DataFrame |
|
model_id = get(kwargs, :model_id, "text-embedding-3-small") |
|
textcol = get(kwargs, :textcol, "text") |
|
|
|
if !textcol in names(candidates) |
|
error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument") |
|
end |
|
|
|
cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id); |
|
|
|
candidates[: , "Embeddings"] = [x for x in cand_embeddings] |
|
return candidates |
|
end |
|
|