File size: 9,183 Bytes
48bb68b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
## Structure of a Narrative
function randid()
config = Sqids.configure() # Local configuration
id = Sqids.encode(config, [rand(1:100), rand(1:100)])
return id
end
function timestamp()
(now() - unix2datetime(0)).value
end
"""
ts_to_time(timestamp()) == now()
"""
function ts_to_time(ts)
return unix2datetime(ts / 1000)
end
"""
Claim: something that supports a misinformation narrative
id: unique identifier for the claim
claim: text of the claim
counterclaim: text of the counterclaim
claimembedding: embedding of the claim
counterclaimembedding: embedding of the counterclaim
created_at: date the claim was created
updated_at: date the claim was last updated
source: source of the claim
"""
mutable struct Claim
id::String
claim::String # claim text
counterclaim::String # counterclaim text
claimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the claim
counterclaimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the counterclaim
created_at::Int64 # date the claim was created
updated_at::Int64 # date the claim was last updated
source::String # source of the claim
keywords::Union{Array{String, 1}, Nothing} # keywords associated with the claim
end
"""
createClaim(claim::String, counterclaim::String, source::String)
Create a new Claim object with the given claim, counterclaim, and source.
The claim and counterclaim embeddings are set to nothing by default.
Example:
createClaim("Solar panels poison the soil and reduce crop yields",
"There is no evidence that solar panels poison the soil or reduce crop yields",
"Facebook post")
"""
function createClaim(claim::String, counterclaim::String, source::String, keywords::Array{String, 1})
return Claim(randid(), claim, counterclaim, nothing, nothing, timestamp(), timestamp(), source, keywords)
end
"""
Narrative: a collection of claims that support a misinformation narrative
id: unique identifier for the narrative
title: descriptive title of the narrative
type: broad type of narrative (e.g., anti-semitism)
target: target group/topic of the narrative
narrativesummary: base narrative text
claims: list of Claim objects
Example:
example_narrative = Narrative(
randid(),
"Jews killed Jesus",
"Anti-semitism",
"Jews",
"Jews are responsible for the death of Jesus",
nothing)
"""
mutable struct Narrative
id::String
title::String # descriptive title (e.g., Jews killed Jesus)
topic::String # broad type of narrative (e.g., anti-semitism)
target::String # target group/topic of the narrative
narrativesummary::String # base narrative text (e.g., Jews are responsible for the death of Jesus)
claims::Vector{Claim} # list of Claim objects
end
"""
## TODO: When you have a lot of narratives, you can create a NarrativeSet
- If you apply a narrative set over a database, it will perform classification using all the narratives
"""
mutable struct NarrativeSet
narratives::Vector{Narrative}
end
import Base: show
## Make the Narrative pretty to show -
function show(io::IO, narrative::Narrative)
println(io, "Narrative: $(narrative.title)")
println(io, "Topic: $(narrative.topic)")
println(io, "Target: $(narrative.target)")
println(io, "Narrative Summary: $(narrative.narrativesummary)")
println(io, "Claims:")
for claim in narrative.claims
println(io, " - $(claim.claim)")
end
end
"""
add_claim!(narrative::Narrative, claim::Claim)
Add a claim to a narrative.
Example:
add_claim!(example_narrative, example_claim)
"""
function add_claim!(narrative::Narrative, claim::Claim)
push!(narrative.claims, claim)
end
function remove_claim!(narrative::Narrative, claim_id::String)
narrative.claims = filter(c -> c.id != claim_id, narrative.claims)
end
function narrative_to_dataframe(narrative::Narrative)
out = DataFrame( narrative_title = narrative.title,
id = [claim.id for claim in narrative.claims],
claim = [claim.claim for claim in narrative.claims],
counterclaim = [claim.counterclaim for claim in narrative.claims],
claimembedding = [claim.claimembedding for claim in narrative.claims],
counterclaimembedding = [claim.counterclaimembedding for claim in narrative.claims],
created_at = [claim.created_at for claim in narrative.claims],
updated_at = [claim.updated_at for claim in narrative.claims],
source = [claim.source for claim in narrative.claims],
keywords = [claim.keywords for claim in narrative.claims])
return out
end
"""
# Collapse a dataframe into a narrative
"""
function dataframe_to_narrative(df::DataFrame, narrative_title::String, narrative_summary::String)
claims = [Claim(row.id, row.claim, row.counterclaim, row.claimembedding, row.counterclaimembedding, row.created_at, row.updated_at, row.source, row.keywords) for row in eachrow(df)]
return Narrative(randid(), narrative_title, "", "", narrative_summary, claims)
end
function deduplicate_claims_in_narrative!(narrative::Narrative)
## check which claim in non-unique in the set
claims = [claim.claim for claim in narrative.claims]
is_duplicated = nonunique(DataFrame(claim=claims))
# Get ID's of duplicated claims then remove them
if length(claims[findall(is_duplicated)]) > 0
for dupclaim in claims[findall(is_duplicated)]
id_dup = [claim.id for claim in narrative.claims if claim.claim == dupclaim]
# Remove all claims except the first one
[remove_claim!(narrative, id) for id in id_dup[2:end]]
end
end
return narrative
end
"""
## Embeddings to recover narratives
cand_embeddings = candidate_embeddings_from_narrative(narrative)
- Input: narrative
- Output: candidate embeddings - embeddings of text that match the regex defined in claims
"""
function candidate_embeddings(candidates::DataFrame; kwargs...)::DataFrame
model_id = get(kwargs, :model_id, "text-embedding-3-small")
textcol = get(kwargs, :textcol, "text")
# check if text column exists
if !textcol in names(candidates)
error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
end
## Data Embeddings
cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
## Add vector of embeddings to dataset
candidates[: , "Embeddings"] = [x for x in cand_embeddings]
return candidates
end
## Embeddings
"""
df = CSV.read("data/random_300k.csv", DataFrame)
df = filter(:message => x -> occursin(Regex("climate"), x), df)
embeds = create_chunked_embeddings(df[:, "message"]; chunk_size=10)
"""
function create_openai_chunked_embeddings(texts; model_id="text-embedding-3-small", chunk_size=1000)
## Chunk the data
embeddings = []
for chunk in 1:chunk_size:length(texts)
embeddings_resp = create_embeddings(ENV["OPENAI_API_KEY"],
texts[chunk:min(chunk+chunk_size-1, length(texts))]; model_id=model_id)
push!(embeddings, [x["embedding"] for x in embeddings_resp.response["data"]])
end
return vcat(embeddings...)
end
"""
## Embeddings of narrative claims
- bang because it modifies the narrative object in place
include("src/ExampleNarrative.jl")
include("src/Narrative.jl")
climate_narrative = create_example_narrative();
generate_claim_embeddings_from_narrative!(climate_narrative)
"""
function generate_openai_claim_embeddings_from_narrative!(narrative::Narrative)
## claim embeddings
claim_embeddings = create_chunked_embeddings([x.claim for x in narrative.claims])
[narrative.claims[i].claimembedding = claim_embeddings[i] for i in 1:length(narrative.claims)]
## counterclaim embeddings
counterclaim_embeddings = create_chunked_embeddings([x.counterclaim for x in narrative.claims])
[narrative.claims[i].counterclaimembedding = counterclaim_embeddings[i] for i in 1:length(narrative.claims)]
return nothing
end
"""
## Embeddings of candidate data
cand_embeddings = candidate_embeddings_from_narrative(narrative)
- Input: narrative
- Output: candidate embeddings - embeddings of text that match the regex defined in claims
"""
function candidate_openai_embeddings(candidates::DataFrame; kwargs...)::DataFrame
model_id = get(kwargs, :model_id, "text-embedding-3-small")
textcol = get(kwargs, :textcol, "text")
# check if text column exists
if !textcol in names(candidates)
error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
end
## Data Embeddings
cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
## Add vector of embeddings to dataset
candidates[: , "Embeddings"] = [x for x in cand_embeddings]
return candidates
end
|