Spaces:
Sleeping
Sleeping
## Structure of a Narrative | |
function randid() | |
config = Sqids.configure() # Local configuration | |
id = Sqids.encode(config, [rand(1:100), rand(1:100)]) | |
return id | |
end | |
function timestamp() | |
(now() - unix2datetime(0)).value | |
end | |
""" | |
ts_to_time(timestamp()) == now() | |
""" | |
function ts_to_time(ts) | |
return unix2datetime(ts / 1000) | |
end | |
""" | |
Claim: something that supports a misinformation narrative | |
id: unique identifier for the claim | |
claim: text of the claim | |
counterclaim: text of the counterclaim | |
claimembedding: embedding of the claim | |
counterclaimembedding: embedding of the counterclaim | |
created_at: date the claim was created | |
updated_at: date the claim was last updated | |
source: source of the claim | |
""" | |
mutable struct Claim | |
id::String | |
claim::String # claim text | |
counterclaim::String # counterclaim text | |
claimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the claim | |
counterclaimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the counterclaim | |
created_at::Int64 # date the claim was created | |
updated_at::Int64 # date the claim was last updated | |
source::String # source of the claim | |
keywords::Union{Array{String, 1}, Nothing} # keywords associated with the claim | |
end | |
""" | |
createClaim(claim::String, counterclaim::String, source::String) | |
Create a new Claim object with the given claim, counterclaim, and source. | |
The claim and counterclaim embeddings are set to nothing by default. | |
Example: | |
createClaim("Solar panels poison the soil and reduce crop yields", | |
"There is no evidence that solar panels poison the soil or reduce crop yields", | |
"Facebook post") | |
""" | |
function createClaim(claim::String, counterclaim::String, source::String, keywords::Array{String, 1}) | |
return Claim(randid(), claim, counterclaim, nothing, nothing, timestamp(), timestamp(), source, keywords) | |
end | |
""" | |
Narrative: a collection of claims that support a misinformation narrative | |
id: unique identifier for the narrative | |
title: descriptive title of the narrative | |
type: broad type of narrative (e.g., anti-semitism) | |
target: target group/topic of the narrative | |
narrativesummary: base narrative text | |
claims: list of Claim objects | |
Example: | |
example_narrative = Narrative( | |
randid(), | |
"Jews killed Jesus", | |
"Anti-semitism", | |
"Jews", | |
"Jews are responsible for the death of Jesus", | |
nothing) | |
""" | |
mutable struct Narrative | |
id::String | |
title::String # descriptive title (e.g., Jews killed Jesus) | |
topic::String # broad type of narrative (e.g., anti-semitism) | |
target::String # target group/topic of the narrative | |
narrativesummary::String # base narrative text (e.g., Jews are responsible for the death of Jesus) | |
claims::Vector{Claim} # list of Claim objects | |
end | |
""" | |
## TODO: When you have a lot of narratives, you can create a NarrativeSet | |
- If you apply a narrative set over a database, it will perform classification using all the narratives | |
""" | |
mutable struct NarrativeSet | |
narratives::Vector{Narrative} | |
end | |
import Base: show | |
## Make the Narrative pretty to show - | |
function show(io::IO, narrative::Narrative) | |
println(io, "Narrative: $(narrative.title)") | |
println(io, "Topic: $(narrative.topic)") | |
println(io, "Target: $(narrative.target)") | |
println(io, "Narrative Summary: $(narrative.narrativesummary)") | |
println(io, "Claims:") | |
for claim in narrative.claims | |
println(io, " - $(claim.claim)") | |
end | |
end | |
""" | |
add_claim!(narrative::Narrative, claim::Claim) | |
Add a claim to a narrative. | |
Example: | |
add_claim!(example_narrative, example_claim) | |
""" | |
function add_claim!(narrative::Narrative, claim::Claim) | |
push!(narrative.claims, claim) | |
end | |
function remove_claim!(narrative::Narrative, claim_id::String) | |
narrative.claims = filter(c -> c.id != claim_id, narrative.claims) | |
end | |
function narrative_to_dataframe(narrative::Narrative) | |
out = DataFrame( narrative_title = narrative.title, | |
id = [claim.id for claim in narrative.claims], | |
claim = [claim.claim for claim in narrative.claims], | |
counterclaim = [claim.counterclaim for claim in narrative.claims], | |
claimembedding = [claim.claimembedding for claim in narrative.claims], | |
counterclaimembedding = [claim.counterclaimembedding for claim in narrative.claims], | |
created_at = [claim.created_at for claim in narrative.claims], | |
updated_at = [claim.updated_at for claim in narrative.claims], | |
source = [claim.source for claim in narrative.claims], | |
keywords = [claim.keywords for claim in narrative.claims]) | |
return out | |
end | |
""" | |
# Collapse a dataframe into a narrative | |
""" | |
function dataframe_to_narrative(df::DataFrame, narrative_title::String, narrative_summary::String) | |
claims = [Claim(row.id, row.claim, row.counterclaim, row.claimembedding, row.counterclaimembedding, row.created_at, row.updated_at, row.source, row.keywords) for row in eachrow(df)] | |
return Narrative(randid(), narrative_title, "", "", narrative_summary, claims) | |
end | |
function deduplicate_claims_in_narrative!(narrative::Narrative) | |
## check which claim in non-unique in the set | |
claims = [claim.claim for claim in narrative.claims] | |
is_duplicated = nonunique(DataFrame(claim=claims)) | |
# Get ID's of duplicated claims then remove them | |
if length(claims[findall(is_duplicated)]) > 0 | |
for dupclaim in claims[findall(is_duplicated)] | |
id_dup = [claim.id for claim in narrative.claims if claim.claim == dupclaim] | |
# Remove all claims except the first one | |
[remove_claim!(narrative, id) for id in id_dup[2:end]] | |
end | |
end | |
return narrative | |
end | |
""" | |
## Embeddings to recover narratives | |
cand_embeddings = candidate_embeddings_from_narrative(narrative) | |
- Input: narrative | |
- Output: candidate embeddings - embeddings of text that match the regex defined in claims | |
""" | |
function candidate_embeddings(candidates::DataFrame; kwargs...)::DataFrame | |
model_id = get(kwargs, :model_id, "text-embedding-3-small") | |
textcol = get(kwargs, :textcol, "text") | |
# check if text column exists | |
if !textcol in names(candidates) | |
error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument") | |
end | |
## Data Embeddings | |
cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id); | |
## Add vector of embeddings to dataset | |
candidates[: , "Embeddings"] = [x for x in cand_embeddings] | |
return candidates | |
end | |
## Embeddings | |
""" | |
df = CSV.read("data/random_300k.csv", DataFrame) | |
df = filter(:message => x -> occursin(Regex("climate"), x), df) | |
embeds = create_chunked_embeddings(df[:, "message"]; chunk_size=10) | |
""" | |
function create_openai_chunked_embeddings(texts; model_id="text-embedding-3-small", chunk_size=1000) | |
## Chunk the data | |
embeddings = [] | |
for chunk in 1:chunk_size:length(texts) | |
embeddings_resp = create_embeddings(ENV["OPENAI_API_KEY"], | |
texts[chunk:min(chunk+chunk_size-1, length(texts))]; model_id=model_id) | |
push!(embeddings, [x["embedding"] for x in embeddings_resp.response["data"]]) | |
end | |
return vcat(embeddings...) | |
end | |
""" | |
## Embeddings of narrative claims | |
- bang because it modifies the narrative object in place | |
include("src/ExampleNarrative.jl") | |
include("src/Narrative.jl") | |
climate_narrative = create_example_narrative(); | |
generate_claim_embeddings_from_narrative!(climate_narrative) | |
""" | |
function generate_openai_claim_embeddings_from_narrative!(narrative::Narrative) | |
## claim embeddings | |
claim_embeddings = create_chunked_embeddings([x.claim for x in narrative.claims]) | |
[narrative.claims[i].claimembedding = claim_embeddings[i] for i in 1:length(narrative.claims)] | |
## counterclaim embeddings | |
counterclaim_embeddings = create_chunked_embeddings([x.counterclaim for x in narrative.claims]) | |
[narrative.claims[i].counterclaimembedding = counterclaim_embeddings[i] for i in 1:length(narrative.claims)] | |
return nothing | |
end | |
""" | |
## Embeddings of candidate data | |
cand_embeddings = candidate_embeddings_from_narrative(narrative) | |
- Input: narrative | |
- Output: candidate embeddings - embeddings of text that match the regex defined in claims | |
""" | |
function candidate_openai_embeddings(candidates::DataFrame; kwargs...)::DataFrame | |
model_id = get(kwargs, :model_id, "text-embedding-3-small") | |
textcol = get(kwargs, :textcol, "text") | |
# check if text column exists | |
if !textcol in names(candidates) | |
error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument") | |
end | |
## Data Embeddings | |
cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id); | |
## Add vector of embeddings to dataset | |
candidates[: , "Embeddings"] = [x for x in cand_embeddings] | |
return candidates | |
end | |