File size: 9,183 Bytes
48bb68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
## Structure of a Narrative 

function randid()
    config = Sqids.configure()  # Local configuration
    id = Sqids.encode(config, [rand(1:100), rand(1:100)])
    return id
end

function timestamp()
    (now() - unix2datetime(0)).value
end

"""
ts_to_time(timestamp()) == now()
"""
function ts_to_time(ts)
    return unix2datetime(ts / 1000)
end

"""
    Claim: something that supports a misinformation narrative 

    id: unique identifier for the claim
    claim: text of the claim
    counterclaim: text of the counterclaim
    claimembedding: embedding of the claim
    counterclaimembedding: embedding of the counterclaim
    created_at: date the claim was created
    updated_at: date the claim was last updated
    source: source of the claim

"""
mutable struct Claim
    id::String
    claim::String  # claim text
    counterclaim::String  # counterclaim text
    claimembedding::Union{Array{Float32, 1}, Nothing}  # embedding of the claim
    counterclaimembedding::Union{Array{Float32, 1}, Nothing}  # embedding of the counterclaim
    created_at::Int64  # date the claim was created
    updated_at::Int64  # date the claim was last updated
    source::String  # source of the claim
    keywords::Union{Array{String, 1}, Nothing}  # keywords associated with the claim
end

"""
    createClaim(claim::String, counterclaim::String, source::String)

    Create a new Claim object with the given claim, counterclaim, and source. 
    The claim and counterclaim embeddings are set to nothing by default. 

    Example: 
    createClaim("Solar panels poison the soil and reduce crop yields", 
                "There is no evidence that solar panels poison the soil or reduce crop yields", 
                "Facebook post")
"""
function createClaim(claim::String, counterclaim::String, source::String, keywords::Array{String, 1})
    return Claim(randid(), claim, counterclaim, nothing, nothing, timestamp(), timestamp(), source, keywords)
end


"""
    Narrative: a collection of claims that support a misinformation narrative

    id: unique identifier for the narrative
    title: descriptive title of the narrative
    type: broad type of narrative (e.g., anti-semitism)
    target: target group/topic of the narrative
    narrativesummary: base narrative text
    claims: list of Claim objects

    Example: 
    example_narrative = Narrative(
        randid(),
        "Jews killed Jesus",
        "Anti-semitism",
        "Jews", 
        "Jews are responsible for the death of Jesus", 
        nothing)
"""
mutable struct Narrative
    id::String
    title::String  # descriptive title (e.g., Jews killed Jesus)
    topic::String  # broad type of narrative (e.g., anti-semitism)
    target::String  # target group/topic of the narrative
    narrativesummary::String  # base narrative text (e.g., Jews are responsible for the death of Jesus)
    claims::Vector{Claim}  # list of Claim objects
end

"""
## TODO: When you have a lot of narratives, you can create a NarrativeSet
- If you apply a narrative set over a database, it will perform classification using all the narratives

"""
mutable struct NarrativeSet 
    narratives::Vector{Narrative}
end

import Base: show
## Make the Narrative pretty to show - 
function show(io::IO, narrative::Narrative)
    println(io, "Narrative: $(narrative.title)")
    println(io, "Topic: $(narrative.topic)")
    println(io, "Target: $(narrative.target)")
    println(io, "Narrative Summary: $(narrative.narrativesummary)")
    println(io, "Claims:")
    for claim in narrative.claims
        println(io, "  - $(claim.claim)")
    end
end

"""
    add_claim!(narrative::Narrative, claim::Claim)

    Add a claim to a narrative. 

    Example: 
    add_claim!(example_narrative, example_claim)
"""

function add_claim!(narrative::Narrative, claim::Claim)
    push!(narrative.claims, claim)
end

function remove_claim!(narrative::Narrative, claim_id::String)
    narrative.claims = filter(c -> c.id != claim_id, narrative.claims)
end

function narrative_to_dataframe(narrative::Narrative)
    out = DataFrame( narrative_title = narrative.title,
                        id = [claim.id for claim in narrative.claims],
                        claim = [claim.claim for claim in narrative.claims],
                        counterclaim = [claim.counterclaim for claim in narrative.claims],
                        claimembedding = [claim.claimembedding for claim in narrative.claims],
                        counterclaimembedding = [claim.counterclaimembedding for claim in narrative.claims],
                        created_at = [claim.created_at for claim in narrative.claims],
                        updated_at = [claim.updated_at for claim in narrative.claims],
                        source = [claim.source for claim in narrative.claims],
                        keywords = [claim.keywords for claim in narrative.claims])
    return out
end

"""
# Collapse a dataframe into a narrative 
"""
function dataframe_to_narrative(df::DataFrame, narrative_title::String, narrative_summary::String)
    claims = [Claim(row.id, row.claim, row.counterclaim, row.claimembedding, row.counterclaimembedding, row.created_at, row.updated_at, row.source, row.keywords) for row in eachrow(df)]
    return Narrative(randid(), narrative_title, "", "", narrative_summary, claims)
end

function deduplicate_claims_in_narrative!(narrative::Narrative)
    ## check which claim in non-unique in the set 
    claims = [claim.claim for claim in narrative.claims]
    is_duplicated = nonunique(DataFrame(claim=claims))
    # Get ID's of duplicated claims then remove them 
    if length(claims[findall(is_duplicated)]) > 0
        for dupclaim in claims[findall(is_duplicated)]
            id_dup = [claim.id for claim in narrative.claims if claim.claim == dupclaim]
            # Remove all claims except the first one
            [remove_claim!(narrative, id) for id in id_dup[2:end]]
        end
    end
    return narrative
end

"""
## Embeddings to recover narratives 
cand_embeddings = candidate_embeddings_from_narrative(narrative)
- Input: narrative
- Output: candidate embeddings - embeddings of text that match the regex defined in claims 

"""
function candidate_embeddings(candidates::DataFrame; kwargs...)::DataFrame
    model_id = get(kwargs, :model_id, "text-embedding-3-small")
    textcol = get(kwargs, :textcol, "text")
    # check if text column exists
    if !textcol in names(candidates)
        error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
    end
    ## Data Embeddings 
    cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
    ## Add vector of embeddings to dataset 
    candidates[: , "Embeddings"] = [x for x in cand_embeddings]
    return candidates
end
## Embeddings 

"""
df = CSV.read("data/random_300k.csv", DataFrame)
df = filter(:message => x -> occursin(Regex("climate"), x), df)
embeds = create_chunked_embeddings(df[:, "message"]; chunk_size=10)

"""
function create_openai_chunked_embeddings(texts; model_id="text-embedding-3-small", chunk_size=1000)
    ## Chunk the data
    embeddings = []
    for chunk in 1:chunk_size:length(texts)
        embeddings_resp = create_embeddings(ENV["OPENAI_API_KEY"], 
                texts[chunk:min(chunk+chunk_size-1, length(texts))]; model_id=model_id)
        push!(embeddings, [x["embedding"] for x in embeddings_resp.response["data"]])
    end
    return vcat(embeddings...)
end

"""
## Embeddings of narrative claims 
- bang because it modifies the narrative object in place
include("src/ExampleNarrative.jl")
include("src/Narrative.jl")
climate_narrative = create_example_narrative();
generate_claim_embeddings_from_narrative!(climate_narrative)

"""
function generate_openai_claim_embeddings_from_narrative!(narrative::Narrative)
    ## claim embeddings
    claim_embeddings = create_chunked_embeddings([x.claim for x in narrative.claims])
    [narrative.claims[i].claimembedding = claim_embeddings[i] for i in 1:length(narrative.claims)]
    ## counterclaim embeddings
    counterclaim_embeddings = create_chunked_embeddings([x.counterclaim for x in narrative.claims])
    [narrative.claims[i].counterclaimembedding = counterclaim_embeddings[i] for i in 1:length(narrative.claims)]
    return nothing
end

"""
## Embeddings of candidate data 
cand_embeddings = candidate_embeddings_from_narrative(narrative)
- Input: narrative
- Output: candidate embeddings - embeddings of text that match the regex defined in claims 

"""
function candidate_openai_embeddings(candidates::DataFrame; kwargs...)::DataFrame
    model_id = get(kwargs, :model_id, "text-embedding-3-small")
    textcol = get(kwargs, :textcol, "text")
    # check if text column exists
    if !textcol in names(candidates)
        error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
    end
    ## Data Embeddings 
    cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
    ## Add vector of embeddings to dataset 
    candidates[: , "Embeddings"] = [x for x in cand_embeddings]
    return candidates
end