File size: 6,789 Bytes
48bb68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05a2a0c
 
 
 
 
48bb68b
 
 
 
05a2a0c
48bb68b
 
 
 
 
 
05a2a0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48bb68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05a2a0c
 
 
 
 
 
48bb68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05a2a0c
48bb68b
 
 
 
 
 
 
 
05a2a0c
 
48bb68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
## Embeddings 

function string_to_float32_vector(str::String)::Vector{Float32}
    # Remove the "Float32[" prefix and the "]" suffix
    str = strip(str, ['F', 'l', 'o', 'a', 't', '3', '2', '[', ']'])
    
    # Replace 'f' with 'e' for scientific notation
    str = replace(str, 'f' => 'e')
    
    # Split the string by commas to get individual elements
    elements = split(str, ",")
    
    # Convert each element to Float32 and collect into a vector
    return Float32[parse(Float32, strip(el)) for el in elements]
end

function dfdat_to_matrix(df::DataFrame, col::Symbol)::Matrix{Float32}
    return hcat([string_to_float32_vector(row[col]) for row in eachrow(df)]...)
end

"""
## Any piece of text longer than 280 characters will be chunked into smaller pieces, and the embeddings will be averaged. 

#Example: 
text = repeat("This is a test. ", 100)
chunktext = create_chunked_text(text)
function create_chunked_text(text; chunk_size=280)
    ## Chunk the data
    chunks = []
    for chunk in 1:chunk_size:length(text)
        push!(chunks, text[chunk:min(chunk+chunk_size-1, length(text))])
    end
    return chunks
end
"""

function create_chunked_text(text::String; chunk_size::Int=280)
    chunks = []
    start_idx = 1
    while start_idx <= lastindex(text)
        end_idx = start_idx
        for _ in 1:chunk_size
            end_idx = nextind(text, end_idx, 1)
            if end_idx > lastindex(text)
                end_idx = lastindex(text)
                break
            end
        end
        push!(chunks, text[start_idx:end_idx])
        start_idx = nextind(text, end_idx)
    end
    return chunks
end

"""
## Embeddings of text from the small encoder

text = "This is a test."
using OstreaCultura: Encoder
embd = Encoder.get_embeddings(text)

"""
function generate_embeddings(text::String)
    try
        return Encoder.get_embeddings(text)
    catch e
        println("Error: ", e)
        return zeros(Float32, 384)
    end
end

"""
## Embeddings of text from the large encoder

text = "This is a test."
using OstreaCultura: Encoder
embd = Encoder.get_embeddings_big(text)
LinearAlgebra.normalize(embd)
"""
function generate_embeddings_large(text::String)
    try
        return Encoder.get_embeddings_big(text)
    catch e
        println("Error: ", e)
        return zeros(Float32, 768)
    end
end

"""
# This is the core function - takes in a string of any length and returns the embeddings

text = repeat("This is a test. ", 100)
mini_embed(text)

# Test to embed truthseeker subsample 
ts = CSV.read("data/truthseeker_subsample.csv", DataFrame)
ts_embed = mini_embed.(ts.statement) # can embed 3K in 25 seconds 
ts.Embeddings = ts_embed
CSV.write("data/truthseeker_subsample_embed.csv", ts)

## embed fact check data 
fc = CSV.read("data/fact_check_latest.csv", DataFrame)
# drop missing text 
fc = fc[.!ismissing.(fc.text), :]
fc_embed = mini_embed.(fc.text) # 12 minutes
fc.Embeddings = fc_embed
CSV.write("data/fact_check_latest_embed.csv", fc)

narrs = CSV.read("data/expansive_claims_library_expanded.csv", DataFrame)
# drop missing text 
narrs.text = narrs.ExpandedClaim
narrs = narrs[.!ismissing.(narrs.text), :]
narratives_embed = OC.mini_embed.(narrs.text) # seconds to run 
narrs.Embeddings = narratives_embed
CSV.write("data/expansive_claims_library_expanded_embed.csv", narrs)

"""
function mini_embed(text::String)
    chunked_text = create_chunked_text(text)
    embeddings = generate_embeddings.(chunked_text)
    mean(embeddings)
end

function maxi_embed(text::String)
    chunked_text = create_chunked_text(text)
    embeddings = generate_embeddings_large.(chunked_text)
    mean(embeddings)
end

"""
# Get distance and classification 

ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
ts_embed = dfdat_to_matrix(ts, :Embeddings)
fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame)
fc_embed = dfdat_to_matrix(fc, :Embeddings)
distances, classification = distances_and_classification(fc_embed, ts_embed[:, 1:5])
"""
function distances_and_classification(narrative_matrix, target_matrix)
    distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
    # get the index of the column with the smallest distance
    return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1]
end

"""
# Get the dot product of the two matrices

ind, scores = dotproduct_distances(fc_embed, ts_embed)

ts.scores = scores

# Group by target and get the max score 
ts_grouped = combine(groupby(ts, :target), :scores => mean)
# show the matched text 
ts.fc_text = fc.text[ind]

"""
function dotproduct_distances(narrative_matrix, target_matrix)
    # multiply each column of the narrative matrix by the target vector
    dprods = narrative_matrix' * target_matrix    
    # get maximum dotproduct and index of the row
    max_dot = argmax(dprods, dims=1)[1, :]
    return first.(Tuple.(max_dot)), dprods[max_dot]
end

function dotproduct_topk(narrative_matrix, target_vector, k)
    # multiply each column of the narrative matrix by the target vector
    dprods = narrative_matrix' * target_vector 
    # indices of the top k dot products
    topk = sortperm(dprods, rev=true)[1:k]
    return topk, dprods[topk]
end

"""
# Get the top k scores

using CSV, DataFrames
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
ts_embed = OC.dfdat_to_matrix(ts, :Embeddings)
fc = CSV.read("data/fact_check_latest_embed_maxi.csv", DataFrame)
fc_embed = OC.dfdat_to_matrix(fc, :Embeddings)

OC.fast_topk(fc_embed, fc, ts.statement[1], 5)

## How fast to get the top 5 scores for 3K statements?
@time [OC.fast_topk(fc_embed, fc, ts.statement[x], 5) for x in 1:3000] # 63 seconds 
"""
function fast_topk(narrative_matrix, narratives, text::String, k)
    #target_vector = mini_embed(text)
    target_vector = maxi_embed(text)
    inds, scores = dotproduct_topk(narrative_matrix, target_vector, k)
    if hasproperty(narratives, :Policy)
        policy = narratives.Policy[inds]
        narrative = narratives.Narrative[inds] 
    else 
        policy = fill("No policy", k)
        narrative = fill("No narrative", k)
    end
    if !hasproperty(narratives, :claimReviewUrl)
        narratives.claimReviewUrl = fill("No URL", size(narratives, 1))
    end
    vec_of_dicts = [Dict("score" => scores[i], 
                        "text" => narratives.text[ind], 
                        "claimUrl" => narratives.claimReviewUrl[ind], 
                        "policy" => policy[i], 
                        "narrative" => narrative[i]) for (i, ind) in enumerate(inds)]
    return vec_of_dicts
end

function load_fasttext_embeddings(file::String="data/fact_check_latest_embed.csv")
    fc = CSV.read(file, DataFrame)
    fc_embed = dfdat_to_matrix(fc, :Embeddings)
    return fc_embed, fc
end