File size: 6,789 Bytes
48bb68b 05a2a0c 48bb68b 05a2a0c 48bb68b 05a2a0c 48bb68b 05a2a0c 48bb68b 05a2a0c 48bb68b 05a2a0c 48bb68b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
## Embeddings
function string_to_float32_vector(str::String)::Vector{Float32}
# Remove the "Float32[" prefix and the "]" suffix
str = strip(str, ['F', 'l', 'o', 'a', 't', '3', '2', '[', ']'])
# Replace 'f' with 'e' for scientific notation
str = replace(str, 'f' => 'e')
# Split the string by commas to get individual elements
elements = split(str, ",")
# Convert each element to Float32 and collect into a vector
return Float32[parse(Float32, strip(el)) for el in elements]
end
function dfdat_to_matrix(df::DataFrame, col::Symbol)::Matrix{Float32}
return hcat([string_to_float32_vector(row[col]) for row in eachrow(df)]...)
end
"""
## Any piece of text longer than 280 characters will be chunked into smaller pieces, and the embeddings will be averaged.
#Example:
text = repeat("This is a test. ", 100)
chunktext = create_chunked_text(text)
function create_chunked_text(text; chunk_size=280)
## Chunk the data
chunks = []
for chunk in 1:chunk_size:length(text)
push!(chunks, text[chunk:min(chunk+chunk_size-1, length(text))])
end
return chunks
end
"""
function create_chunked_text(text::String; chunk_size::Int=280)
chunks = []
start_idx = 1
while start_idx <= lastindex(text)
end_idx = start_idx
for _ in 1:chunk_size
end_idx = nextind(text, end_idx, 1)
if end_idx > lastindex(text)
end_idx = lastindex(text)
break
end
end
push!(chunks, text[start_idx:end_idx])
start_idx = nextind(text, end_idx)
end
return chunks
end
"""
## Embeddings of text from the small encoder
text = "This is a test."
using OstreaCultura: Encoder
embd = Encoder.get_embeddings(text)
"""
function generate_embeddings(text::String)
try
return Encoder.get_embeddings(text)
catch e
println("Error: ", e)
return zeros(Float32, 384)
end
end
"""
## Embeddings of text from the large encoder
text = "This is a test."
using OstreaCultura: Encoder
embd = Encoder.get_embeddings_big(text)
LinearAlgebra.normalize(embd)
"""
function generate_embeddings_large(text::String)
try
return Encoder.get_embeddings_big(text)
catch e
println("Error: ", e)
return zeros(Float32, 768)
end
end
"""
# This is the core function - takes in a string of any length and returns the embeddings
text = repeat("This is a test. ", 100)
mini_embed(text)
# Test to embed truthseeker subsample
ts = CSV.read("data/truthseeker_subsample.csv", DataFrame)
ts_embed = mini_embed.(ts.statement) # can embed 3K in 25 seconds
ts.Embeddings = ts_embed
CSV.write("data/truthseeker_subsample_embed.csv", ts)
## embed fact check data
fc = CSV.read("data/fact_check_latest.csv", DataFrame)
# drop missing text
fc = fc[.!ismissing.(fc.text), :]
fc_embed = mini_embed.(fc.text) # 12 minutes
fc.Embeddings = fc_embed
CSV.write("data/fact_check_latest_embed.csv", fc)
narrs = CSV.read("data/expansive_claims_library_expanded.csv", DataFrame)
# drop missing text
narrs.text = narrs.ExpandedClaim
narrs = narrs[.!ismissing.(narrs.text), :]
narratives_embed = OC.mini_embed.(narrs.text) # seconds to run
narrs.Embeddings = narratives_embed
CSV.write("data/expansive_claims_library_expanded_embed.csv", narrs)
"""
function mini_embed(text::String)
chunked_text = create_chunked_text(text)
embeddings = generate_embeddings.(chunked_text)
mean(embeddings)
end
function maxi_embed(text::String)
chunked_text = create_chunked_text(text)
embeddings = generate_embeddings_large.(chunked_text)
mean(embeddings)
end
"""
# Get distance and classification
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
ts_embed = dfdat_to_matrix(ts, :Embeddings)
fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame)
fc_embed = dfdat_to_matrix(fc, :Embeddings)
distances, classification = distances_and_classification(fc_embed, ts_embed[:, 1:5])
"""
function distances_and_classification(narrative_matrix, target_matrix)
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
# get the index of the column with the smallest distance
return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1]
end
"""
# Get the dot product of the two matrices
ind, scores = dotproduct_distances(fc_embed, ts_embed)
ts.scores = scores
# Group by target and get the max score
ts_grouped = combine(groupby(ts, :target), :scores => mean)
# show the matched text
ts.fc_text = fc.text[ind]
"""
function dotproduct_distances(narrative_matrix, target_matrix)
# multiply each column of the narrative matrix by the target vector
dprods = narrative_matrix' * target_matrix
# get maximum dotproduct and index of the row
max_dot = argmax(dprods, dims=1)[1, :]
return first.(Tuple.(max_dot)), dprods[max_dot]
end
function dotproduct_topk(narrative_matrix, target_vector, k)
# multiply each column of the narrative matrix by the target vector
dprods = narrative_matrix' * target_vector
# indices of the top k dot products
topk = sortperm(dprods, rev=true)[1:k]
return topk, dprods[topk]
end
"""
# Get the top k scores
using CSV, DataFrames
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
ts_embed = OC.dfdat_to_matrix(ts, :Embeddings)
fc = CSV.read("data/fact_check_latest_embed_maxi.csv", DataFrame)
fc_embed = OC.dfdat_to_matrix(fc, :Embeddings)
OC.fast_topk(fc_embed, fc, ts.statement[1], 5)
## How fast to get the top 5 scores for 3K statements?
@time [OC.fast_topk(fc_embed, fc, ts.statement[x], 5) for x in 1:3000] # 63 seconds
"""
function fast_topk(narrative_matrix, narratives, text::String, k)
#target_vector = mini_embed(text)
target_vector = maxi_embed(text)
inds, scores = dotproduct_topk(narrative_matrix, target_vector, k)
if hasproperty(narratives, :Policy)
policy = narratives.Policy[inds]
narrative = narratives.Narrative[inds]
else
policy = fill("No policy", k)
narrative = fill("No narrative", k)
end
if !hasproperty(narratives, :claimReviewUrl)
narratives.claimReviewUrl = fill("No URL", size(narratives, 1))
end
vec_of_dicts = [Dict("score" => scores[i],
"text" => narratives.text[ind],
"claimUrl" => narratives.claimReviewUrl[ind],
"policy" => policy[i],
"narrative" => narrative[i]) for (i, ind) in enumerate(inds)]
return vec_of_dicts
end
function load_fasttext_embeddings(file::String="data/fact_check_latest_embed.csv")
fc = CSV.read(file, DataFrame)
fc_embed = dfdat_to_matrix(fc, :Embeddings)
return fc_embed, fc
end |