misinfo_detection_app / src /Embeddings.jl
=
Add new embeddings and update data processing scripts; remove MiniEncoder
05a2a0c
## Embeddings
function string_to_float32_vector(str::String)::Vector{Float32}
# Remove the "Float32[" prefix and the "]" suffix
str = strip(str, ['F', 'l', 'o', 'a', 't', '3', '2', '[', ']'])
# Replace 'f' with 'e' for scientific notation
str = replace(str, 'f' => 'e')
# Split the string by commas to get individual elements
elements = split(str, ",")
# Convert each element to Float32 and collect into a vector
return Float32[parse(Float32, strip(el)) for el in elements]
end
function dfdat_to_matrix(df::DataFrame, col::Symbol)::Matrix{Float32}
return hcat([string_to_float32_vector(row[col]) for row in eachrow(df)]...)
end
"""
## Any piece of text longer than 280 characters will be chunked into smaller pieces, and the embeddings will be averaged.
#Example:
text = repeat("This is a test. ", 100)
chunktext = create_chunked_text(text)
function create_chunked_text(text; chunk_size=280)
## Chunk the data
chunks = []
for chunk in 1:chunk_size:length(text)
push!(chunks, text[chunk:min(chunk+chunk_size-1, length(text))])
end
return chunks
end
"""
function create_chunked_text(text::String; chunk_size::Int=280)
chunks = []
start_idx = 1
while start_idx <= lastindex(text)
end_idx = start_idx
for _ in 1:chunk_size
end_idx = nextind(text, end_idx, 1)
if end_idx > lastindex(text)
end_idx = lastindex(text)
break
end
end
push!(chunks, text[start_idx:end_idx])
start_idx = nextind(text, end_idx)
end
return chunks
end
"""
## Embeddings of text from the small encoder
text = "This is a test."
using OstreaCultura: Encoder
embd = Encoder.get_embeddings(text)
"""
function generate_embeddings(text::String)
try
return Encoder.get_embeddings(text)
catch e
println("Error: ", e)
return zeros(Float32, 384)
end
end
"""
## Embeddings of text from the large encoder
text = "This is a test."
using OstreaCultura: Encoder
embd = Encoder.get_embeddings_big(text)
LinearAlgebra.normalize(embd)
"""
function generate_embeddings_large(text::String)
try
return Encoder.get_embeddings_big(text)
catch e
println("Error: ", e)
return zeros(Float32, 768)
end
end
"""
# This is the core function - takes in a string of any length and returns the embeddings
text = repeat("This is a test. ", 100)
mini_embed(text)
# Test to embed truthseeker subsample
ts = CSV.read("data/truthseeker_subsample.csv", DataFrame)
ts_embed = mini_embed.(ts.statement) # can embed 3K in 25 seconds
ts.Embeddings = ts_embed
CSV.write("data/truthseeker_subsample_embed.csv", ts)
## embed fact check data
fc = CSV.read("data/fact_check_latest.csv", DataFrame)
# drop missing text
fc = fc[.!ismissing.(fc.text), :]
fc_embed = mini_embed.(fc.text) # 12 minutes
fc.Embeddings = fc_embed
CSV.write("data/fact_check_latest_embed.csv", fc)
narrs = CSV.read("data/expansive_claims_library_expanded.csv", DataFrame)
# drop missing text
narrs.text = narrs.ExpandedClaim
narrs = narrs[.!ismissing.(narrs.text), :]
narratives_embed = OC.mini_embed.(narrs.text) # seconds to run
narrs.Embeddings = narratives_embed
CSV.write("data/expansive_claims_library_expanded_embed.csv", narrs)
"""
function mini_embed(text::String)
chunked_text = create_chunked_text(text)
embeddings = generate_embeddings.(chunked_text)
mean(embeddings)
end
function maxi_embed(text::String)
chunked_text = create_chunked_text(text)
embeddings = generate_embeddings_large.(chunked_text)
mean(embeddings)
end
"""
# Get distance and classification
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
ts_embed = dfdat_to_matrix(ts, :Embeddings)
fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame)
fc_embed = dfdat_to_matrix(fc, :Embeddings)
distances, classification = distances_and_classification(fc_embed, ts_embed[:, 1:5])
"""
function distances_and_classification(narrative_matrix, target_matrix)
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
# get the index of the column with the smallest distance
return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1]
end
"""
# Get the dot product of the two matrices
ind, scores = dotproduct_distances(fc_embed, ts_embed)
ts.scores = scores
# Group by target and get the max score
ts_grouped = combine(groupby(ts, :target), :scores => mean)
# show the matched text
ts.fc_text = fc.text[ind]
"""
function dotproduct_distances(narrative_matrix, target_matrix)
# multiply each column of the narrative matrix by the target vector
dprods = narrative_matrix' * target_matrix
# get maximum dotproduct and index of the row
max_dot = argmax(dprods, dims=1)[1, :]
return first.(Tuple.(max_dot)), dprods[max_dot]
end
function dotproduct_topk(narrative_matrix, target_vector, k)
# multiply each column of the narrative matrix by the target vector
dprods = narrative_matrix' * target_vector
# indices of the top k dot products
topk = sortperm(dprods, rev=true)[1:k]
return topk, dprods[topk]
end
"""
# Get the top k scores
using CSV, DataFrames
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
ts_embed = OC.dfdat_to_matrix(ts, :Embeddings)
fc = CSV.read("data/fact_check_latest_embed_maxi.csv", DataFrame)
fc_embed = OC.dfdat_to_matrix(fc, :Embeddings)
OC.fast_topk(fc_embed, fc, ts.statement[1], 5)
## How fast to get the top 5 scores for 3K statements?
@time [OC.fast_topk(fc_embed, fc, ts.statement[x], 5) for x in 1:3000] # 63 seconds
"""
function fast_topk(narrative_matrix, narratives, text::String, k)
#target_vector = mini_embed(text)
target_vector = maxi_embed(text)
inds, scores = dotproduct_topk(narrative_matrix, target_vector, k)
if hasproperty(narratives, :Policy)
policy = narratives.Policy[inds]
narrative = narratives.Narrative[inds]
else
policy = fill("No policy", k)
narrative = fill("No narrative", k)
end
if !hasproperty(narratives, :claimReviewUrl)
narratives.claimReviewUrl = fill("No URL", size(narratives, 1))
end
vec_of_dicts = [Dict("score" => scores[i],
"text" => narratives.text[ind],
"claimUrl" => narratives.claimReviewUrl[ind],
"policy" => policy[i],
"narrative" => narrative[i]) for (i, ind) in enumerate(inds)]
return vec_of_dicts
end
function load_fasttext_embeddings(file::String="data/fact_check_latest_embed.csv")
fc = CSV.read(file, DataFrame)
fc_embed = dfdat_to_matrix(fc, :Embeddings)
return fc_embed, fc
end