|
|
|
|
|
function string_to_float32_vector(str::String)::Vector{Float32} |
|
|
|
str = strip(str, ['F', 'l', 'o', 'a', 't', '3', '2', '[', ']']) |
|
|
|
|
|
str = replace(str, 'f' => 'e') |
|
|
|
|
|
elements = split(str, ",") |
|
|
|
|
|
return Float32[parse(Float32, strip(el)) for el in elements] |
|
end |
|
|
|
function dfdat_to_matrix(df::DataFrame, col::Symbol)::Matrix{Float32} |
|
return hcat([string_to_float32_vector(row[col]) for row in eachrow(df)]...) |
|
end |
|
|
|
""" |
|
## Any piece of text longer than 280 characters will be chunked into smaller pieces, and the embeddings will be averaged. |
|
|
|
#Example: |
|
text = repeat("This is a test. ", 100) |
|
chunktext = create_chunked_text(text) |
|
function create_chunked_text(text; chunk_size=280) |
|
## Chunk the data |
|
chunks = [] |
|
for chunk in 1:chunk_size:length(text) |
|
push!(chunks, text[chunk:min(chunk+chunk_size-1, length(text))]) |
|
end |
|
return chunks |
|
end |
|
""" |
|
|
|
function create_chunked_text(text::String; chunk_size::Int=280) |
|
chunks = [] |
|
start_idx = 1 |
|
while start_idx <= lastindex(text) |
|
end_idx = start_idx |
|
for _ in 1:chunk_size |
|
end_idx = nextind(text, end_idx, 1) |
|
if end_idx > lastindex(text) |
|
end_idx = lastindex(text) |
|
break |
|
end |
|
end |
|
push!(chunks, text[start_idx:end_idx]) |
|
start_idx = nextind(text, end_idx) |
|
end |
|
return chunks |
|
end |
|
|
|
""" |
|
## Embeddings of text from the small encoder |
|
|
|
text = "This is a test." |
|
using OstreaCultura: Encoder |
|
embd = Encoder.get_embeddings(text) |
|
|
|
""" |
|
function generate_embeddings(text::String) |
|
try |
|
return Encoder.get_embeddings(text) |
|
catch e |
|
println("Error: ", e) |
|
return zeros(Float32, 384) |
|
end |
|
end |
|
|
|
""" |
|
## Embeddings of text from the large encoder |
|
|
|
text = "This is a test." |
|
using OstreaCultura: Encoder |
|
embd = Encoder.get_embeddings_big(text) |
|
LinearAlgebra.normalize(embd) |
|
""" |
|
function generate_embeddings_large(text::String) |
|
try |
|
return Encoder.get_embeddings_big(text) |
|
catch e |
|
println("Error: ", e) |
|
return zeros(Float32, 768) |
|
end |
|
end |
|
|
|
""" |
|
# This is the core function - takes in a string of any length and returns the embeddings |
|
|
|
text = repeat("This is a test. ", 100) |
|
mini_embed(text) |
|
|
|
# Test to embed truthseeker subsample |
|
ts = CSV.read("data/truthseeker_subsample.csv", DataFrame) |
|
ts_embed = mini_embed.(ts.statement) # can embed 3K in 25 seconds |
|
ts.Embeddings = ts_embed |
|
CSV.write("data/truthseeker_subsample_embed.csv", ts) |
|
|
|
## embed fact check data |
|
fc = CSV.read("data/fact_check_latest.csv", DataFrame) |
|
# drop missing text |
|
fc = fc[.!ismissing.(fc.text), :] |
|
fc_embed = mini_embed.(fc.text) # 12 minutes |
|
fc.Embeddings = fc_embed |
|
CSV.write("data/fact_check_latest_embed.csv", fc) |
|
|
|
narrs = CSV.read("data/expansive_claims_library_expanded.csv", DataFrame) |
|
# drop missing text |
|
narrs.text = narrs.ExpandedClaim |
|
narrs = narrs[.!ismissing.(narrs.text), :] |
|
narratives_embed = OC.mini_embed.(narrs.text) # seconds to run |
|
narrs.Embeddings = narratives_embed |
|
CSV.write("data/expansive_claims_library_expanded_embed.csv", narrs) |
|
|
|
""" |
|
function mini_embed(text::String) |
|
chunked_text = create_chunked_text(text) |
|
embeddings = generate_embeddings.(chunked_text) |
|
mean(embeddings) |
|
end |
|
|
|
function maxi_embed(text::String) |
|
chunked_text = create_chunked_text(text) |
|
embeddings = generate_embeddings_large.(chunked_text) |
|
mean(embeddings) |
|
end |
|
|
|
""" |
|
# Get distance and classification |
|
|
|
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame) |
|
ts_embed = dfdat_to_matrix(ts, :Embeddings) |
|
fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame) |
|
fc_embed = dfdat_to_matrix(fc, :Embeddings) |
|
distances, classification = distances_and_classification(fc_embed, ts_embed[:, 1:5]) |
|
""" |
|
function distances_and_classification(narrative_matrix, target_matrix) |
|
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2) |
|
|
|
return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1] |
|
end |
|
|
|
""" |
|
# Get the dot product of the two matrices |
|
|
|
ind, scores = dotproduct_distances(fc_embed, ts_embed) |
|
|
|
ts.scores = scores |
|
|
|
# Group by target and get the max score |
|
ts_grouped = combine(groupby(ts, :target), :scores => mean) |
|
# show the matched text |
|
ts.fc_text = fc.text[ind] |
|
|
|
""" |
|
function dotproduct_distances(narrative_matrix, target_matrix) |
|
|
|
dprods = narrative_matrix' * target_matrix |
|
|
|
max_dot = argmax(dprods, dims=1)[1, :] |
|
return first.(Tuple.(max_dot)), dprods[max_dot] |
|
end |
|
|
|
function dotproduct_topk(narrative_matrix, target_vector, k) |
|
|
|
dprods = narrative_matrix' * target_vector |
|
|
|
topk = sortperm(dprods, rev=true)[1:k] |
|
return topk, dprods[topk] |
|
end |
|
|
|
""" |
|
# Get the top k scores |
|
|
|
using CSV, DataFrames |
|
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame) |
|
ts_embed = OC.dfdat_to_matrix(ts, :Embeddings) |
|
fc = CSV.read("data/fact_check_latest_embed_maxi.csv", DataFrame) |
|
fc_embed = OC.dfdat_to_matrix(fc, :Embeddings) |
|
|
|
OC.fast_topk(fc_embed, fc, ts.statement[1], 5) |
|
|
|
## How fast to get the top 5 scores for 3K statements? |
|
@time [OC.fast_topk(fc_embed, fc, ts.statement[x], 5) for x in 1:3000] # 63 seconds |
|
""" |
|
function fast_topk(narrative_matrix, narratives, text::String, k) |
|
|
|
target_vector = maxi_embed(text) |
|
inds, scores = dotproduct_topk(narrative_matrix, target_vector, k) |
|
if hasproperty(narratives, :Policy) |
|
policy = narratives.Policy[inds] |
|
narrative = narratives.Narrative[inds] |
|
else |
|
policy = fill("No policy", k) |
|
narrative = fill("No narrative", k) |
|
end |
|
if !hasproperty(narratives, :claimReviewUrl) |
|
narratives.claimReviewUrl = fill("No URL", size(narratives, 1)) |
|
end |
|
vec_of_dicts = [Dict("score" => scores[i], |
|
"text" => narratives.text[ind], |
|
"claimUrl" => narratives.claimReviewUrl[ind], |
|
"policy" => policy[i], |
|
"narrative" => narrative[i]) for (i, ind) in enumerate(inds)] |
|
return vec_of_dicts |
|
end |
|
|
|
function load_fasttext_embeddings(file::String="data/fact_check_latest_embed.csv") |
|
fc = CSV.read(file, DataFrame) |
|
fc_embed = dfdat_to_matrix(fc, :Embeddings) |
|
return fc_embed, fc |
|
end |