Spaces:
Sleeping
Sleeping
## Embeddings | |
function string_to_float32_vector(str::String)::Vector{Float32} | |
# Remove the "Float32[" prefix and the "]" suffix | |
str = strip(str, ['F', 'l', 'o', 'a', 't', '3', '2', '[', ']']) | |
# Replace 'f' with 'e' for scientific notation | |
str = replace(str, 'f' => 'e') | |
# Split the string by commas to get individual elements | |
elements = split(str, ",") | |
# Convert each element to Float32 and collect into a vector | |
return Float32[parse(Float32, strip(el)) for el in elements] | |
end | |
function dfdat_to_matrix(df::DataFrame, col::Symbol)::Matrix{Float32} | |
return hcat([string_to_float32_vector(row[col]) for row in eachrow(df)]...) | |
end | |
""" | |
## Any piece of text longer than 280 characters will be chunked into smaller pieces, and the embeddings will be averaged. | |
#Example: | |
text = repeat("This is a test. ", 100) | |
chunktext = create_chunked_text(text) | |
function create_chunked_text(text; chunk_size=280) | |
## Chunk the data | |
chunks = [] | |
for chunk in 1:chunk_size:length(text) | |
push!(chunks, text[chunk:min(chunk+chunk_size-1, length(text))]) | |
end | |
return chunks | |
end | |
""" | |
function create_chunked_text(text::String; chunk_size::Int=280) | |
chunks = [] | |
start_idx = 1 | |
while start_idx <= lastindex(text) | |
end_idx = start_idx | |
for _ in 1:chunk_size | |
end_idx = nextind(text, end_idx, 1) | |
if end_idx > lastindex(text) | |
end_idx = lastindex(text) | |
break | |
end | |
end | |
push!(chunks, text[start_idx:end_idx]) | |
start_idx = nextind(text, end_idx) | |
end | |
return chunks | |
end | |
""" | |
## Embeddings of text | |
""" | |
function generate_embeddings(text::String) | |
try | |
return MiniEncoder.get_embeddings(text) | |
catch e | |
println("Error: ", e) | |
return zeros(Float32, 384) | |
end | |
end | |
""" | |
# This is the core function - takes in a string of any length and returns the embeddings | |
text = repeat("This is a test. ", 100) | |
mini_embed(text) | |
# Test to embed truthseeker subsample | |
ts = CSV.read("data/truthseeker_subsample.csv", DataFrame) | |
ts_embed = mini_embed.(ts.statement) # can embed 3K in 25 seconds | |
ts.Embeddings = ts_embed | |
CSV.write("data/truthseeker_subsample_embed.csv", ts) | |
## embed fact check data | |
fc = CSV.read("data/fact_check_latest.csv", DataFrame) | |
# drop missing text | |
fc = fc[.!ismissing.(fc.text), :] | |
fc_embed = mini_embed.(fc.text) # 12 minutes | |
fc.Embeddings = fc_embed | |
CSV.write("data/fact_check_latest_embed.csv", fc) | |
narrs = CSV.read("data/expansive_claims_library_expanded.csv", DataFrame) | |
# drop missing text | |
narrs.text = narrs.ExpandedClaim | |
narrs = narrs[.!ismissing.(narrs.text), :] | |
narratives_embed = OC.mini_embed.(narrs.text) # seconds to run | |
narrs.Embeddings = narratives_embed | |
CSV.write("data/expansive_claims_library_expanded_embed.csv", narrs) | |
""" | |
function mini_embed(text::String) | |
chunked_text = create_chunked_text(text) | |
embeddings = generate_embeddings.(chunked_text) | |
mean(embeddings) | |
end | |
""" | |
# Get distance and classification | |
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame) | |
ts_embed = dfdat_to_matrix(ts, :Embeddings) | |
fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame) | |
fc_embed = dfdat_to_matrix(fc, :Embeddings) | |
distances, classification = distances_and_classification(fc_embed, ts_embed[:, 1:5]) | |
""" | |
function distances_and_classification(narrative_matrix, target_matrix) | |
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2) | |
# get the index of the column with the smallest distance | |
return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1] | |
end | |
""" | |
# Get the dot product of the two matrices | |
ind, scores = dotproduct_distances(fc_embed, ts_embed) | |
ts.scores = scores | |
# Group by target and get the max score | |
ts_grouped = combine(groupby(ts, :target), :scores => mean) | |
# show the matched text | |
ts.fc_text = fc.text[ind] | |
""" | |
function dotproduct_distances(narrative_matrix, target_matrix) | |
# multiply each column of the narrative matrix by the target vector | |
dprods = narrative_matrix' * target_matrix | |
# get maximum dotproduct and index of the row | |
max_dot = argmax(dprods, dims=1)[1, :] | |
return first.(Tuple.(max_dot)), dprods[max_dot] | |
end | |
function dotproduct_topk(narrative_matrix, target_vector, k) | |
# multiply each column of the narrative matrix by the target vector | |
dprods = narrative_matrix' * target_vector | |
# indices of the top k dot products | |
topk = sortperm(dprods, rev=true)[1:k] | |
return topk, dprods[topk] | |
end | |
""" | |
# Get the top k scores | |
using CSV, DataFrames | |
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame) | |
ts_embed = OC.dfdat_to_matrix(ts, :Embeddings) | |
fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame) | |
fc_embed = OC.dfdat_to_matrix(fc, :Embeddings) | |
OC.fast_topk(fc_embed, fc, ts.statement[1], 5) | |
## How fast to get the top 5 scores for 3K statements? | |
@time [OC.fast_topk(fc_embed, fc, ts.statement[x], 5) for x in 1:3000] # 63 seconds | |
""" | |
function fast_topk(narrative_matrix, narratives, text::String, k) | |
target_vector = mini_embed(text) | |
inds, scores = dotproduct_topk(narrative_matrix, target_vector, k) | |
if hasproperty(narratives, :Policy) | |
policy = narratives.Policy[inds] | |
narrative = narratives.Narrative[inds] | |
else | |
policy = fill("No policy", k) | |
narrative = fill("No narrative", k) | |
end | |
if !hasproperty(narratives, :claimReviewUrl) | |
narratives.claimReviewUrl = fill("No URL", size(narratives, 1)) | |
end | |
vec_of_dicts = [Dict("score" => scores[i], | |
"text" => narratives.text[ind], | |
"claimUrl" => narratives.claimReviewUrl[ind], | |
"policy" => policy[i], | |
"narrative" => narrative[i]) for (i, ind) in enumerate(inds)] | |
return vec_of_dicts | |
end | |
function load_fasttext_embeddings(file::String="data/fact_check_latest_embed.csv") | |
fc = CSV.read(file, DataFrame) | |
fc_embed = dfdat_to_matrix(fc, :Embeddings) | |
return fc_embed, fc | |
end |