## Embeddings function string_to_float32_vector(str::String)::Vector{Float32} # Remove the "Float32[" prefix and the "]" suffix str = strip(str, ['F', 'l', 'o', 'a', 't', '3', '2', '[', ']']) # Replace 'f' with 'e' for scientific notation str = replace(str, 'f' => 'e') # Split the string by commas to get individual elements elements = split(str, ",") # Convert each element to Float32 and collect into a vector return Float32[parse(Float32, strip(el)) for el in elements] end function dfdat_to_matrix(df::DataFrame, col::Symbol)::Matrix{Float32} return hcat([string_to_float32_vector(row[col]) for row in eachrow(df)]...) end """ ## Any piece of text longer than 280 characters will be chunked into smaller pieces, and the embeddings will be averaged. #Example: text = repeat("This is a test. ", 100) chunktext = create_chunked_text(text) function create_chunked_text(text; chunk_size=280) ## Chunk the data chunks = [] for chunk in 1:chunk_size:length(text) push!(chunks, text[chunk:min(chunk+chunk_size-1, length(text))]) end return chunks end """ function create_chunked_text(text::String; chunk_size::Int=280) chunks = [] start_idx = 1 while start_idx <= lastindex(text) end_idx = start_idx for _ in 1:chunk_size end_idx = nextind(text, end_idx, 1) if end_idx > lastindex(text) end_idx = lastindex(text) break end end push!(chunks, text[start_idx:end_idx]) start_idx = nextind(text, end_idx) end return chunks end """ ## Embeddings of text from the small encoder text = "This is a test." using OstreaCultura: Encoder embd = Encoder.get_embeddings(text) """ function generate_embeddings(text::String) try return Encoder.get_embeddings(text) catch e println("Error: ", e) return zeros(Float32, 384) end end """ ## Embeddings of text from the large encoder text = "This is a test." using OstreaCultura: Encoder embd = Encoder.get_embeddings_big(text) LinearAlgebra.normalize(embd) """ function generate_embeddings_large(text::String) try return Encoder.get_embeddings_big(text) catch e println("Error: ", e) return zeros(Float32, 768) end end """ # This is the core function - takes in a string of any length and returns the embeddings text = repeat("This is a test. ", 100) mini_embed(text) # Test to embed truthseeker subsample ts = CSV.read("data/truthseeker_subsample.csv", DataFrame) ts_embed = mini_embed.(ts.statement) # can embed 3K in 25 seconds ts.Embeddings = ts_embed CSV.write("data/truthseeker_subsample_embed.csv", ts) ## embed fact check data fc = CSV.read("data/fact_check_latest.csv", DataFrame) # drop missing text fc = fc[.!ismissing.(fc.text), :] fc_embed = mini_embed.(fc.text) # 12 minutes fc.Embeddings = fc_embed CSV.write("data/fact_check_latest_embed.csv", fc) narrs = CSV.read("data/expansive_claims_library_expanded.csv", DataFrame) # drop missing text narrs.text = narrs.ExpandedClaim narrs = narrs[.!ismissing.(narrs.text), :] narratives_embed = OC.mini_embed.(narrs.text) # seconds to run narrs.Embeddings = narratives_embed CSV.write("data/expansive_claims_library_expanded_embed.csv", narrs) """ function mini_embed(text::String) chunked_text = create_chunked_text(text) embeddings = generate_embeddings.(chunked_text) mean(embeddings) end function maxi_embed(text::String) chunked_text = create_chunked_text(text) embeddings = generate_embeddings_large.(chunked_text) mean(embeddings) end """ # Get distance and classification ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame) ts_embed = dfdat_to_matrix(ts, :Embeddings) fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame) fc_embed = dfdat_to_matrix(fc, :Embeddings) distances, classification = distances_and_classification(fc_embed, ts_embed[:, 1:5]) """ function distances_and_classification(narrative_matrix, target_matrix) distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2) # get the index of the column with the smallest distance return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1] end """ # Get the dot product of the two matrices ind, scores = dotproduct_distances(fc_embed, ts_embed) ts.scores = scores # Group by target and get the max score ts_grouped = combine(groupby(ts, :target), :scores => mean) # show the matched text ts.fc_text = fc.text[ind] """ function dotproduct_distances(narrative_matrix, target_matrix) # multiply each column of the narrative matrix by the target vector dprods = narrative_matrix' * target_matrix # get maximum dotproduct and index of the row max_dot = argmax(dprods, dims=1)[1, :] return first.(Tuple.(max_dot)), dprods[max_dot] end function dotproduct_topk(narrative_matrix, target_vector, k) # multiply each column of the narrative matrix by the target vector dprods = narrative_matrix' * target_vector # indices of the top k dot products topk = sortperm(dprods, rev=true)[1:k] return topk, dprods[topk] end """ # Get the top k scores using CSV, DataFrames ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame) ts_embed = OC.dfdat_to_matrix(ts, :Embeddings) fc = CSV.read("data/fact_check_latest_embed_maxi.csv", DataFrame) fc_embed = OC.dfdat_to_matrix(fc, :Embeddings) OC.fast_topk(fc_embed, fc, ts.statement[1], 5) ## How fast to get the top 5 scores for 3K statements? @time [OC.fast_topk(fc_embed, fc, ts.statement[x], 5) for x in 1:3000] # 63 seconds """ function fast_topk(narrative_matrix, narratives, text::String, k) #target_vector = mini_embed(text) target_vector = maxi_embed(text) inds, scores = dotproduct_topk(narrative_matrix, target_vector, k) if hasproperty(narratives, :Policy) policy = narratives.Policy[inds] narrative = narratives.Narrative[inds] else policy = fill("No policy", k) narrative = fill("No narrative", k) end if !hasproperty(narratives, :claimReviewUrl) narratives.claimReviewUrl = fill("No URL", size(narratives, 1)) end vec_of_dicts = [Dict("score" => scores[i], "text" => narratives.text[ind], "claimUrl" => narratives.claimReviewUrl[ind], "policy" => policy[i], "narrative" => narrative[i]) for (i, ind) in enumerate(inds)] return vec_of_dicts end function load_fasttext_embeddings(file::String="data/fact_check_latest_embed.csv") fc = CSV.read(file, DataFrame) fc_embed = dfdat_to_matrix(fc, :Embeddings) return fc_embed, fc end