Spaces:

stefanjwojcik
/

misinfo_detection_app

Running

File size: 6,789 Bytes

## Embeddings 

function string_to_float32_vector(str::String)::Vector{Float32}
    # Remove the "Float32[" prefix and the "]" suffix
    str = strip(str, ['F', 'l', 'o', 'a', 't', '3', '2', '[', ']'])
    
    # Replace 'f' with 'e' for scientific notation
    str = replace(str, 'f' => 'e')
    
    # Split the string by commas to get individual elements
    elements = split(str, ",")
    
    # Convert each element to Float32 and collect into a vector
    return Float32[parse(Float32, strip(el)) for el in elements]
end

function dfdat_to_matrix(df::DataFrame, col::Symbol)::Matrix{Float32}
    return hcat([string_to_float32_vector(row[col]) for row in eachrow(df)]...)
end

"""
## Any piece of text longer than 280 characters will be chunked into smaller pieces, and the embeddings will be averaged. 

#Example: 
text = repeat("This is a test. ", 100)
chunktext = create_chunked_text(text)
function create_chunked_text(text; chunk_size=280)
    ## Chunk the data
    chunks = []
    for chunk in 1:chunk_size:length(text)
        push!(chunks, text[chunk:min(chunk+chunk_size-1, length(text))])
    end
    return chunks
end
"""

function create_chunked_text(text::String; chunk_size::Int=280)
    chunks = []
    start_idx = 1
    while start_idx <= lastindex(text)
        end_idx = start_idx
        for _ in 1:chunk_size
            end_idx = nextind(text, end_idx, 1)
            if end_idx > lastindex(text)
                end_idx = lastindex(text)
                break
            end
        end
        push!(chunks, text[start_idx:end_idx])
        start_idx = nextind(text, end_idx)
    end
    return chunks
end

"""
## Embeddings of text from the small encoder

text = "This is a test."
using OstreaCultura: Encoder
embd = Encoder.get_embeddings(text)

"""
function generate_embeddings(text::String)
    try
        return Encoder.get_embeddings(text)
    catch e
        println("Error: ", e)
        return zeros(Float32, 384)
    end
end

"""
## Embeddings of text from the large encoder

text = "This is a test."
using OstreaCultura: Encoder
embd = Encoder.get_embeddings_big(text)
LinearAlgebra.normalize(embd)
"""
function generate_embeddings_large(text::String)
    try
        return Encoder.get_embeddings_big(text)
    catch e
        println("Error: ", e)
        return zeros(Float32, 768)
    end
end

"""
# This is the core function - takes in a string of any length and returns the embeddings

text = repeat("This is a test. ", 100)
mini_embed(text)

# Test to embed truthseeker subsample 
ts = CSV.read("data/truthseeker_subsample.csv", DataFrame)
ts_embed = mini_embed.(ts.statement) # can embed 3K in 25 seconds 
ts.Embeddings = ts_embed
CSV.write("data/truthseeker_subsample_embed.csv", ts)

## embed fact check data 
fc = CSV.read("data/fact_check_latest.csv", DataFrame)
# drop missing text 
fc = fc[.!ismissing.(fc.text), :]
fc_embed = mini_embed.(fc.text) # 12 minutes
fc.Embeddings = fc_embed
CSV.write("data/fact_check_latest_embed.csv", fc)

narrs = CSV.read("data/expansive_claims_library_expanded.csv", DataFrame)
# drop missing text 
narrs.text = narrs.ExpandedClaim
narrs = narrs[.!ismissing.(narrs.text), :]
narratives_embed = OC.mini_embed.(narrs.text) # seconds to run 
narrs.Embeddings = narratives_embed
CSV.write("data/expansive_claims_library_expanded_embed.csv", narrs)

"""
function mini_embed(text::String)
    chunked_text = create_chunked_text(text)
    embeddings = generate_embeddings.(chunked_text)
    mean(embeddings)
end

function maxi_embed(text::String)
    chunked_text = create_chunked_text(text)
    embeddings = generate_embeddings_large.(chunked_text)
    mean(embeddings)
end

"""
# Get distance and classification 

ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
ts_embed = dfdat_to_matrix(ts, :Embeddings)
fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame)
fc_embed = dfdat_to_matrix(fc, :Embeddings)
distances, classification = distances_and_classification(fc_embed, ts_embed[:, 1:5])
"""
function distances_and_classification(narrative_matrix, target_matrix)
    distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
    # get the index of the column with the smallest distance
    return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1]
end

"""
# Get the dot product of the two matrices

ind, scores = dotproduct_distances(fc_embed, ts_embed)

ts.scores = scores

# Group by target and get the max score 
ts_grouped = combine(groupby(ts, :target), :scores => mean)
# show the matched text 
ts.fc_text = fc.text[ind]

"""
function dotproduct_distances(narrative_matrix, target_matrix)
    # multiply each column of the narrative matrix by the target vector
    dprods = narrative_matrix' * target_matrix    
    # get maximum dotproduct and index of the row
    max_dot = argmax(dprods, dims=1)[1, :]
    return first.(Tuple.(max_dot)), dprods[max_dot]
end

function dotproduct_topk(narrative_matrix, target_vector, k)
    # multiply each column of the narrative matrix by the target vector
    dprods = narrative_matrix' * target_vector 
    # indices of the top k dot products
    topk = sortperm(dprods, rev=true)[1:k]
    return topk, dprods[topk]
end

"""
# Get the top k scores

using CSV, DataFrames
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
ts_embed = OC.dfdat_to_matrix(ts, :Embeddings)
fc = CSV.read("data/fact_check_latest_embed_maxi.csv", DataFrame)
fc_embed = OC.dfdat_to_matrix(fc, :Embeddings)

OC.fast_topk(fc_embed, fc, ts.statement[1], 5)

## How fast to get the top 5 scores for 3K statements?
@time [OC.fast_topk(fc_embed, fc, ts.statement[x], 5) for x in 1:3000] # 63 seconds 
"""
function fast_topk(narrative_matrix, narratives, text::String, k)
    #target_vector = mini_embed(text)
    target_vector = maxi_embed(text)
    inds, scores = dotproduct_topk(narrative_matrix, target_vector, k)
    if hasproperty(narratives, :Policy)
        policy = narratives.Policy[inds]
        narrative = narratives.Narrative[inds] 
    else 
        policy = fill("No policy", k)
        narrative = fill("No narrative", k)
    end
    if !hasproperty(narratives, :claimReviewUrl)
        narratives.claimReviewUrl = fill("No URL", size(narratives, 1))
    end
    vec_of_dicts = [Dict("score" => scores[i], 
                        "text" => narratives.text[ind], 
                        "claimUrl" => narratives.claimReviewUrl[ind], 
                        "policy" => policy[i], 
                        "narrative" => narrative[i]) for (i, ind) in enumerate(inds)]
    return vec_of_dicts
end

function load_fasttext_embeddings(file::String="data/fact_check_latest_embed.csv")
    fc = CSV.read(file, DataFrame)
    fc_embed = dfdat_to_matrix(fc, :Embeddings)
    return fc_embed, fc
end