Spaces:

stefanjwojcik
/

misinfo_detection_app

Running

File size: 9,183 Bytes

48bb68b

## Structure of a Narrative 

function randid()
    config = Sqids.configure()  # Local configuration
    id = Sqids.encode(config, [rand(1:100), rand(1:100)])
    return id
end

function timestamp()
    (now() - unix2datetime(0)).value
end

"""
ts_to_time(timestamp()) == now()
"""
function ts_to_time(ts)
    return unix2datetime(ts / 1000)
end

"""
    Claim: something that supports a misinformation narrative 

    id: unique identifier for the claim
    claim: text of the claim
    counterclaim: text of the counterclaim
    claimembedding: embedding of the claim
    counterclaimembedding: embedding of the counterclaim
    created_at: date the claim was created
    updated_at: date the claim was last updated
    source: source of the claim

"""
mutable struct Claim
    id::String
    claim::String  # claim text
    counterclaim::String  # counterclaim text
    claimembedding::Union{Array{Float32, 1}, Nothing}  # embedding of the claim
    counterclaimembedding::Union{Array{Float32, 1}, Nothing}  # embedding of the counterclaim
    created_at::Int64  # date the claim was created
    updated_at::Int64  # date the claim was last updated
    source::String  # source of the claim
    keywords::Union{Array{String, 1}, Nothing}  # keywords associated with the claim
end

"""
    createClaim(claim::String, counterclaim::String, source::String)

    Create a new Claim object with the given claim, counterclaim, and source. 
    The claim and counterclaim embeddings are set to nothing by default. 

    Example: 
    createClaim("Solar panels poison the soil and reduce crop yields", 
                "There is no evidence that solar panels poison the soil or reduce crop yields", 
                "Facebook post")
"""
function createClaim(claim::String, counterclaim::String, source::String, keywords::Array{String, 1})
    return Claim(randid(), claim, counterclaim, nothing, nothing, timestamp(), timestamp(), source, keywords)
end


"""
    Narrative: a collection of claims that support a misinformation narrative

    id: unique identifier for the narrative
    title: descriptive title of the narrative
    type: broad type of narrative (e.g., anti-semitism)
    target: target group/topic of the narrative
    narrativesummary: base narrative text
    claims: list of Claim objects

    Example: 
    example_narrative = Narrative(
        randid(),
        "Jews killed Jesus",
        "Anti-semitism",
        "Jews", 
        "Jews are responsible for the death of Jesus", 
        nothing)
"""
mutable struct Narrative
    id::String
    title::String  # descriptive title (e.g., Jews killed Jesus)
    topic::String  # broad type of narrative (e.g., anti-semitism)
    target::String  # target group/topic of the narrative
    narrativesummary::String  # base narrative text (e.g., Jews are responsible for the death of Jesus)
    claims::Vector{Claim}  # list of Claim objects
end

"""
## TODO: When you have a lot of narratives, you can create a NarrativeSet
- If you apply a narrative set over a database, it will perform classification using all the narratives

"""
mutable struct NarrativeSet 
    narratives::Vector{Narrative}
end

import Base: show
## Make the Narrative pretty to show - 
function show(io::IO, narrative::Narrative)
    println(io, "Narrative: $(narrative.title)")
    println(io, "Topic: $(narrative.topic)")
    println(io, "Target: $(narrative.target)")
    println(io, "Narrative Summary: $(narrative.narrativesummary)")
    println(io, "Claims:")
    for claim in narrative.claims
        println(io, "  - $(claim.claim)")
    end
end

"""
    add_claim!(narrative::Narrative, claim::Claim)

    Add a claim to a narrative. 

    Example: 
    add_claim!(example_narrative, example_claim)
"""

function add_claim!(narrative::Narrative, claim::Claim)
    push!(narrative.claims, claim)
end

function remove_claim!(narrative::Narrative, claim_id::String)
    narrative.claims = filter(c -> c.id != claim_id, narrative.claims)
end

function narrative_to_dataframe(narrative::Narrative)
    out = DataFrame( narrative_title = narrative.title,
                        id = [claim.id for claim in narrative.claims],
                        claim = [claim.claim for claim in narrative.claims],
                        counterclaim = [claim.counterclaim for claim in narrative.claims],
                        claimembedding = [claim.claimembedding for claim in narrative.claims],
                        counterclaimembedding = [claim.counterclaimembedding for claim in narrative.claims],
                        created_at = [claim.created_at for claim in narrative.claims],
                        updated_at = [claim.updated_at for claim in narrative.claims],
                        source = [claim.source for claim in narrative.claims],
                        keywords = [claim.keywords for claim in narrative.claims])
    return out
end

"""
# Collapse a dataframe into a narrative 
"""
function dataframe_to_narrative(df::DataFrame, narrative_title::String, narrative_summary::String)
    claims = [Claim(row.id, row.claim, row.counterclaim, row.claimembedding, row.counterclaimembedding, row.created_at, row.updated_at, row.source, row.keywords) for row in eachrow(df)]
    return Narrative(randid(), narrative_title, "", "", narrative_summary, claims)
end

function deduplicate_claims_in_narrative!(narrative::Narrative)
    ## check which claim in non-unique in the set 
    claims = [claim.claim for claim in narrative.claims]
    is_duplicated = nonunique(DataFrame(claim=claims))
    # Get ID's of duplicated claims then remove them 
    if length(claims[findall(is_duplicated)]) > 0
        for dupclaim in claims[findall(is_duplicated)]
            id_dup = [claim.id for claim in narrative.claims if claim.claim == dupclaim]
            # Remove all claims except the first one
            [remove_claim!(narrative, id) for id in id_dup[2:end]]
        end
    end
    return narrative
end

"""
## Embeddings to recover narratives 
cand_embeddings = candidate_embeddings_from_narrative(narrative)
- Input: narrative
- Output: candidate embeddings - embeddings of text that match the regex defined in claims 

"""
function candidate_embeddings(candidates::DataFrame; kwargs...)::DataFrame
    model_id = get(kwargs, :model_id, "text-embedding-3-small")
    textcol = get(kwargs, :textcol, "text")
    # check if text column exists
    if !textcol in names(candidates)
        error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
    end
    ## Data Embeddings 
    cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
    ## Add vector of embeddings to dataset 
    candidates[: , "Embeddings"] = [x for x in cand_embeddings]
    return candidates
end
## Embeddings 

"""
df = CSV.read("data/random_300k.csv", DataFrame)
df = filter(:message => x -> occursin(Regex("climate"), x), df)
embeds = create_chunked_embeddings(df[:, "message"]; chunk_size=10)

"""
function create_openai_chunked_embeddings(texts; model_id="text-embedding-3-small", chunk_size=1000)
    ## Chunk the data
    embeddings = []
    for chunk in 1:chunk_size:length(texts)
        embeddings_resp = create_embeddings(ENV["OPENAI_API_KEY"], 
                texts[chunk:min(chunk+chunk_size-1, length(texts))]; model_id=model_id)
        push!(embeddings, [x["embedding"] for x in embeddings_resp.response["data"]])
    end
    return vcat(embeddings...)
end

"""
## Embeddings of narrative claims 
- bang because it modifies the narrative object in place
include("src/ExampleNarrative.jl")
include("src/Narrative.jl")
climate_narrative = create_example_narrative();
generate_claim_embeddings_from_narrative!(climate_narrative)

"""
function generate_openai_claim_embeddings_from_narrative!(narrative::Narrative)
    ## claim embeddings
    claim_embeddings = create_chunked_embeddings([x.claim for x in narrative.claims])
    [narrative.claims[i].claimembedding = claim_embeddings[i] for i in 1:length(narrative.claims)]
    ## counterclaim embeddings
    counterclaim_embeddings = create_chunked_embeddings([x.counterclaim for x in narrative.claims])
    [narrative.claims[i].counterclaimembedding = counterclaim_embeddings[i] for i in 1:length(narrative.claims)]
    return nothing
end

"""
## Embeddings of candidate data 
cand_embeddings = candidate_embeddings_from_narrative(narrative)
- Input: narrative
- Output: candidate embeddings - embeddings of text that match the regex defined in claims 

"""
function candidate_openai_embeddings(candidates::DataFrame; kwargs...)::DataFrame
    model_id = get(kwargs, :model_id, "text-embedding-3-small")
    textcol = get(kwargs, :textcol, "text")
    # check if text column exists
    if !textcol in names(candidates)
        error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
    end
    ## Data Embeddings 
    cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
    ## Add vector of embeddings to dataset 
    candidates[: , "Embeddings"] = [x for x in cand_embeddings]
    return candidates
end