Spaces:

stefanjwojcik
/

misinfo_detection_app

Running

File size: 2,077 Bytes

48bb68b

## Utility Functions 
## Note: edit ~/.bigqueryrc to set global settings for bq command line tool


"""
## ostreacultura_bq_auth()
- Activate the service account using the credentials file
"""
function ostreacultura_bq_auth()
    if isfile("ostreacultura-credentials.json")
        run(`gcloud auth activate-service-account --key-file=ostreacultura-credentials.json`)
    else
        println("Credentials file not found")
    end
end

"""
## bq(query::String)
- Run a BigQuery query and return the result as a DataFrame

Example: bq("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10")
"""
function bq(query::String)
    tname = tempname()
    run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, tname))
    return CSV.read(tname, DataFrame)
end

"""
## bq_db(query::String, db::String)
- Run a BigQuery query and save to a database

Example: 
bq_db("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10", "data/test.csv")
"""
function bq_db(query::String, db::String)
    run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, db))
end

"""
 one token is roughly 3/4 of a word

"""
function token_estimate(allstrings::Vector{String})
    ## Tokenize the strings
    tokens = [split(x) for x in allstrings]
    ## Estimate the number of tokens
    token_estimate = sum([length(x) for x in tokens])
    return token_estimate * 4 / 3
end

function chunk_by_tokens(allstrings::Vector{String}, max_tokens::Int=8191)
    ## Tokenize the strings
    tokens = [split(x) for x in allstrings]
    ## Estimate the number of tokens
    token_estimate = sum([length(x) for x in tokens]) * 4 / 3
    ## Chunk the strings
    chunks = []
    chunk = []
    chunk_tokens = 0
    for i in 1:length(allstrings)
        if chunk_tokens + length(tokens[i]) < max_tokens
            push!(chunk, allstrings[i])
            chunk_tokens += length(tokens[i])
        else
            push!(chunks, chunk)
            chunk = [allstrings[i]]
            chunk_tokens = length(tokens[i])
        end
    end
    push!(chunks, chunk)
    return chunks
end