## Utility Functions ## Note: edit ~/.bigqueryrc to set global settings for bq command line tool """ ## ostreacultura_bq_auth() - Activate the service account using the credentials file """ function ostreacultura_bq_auth() if isfile("ostreacultura-credentials.json") run(`gcloud auth activate-service-account --key-file=ostreacultura-credentials.json`) else println("Credentials file not found") end end """ ## bq(query::String) - Run a BigQuery query and return the result as a DataFrame Example: bq("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10") """ function bq(query::String) tname = tempname() run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, tname)) return CSV.read(tname, DataFrame) end """ ## bq_db(query::String, db::String) - Run a BigQuery query and save to a database Example: bq_db("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10", "data/test.csv") """ function bq_db(query::String, db::String) run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, db)) end """ one token is roughly 3/4 of a word """ function token_estimate(allstrings::Vector{String}) ## Tokenize the strings tokens = [split(x) for x in allstrings] ## Estimate the number of tokens token_estimate = sum([length(x) for x in tokens]) return token_estimate * 4 / 3 end function chunk_by_tokens(allstrings::Vector{String}, max_tokens::Int=8191) ## Tokenize the strings tokens = [split(x) for x in allstrings] ## Estimate the number of tokens token_estimate = sum([length(x) for x in tokens]) * 4 / 3 ## Chunk the strings chunks = [] chunk = [] chunk_tokens = 0 for i in 1:length(allstrings) if chunk_tokens + length(tokens[i]) < max_tokens push!(chunk, allstrings[i]) chunk_tokens += length(tokens[i]) else push!(chunks, chunk) chunk = [allstrings[i]] chunk_tokens = length(tokens[i]) end end push!(chunks, chunk) return chunks end