stefanjwojcik's picture
Upload 24 files
48bb68b verified
## Utility Functions
## Note: edit ~/.bigqueryrc to set global settings for bq command line tool
"""
## ostreacultura_bq_auth()
- Activate the service account using the credentials file
"""
function ostreacultura_bq_auth()
if isfile("ostreacultura-credentials.json")
run(`gcloud auth activate-service-account --key-file=ostreacultura-credentials.json`)
else
println("Credentials file not found")
end
end
"""
## bq(query::String)
- Run a BigQuery query and return the result as a DataFrame
Example: bq("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10")
"""
function bq(query::String)
tname = tempname()
run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, tname))
return CSV.read(tname, DataFrame)
end
"""
## bq_db(query::String, db::String)
- Run a BigQuery query and save to a database
Example:
bq_db("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10", "data/test.csv")
"""
function bq_db(query::String, db::String)
run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, db))
end
"""
one token is roughly 3/4 of a word
"""
function token_estimate(allstrings::Vector{String})
## Tokenize the strings
tokens = [split(x) for x in allstrings]
## Estimate the number of tokens
token_estimate = sum([length(x) for x in tokens])
return token_estimate * 4 / 3
end
function chunk_by_tokens(allstrings::Vector{String}, max_tokens::Int=8191)
## Tokenize the strings
tokens = [split(x) for x in allstrings]
## Estimate the number of tokens
token_estimate = sum([length(x) for x in tokens]) * 4 / 3
## Chunk the strings
chunks = []
chunk = []
chunk_tokens = 0
for i in 1:length(allstrings)
if chunk_tokens + length(tokens[i]) < max_tokens
push!(chunk, allstrings[i])
chunk_tokens += length(tokens[i])
else
push!(chunks, chunk)
chunk = [allstrings[i]]
chunk_tokens = length(tokens[i])
end
end
push!(chunks, chunk)
return chunks
end