Spaces:
Sleeping
Sleeping
## Utility Functions | |
## Note: edit ~/.bigqueryrc to set global settings for bq command line tool | |
""" | |
## ostreacultura_bq_auth() | |
- Activate the service account using the credentials file | |
""" | |
function ostreacultura_bq_auth() | |
if isfile("ostreacultura-credentials.json") | |
run(`gcloud auth activate-service-account --key-file=ostreacultura-credentials.json`) | |
else | |
println("Credentials file not found") | |
end | |
end | |
""" | |
## bq(query::String) | |
- Run a BigQuery query and return the result as a DataFrame | |
Example: bq("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10") | |
""" | |
function bq(query::String) | |
tname = tempname() | |
run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, tname)) | |
return CSV.read(tname, DataFrame) | |
end | |
""" | |
## bq_db(query::String, db::String) | |
- Run a BigQuery query and save to a database | |
Example: | |
bq_db("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10", "data/test.csv") | |
""" | |
function bq_db(query::String, db::String) | |
run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, db)) | |
end | |
""" | |
one token is roughly 3/4 of a word | |
""" | |
function token_estimate(allstrings::Vector{String}) | |
## Tokenize the strings | |
tokens = [split(x) for x in allstrings] | |
## Estimate the number of tokens | |
token_estimate = sum([length(x) for x in tokens]) | |
return token_estimate * 4 / 3 | |
end | |
function chunk_by_tokens(allstrings::Vector{String}, max_tokens::Int=8191) | |
## Tokenize the strings | |
tokens = [split(x) for x in allstrings] | |
## Estimate the number of tokens | |
token_estimate = sum([length(x) for x in tokens]) * 4 / 3 | |
## Chunk the strings | |
chunks = [] | |
chunk = [] | |
chunk_tokens = 0 | |
for i in 1:length(allstrings) | |
if chunk_tokens + length(tokens[i]) < max_tokens | |
push!(chunk, allstrings[i]) | |
chunk_tokens += length(tokens[i]) | |
else | |
push!(chunks, chunk) | |
chunk = [allstrings[i]] | |
chunk_tokens = length(tokens[i]) | |
end | |
end | |
push!(chunks, chunk) | |
return chunks | |
end |