|
|
|
|
|
|
|
|
|
""" |
|
## ostreacultura_bq_auth() |
|
- Activate the service account using the credentials file |
|
""" |
|
function ostreacultura_bq_auth() |
|
if isfile("ostreacultura-credentials.json") |
|
run(`gcloud auth activate-service-account --key-file=ostreacultura-credentials.json`) |
|
else |
|
println("Credentials file not found") |
|
end |
|
end |
|
|
|
""" |
|
## bq(query::String) |
|
- Run a BigQuery query and return the result as a DataFrame |
|
|
|
Example: bq("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10") |
|
""" |
|
function bq(query::String) |
|
tname = tempname() |
|
run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, tname)) |
|
return CSV.read(tname, DataFrame) |
|
end |
|
|
|
""" |
|
## bq_db(query::String, db::String) |
|
- Run a BigQuery query and save to a database |
|
|
|
Example: |
|
bq_db("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10", "data/test.csv") |
|
""" |
|
function bq_db(query::String, db::String) |
|
run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, db)) |
|
end |
|
|
|
""" |
|
one token is roughly 3/4 of a word |
|
|
|
""" |
|
function token_estimate(allstrings::Vector{String}) |
|
|
|
tokens = [split(x) for x in allstrings] |
|
|
|
token_estimate = sum([length(x) for x in tokens]) |
|
return token_estimate * 4 / 3 |
|
end |
|
|
|
function chunk_by_tokens(allstrings::Vector{String}, max_tokens::Int=8191) |
|
|
|
tokens = [split(x) for x in allstrings] |
|
|
|
token_estimate = sum([length(x) for x in tokens]) * 4 / 3 |
|
|
|
chunks = [] |
|
chunk = [] |
|
chunk_tokens = 0 |
|
for i in 1:length(allstrings) |
|
if chunk_tokens + length(tokens[i]) < max_tokens |
|
push!(chunk, allstrings[i]) |
|
chunk_tokens += length(tokens[i]) |
|
else |
|
push!(chunks, chunk) |
|
chunk = [allstrings[i]] |
|
chunk_tokens = length(tokens[i]) |
|
end |
|
end |
|
push!(chunks, chunk) |
|
return chunks |
|
end |