File size: 2,077 Bytes
48bb68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
## Utility Functions 
## Note: edit ~/.bigqueryrc to set global settings for bq command line tool


"""
## ostreacultura_bq_auth()
- Activate the service account using the credentials file
"""
function ostreacultura_bq_auth()
    if isfile("ostreacultura-credentials.json")
        run(`gcloud auth activate-service-account --key-file=ostreacultura-credentials.json`)
    else
        println("Credentials file not found")
    end
end

"""
## bq(query::String)
- Run a BigQuery query and return the result as a DataFrame

Example: bq("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10")
"""
function bq(query::String)
    tname = tempname()
    run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, tname))
    return CSV.read(tname, DataFrame)
end

"""
## bq_db(query::String, db::String)
- Run a BigQuery query and save to a database

Example: 
bq_db("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10", "data/test.csv")
"""
function bq_db(query::String, db::String)
    run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, db))
end

"""
 one token is roughly 3/4 of a word

"""
function token_estimate(allstrings::Vector{String})
    ## Tokenize the strings
    tokens = [split(x) for x in allstrings]
    ## Estimate the number of tokens
    token_estimate = sum([length(x) for x in tokens])
    return token_estimate * 4 / 3
end

function chunk_by_tokens(allstrings::Vector{String}, max_tokens::Int=8191)
    ## Tokenize the strings
    tokens = [split(x) for x in allstrings]
    ## Estimate the number of tokens
    token_estimate = sum([length(x) for x in tokens]) * 4 / 3
    ## Chunk the strings
    chunks = []
    chunk = []
    chunk_tokens = 0
    for i in 1:length(allstrings)
        if chunk_tokens + length(tokens[i]) < max_tokens
            push!(chunk, allstrings[i])
            chunk_tokens += length(tokens[i])
        else
            push!(chunks, chunk)
            chunk = [allstrings[i]]
            chunk_tokens = length(tokens[i])
        end
    end
    push!(chunks, chunk)
    return chunks
end