Spaces:

stefanjwojcik
/

misinfo_detection_app

Sleeping

App Files Files Community

misinfo_detection_app / src /dev /Utils.jl

stefanjwojcik

Upload 24 files

48bb68b verified 5 months ago

raw

history blame

2.08 kB

	## Utility Functions
	## Note: edit ~/.bigqueryrc to set global settings for bq command line tool


	"""
	## ostreacultura_bq_auth()
	- Activate the service account using the credentials file
	"""
	function ostreacultura_bq_auth()
	if isfile("ostreacultura-credentials.json")
	run(`gcloud auth activate-service-account --key-file=ostreacultura-credentials.json`)
	else
	println("Credentials file not found")
	end
	end

	"""
	## bq(query::String)
	- Run a BigQuery query and return the result as a DataFrame

	Example: bq("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10")
	"""
	function bq(query::String)
	tname = tempname()
	run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, tname))
	return CSV.read(tname, DataFrame)
	end

	"""
	## bq_db(query::String, db::String)
	- Run a BigQuery query and save to a database

	Example:
	bq_db("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10", "data/test.csv")
	"""
	function bq_db(query::String, db::String)
	run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, db))
	end

	"""
	one token is roughly 3/4 of a word

	"""
	function token_estimate(allstrings::Vector{String})
	## Tokenize the strings
	tokens = [split(x) for x in allstrings]
	## Estimate the number of tokens
	token_estimate = sum([length(x) for x in tokens])
	return token_estimate * 4 / 3
	end

	function chunk_by_tokens(allstrings::Vector{String}, max_tokens::Int=8191)
	## Tokenize the strings
	tokens = [split(x) for x in allstrings]
	## Estimate the number of tokens
	token_estimate = sum([length(x) for x in tokens]) * 4 / 3
	## Chunk the strings
	chunks = []
	chunk = []
	chunk_tokens = 0
	for i in 1:length(allstrings)
	if chunk_tokens + length(tokens[i]) < max_tokens
	push!(chunk, allstrings[i])
	chunk_tokens += length(tokens[i])
	else
	push!(chunks, chunk)
	chunk = [allstrings[i]]
	chunk_tokens = length(tokens[i])
	end
	end
	push!(chunks, chunk)
	return chunks
	end