Spaces:

stefanjwojcik
/

misinfo_detection_app

Sleeping

App Files Files Community

misinfo_detection_app / scripts /google_fact_check_api.jl

stefanjwojcik

add scripts

143b0d4 verified 5 months ago

raw

history blame

11 kB

	###
	using HTTP
	using JSON3, DataFrames, ProgressMeter, Dates

	const fact_check_sources = ["nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com"]
	const query_categories = ["climate change", "jewish people", "black people", "immigration", "LGBTQ", "sexual and reproductive health"]

	"""
	## Search Google Fact Check API

	## API specs here:
	https://developers.google.com/fact-check/tools/api/reference/rest/v1alpha1/claims/search

	## Example:
	response = search_claims(languageCode="en-US", reviewPublisherSiteFilter="politifact.com", maxAgeDays=7, pageSize=20, pageToken="", offset=0)

	responsenyt = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=700, pageSize=20)

	response = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=1, pageSize=200)
	"""
	function search_claims(;
	query::String = "",
	languageCode::String = "en-US", # bcp-47 language code
	reviewPublisherSiteFilter::String = "",
	maxAgeDays::Int = 7,
	pageSize::Int = 20,
	pageToken::String = "",
	offset::Int = 0)

	# Prepare the base URL
	url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

	# Build query parameters
	params = Dict("key" => ENV["GOOGLECLOUD"])
	if !isempty(query)
	params["query"] = query
	end
	if !isempty(languageCode)
	params["languageCode"] = languageCode
	end
	if !isempty(reviewPublisherSiteFilter)
	params["reviewPublisherSiteFilter"] = reviewPublisherSiteFilter
	end
	if maxAgeDays > 0
	params["maxAgeDays"] = string(maxAgeDays)
	end
	if pageSize != 10
	params["pageSize"] = string(pageSize)
	end
	if !isempty(pageToken)
	params["pageToken"] = pageToken
	elseif offset > 0
	params["offset"] = string(offset)
	end

	# Make the HTTP GET request
	response = HTTP.get(url, query=params)

	# Parse the JSON response
	return JSON3.read(response.body)
	end

	"""
	## Convert the search response to a tabular format
	qu= "Video shows Kamala (Harris) responding to someone"
	response = search_claims(query = qu, languageCode="en-US", maxAgeDays=700, pageSize=20)
	searchresponse_to_tabular(response)
	"""
	function searchresponse_to_tabular(response::JSON3.Object)::DataFrame
	# Initialize the results array
	try
	results = DataFrame(
	text = String[get(x, :text, "") for x in response.claims],
	claimant = String[get(x, :claimant, "") for x in response.claims],
	claimDate = String[get(x, :claimDate, "") for x in response.claims],
	claimReviewPublisher = String[get(x[:claimReview][1], "publisher", Dict())["site"] for x in response.claims],
	claimReviewTitle = String[get(x[:claimReview][1], "title", "") for x in response.claims],
	claimReviewTextualRating = String[get(x[:claimReview][1], "textualRating", "") for x in response.claims],
	claimReviewUrl = String[get(x[:claimReview][1], "url", "") for x in response.claims])
	return results
	catch
	return DataFrame()
	end
	end

	"""
	## Paginate Google Fact Check API results
	use the pageToken to get the next page of results

	## NOTES:
	- 'reviewPublisherSiteFilter' is a string that filters by the review publisher site. You can use things like: "nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com", etc.
	- If you have reviewPublisherSiteFilter, then query can be empty.
	"""
	function paginate_claims(;
	query::String = "",
	languageCode::String = "en-US", # bcp-47 language code
	reviewPublisherSiteFilter::String = "",
	maxAgeDays::Int = 7,
	pageSize::Int = 20,
	pageToken::String = "",
	offset::Int = 0,
	delay::Float64 = 1/(300/60)) # allows reqs per minute = 300

	# Initialize the results array
	results = []

	# Get the first page of results
	response = search_claims(query=query,
	languageCode=languageCode,
	reviewPublisherSiteFilter=reviewPublisherSiteFilter,
	maxAgeDays=maxAgeDays,
	pageSize=pageSize,
	pageToken=pageToken,
	offset=offset)
	push!(results, response)

	# Get the next page of results
	while haskey(response, "nextPageToken")
	sleep(delay)
	pageToken = response["nextPageToken"]
	response = search_claims(query=query,
	languageCode=languageCode,
	reviewPublisherSiteFilter=reviewPublisherSiteFilter,
	maxAgeDays=maxAgeDays,
	pageSize=pageSize,
	pageToken=pageToken,
	offset=offset)
	push!(results, response)
	end

	return results
	end

	"""
	# script to daily check for new fact-checks for each category
	allfacts = periodic_fact_check(365*8)
	## Save the results to a CSV file
	using CSV, Dates
	CSV.write("data/google_fact_checks$(today()).csv", allfacts)

	allfacts= filter(:claimReviewTextualRating => x -> !contains(x, r" accurate\| true"), allfacts)

	CSV.write("data/google_fact_checks2024-11-14.csv", allfacts)

	"""
	function periodic_fact_check(max_days::Int = 1)
	allresults = DataFrame[]
	for category in query_categories
	println("getting Category: $category")
	paginated_results = paginate_claims(query = category, languageCode="en-US", maxAgeDays=max_days, pageSize=200)
	if any(haskey.(paginated_results, "claims"))
	results = [searchresponse_to_tabular(page) for page in paginated_results]
	## concat the results
	results = vcat(results...)
	results[!, :category] .= category
	push!(allresults, results)
	end
	end
	return vcat(allresults...)
	end

	function get_latest_fact_checks()
	download("https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json", "data/fact_check_latest.json")
	end

	"""
	d = Dict(
	:author => Dict(
	:name => "John Doe"
	)
	)
	safe_get(d, (:author, :name), "No name")
	"""
	function safe_get(dict::Dict, keys::Tuple, default=nothing)
	current = dict
	for key in keys
	if haskey(current, key)
	current = current[key]
	else
	return default
	end
	end
	return current
	end

	function safe_datetime(date::Union{DateTime, Missing})
	return date
	end

	## Convert date string to DateTime object without throwing an error
	function safe_datetime(date::String)
	try
	return Dates.DateTime(date)
	catch
	try
	Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SS.sssZ")
	catch
	try
	Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SSZ")
	catch
	## If all attempts fail
	return missing
	end
	end
	end
	end

	"""
	## Load the entire fact check JSON file
	- the updated dataset can be found at: https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json
	df, errors = load_fact_check_json()
	"""
	function load_fact_check_json(file::String="data/fact_check_latest.json"; get_latest=false)
	if get_latest
	get_latest_fact_checks()
	end
	df = JSON3.read(file)
	dfout = DataFrame[]
	errors = 0
	error_index = Int64[]
	for (i, data) in enumerate(df[:dataFeedElement])
	try
	d = Dict(data[:item][1])
	results = DataFrame(
	text = get(d, :claimReviewed, ""),
	claimant = safe_get(d, (:itemReviewed, :author, :name), ""),
	claimDate = safe_get(d, (:itemReviewed, :datePublished), ""),
	claimReviewDate = get(d, :datePublished, ""),
	claimReviewPublisher = get(d[:author], :name, ""),
	claimReviewTitle = "",
	claimReviewTextualRating = safe_get(d, (:reviewRating, :alternateName), ""),
	claimReviewUrl = get(data, :url, "")
	)
	push!(dfout, results)
	catch
	push!(error_index, i)
	errors += 1
	end
	end
	return (vcat(dfout...), error_index)
	end

	"""
	## Format the date columns in the DataFrame
	- drop rows where both date columns are missing

	df, errors = load_fact_check_json("data/fact_check_latest.json")
	format_date_cols!(df)
	"""
	function format_date_cols!(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
	# Drop where date = ""
	for col in date_cols
	df[!, col] = safe_datetime.(df[!, col])
	end
	# Drop if both date columns are missing
	df[:, :contains_date] = (ismissing.(df.claimDate) .& ismissing.(df.claimReviewDate)) .== false
	subset!(df, :contains_date)
	end

	"""
	## Gets the latest date in the DataFrame from current date columns
	- used to identify the latest fact-checks in the datasets
	df, errs = load_fact_check_json("data/fact_check_latest.json")
	get_latest_date(df)
	"""
	function get_latest_date(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
	if typeof(df.claimDate[1]) == String
	format_date_cols!(df)
	end
	## Identify any dates that are in the future - must be miscoded, set to missing
	for col in date_cols
	df[!, col] = [ coalesce(x, Dates.today()) > Dates.today() ? missing : x for x in df[!, col]]
	end
	maxdates = [maximum(coalesce.(df[!, col], Date(1901, 1, 1))) for col in date_cols]
	maxcolumn = date_cols[argmax(maxdates)]
	return maximum(maxdates), maxcolumn
	end

	"""
	## Identify the fact-checks in the latest dataset that are not in the previous dataset
	- use claimReviewDate to identify differences
	- get the latest claimReviewDate in current_data
	- get the latest claimReviewDate in previous_data
	- select the rows in current_data where claimReviewDate > latest_claimReviewDate

	Example:
	previous_data, errs = load_fact_check_json("data/fact_check_latest.json")
	current_data, errs = load_fact_check_json("data/fact_check_latest.json", get_latest=true)
	CSV.write("data/fact_check_latest.csv", current_data)
	new_fact_checks = get_new_fact_checks(current_data, previous_data)
	"""
	function get_new_fact_checks(current_data::DataFrame, previous_data::DataFrame)
	latest_of_newest, datecol = get_latest_date(current_data)
	latest_of_previous, datecol = get_latest_date(previous_data)
	# Get the indices of the new fact-checks
	if latest_of_newest > latest_of_previous
	return current_data[coalesce.(current_data[!, datecol], Date(1901, 1, 1)) .> latest_of_previous, :]
	else
	return DataFrame()
	end
	end