|
|
|
using HTTP |
|
using JSON3, DataFrames, ProgressMeter, Dates |
|
|
|
const fact_check_sources = ["nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com"] |
|
const query_categories = ["climate change", "jewish people", "black people", "immigration", "LGBTQ", "sexual and reproductive health"] |
|
|
|
""" |
|
## Search Google Fact Check API |
|
|
|
## API specs here: |
|
https://developers.google.com/fact-check/tools/api/reference/rest/v1alpha1/claims/search |
|
|
|
## Example: |
|
response = search_claims(languageCode="en-US", reviewPublisherSiteFilter="politifact.com", maxAgeDays=7, pageSize=20, pageToken="", offset=0) |
|
|
|
responsenyt = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=700, pageSize=20) |
|
|
|
response = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=1, pageSize=200) |
|
""" |
|
function search_claims(; |
|
query::String = "", |
|
languageCode::String = "en-US", |
|
reviewPublisherSiteFilter::String = "", |
|
maxAgeDays::Int = 7, |
|
pageSize::Int = 20, |
|
pageToken::String = "", |
|
offset::Int = 0) |
|
|
|
|
|
url = "https://factchecktools.googleapis.com/v1alpha1/claims:search" |
|
|
|
|
|
params = Dict("key" => ENV["GOOGLECLOUD"]) |
|
if !isempty(query) |
|
params["query"] = query |
|
end |
|
if !isempty(languageCode) |
|
params["languageCode"] = languageCode |
|
end |
|
if !isempty(reviewPublisherSiteFilter) |
|
params["reviewPublisherSiteFilter"] = reviewPublisherSiteFilter |
|
end |
|
if maxAgeDays > 0 |
|
params["maxAgeDays"] = string(maxAgeDays) |
|
end |
|
if pageSize != 10 |
|
params["pageSize"] = string(pageSize) |
|
end |
|
if !isempty(pageToken) |
|
params["pageToken"] = pageToken |
|
elseif offset > 0 |
|
params["offset"] = string(offset) |
|
end |
|
|
|
|
|
response = HTTP.get(url, query=params) |
|
|
|
|
|
return JSON3.read(response.body) |
|
end |
|
|
|
""" |
|
## Convert the search response to a tabular format |
|
qu= "Video shows Kamala (Harris) responding to someone" |
|
response = search_claims(query = qu, languageCode="en-US", maxAgeDays=700, pageSize=20) |
|
searchresponse_to_tabular(response) |
|
""" |
|
function searchresponse_to_tabular(response::JSON3.Object)::DataFrame |
|
|
|
try |
|
results = DataFrame( |
|
text = String[get(x, :text, "") for x in response.claims], |
|
claimant = String[get(x, :claimant, "") for x in response.claims], |
|
claimDate = String[get(x, :claimDate, "") for x in response.claims], |
|
claimReviewPublisher = String[get(x[:claimReview][1], "publisher", Dict())["site"] for x in response.claims], |
|
claimReviewTitle = String[get(x[:claimReview][1], "title", "") for x in response.claims], |
|
claimReviewTextualRating = String[get(x[:claimReview][1], "textualRating", "") for x in response.claims], |
|
claimReviewUrl = String[get(x[:claimReview][1], "url", "") for x in response.claims]) |
|
return results |
|
catch |
|
return DataFrame() |
|
end |
|
end |
|
|
|
""" |
|
## Paginate Google Fact Check API results |
|
use the pageToken to get the next page of results |
|
|
|
## NOTES: |
|
- 'reviewPublisherSiteFilter' is a string that filters by the review publisher site. You can use things like: "nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com", etc. |
|
- If you have reviewPublisherSiteFilter, then query can be empty. |
|
""" |
|
function paginate_claims(; |
|
query::String = "", |
|
languageCode::String = "en-US", |
|
reviewPublisherSiteFilter::String = "", |
|
maxAgeDays::Int = 7, |
|
pageSize::Int = 20, |
|
pageToken::String = "", |
|
offset::Int = 0, |
|
delay::Float64 = 1/(300/60)) |
|
|
|
|
|
results = [] |
|
|
|
|
|
response = search_claims(query=query, |
|
languageCode=languageCode, |
|
reviewPublisherSiteFilter=reviewPublisherSiteFilter, |
|
maxAgeDays=maxAgeDays, |
|
pageSize=pageSize, |
|
pageToken=pageToken, |
|
offset=offset) |
|
push!(results, response) |
|
|
|
|
|
while haskey(response, "nextPageToken") |
|
sleep(delay) |
|
pageToken = response["nextPageToken"] |
|
response = search_claims(query=query, |
|
languageCode=languageCode, |
|
reviewPublisherSiteFilter=reviewPublisherSiteFilter, |
|
maxAgeDays=maxAgeDays, |
|
pageSize=pageSize, |
|
pageToken=pageToken, |
|
offset=offset) |
|
push!(results, response) |
|
end |
|
|
|
return results |
|
end |
|
|
|
""" |
|
# script to daily check for new fact-checks for each category |
|
allfacts = periodic_fact_check(365*8) |
|
## Save the results to a CSV file |
|
using CSV, Dates |
|
CSV.write("data/google_fact_checks$(today()).csv", allfacts) |
|
|
|
allfacts= filter(:claimReviewTextualRating => x -> !contains(x, r" accurate| true"), allfacts) |
|
|
|
CSV.write("data/google_fact_checks2024-11-14.csv", allfacts) |
|
|
|
""" |
|
function periodic_fact_check(max_days::Int = 1) |
|
allresults = DataFrame[] |
|
for category in query_categories |
|
println("getting Category: $category") |
|
paginated_results = paginate_claims(query = category, languageCode="en-US", maxAgeDays=max_days, pageSize=200) |
|
if any(haskey.(paginated_results, "claims")) |
|
results = [searchresponse_to_tabular(page) for page in paginated_results] |
|
|
|
results = vcat(results...) |
|
results[!, :category] .= category |
|
push!(allresults, results) |
|
end |
|
end |
|
return vcat(allresults...) |
|
end |
|
|
|
function get_latest_fact_checks() |
|
download("https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json", "data/fact_check_latest.json") |
|
end |
|
|
|
""" |
|
d = Dict( |
|
:author => Dict( |
|
:name => "John Doe" |
|
) |
|
) |
|
safe_get(d, (:author, :name), "No name") |
|
""" |
|
function safe_get(dict::Dict, keys::Tuple, default=nothing) |
|
current = dict |
|
for key in keys |
|
if haskey(current, key) |
|
current = current[key] |
|
else |
|
return default |
|
end |
|
end |
|
return current |
|
end |
|
|
|
function safe_datetime(date::Union{DateTime, Missing}) |
|
return date |
|
end |
|
|
|
|
|
function safe_datetime(date::String) |
|
try |
|
return Dates.DateTime(date) |
|
catch |
|
try |
|
Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SS.sssZ") |
|
catch |
|
try |
|
Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SSZ") |
|
catch |
|
|
|
return missing |
|
end |
|
end |
|
end |
|
end |
|
|
|
""" |
|
## Load the entire fact check JSON file |
|
- the updated dataset can be found at: https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json |
|
df, errors = load_fact_check_json() |
|
""" |
|
function load_fact_check_json(file::String="data/fact_check_latest.json"; get_latest=false) |
|
if get_latest |
|
get_latest_fact_checks() |
|
end |
|
df = JSON3.read(file) |
|
dfout = DataFrame[] |
|
errors = 0 |
|
error_index = Int64[] |
|
for (i, data) in enumerate(df[:dataFeedElement]) |
|
try |
|
d = Dict(data[:item][1]) |
|
results = DataFrame( |
|
text = get(d, :claimReviewed, ""), |
|
claimant = safe_get(d, (:itemReviewed, :author, :name), ""), |
|
claimDate = safe_get(d, (:itemReviewed, :datePublished), ""), |
|
claimReviewDate = get(d, :datePublished, ""), |
|
claimReviewPublisher = get(d[:author], :name, ""), |
|
claimReviewTitle = "", |
|
claimReviewTextualRating = safe_get(d, (:reviewRating, :alternateName), ""), |
|
claimReviewUrl = get(data, :url, "") |
|
) |
|
push!(dfout, results) |
|
catch |
|
push!(error_index, i) |
|
errors += 1 |
|
end |
|
end |
|
return (vcat(dfout...), error_index) |
|
end |
|
|
|
""" |
|
## Format the date columns in the DataFrame |
|
- drop rows where both date columns are missing |
|
|
|
df, errors = load_fact_check_json("data/fact_check_latest.json") |
|
format_date_cols!(df) |
|
""" |
|
function format_date_cols!(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate]) |
|
|
|
for col in date_cols |
|
df[!, col] = safe_datetime.(df[!, col]) |
|
end |
|
|
|
df[:, :contains_date] = (ismissing.(df.claimDate) .& ismissing.(df.claimReviewDate)) .== false |
|
subset!(df, :contains_date) |
|
end |
|
|
|
""" |
|
## Gets the latest date in the DataFrame from current date columns |
|
- used to identify the latest fact-checks in the datasets |
|
df, errs = load_fact_check_json("data/fact_check_latest.json") |
|
get_latest_date(df) |
|
""" |
|
function get_latest_date(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate]) |
|
if typeof(df.claimDate[1]) == String |
|
format_date_cols!(df) |
|
end |
|
|
|
for col in date_cols |
|
df[!, col] = [ coalesce(x, Dates.today()) > Dates.today() ? missing : x for x in df[!, col]] |
|
end |
|
maxdates = [maximum(coalesce.(df[!, col], Date(1901, 1, 1))) for col in date_cols] |
|
maxcolumn = date_cols[argmax(maxdates)] |
|
return maximum(maxdates), maxcolumn |
|
end |
|
|
|
""" |
|
## Identify the fact-checks in the latest dataset that are not in the previous dataset |
|
- use claimReviewDate to identify differences |
|
- get the latest claimReviewDate in current_data |
|
- get the latest claimReviewDate in previous_data |
|
- select the rows in current_data where claimReviewDate > latest_claimReviewDate |
|
|
|
Example: |
|
previous_data, errs = load_fact_check_json("data/fact_check_latest.json") |
|
current_data, errs = load_fact_check_json("data/fact_check_latest.json", get_latest=true) |
|
CSV.write("data/fact_check_latest.csv", current_data) |
|
new_fact_checks = get_new_fact_checks(current_data, previous_data) |
|
""" |
|
function get_new_fact_checks(current_data::DataFrame, previous_data::DataFrame) |
|
latest_of_newest, datecol = get_latest_date(current_data) |
|
latest_of_previous, datecol = get_latest_date(previous_data) |
|
|
|
if latest_of_newest > latest_of_previous |
|
return current_data[coalesce.(current_data[!, datecol], Date(1901, 1, 1)) .> latest_of_previous, :] |
|
else |
|
return DataFrame() |
|
end |
|
end |
|
|
|
|