### using HTTP using JSON3, DataFrames, ProgressMeter, Dates const fact_check_sources = ["nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com"] const query_categories = ["climate change", "jewish people", "black people", "immigration", "LGBTQ", "sexual and reproductive health"] """ ## Search Google Fact Check API ## API specs here: https://developers.google.com/fact-check/tools/api/reference/rest/v1alpha1/claims/search ## Example: response = search_claims(languageCode="en-US", reviewPublisherSiteFilter="politifact.com", maxAgeDays=7, pageSize=20, pageToken="", offset=0) responsenyt = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=700, pageSize=20) response = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=1, pageSize=200) """ function search_claims(; query::String = "", languageCode::String = "en-US", # bcp-47 language code reviewPublisherSiteFilter::String = "", maxAgeDays::Int = 7, pageSize::Int = 20, pageToken::String = "", offset::Int = 0) # Prepare the base URL url = "https://factchecktools.googleapis.com/v1alpha1/claims:search" # Build query parameters params = Dict("key" => ENV["GOOGLECLOUD"]) if !isempty(query) params["query"] = query end if !isempty(languageCode) params["languageCode"] = languageCode end if !isempty(reviewPublisherSiteFilter) params["reviewPublisherSiteFilter"] = reviewPublisherSiteFilter end if maxAgeDays > 0 params["maxAgeDays"] = string(maxAgeDays) end if pageSize != 10 params["pageSize"] = string(pageSize) end if !isempty(pageToken) params["pageToken"] = pageToken elseif offset > 0 params["offset"] = string(offset) end # Make the HTTP GET request response = HTTP.get(url, query=params) # Parse the JSON response return JSON3.read(response.body) end """ ## Convert the search response to a tabular format qu= "Video shows Kamala (Harris) responding to someone" response = search_claims(query = qu, languageCode="en-US", maxAgeDays=700, pageSize=20) searchresponse_to_tabular(response) """ function searchresponse_to_tabular(response::JSON3.Object)::DataFrame # Initialize the results array try results = DataFrame( text = String[get(x, :text, "") for x in response.claims], claimant = String[get(x, :claimant, "") for x in response.claims], claimDate = String[get(x, :claimDate, "") for x in response.claims], claimReviewPublisher = String[get(x[:claimReview][1], "publisher", Dict())["site"] for x in response.claims], claimReviewTitle = String[get(x[:claimReview][1], "title", "") for x in response.claims], claimReviewTextualRating = String[get(x[:claimReview][1], "textualRating", "") for x in response.claims], claimReviewUrl = String[get(x[:claimReview][1], "url", "") for x in response.claims]) return results catch return DataFrame() end end """ ## Paginate Google Fact Check API results use the pageToken to get the next page of results ## NOTES: - 'reviewPublisherSiteFilter' is a string that filters by the review publisher site. You can use things like: "nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com", etc. - If you have reviewPublisherSiteFilter, then query can be empty. """ function paginate_claims(; query::String = "", languageCode::String = "en-US", # bcp-47 language code reviewPublisherSiteFilter::String = "", maxAgeDays::Int = 7, pageSize::Int = 20, pageToken::String = "", offset::Int = 0, delay::Float64 = 1/(300/60)) # allows reqs per minute = 300 # Initialize the results array results = [] # Get the first page of results response = search_claims(query=query, languageCode=languageCode, reviewPublisherSiteFilter=reviewPublisherSiteFilter, maxAgeDays=maxAgeDays, pageSize=pageSize, pageToken=pageToken, offset=offset) push!(results, response) # Get the next page of results while haskey(response, "nextPageToken") sleep(delay) pageToken = response["nextPageToken"] response = search_claims(query=query, languageCode=languageCode, reviewPublisherSiteFilter=reviewPublisherSiteFilter, maxAgeDays=maxAgeDays, pageSize=pageSize, pageToken=pageToken, offset=offset) push!(results, response) end return results end """ # script to daily check for new fact-checks for each category allfacts = periodic_fact_check(365*8) ## Save the results to a CSV file using CSV, Dates CSV.write("data/google_fact_checks$(today()).csv", allfacts) allfacts= filter(:claimReviewTextualRating => x -> !contains(x, r" accurate| true"), allfacts) CSV.write("data/google_fact_checks2024-11-14.csv", allfacts) """ function periodic_fact_check(max_days::Int = 1) allresults = DataFrame[] for category in query_categories println("getting Category: $category") paginated_results = paginate_claims(query = category, languageCode="en-US", maxAgeDays=max_days, pageSize=200) if any(haskey.(paginated_results, "claims")) results = [searchresponse_to_tabular(page) for page in paginated_results] ## concat the results results = vcat(results...) results[!, :category] .= category push!(allresults, results) end end return vcat(allresults...) end function get_latest_fact_checks() download("https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json", "data/fact_check_latest.json") end """ d = Dict( :author => Dict( :name => "John Doe" ) ) safe_get(d, (:author, :name), "No name") """ function safe_get(dict::Dict, keys::Tuple, default=nothing) current = dict for key in keys if haskey(current, key) current = current[key] else return default end end return current end function safe_datetime(date::Union{DateTime, Missing}) return date end ## Convert date string to DateTime object without throwing an error function safe_datetime(date::String) try return Dates.DateTime(date) catch try Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SS.sssZ") catch try Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SSZ") catch ## If all attempts fail return missing end end end end """ ## Load the entire fact check JSON file - the updated dataset can be found at: https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json df, errors = load_fact_check_json() """ function load_fact_check_json(file::String="data/fact_check_latest.json"; get_latest=false) if get_latest get_latest_fact_checks() end df = JSON3.read(file) dfout = DataFrame[] errors = 0 error_index = Int64[] for (i, data) in enumerate(df[:dataFeedElement]) try d = Dict(data[:item][1]) results = DataFrame( text = get(d, :claimReviewed, ""), claimant = safe_get(d, (:itemReviewed, :author, :name), ""), claimDate = safe_get(d, (:itemReviewed, :datePublished), ""), claimReviewDate = get(d, :datePublished, ""), claimReviewPublisher = get(d[:author], :name, ""), claimReviewTitle = "", claimReviewTextualRating = safe_get(d, (:reviewRating, :alternateName), ""), claimReviewUrl = get(data, :url, "") ) push!(dfout, results) catch push!(error_index, i) errors += 1 end end return (vcat(dfout...), error_index) end """ ## Format the date columns in the DataFrame - drop rows where both date columns are missing df, errors = load_fact_check_json("data/fact_check_latest.json") format_date_cols!(df) """ function format_date_cols!(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate]) # Drop where date = "" for col in date_cols df[!, col] = safe_datetime.(df[!, col]) end # Drop if both date columns are missing df[:, :contains_date] = (ismissing.(df.claimDate) .& ismissing.(df.claimReviewDate)) .== false subset!(df, :contains_date) end """ ## Gets the latest date in the DataFrame from current date columns - used to identify the latest fact-checks in the datasets df, errs = load_fact_check_json("data/fact_check_latest.json") get_latest_date(df) """ function get_latest_date(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate]) if typeof(df.claimDate[1]) == String format_date_cols!(df) end ## Identify any dates that are in the future - must be miscoded, set to missing for col in date_cols df[!, col] = [ coalesce(x, Dates.today()) > Dates.today() ? missing : x for x in df[!, col]] end maxdates = [maximum(coalesce.(df[!, col], Date(1901, 1, 1))) for col in date_cols] maxcolumn = date_cols[argmax(maxdates)] return maximum(maxdates), maxcolumn end """ ## Identify the fact-checks in the latest dataset that are not in the previous dataset - use claimReviewDate to identify differences - get the latest claimReviewDate in current_data - get the latest claimReviewDate in previous_data - select the rows in current_data where claimReviewDate > latest_claimReviewDate Example: previous_data, errs = load_fact_check_json("data/fact_check_latest.json") current_data, errs = load_fact_check_json("data/fact_check_latest.json", get_latest=true) CSV.write("data/fact_check_latest.csv", current_data) new_fact_checks = get_new_fact_checks(current_data, previous_data) """ function get_new_fact_checks(current_data::DataFrame, previous_data::DataFrame) latest_of_newest, datecol = get_latest_date(current_data) latest_of_previous, datecol = get_latest_date(previous_data) # Get the indices of the new fact-checks if latest_of_newest > latest_of_previous return current_data[coalesce.(current_data[!, datecol], Date(1901, 1, 1)) .> latest_of_previous, :] else return DataFrame() end end