File size: 10,989 Bytes
143b0d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 |
###
using HTTP
using JSON3, DataFrames, ProgressMeter, Dates
const fact_check_sources = ["nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com"]
const query_categories = ["climate change", "jewish people", "black people", "immigration", "LGBTQ", "sexual and reproductive health"]
"""
## Search Google Fact Check API
## API specs here:
https://developers.google.com/fact-check/tools/api/reference/rest/v1alpha1/claims/search
## Example:
response = search_claims(languageCode="en-US", reviewPublisherSiteFilter="politifact.com", maxAgeDays=7, pageSize=20, pageToken="", offset=0)
responsenyt = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=700, pageSize=20)
response = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=1, pageSize=200)
"""
function search_claims(;
query::String = "",
languageCode::String = "en-US", # bcp-47 language code
reviewPublisherSiteFilter::String = "",
maxAgeDays::Int = 7,
pageSize::Int = 20,
pageToken::String = "",
offset::Int = 0)
# Prepare the base URL
url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
# Build query parameters
params = Dict("key" => ENV["GOOGLECLOUD"])
if !isempty(query)
params["query"] = query
end
if !isempty(languageCode)
params["languageCode"] = languageCode
end
if !isempty(reviewPublisherSiteFilter)
params["reviewPublisherSiteFilter"] = reviewPublisherSiteFilter
end
if maxAgeDays > 0
params["maxAgeDays"] = string(maxAgeDays)
end
if pageSize != 10
params["pageSize"] = string(pageSize)
end
if !isempty(pageToken)
params["pageToken"] = pageToken
elseif offset > 0
params["offset"] = string(offset)
end
# Make the HTTP GET request
response = HTTP.get(url, query=params)
# Parse the JSON response
return JSON3.read(response.body)
end
"""
## Convert the search response to a tabular format
qu= "Video shows Kamala (Harris) responding to someone"
response = search_claims(query = qu, languageCode="en-US", maxAgeDays=700, pageSize=20)
searchresponse_to_tabular(response)
"""
function searchresponse_to_tabular(response::JSON3.Object)::DataFrame
# Initialize the results array
try
results = DataFrame(
text = String[get(x, :text, "") for x in response.claims],
claimant = String[get(x, :claimant, "") for x in response.claims],
claimDate = String[get(x, :claimDate, "") for x in response.claims],
claimReviewPublisher = String[get(x[:claimReview][1], "publisher", Dict())["site"] for x in response.claims],
claimReviewTitle = String[get(x[:claimReview][1], "title", "") for x in response.claims],
claimReviewTextualRating = String[get(x[:claimReview][1], "textualRating", "") for x in response.claims],
claimReviewUrl = String[get(x[:claimReview][1], "url", "") for x in response.claims])
return results
catch
return DataFrame()
end
end
"""
## Paginate Google Fact Check API results
use the pageToken to get the next page of results
## NOTES:
- 'reviewPublisherSiteFilter' is a string that filters by the review publisher site. You can use things like: "nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com", etc.
- If you have reviewPublisherSiteFilter, then query can be empty.
"""
function paginate_claims(;
query::String = "",
languageCode::String = "en-US", # bcp-47 language code
reviewPublisherSiteFilter::String = "",
maxAgeDays::Int = 7,
pageSize::Int = 20,
pageToken::String = "",
offset::Int = 0,
delay::Float64 = 1/(300/60)) # allows reqs per minute = 300
# Initialize the results array
results = []
# Get the first page of results
response = search_claims(query=query,
languageCode=languageCode,
reviewPublisherSiteFilter=reviewPublisherSiteFilter,
maxAgeDays=maxAgeDays,
pageSize=pageSize,
pageToken=pageToken,
offset=offset)
push!(results, response)
# Get the next page of results
while haskey(response, "nextPageToken")
sleep(delay)
pageToken = response["nextPageToken"]
response = search_claims(query=query,
languageCode=languageCode,
reviewPublisherSiteFilter=reviewPublisherSiteFilter,
maxAgeDays=maxAgeDays,
pageSize=pageSize,
pageToken=pageToken,
offset=offset)
push!(results, response)
end
return results
end
"""
# script to daily check for new fact-checks for each category
allfacts = periodic_fact_check(365*8)
## Save the results to a CSV file
using CSV, Dates
CSV.write("data/google_fact_checks$(today()).csv", allfacts)
allfacts= filter(:claimReviewTextualRating => x -> !contains(x, r" accurate| true"), allfacts)
CSV.write("data/google_fact_checks2024-11-14.csv", allfacts)
"""
function periodic_fact_check(max_days::Int = 1)
allresults = DataFrame[]
for category in query_categories
println("getting Category: $category")
paginated_results = paginate_claims(query = category, languageCode="en-US", maxAgeDays=max_days, pageSize=200)
if any(haskey.(paginated_results, "claims"))
results = [searchresponse_to_tabular(page) for page in paginated_results]
## concat the results
results = vcat(results...)
results[!, :category] .= category
push!(allresults, results)
end
end
return vcat(allresults...)
end
function get_latest_fact_checks()
download("https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json", "data/fact_check_latest.json")
end
"""
d = Dict(
:author => Dict(
:name => "John Doe"
)
)
safe_get(d, (:author, :name), "No name")
"""
function safe_get(dict::Dict, keys::Tuple, default=nothing)
current = dict
for key in keys
if haskey(current, key)
current = current[key]
else
return default
end
end
return current
end
function safe_datetime(date::Union{DateTime, Missing})
return date
end
## Convert date string to DateTime object without throwing an error
function safe_datetime(date::String)
try
return Dates.DateTime(date)
catch
try
Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SS.sssZ")
catch
try
Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SSZ")
catch
## If all attempts fail
return missing
end
end
end
end
"""
## Load the entire fact check JSON file
- the updated dataset can be found at: https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json
df, errors = load_fact_check_json()
"""
function load_fact_check_json(file::String="data/fact_check_latest.json"; get_latest=false)
if get_latest
get_latest_fact_checks()
end
df = JSON3.read(file)
dfout = DataFrame[]
errors = 0
error_index = Int64[]
for (i, data) in enumerate(df[:dataFeedElement])
try
d = Dict(data[:item][1])
results = DataFrame(
text = get(d, :claimReviewed, ""),
claimant = safe_get(d, (:itemReviewed, :author, :name), ""),
claimDate = safe_get(d, (:itemReviewed, :datePublished), ""),
claimReviewDate = get(d, :datePublished, ""),
claimReviewPublisher = get(d[:author], :name, ""),
claimReviewTitle = "",
claimReviewTextualRating = safe_get(d, (:reviewRating, :alternateName), ""),
claimReviewUrl = get(data, :url, "")
)
push!(dfout, results)
catch
push!(error_index, i)
errors += 1
end
end
return (vcat(dfout...), error_index)
end
"""
## Format the date columns in the DataFrame
- drop rows where both date columns are missing
df, errors = load_fact_check_json("data/fact_check_latest.json")
format_date_cols!(df)
"""
function format_date_cols!(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
# Drop where date = ""
for col in date_cols
df[!, col] = safe_datetime.(df[!, col])
end
# Drop if both date columns are missing
df[:, :contains_date] = (ismissing.(df.claimDate) .& ismissing.(df.claimReviewDate)) .== false
subset!(df, :contains_date)
end
"""
## Gets the latest date in the DataFrame from current date columns
- used to identify the latest fact-checks in the datasets
df, errs = load_fact_check_json("data/fact_check_latest.json")
get_latest_date(df)
"""
function get_latest_date(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
if typeof(df.claimDate[1]) == String
format_date_cols!(df)
end
## Identify any dates that are in the future - must be miscoded, set to missing
for col in date_cols
df[!, col] = [ coalesce(x, Dates.today()) > Dates.today() ? missing : x for x in df[!, col]]
end
maxdates = [maximum(coalesce.(df[!, col], Date(1901, 1, 1))) for col in date_cols]
maxcolumn = date_cols[argmax(maxdates)]
return maximum(maxdates), maxcolumn
end
"""
## Identify the fact-checks in the latest dataset that are not in the previous dataset
- use claimReviewDate to identify differences
- get the latest claimReviewDate in current_data
- get the latest claimReviewDate in previous_data
- select the rows in current_data where claimReviewDate > latest_claimReviewDate
Example:
previous_data, errs = load_fact_check_json("data/fact_check_latest.json")
current_data, errs = load_fact_check_json("data/fact_check_latest.json", get_latest=true)
CSV.write("data/fact_check_latest.csv", current_data)
new_fact_checks = get_new_fact_checks(current_data, previous_data)
"""
function get_new_fact_checks(current_data::DataFrame, previous_data::DataFrame)
latest_of_newest, datecol = get_latest_date(current_data)
latest_of_previous, datecol = get_latest_date(previous_data)
# Get the indices of the new fact-checks
if latest_of_newest > latest_of_previous
return current_data[coalesce.(current_data[!, datecol], Date(1901, 1, 1)) .> latest_of_previous, :]
else
return DataFrame()
end
end
|