File size: 10,989 Bytes
143b0d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
### 
using HTTP
using JSON3, DataFrames, ProgressMeter, Dates

const fact_check_sources = ["nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com"]
const query_categories = ["climate change", "jewish people", "black people", "immigration", "LGBTQ", "sexual and reproductive health"]

"""
## Search Google Fact Check API

## API specs here: 
https://developers.google.com/fact-check/tools/api/reference/rest/v1alpha1/claims/search

## Example: 
response = search_claims(languageCode="en-US", reviewPublisherSiteFilter="politifact.com", maxAgeDays=7, pageSize=20, pageToken="", offset=0)

responsenyt = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=700, pageSize=20)

response = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=1, pageSize=200)
"""
function search_claims(;
                       query::String = "",
                       languageCode::String = "en-US", # bcp-47 language code
                       reviewPublisherSiteFilter::String = "",
                       maxAgeDays::Int = 7,
                       pageSize::Int = 20,
                       pageToken::String = "",
                       offset::Int = 0)

    # Prepare the base URL
    url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

    # Build query parameters
    params = Dict("key" => ENV["GOOGLECLOUD"])
    if !isempty(query)
        params["query"] = query
    end
    if !isempty(languageCode)
        params["languageCode"] = languageCode
    end
    if !isempty(reviewPublisherSiteFilter)
        params["reviewPublisherSiteFilter"] = reviewPublisherSiteFilter
    end
    if maxAgeDays > 0
        params["maxAgeDays"] = string(maxAgeDays)
    end
    if pageSize != 10
        params["pageSize"] = string(pageSize)
    end
    if !isempty(pageToken)
        params["pageToken"] = pageToken
    elseif offset > 0
        params["offset"] = string(offset)
    end

    # Make the HTTP GET request
    response = HTTP.get(url, query=params)
    
    # Parse the JSON response
    return JSON3.read(response.body)
end

"""
## Convert the search response to a tabular format
qu= "Video shows Kamala (Harris) responding to someone"
response = search_claims(query = qu, languageCode="en-US", maxAgeDays=700, pageSize=20)
searchresponse_to_tabular(response)
"""
function searchresponse_to_tabular(response::JSON3.Object)::DataFrame
    # Initialize the results array
    try
        results = DataFrame(
            text = String[get(x, :text, "") for x in response.claims],
            claimant = String[get(x, :claimant, "") for x in response.claims],
            claimDate = String[get(x, :claimDate, "") for x in response.claims],
            claimReviewPublisher = String[get(x[:claimReview][1], "publisher", Dict())["site"] for x in response.claims],
            claimReviewTitle = String[get(x[:claimReview][1], "title", "") for x in response.claims],
            claimReviewTextualRating = String[get(x[:claimReview][1], "textualRating", "") for x in response.claims],
            claimReviewUrl = String[get(x[:claimReview][1], "url", "") for x in response.claims])
        return results
    catch
        return DataFrame()
    end
end

"""
## Paginate Google Fact Check API results 
use the pageToken to get the next page of results

## NOTES: 
- 'reviewPublisherSiteFilter' is a string that filters by the review publisher site. You can use things like: "nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com", etc.
- If you have reviewPublisherSiteFilter, then query can be empty. 
"""
function paginate_claims(;
                         query::String = "",
                         languageCode::String = "en-US", # bcp-47 language code
                         reviewPublisherSiteFilter::String = "",
                         maxAgeDays::Int = 7,
                         pageSize::Int = 20,
                         pageToken::String = "",
                         offset::Int = 0, 
                         delay::Float64 = 1/(300/60)) # allows reqs per minute = 300

    # Initialize the results array
    results = []

    # Get the first page of results
    response = search_claims(query=query,
                             languageCode=languageCode,
                             reviewPublisherSiteFilter=reviewPublisherSiteFilter,
                             maxAgeDays=maxAgeDays,
                             pageSize=pageSize,
                             pageToken=pageToken,
                             offset=offset)
    push!(results, response)

    # Get the next page of results
    while haskey(response, "nextPageToken")
        sleep(delay)
        pageToken = response["nextPageToken"]
        response = search_claims(query=query,
                                 languageCode=languageCode,
                                 reviewPublisherSiteFilter=reviewPublisherSiteFilter,
                                 maxAgeDays=maxAgeDays,
                                 pageSize=pageSize,
                                 pageToken=pageToken,
                                 offset=offset)
        push!(results, response)
    end

    return results
end

"""
# script to daily check for new fact-checks for each category
allfacts = periodic_fact_check(365*8)
## Save the results to a CSV file
using CSV, Dates 
CSV.write("data/google_fact_checks$(today()).csv", allfacts)

allfacts= filter(:claimReviewTextualRating => x -> !contains(x, r" accurate| true"), allfacts)

CSV.write("data/google_fact_checks2024-11-14.csv", allfacts)

"""
function periodic_fact_check(max_days::Int = 1)
    allresults = DataFrame[]
    for category in query_categories
        println("getting Category: $category")
        paginated_results = paginate_claims(query = category, languageCode="en-US", maxAgeDays=max_days, pageSize=200)
        if any(haskey.(paginated_results, "claims"))
            results = [searchresponse_to_tabular(page) for page in paginated_results]
            ## concat the results
            results = vcat(results...)
            results[!, :category] .= category
            push!(allresults, results)
        end
    end
    return vcat(allresults...)
end

function get_latest_fact_checks()
    download("https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json", "data/fact_check_latest.json")
end

"""
d = Dict(
    :author => Dict(
        :name => "John Doe"
    )
)
safe_get(d, (:author, :name), "No name")
"""
function safe_get(dict::Dict, keys::Tuple, default=nothing)
    current = dict
    for key in keys
        if haskey(current, key)
            current = current[key]
        else
            return default
        end
    end
    return current
end

function safe_datetime(date::Union{DateTime, Missing})
    return date
end

## Convert date string to DateTime object without throwing an error
function safe_datetime(date::String)
    try
        return Dates.DateTime(date)
    catch
        try 
            Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SS.sssZ")
        catch 
            try 
                Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SSZ")
            catch
                ## If all attempts fail 
                return missing
            end
        end
    end
end

"""
## Load the entire fact check JSON file
- the updated dataset can be found at: https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json
df, errors = load_fact_check_json()
"""
function load_fact_check_json(file::String="data/fact_check_latest.json"; get_latest=false)
    if get_latest
        get_latest_fact_checks()
    end
    df = JSON3.read(file)
    dfout = DataFrame[]
    errors = 0
    error_index = Int64[]
    for (i, data) in enumerate(df[:dataFeedElement])
        try
            d = Dict(data[:item][1])
            results = DataFrame(
                text = get(d, :claimReviewed, ""),
                claimant = safe_get(d, (:itemReviewed, :author, :name), ""),
                claimDate = safe_get(d, (:itemReviewed, :datePublished), ""),
                claimReviewDate = get(d, :datePublished, ""),
                claimReviewPublisher = get(d[:author], :name, ""),
                claimReviewTitle = "",
                claimReviewTextualRating = safe_get(d, (:reviewRating, :alternateName), ""),
                claimReviewUrl = get(data, :url, "")
            )
            push!(dfout, results)
        catch
            push!(error_index, i)
            errors += 1
        end
    end
    return (vcat(dfout...), error_index)
end

"""
## Format the date columns in the DataFrame
- drop rows where both date columns are missing

df, errors = load_fact_check_json("data/fact_check_latest.json")
format_date_cols!(df)
"""
function format_date_cols!(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
    # Drop where date = ""
    for col in date_cols
        df[!, col] = safe_datetime.(df[!, col])
    end
    # Drop if both date columns are missing
    df[:, :contains_date] = (ismissing.(df.claimDate) .& ismissing.(df.claimReviewDate)) .== false
    subset!(df, :contains_date)
end

"""
## Gets the latest date in the DataFrame from current date columns 
- used to identify the latest fact-checks in the datasets
df, errs = load_fact_check_json("data/fact_check_latest.json")
get_latest_date(df)
"""
function get_latest_date(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
    if typeof(df.claimDate[1]) == String
        format_date_cols!(df)
    end
    ## Identify any dates that are in the future - must be miscoded, set to missing 
    for col in date_cols
        df[!, col] = [ coalesce(x, Dates.today()) > Dates.today() ? missing : x for x in df[!, col]]
    end
    maxdates = [maximum(coalesce.(df[!, col], Date(1901, 1, 1))) for col in date_cols]
    maxcolumn = date_cols[argmax(maxdates)]
    return maximum(maxdates), maxcolumn
end

"""
## Identify the fact-checks in the latest dataset that are not in the previous dataset
- use claimReviewDate to identify differences
- get the latest claimReviewDate in current_data 
- get the latest claimReviewDate in previous_data
- select the rows in current_data where claimReviewDate > latest_claimReviewDate 

Example: 
previous_data, errs = load_fact_check_json("data/fact_check_latest.json")
current_data, errs = load_fact_check_json("data/fact_check_latest.json", get_latest=true)
CSV.write("data/fact_check_latest.csv", current_data)
new_fact_checks = get_new_fact_checks(current_data, previous_data)
"""
function get_new_fact_checks(current_data::DataFrame, previous_data::DataFrame)
    latest_of_newest, datecol = get_latest_date(current_data)
    latest_of_previous, datecol = get_latest_date(previous_data)
    # Get the indices of the new fact-checks
    if latest_of_newest > latest_of_previous
        return current_data[coalesce.(current_data[!, datecol], Date(1901, 1, 1)) .> latest_of_previous, :]
    else
        return DataFrame()
    end
end