Spaces:

stefanjwojcik
/

misinfo_detection_app

Running

File size: 10,989 Bytes

143b0d4

### 
using HTTP
using JSON3, DataFrames, ProgressMeter, Dates

const fact_check_sources = ["nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com"]
const query_categories = ["climate change", "jewish people", "black people", "immigration", "LGBTQ", "sexual and reproductive health"]

"""
## Search Google Fact Check API

## API specs here: 
https://developers.google.com/fact-check/tools/api/reference/rest/v1alpha1/claims/search

## Example: 
response = search_claims(languageCode="en-US", reviewPublisherSiteFilter="politifact.com", maxAgeDays=7, pageSize=20, pageToken="", offset=0)

responsenyt = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=700, pageSize=20)

response = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=1, pageSize=200)
"""
function search_claims(;
                       query::String = "",
                       languageCode::String = "en-US", # bcp-47 language code
                       reviewPublisherSiteFilter::String = "",
                       maxAgeDays::Int = 7,
                       pageSize::Int = 20,
                       pageToken::String = "",
                       offset::Int = 0)

    # Prepare the base URL
    url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

    # Build query parameters
    params = Dict("key" => ENV["GOOGLECLOUD"])
    if !isempty(query)
        params["query"] = query
    end
    if !isempty(languageCode)
        params["languageCode"] = languageCode
    end
    if !isempty(reviewPublisherSiteFilter)
        params["reviewPublisherSiteFilter"] = reviewPublisherSiteFilter
    end
    if maxAgeDays > 0
        params["maxAgeDays"] = string(maxAgeDays)
    end
    if pageSize != 10
        params["pageSize"] = string(pageSize)
    end
    if !isempty(pageToken)
        params["pageToken"] = pageToken
    elseif offset > 0
        params["offset"] = string(offset)
    end

    # Make the HTTP GET request
    response = HTTP.get(url, query=params)
    
    # Parse the JSON response
    return JSON3.read(response.body)
end

"""
## Convert the search response to a tabular format
qu= "Video shows Kamala (Harris) responding to someone"
response = search_claims(query = qu, languageCode="en-US", maxAgeDays=700, pageSize=20)
searchresponse_to_tabular(response)
"""
function searchresponse_to_tabular(response::JSON3.Object)::DataFrame
    # Initialize the results array
    try
        results = DataFrame(
            text = String[get(x, :text, "") for x in response.claims],
            claimant = String[get(x, :claimant, "") for x in response.claims],
            claimDate = String[get(x, :claimDate, "") for x in response.claims],
            claimReviewPublisher = String[get(x[:claimReview][1], "publisher", Dict())["site"] for x in response.claims],
            claimReviewTitle = String[get(x[:claimReview][1], "title", "") for x in response.claims],
            claimReviewTextualRating = String[get(x[:claimReview][1], "textualRating", "") for x in response.claims],
            claimReviewUrl = String[get(x[:claimReview][1], "url", "") for x in response.claims])
        return results
    catch
        return DataFrame()
    end
end

"""
## Paginate Google Fact Check API results 
use the pageToken to get the next page of results

## NOTES: 
- 'reviewPublisherSiteFilter' is a string that filters by the review publisher site. You can use things like: "nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com", etc.
- If you have reviewPublisherSiteFilter, then query can be empty. 
"""
function paginate_claims(;
                         query::String = "",
                         languageCode::String = "en-US", # bcp-47 language code
                         reviewPublisherSiteFilter::String = "",
                         maxAgeDays::Int = 7,
                         pageSize::Int = 20,
                         pageToken::String = "",
                         offset::Int = 0, 
                         delay::Float64 = 1/(300/60)) # allows reqs per minute = 300

    # Initialize the results array
    results = []

    # Get the first page of results
    response = search_claims(query=query,
                             languageCode=languageCode,
                             reviewPublisherSiteFilter=reviewPublisherSiteFilter,
                             maxAgeDays=maxAgeDays,
                             pageSize=pageSize,
                             pageToken=pageToken,
                             offset=offset)
    push!(results, response)

    # Get the next page of results
    while haskey(response, "nextPageToken")
        sleep(delay)
        pageToken = response["nextPageToken"]
        response = search_claims(query=query,
                                 languageCode=languageCode,
                                 reviewPublisherSiteFilter=reviewPublisherSiteFilter,
                                 maxAgeDays=maxAgeDays,
                                 pageSize=pageSize,
                                 pageToken=pageToken,
                                 offset=offset)
        push!(results, response)
    end

    return results
end

"""
# script to daily check for new fact-checks for each category
allfacts = periodic_fact_check(365*8)
## Save the results to a CSV file
using CSV, Dates 
CSV.write("data/google_fact_checks$(today()).csv", allfacts)

allfacts= filter(:claimReviewTextualRating => x -> !contains(x, r" accurate| true"), allfacts)

CSV.write("data/google_fact_checks2024-11-14.csv", allfacts)

"""
function periodic_fact_check(max_days::Int = 1)
    allresults = DataFrame[]
    for category in query_categories
        println("getting Category: $category")
        paginated_results = paginate_claims(query = category, languageCode="en-US", maxAgeDays=max_days, pageSize=200)
        if any(haskey.(paginated_results, "claims"))
            results = [searchresponse_to_tabular(page) for page in paginated_results]
            ## concat the results
            results = vcat(results...)
            results[!, :category] .= category
            push!(allresults, results)
        end
    end
    return vcat(allresults...)
end

function get_latest_fact_checks()
    download("https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json", "data/fact_check_latest.json")
end

"""
d = Dict(
    :author => Dict(
        :name => "John Doe"
    )
)
safe_get(d, (:author, :name), "No name")
"""
function safe_get(dict::Dict, keys::Tuple, default=nothing)
    current = dict
    for key in keys
        if haskey(current, key)
            current = current[key]
        else
            return default
        end
    end
    return current
end

function safe_datetime(date::Union{DateTime, Missing})
    return date
end

## Convert date string to DateTime object without throwing an error
function safe_datetime(date::String)
    try
        return Dates.DateTime(date)
    catch
        try 
            Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SS.sssZ")
        catch 
            try 
                Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SSZ")
            catch
                ## If all attempts fail 
                return missing
            end
        end
    end
end

"""
## Load the entire fact check JSON file
- the updated dataset can be found at: https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json
df, errors = load_fact_check_json()
"""
function load_fact_check_json(file::String="data/fact_check_latest.json"; get_latest=false)
    if get_latest
        get_latest_fact_checks()
    end
    df = JSON3.read(file)
    dfout = DataFrame[]
    errors = 0
    error_index = Int64[]
    for (i, data) in enumerate(df[:dataFeedElement])
        try
            d = Dict(data[:item][1])
            results = DataFrame(
                text = get(d, :claimReviewed, ""),
                claimant = safe_get(d, (:itemReviewed, :author, :name), ""),
                claimDate = safe_get(d, (:itemReviewed, :datePublished), ""),
                claimReviewDate = get(d, :datePublished, ""),
                claimReviewPublisher = get(d[:author], :name, ""),
                claimReviewTitle = "",
                claimReviewTextualRating = safe_get(d, (:reviewRating, :alternateName), ""),
                claimReviewUrl = get(data, :url, "")
            )
            push!(dfout, results)
        catch
            push!(error_index, i)
            errors += 1
        end
    end
    return (vcat(dfout...), error_index)
end

"""
## Format the date columns in the DataFrame
- drop rows where both date columns are missing

df, errors = load_fact_check_json("data/fact_check_latest.json")
format_date_cols!(df)
"""
function format_date_cols!(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
    # Drop where date = ""
    for col in date_cols
        df[!, col] = safe_datetime.(df[!, col])
    end
    # Drop if both date columns are missing
    df[:, :contains_date] = (ismissing.(df.claimDate) .& ismissing.(df.claimReviewDate)) .== false
    subset!(df, :contains_date)
end

"""
## Gets the latest date in the DataFrame from current date columns 
- used to identify the latest fact-checks in the datasets
df, errs = load_fact_check_json("data/fact_check_latest.json")
get_latest_date(df)
"""
function get_latest_date(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
    if typeof(df.claimDate[1]) == String
        format_date_cols!(df)
    end
    ## Identify any dates that are in the future - must be miscoded, set to missing 
    for col in date_cols
        df[!, col] = [ coalesce(x, Dates.today()) > Dates.today() ? missing : x for x in df[!, col]]
    end
    maxdates = [maximum(coalesce.(df[!, col], Date(1901, 1, 1))) for col in date_cols]
    maxcolumn = date_cols[argmax(maxdates)]
    return maximum(maxdates), maxcolumn
end

"""
## Identify the fact-checks in the latest dataset that are not in the previous dataset
- use claimReviewDate to identify differences
- get the latest claimReviewDate in current_data 
- get the latest claimReviewDate in previous_data
- select the rows in current_data where claimReviewDate > latest_claimReviewDate 

Example: 
previous_data, errs = load_fact_check_json("data/fact_check_latest.json")
current_data, errs = load_fact_check_json("data/fact_check_latest.json", get_latest=true)
CSV.write("data/fact_check_latest.csv", current_data)
new_fact_checks = get_new_fact_checks(current_data, previous_data)
"""
function get_new_fact_checks(current_data::DataFrame, previous_data::DataFrame)
    latest_of_newest, datecol = get_latest_date(current_data)
    latest_of_previous, datecol = get_latest_date(previous_data)
    # Get the indices of the new fact-checks
    if latest_of_newest > latest_of_previous
        return current_data[coalesce.(current_data[!, datecol], Date(1901, 1, 1)) .> latest_of_previous, :]
    else
        return DataFrame()
    end
end