misinfo_detection_app / scripts /claims_from_communitynotes.jl
stefanjwojcik's picture
add scripts
143b0d4 verified
## We're going to use OpenAI.jl to find the root misleading claims based on the given contextual notes
using OpenAI, Dates, DataFrames, CSV
## First, we need to set the API key
api_key = ENV["OPENAI_API_KEY"]
systemprompt = """
Generate a plausible misleading claim based on the provided notes on the original misleading statement.
Your task is to create an original misleading claim that aligns with the information given in the corresponding notes. Use the notes to understand how the misleading statement could be presented, but do not copy directly from any information or break from the midleading nature of the statement.
# Steps
1. Review the provided notes that detail information about the misleading claim.
2. Identify the key misleading aspect or the central theme that could be easily misrepresented.
3. Structure a concise, misleading statement or claim that could have reasonably informed the notes.
# Output Format
The output should be a single sentence representing the original misleading claim. Ensure the claim is clear enough to align with what the notes provide but presents the same misleading perspective.
# Examples
**Notes:** "Whales are indeed mammals. Marine mammals are able to 'stay hydrated' because their kidneys have evolved to excrete more salt and reclaim more water than humans and many other mammals can. They also obtain water from their food. This is widely documented, for example in [reputable link]"
**Generated Claim:** "Whales are not actually mammals. If humans (land mammals) can't drink water - try it! - how can supposed sea mammals like whales stay hydrated?"
**Notes:** "[The supplement] does have some clinical trials showing side effects, including fatigue, but these are not significant enough to require a full FDA warning."
**Generated Claim:** "Clinically tested with no significant side effects found."
(The real examples may vary in complexity or phrasing, but the misleading nature must always be consistent with the supplied notes.)
"""
"""
## create_notes_claim: Function to generate a misleading claim based on the provided notes
# Example
claimprompt = "Forbes has a good rundown of the investigation and the Washington Post has a fuller picture of where the investigation is headed. Gaetz seems to be deliberately misleading his readers about the timeline of any investigation with this tweet. https://www.forbes.com/sites/rachelsandler/2021/03/30/gop-rep-matt-gaetz-reportedly-under-investigation-over-relationship-with-17-year-old-girl/?sh=7da3be1a23f4 https://www.washingtonpost.com/politics/2022/01/27/sex-trafficking-allegations-matt-gaetz/"
response = create_notes_claim(claimprompt, systemprompt)
println(response.response.choices[1].message.content)
# Example
claimprompt = "The Jan 6th riots were encouraged by the sitting US President saying to his followers to "stop the steal" not just protest it. As well as laying groundwork well before. https://www.nytimes.com/2021/01/10/us/trump-speech-riot.html https://www.wsj.com/articles/trump-and-his-allies-set-the-stage-for-riot-well-before-january-6-11610156283 Four people in the crowd on Jan 6th died as well as five officers shortly after. https://www.nytimes.com/2022/01/05/us/politics/jan-6-capitol-deaths.html"
response = create_notes_claim(claimprompt, systemprompt)
println(response.response.choices[1].message.content)
# "The sitting US President merely suggested peaceful protests during the events leading up to January 6th, with no evidence of any incitement or preparation for violence."
"""
function create_notes_claim(claimprompt, systemprompt=systemprompt)
response = OpenAI.create_chat(
ENV["OPENAI_API_KEY"],
"gpt-4o",
[Dict("role" => "system", "content" => systemprompt),
Dict("role" => "user", "content" => claimprompt)]
)
return response
end
"""
## Function to pull in community notes data - merge in note status and generate filters
"""
function get_community_notes(; kwargs...)
path_or_db = get(kwargs, "path_or_db", "db")
path = get(kwargs, "path", "data/community_notes/")
if path_or_db == "db"
## Load the community notes data
community_notes = bq("SELECT * FROM ostreacultura.community_notes.notes")
note_status = bq("SELECT * FROM ostreacultura.community_notes.note_status")
else
## Load the community notes data
community_notes = CSV.File(joinpath(path, "notes.tsv"), DataFrame)
note_status = CSV.File(joinpath(path, "noteStatusHistory.tsv"), DataFrame)
## Get latest status
note_status = note_status |>
@groupby(_.noteId) |>
@map({noteId=key(_), status=last(_.status)}) |> DataFrame
end
## Get the latest note in the data
end