Spaces:

stefanjwojcik
/

misinfo_detection_app

Running

App Files Files Community

stefanjwojcik commited on Mar 11

Commit

143b0d4

verified ·

1 Parent(s): 9ff0a35

add scripts

Browse files

Files changed (11) hide show

scripts/CARDS_climate_training_classification.jl +92 -0
scripts/CARDStestclassification.jl +104 -0
scripts/CardsExample.jl +31 -0
scripts/claims_from_communitynotes.jl +80 -0
scripts/dashapp.jl +57 -0
scripts/database_scratch.jl +97 -0
scripts/expansive_claims_with_LLM.jl +168 -0
scripts/google_fact_check_api.jl +302 -0
scripts/misinfo.jl +145 -0
scripts/single_climate_example.jl +44 -0
scripts/upsert_climate_test.py +20 -0

scripts/CARDS_climate_training_classification.jl ADDED Viewed

	@@ -0,0 +1,92 @@

+## Embed all of the CARDS Data
+df = OC.DataLoader.pd.read_csv("data/climate_training.csv")
+model = "multilingual-e5-large"
+indexname = "ostreacultura-v1"
+namespace = "cards-data"
+out = OC.multi_embeddings(model, df, 96, "text")
+OC.upsert_data(out, "ostreacultura-v1", namespace, chunk_size=96)
+# Now, query cards data using Climate Misinformation Claims
+import OstreaCultura as OC
+using DataFrames
+claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv")
+indexname = "ostreacultura-v1"
+namespace = "cards-data"
+claim = claims.Claims[1]
+counterclaim = claims.Counterclaims[1]
+threshold = .8
+top_k = 10_000 # top_k for the initial query = to total number of claims
+@time OC.query_claims(claims.Claims[1], claims.Counterclaims[1], indexname, namespace)
+# Write a loop to query all claims, then assign the claim to the top k values
+classified = DataFrame()
+@time for i in 1:size(claims)[1]
+    result = OC.query_claims(string(claims.Claims[i]), string(claims.Counterclaims[i]), indexname, namespace; top_k=top_k, include_values=false)
+    if nrow(result) == 0
+        println("No results found for claim: ", claims.Claims[i])
+        continue
+    else
+        result.assigned_claim .= claims.Claims[i]
+        classified = vcat(classified, result)
+    end
+end
+# Write the classified data to a csv file
+using CSV
+CSV.write("data/cards_top10000_results.csv", classified)
+classified = CSV.read("data/cards_top10000_results.csv", DataFrame)
+## Assign labels
+classified.predlabel80 .= 1
+classified.predlabel85 .= 0
+classified.predlabel85[classified.claim_score .>= .85] .= 1
+classified.predlabel90 .= 0
+classified.predlabel90[classified.claim_score .>= .90] .= 1
+classified.ytrue .= [occursin("1", x) for x in classified.claim]
+sclassified = select(classified, r"id|predlabel|claim_score|assigned_claim")
+# Group by id and combine to keep only the id with the max score
+sclassified_grouped = groupby(sclassified, :id)
+sdf = combine(sclassified_grouped) do eh
+    eh[argmax(eh.claim_score), :]
+end
+# climate full - get all ids
+pc = OC.create_pinecone_context()
+index = pc.Index(indexname)
+hi = [x for x in index.list(namespace=namespace)]
+## concat all the ids
+ids = vcat(hi...)
+# get all the data
+#cardsfull = OC.fetch_data(ids, indexname, namespace)
+#CSV.write("data/cards_full.csv", cardsfull)
+cardsfull = CSV.read("data/cards_full.csv", DataFrame)
+## left join the full data with the predicted
+cardsfull = leftjoin(cardsfull, sdf, on=:id)
+cardsfull.ytrue .= [!occursin("0", x) for x in cardsfull.claim]
+#coalesce all the predlabels
+cardsfull.predlabel80 .= coalesce.(cardsfull.predlabel80, 0)
+cardsfull.predlabel85 .= coalesce.(cardsfull.predlabel85, 0)
+cardsfull.predlabel90 .= coalesce.(cardsfull.predlabel90, 0)
+# Get precision and recall
+using MLJ
+ytrue = [!occursin("0", x) for x in cardsfull.claim]
+## Now predlabel 90
+confusion_matrix(cardsfull.predlabel90, ytrue)
+accuracy(cardsfull.predlabel90, ytrue)
+true_positive_rate(ytrue, cardsfull.predlabel90)
+false_negative_rate(cardsfull.predlabel90,ytrue)
+## Now predlabel 85
+confusion_matrix(cardsfull.predlabel85, ytrue)
+accuracy(cardsfull.predlabel85, ytrue)
+true_positive_rate(ytrue, cardsfull.predlabel85)
+false_negative_rate(cardsfull.predlabel85, ytrue)
+## Now predlabel 80
+confusion_matrix(cardsfull.predlabel80, ytrue)
+accuracy(cardsfull.predlabel80, ytrue)
+true_positive_rate(ytrue, cardsfull.predlabel80)
+false_negative_rate(cardsfull.predlabel80, ytrue)

scripts/CARDStestclassification.jl ADDED Viewed

	@@ -0,0 +1,104 @@

+## Embed all of the CARDS Data
+df = CSV.read("data/climate_data/data/test.csv", DataFrame)
+# Drop rows == "nan"
+df = filter(row -> row.text != "nan", df)
+df = OC.df_to_pd(df)
+model = "multilingual-e5-large"
+indexname = "ostreacultura-v1"
+namespace = "cards-data-test"
+out = OC.multi_embeddings(model, df, 96, "text")
+OC.upsert_data(out, "ostreacultura-v1", namespace, chunk_size=96)
+# Now, query cards data using Climate Misinformation Claims
+import OstreaCultura as OC
+using DataFrames
+claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv")
+indexname = "ostreacultura-v1"
+namespace = "cards-data-test"
+claim = claims.Claims[1]
+counterclaim = claims.Counterclaims[1]
+threshold = .8
+top_k = 400 # top_k for the initial query = to total number of claims
+@time OC.query(claim, indexname, namespace, top_k=top_k)
+# Write a loop to query all claims, then assign the claim to the top k values
+classified = DataFrame()
+@time for i in 1:size(claims)[1]
+    result = OC.query(string(claims.Claims[i]), indexname, namespace; top_k=top_k, include_values=false)
+    if nrow(result) == 0
+        println("No results found for claim: ", claims.Claims[i])
+        continue
+    else
+        result.assigned_claim .= claims.Claims[i]
+        classified = vcat(classified, result)
+    end
+end
+# Write the classified data to a csv file
+using CSV
+CSV.write("data/cards_test_query_top400_results.csv", classified)
+classified = CSV.read("data/cards_test_query_top400_results.csv", DataFrame)
+## Assign labels
+classified.predlabel80 .= 1
+classified.predlabel85 .= 0
+classified.predlabel85[classified.score .>= .85] .= 1
+classified.predlabel90 .= 0
+classified.predlabel90[classified.score .>= .90] .= 1
+classified.ytrue .= [occursin("1", x) for x in classified.claim]
+sclassified = select(classified, r"id|predlabel|score|assigned_claim")
+# Group by id and combine to keep only the id with the max score
+sclassified_grouped = groupby(sclassified, :id)
+sdf = combine(sclassified_grouped) do eh
+    eh[argmax(eh.score), :]
+end
+# climate full - get all ids
+pc = OC.create_pinecone_context()
+index = pc.Index(indexname)
+hi = [x for x in index.list(namespace=namespace)]
+## concat all the ids
+ids = vcat(hi...)
+# get all the data
+cardsfull = OC.fetch_data(ids, indexname, namespace)
+#CSV.write("data/climate_data/data/test_w_ids.csv", cardsfull)
+cardsfull = CSV.read("data/climate_data/data/test_w_ids.csv", DataFrame)
+## left join the full data with the predicted
+cardsfull = leftjoin(cardsfull, sdf, on=:id)
+cardsfull.ytrue .= [!occursin("0", x) for x in cardsfull.claim]
+#coalesce all the predlabels
+cardsfull.predlabel80 .= coalesce.(cardsfull.predlabel80, 0)
+cardsfull.predlabel85 .= coalesce.(cardsfull.predlabel85, 0)
+cardsfull.predlabel90 .= coalesce.(cardsfull.predlabel90, 0)
+# Get precision and recall
+using MLJ
+ytrue = [!occursin("0", x) for x in cardsfull.claim]
+## Now predlabel 90
+confusion_matrix(cardsfull.predlabel90, ytrue)
+accuracy(cardsfull.predlabel90, ytrue)
+true_positive_rate(ytrue, cardsfull.predlabel90)
+false_negative_rate(cardsfull.predlabel90,ytrue)
+## Now predlabel 85
+confusion_matrix(cardsfull.predlabel85, ytrue)
+accuracy(cardsfull.predlabel85, ytrue)
+true_positive_rate(ytrue, cardsfull.predlabel85)
+false_negative_rate(cardsfull.predlabel85, ytrue)
+## Now predlabel 80
+confusion_matrix(cardsfull.predlabel80, ytrue)
+accuracy(cardsfull.predlabel80, ytrue)
+true_positive_rate(ytrue, cardsfull.predlabel80)
+false_negative_rate(cardsfull.predlabel80, ytrue)
+CSV.write("data/cards_test_query_top400_results.csv", cardsfull)
+using TidierPlots
+## By Label
+ggplot(filter(:score => x -> !ismissing(x), cardsfull), @aes(x = ytrue, y = score)) +
+    geom_violin() + labs(x="Misinfo Label", y="Score") #+ geom_hline()

scripts/CardsExample.jl ADDED Viewed

	@@ -0,0 +1,31 @@

+import OstreaCultura as OC
+using DataFrames, XLSX, CSV
+df = DataFrame(XLSX.readtable("data/Misinformation Library with counterclaims.xlsx", "Climate"))
+CSV.write("data/Climate Misinformation Library with counterclaims.csv", df)
+claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv")
+indexname = "ostreacultura-v1"
+namespace = "cards-data"
+claim = claims.Claims[1]
+counterclaim = claims.Counterclaims[1]
+threshold = .8
+top_k = 100 # top_k for the initial query
+#OC.query_claims(claims.Claims[1], claims.Counterclaims[1], indexname, namespace)
+# Write a loop to query all claims, then assign the claim to the top k values
+classified = DataFrame()
+for i in 1:size(claims)[1]
+    result = OC.query_claims(string(claims.Claims[i]), string(claims.Counterclaims[i]), indexname, namespace; top_k=100, include_values=false)
+    if nrow(result) == 0
+        println("No results found for claim: ", claims.Claims[i])
+        continue
+    else
+        result.assigned_claim .= claims.Claims[i]
+        classified = vcat(classified, result)
+    end
+end
+# Write the classified data to a csv file
+using CSV
+CSV.write("data/cards_top100_results.csv", classified)
+##

scripts/claims_from_communitynotes.jl ADDED Viewed

	@@ -0,0 +1,80 @@

+## We're going to use OpenAI.jl to find the root misleading claims based on the given contextual notes
+using OpenAI, Dates, DataFrames, CSV
+## First, we need to set the API key
+api_key = ENV["OPENAI_API_KEY"]
+systemprompt = """
+Generate a plausible misleading claim based on the provided notes on the original misleading statement.
+Your task is to create an original misleading claim that aligns with the information given in the corresponding notes. Use the notes to understand how the misleading statement could be presented, but do not copy directly from any information or break from the midleading nature of the statement.
+# Steps
+1. Review the provided notes that detail information about the misleading claim.
+2. Identify the key misleading aspect or the central theme that could be easily misrepresented.
+3. Structure a concise, misleading statement or claim that could have reasonably informed the notes.
+# Output Format
+The output should be a single sentence representing the original misleading claim. Ensure the claim is clear enough to align with what the notes provide but presents the same misleading perspective.
+# Examples
+**Notes:** "Whales are indeed mammals. Marine mammals are able to 'stay hydrated' because their kidneys have evolved to excrete more salt and reclaim more water than humans and many other mammals can. They also obtain water from their food. This is widely documented, for example in [reputable link]"
+**Generated Claim:** "Whales are not actually mammals. If humans (land mammals) can't drink water - try it! - how can supposed sea mammals like whales stay hydrated?"
+**Notes:** "[The supplement] does have some clinical trials showing side effects, including fatigue, but these are not significant enough to require a full FDA warning."
+**Generated Claim:** "Clinically tested with no significant side effects found."
+(The real examples may vary in complexity or phrasing, but the misleading nature must always be consistent with the supplied notes.)
+"""
+"""
+## create_notes_claim:  Function to generate a misleading claim based on the provided notes
+# Example
+claimprompt = "Forbes has a good rundown of the investigation and the Washington Post has a fuller picture of where the investigation is headed.     Gaetz seems to be deliberately misleading his readers about the timeline of any investigation with this tweet.    https://www.forbes.com/sites/rachelsandler/2021/03/30/gop-rep-matt-gaetz-reportedly-under-investigation-over-relationship-with-17-year-old-girl/?sh=7da3be1a23f4    https://www.washingtonpost.com/politics/2022/01/27/sex-trafficking-allegations-matt-gaetz/"
+response = create_notes_claim(claimprompt, systemprompt)
+println(response.response.choices[1].message.content)
+# Example
+claimprompt = "The Jan 6th riots were encouraged by the sitting US President saying to his followers to &quot;stop the steal&quot; not just protest it. As well as laying groundwork well before.    https://www.nytimes.com/2021/01/10/us/trump-speech-riot.html    https://www.wsj.com/articles/trump-and-his-allies-set-the-stage-for-riot-well-before-january-6-11610156283    Four people in the crowd on Jan 6th died as well as five officers shortly after.     https://www.nytimes.com/2022/01/05/us/politics/jan-6-capitol-deaths.html"
+response = create_notes_claim(claimprompt, systemprompt)
+println(response.response.choices[1].message.content)
+# "The sitting US President merely suggested peaceful protests during the events leading up to January 6th, with no evidence of any incitement or preparation for violence."
+"""
+function create_notes_claim(claimprompt, systemprompt=systemprompt)
+    response = OpenAI.create_chat(
+        ENV["OPENAI_API_KEY"],
+        "gpt-4o",
+        [Dict("role" => "system", "content" => systemprompt),
+        Dict("role" => "user", "content" => claimprompt)]
+    )
+    return response
+end
+"""
+## Function to pull in community notes data - merge in note status and generate filters
+"""
+function get_community_notes(; kwargs...)
+    path_or_db = get(kwargs, "path_or_db", "db")
+    path = get(kwargs, "path", "data/community_notes/")
+    if path_or_db == "db"
+        ## Load the community notes data
+        community_notes = bq("SELECT * FROM ostreacultura.community_notes.notes")
+        note_status = bq("SELECT * FROM ostreacultura.community_notes.note_status")
+    else
+        ## Load the community notes data
+        community_notes = CSV.File(joinpath(path, "notes.tsv"), DataFrame)
+        note_status = CSV.File(joinpath(path, "noteStatusHistory.tsv"), DataFrame)
+        ## Get latest status
+        note_status = note_status |>
+            @groupby(_.noteId) |>
+            @map({noteId=key(_), status=last(_.status)}) |> DataFrame
+    end
+    ## Get the latest note in the data
+end

scripts/dashapp.jl ADDED Viewed

	@@ -0,0 +1,57 @@

+using Dash
+using DataFrames
+using PlotlyJS
+# Sample data for demonstration
+df = DataFrame(
+    text = ["Example text $i" for i in 1:10],
+    classification = ["Misinformation", "Legitimate", "Misinformation", "Legitimate", "Misinformation", "Legitimate", "Misinformation", "Legitimate", "Misinformation", "Legitimate"],
+    score = rand(0:100, 10)
+)
+app = dash()
+app.layout = html_div() do
+    [
+        dcc_input(id="search-box", type="text", placeholder="Enter text to search for", style=Dict("width" => "100%")),
+        html_button("Search", id="search-button", n_clicks=0),
+        dash_datatable(
+            id="results-table",
+            columns=[Dict("name" => i, "id" => i) for i in names(df)],
+            data=Dict.(pairs.(eachrow(df))),
+            row_selectable="multi",
+            selected_rows=[]
+        ),
+        dcc_graph(id="score-distribution")
+    ]
+end
+callback!(
+    app,
+    Output("results-table", "data"),
+    Output("score-distribution", "figure"),
+    Input("search-button", "n_clicks"),
+    State("search-box", "value")
+) do n_clicks, search_value
+    if n_clicks > 0 && !isempty(search_value)
+        filtered_data = filter(row -> occursin(search_value, row.text), df)
+        data_dict = Dict.(pairs.(eachrow(filtered_data)))
+        scores = filtered_data[!, :score]
+        fig = plot(
+            bar(x=1:length(scores), y=scores, marker_color="blue"),
+            Layout(title="Score Distribution", xaxis_title="Index", yaxis_title="Score")
+        )
+        return data_dict, fig
+    else
+        empty_data = Dict.(pairs.(eachrow(DataFrame())))
+        empty_fig = plot(
+            bar(x=[], y=[], marker_color="blue"),
+            Layout(title="Score Distribution", xaxis_title="Index", yaxis_title="Score")
+        )
+        return empty_data, empty_fig
+    end
+end
+run_server(app, "0.0.0.0", debug=true)

scripts/database_scratch.jl ADDED Viewed

	@@ -0,0 +1,97 @@

+## Uploading the data to the database
+import OstreaCultura as OC
+### Creating a long database of claims
+cards = CSV.read("data/Climate Misinformation Library with counterclaims.csv", DataFrame)
+oclib = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
+## OC Library modification:
+## 1. Drop Random ID
+oclib = select(oclib, Not(r"Random"))
+## 2. Rename Target to Topic
+rename!(oclib, :Target => :Topic)
+## 3. Rename Misinformation Narrative to Narrative
+rename!(oclib, "Misinformation Narrative" => "Narrative")
+## 4. Add column Claims, populate with Narrative
+oclib[!, :Claims] = oclib[!, :Narrative];
+## Model -> Topic
+oclib[!, :Model] .= oclib[!, :Topic];
+## Drop Type
+oclib = select(oclib, Not(r"Type"))
+## Cards modification:
+## 1. Drop Sub-narrative
+cards = select(cards, Not(r"Sub-narrative"))
+## Vcat the two dataframes with cols = :union
+df = vcat(cards, oclib; cols=:union)
+## Save as CSV
+CSV.write("data/Combined Misinformation Library.csv", df)
+### CREATING TEST SET ON INFERENCE ENDPOINT
+pc = OC.create_inf_pinecone_context()
+df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
+model = "multilingual-e5-large"
+test_embeds = OC.multi_embeddings(pc, model, df, 96, "text")
+## Uploading the data to the database
+OC.upsert_data(test_embeds, "test-index", "test-namespace")
+## CREATING Test Set for Indicator Test
+df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
+model = "multilingual-e5-large"
+test_embeds = OC.multi_embeddings(model, df, 96, "text")
+# Drop all columns except text, id, label, and embeddings
+test_embeds.drop(columns=["channelID", "MessageID", "AccountID", "topics", "weak topics",
+    "contexts", "indicators", "CSV_File"], inplace=true)
+OC.upsert_data(test_embeds, "test-index", "indicator-test-namespace-2", chunk_size=96)
+## Creating Initial Library to query against
+df = OC.DataLoader.pd.read_csv("data/Modified Misinformation Library.csv")
+model = "multilingual-e5-large"
+out = OC.multi_embeddings(model, df, 96, "Misinformation Narrative")
+# Rename column Misinformation Narrative to text
+out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true)
+# Drop Random ID
+out.drop(columns=["Random ID"], inplace=true)
+OC.upsert_data(out, "ostreacultura-v1", "modified-misinfo-library", chunk_size=96)
+## Access the working database
+import XLSX, DataFrames
+xf = XLSX.readxlsx("data/Misinformation Library.xlsx")
+allsheets = ["LGBTQ", "Anti-Semitic", "Reproductive Health", "Climate"]
+#df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", "LGBTQ"))
+out = DataFrame()
+for sheet in allsheets
+    df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", sheet))
+    # select model, narrative, instances
+    df = select(df, [:Model, :Narrative, :Instances])
+    # convert all columns to string
+    out = vcat(out, df)
+end
+# if Instance is missing, fill with Narrative
+out[ismissing.(out.Instances), :Instances] .= out[ismissing.(out.Instances), :Narrative]
+[out[!, col] = string.(out[!, col]) for col in names(out)]
+# drop duplicate instances
+out = unique(out)
+model = "multilingual-e5-large"
+out = OC.multi_embeddings(dropmissing(out); textcol="Instances")
+# Rename column Misinformation Narrative to text
+out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true)
+OC.upsert_data(out, "ostreacultura-v1", "expanded-misinfo-library", chunk_size=96)
+## How long does it take to query and classify 1000 claims
+using BenchmarkTools
+claims = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
+indexname = "ostreacultura-v1"
+namespace = "expanded-misinfo-library"
+classified = []
+## TODO: Adjust for longer text by splitting
+@time for i in 1:1000
+    claim = claims.text[i]
+    push!(classified, OC.classify_claim(claim, "", indexname, namespace; top_k=5, include_values=false))
+end
+OC.classify_claim(claims.text[1], "", indexname, namespace; top_k=5, include_values=false)

scripts/expansive_claims_with_LLM.jl ADDED Viewed

	@@ -0,0 +1,168 @@

+## We're going to use OpenAI.jl expand upon very simple claims we already have
+using OpenAI, Dates, DataFrames, CSV,ProgressMeter, JSON3
+## First, we need to set the API key
+api_key = ENV["OPENAI_API_KEY"]
+systemprompt = """
+Create a conversation between a misinformed user and a fact-checker. Given a misleading claim, expand on that claim to make it sound credible, then provide the fact-checker's response to correct it. Structure the conversation as alternating exchanges, with each misleading claim followed by a fact-checked response.
+# Steps
+1. Elaborate on the misleading claim, providing reasoning that a misinformed user might use to justify their belief.
+2. Construct a response from the fact-checker that addresses each erroneous point, correcting the misinformation using clear and reliable information.
+3. Alternate between "User" and "Fact-checker" dialogue, ensuring there are **at least 2 exchanges** per conversation.
+4. Present results such that each interaction is divided into separate payloads for an API response.
+# Output Format
+Result should be formatted as JSON without code blocks:
+{
+  "user_statements": [
+    {
+      "message": "[First misinformed user statement]"
+    },
+    {
+      "message": "[Second misinformed user statement if needed]"
+    }
+  ],
+  "fact_checker_responses": [
+    {
+      "message": "[Fact-checker's response to the first user statement]"
+    },
+    {
+      "message": "[Fact-checker's response to the second user statement if needed]"
+    }
+  ]
+}
+# Examples
+Input:
+The earth is flat
+Output:
+{
+  "user_statements": [
+    {
+      "message": "I've heard that the Earth is flat because if it were round, we would all fall off. Plus, they say there's no real proof of a round Earth, just some photoshopped images by space agencies. It just makes sense when you think about it."
+    }
+  ],
+  "fact_checker_responses": [
+    {
+      "message": "Actually, the Earth isn't flat. Gravity keeps everything attached to the Earth's surface regardless of where we are on the globe, which explains why we don't fall off. Additionally, countless photos and scientific missions over decades have demonstrated that the Earth is round. The images of Earth from space are verified by experts worldwide and they come from many different agencies and companies, not just government entities. Private organizations, like SpaceX, have also provided evidence that the Earth is round."
+    }
+  ]
+}
+Input:
+Vaccines are dangerous
+Output:
+{
+  "user_statements": [
+    {
+      "message": "I read somewhere that vaccines are dangerous because they contain harmful chemicals like mercury, and they can cause severe diseases. Isn't that a huge risk to take?"
+    }
+  ],
+  "fact_checker_responses": [
+    {
+      "message": "Vaccines do contain ingredients to help enhance their effectiveness, but they are used in very small, safe amounts. For instance, mercury is found in the form of Thimerosal, which serves as a preservative to prevent contamination and has been repeatedly found to be safe in those minimal amounts. Moreover, most modern vaccines no longer contain any mercury at all. Decades of research have shown that vaccines are far safer than the dangerous diseases they prevent, protecting millions of lives worldwide."
+    }
+  ]
+}
+# Notes
+- Ensure each claim is expanded to appear credible, using reasoning or information one might encounter from unreliable sources.
+- Fact-checking responses should be direct and supported with verified facts.
+- Keep each user statement clearly differentiated from the fact-checker's response to make it easy to parse through the API."""
+"""
+## create_expansive_claim:  Function to generate a misleading claim based on a very simple claim
+# Example
+claimprompt = "vaccines are dangerous"
+response = create_expansive_claim(claimprompt, systemprompt)
+println(response.response.choices[1].message.content)
+# Example
+"""
+function create_expansive_claim(claimprompt, systemprompt=systemprompt)
+    response = OpenAI.create_chat(
+        ENV["OPENAI_API_KEY"],
+        "gpt-4o",
+        [Dict("role" => "system", "content" => systemprompt),
+        Dict("role" => "user", "content" => claimprompt)]
+    )
+    return response
+end
+"""
+## Function to parse the result of the expansive claim generation
+"""
+function get_misinfo_claim(response; kwargs...)
+    # Escape control characters in the JSON string
+    json_string = replace(response.response.choices[1].message.content, "\n" => "")
+    json_content_response = JSON3.read(json_string)
+    user_statements = String[]
+    for statement in json_content_response["user_statements"]
+        push!(user_statements, statement["message"])
+    end
+    return user_statements
+end
+"""
+## Function to generate expansive claims based on a library of claims
+# Example
+expansive_claims_library = expansive_combined_library()
+query_categories = ["climate change", "jewish people", "black people",
+        "immigration", "LGBTQ", "sexual and reproductive health"]
+replace_dict = Dict("Climate Change" => "climate change",
+                    "Anti-semitic" => "jewish people",
+                    "Black" => "black people",
+                    "Immigration" => "immigration",
+                    "LGBTQ" => "LGBTQ",
+                    "Reproductive health" => "sexual and reproductive health")
+## Use replace dict to generate category where .Model equal the dict key
+expansive_claims_library[!, :category] = [replace_dict[x] for x in expansive_claims_library.Model]
+expansive_claims_library[!, :text] = expansive_claims_library.ExpandedClaim
+CSV.write("data/expansive_claims_library.csv", expansive_claims_library)
+"""
+function expansive_combined_library(path::String= "data/Combined Misinformation Library.csv")
+    ## Load the expansive claims library
+    expansive_claims_library = CSV.read(path, DataFrame)
+    expansive_claims_library[!, :ExpandedClaim] .= ""
+    @showprogress for (i, claim) in enumerate(expansive_claims_library.Claims)
+        response = create_expansive_claim(claim)
+        user_statements = get_misinfo_claim(response)
+        expansive_claims_library[i, :ExpandedClaim] = user_statements[1]
+    end
+    return expansive_claims_library
+end
+"""
+include("scripts/expansive_claims_with_LLM.jl")
+cl = CSV.read("data/expansive_claims_library.csv", DataFrame)
+fill_expansive_claims_library!(cl)
+CSV.write("data/expansive_claims_library_expanded.csv", cl)
+"""
+function fill_expansive_claims_library!(cl::DataFrame)
+  # Get all those with missing expanded claims
+  missing_claims = findall(ismissing, cl.ExpandedClaim)
+    @showprogress for i in missing_claims
+        claim = cl.Claims[i]
+        response = create_expansive_claim(claim)
+        user_statements = get_misinfo_claim(response)
+        cl[i, :ExpandedClaim] = user_statements[1]
+    end
+end

scripts/google_fact_check_api.jl ADDED Viewed

	@@ -0,0 +1,302 @@

+###
+using HTTP
+using JSON3, DataFrames, ProgressMeter, Dates
+const fact_check_sources = ["nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com"]
+const query_categories = ["climate change", "jewish people", "black people", "immigration", "LGBTQ", "sexual and reproductive health"]
+"""
+## Search Google Fact Check API
+## API specs here:
+https://developers.google.com/fact-check/tools/api/reference/rest/v1alpha1/claims/search
+## Example:
+response = search_claims(languageCode="en-US", reviewPublisherSiteFilter="politifact.com", maxAgeDays=7, pageSize=20, pageToken="", offset=0)
+responsenyt = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=700, pageSize=20)
+response = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=1, pageSize=200)
+"""
+function search_claims(;
+                       query::String = "",
+                       languageCode::String = "en-US", # bcp-47 language code
+                       reviewPublisherSiteFilter::String = "",
+                       maxAgeDays::Int = 7,
+                       pageSize::Int = 20,
+                       pageToken::String = "",
+                       offset::Int = 0)
+    # Prepare the base URL
+    url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
+    # Build query parameters
+    params = Dict("key" => ENV["GOOGLECLOUD"])
+    if !isempty(query)
+        params["query"] = query
+    end
+    if !isempty(languageCode)
+        params["languageCode"] = languageCode
+    end
+    if !isempty(reviewPublisherSiteFilter)
+        params["reviewPublisherSiteFilter"] = reviewPublisherSiteFilter
+    end
+    if maxAgeDays > 0
+        params["maxAgeDays"] = string(maxAgeDays)
+    end
+    if pageSize != 10
+        params["pageSize"] = string(pageSize)
+    end
+    if !isempty(pageToken)
+        params["pageToken"] = pageToken
+    elseif offset > 0
+        params["offset"] = string(offset)
+    end
+    # Make the HTTP GET request
+    response = HTTP.get(url, query=params)
+    # Parse the JSON response
+    return JSON3.read(response.body)
+end
+"""
+## Convert the search response to a tabular format
+qu= "Video shows Kamala (Harris) responding to someone"
+response = search_claims(query = qu, languageCode="en-US", maxAgeDays=700, pageSize=20)
+searchresponse_to_tabular(response)
+"""
+function searchresponse_to_tabular(response::JSON3.Object)::DataFrame
+    # Initialize the results array
+    try
+        results = DataFrame(
+            text = String[get(x, :text, "") for x in response.claims],
+            claimant = String[get(x, :claimant, "") for x in response.claims],
+            claimDate = String[get(x, :claimDate, "") for x in response.claims],
+            claimReviewPublisher = String[get(x[:claimReview][1], "publisher", Dict())["site"] for x in response.claims],
+            claimReviewTitle = String[get(x[:claimReview][1], "title", "") for x in response.claims],
+            claimReviewTextualRating = String[get(x[:claimReview][1], "textualRating", "") for x in response.claims],
+            claimReviewUrl = String[get(x[:claimReview][1], "url", "") for x in response.claims])
+        return results
+    catch
+        return DataFrame()
+    end
+end
+"""
+## Paginate Google Fact Check API results
+use the pageToken to get the next page of results
+## NOTES:
+- 'reviewPublisherSiteFilter' is a string that filters by the review publisher site. You can use things like: "nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com", etc.
+- If you have reviewPublisherSiteFilter, then query can be empty.
+"""
+function paginate_claims(;
+                         query::String = "",
+                         languageCode::String = "en-US", # bcp-47 language code
+                         reviewPublisherSiteFilter::String = "",
+                         maxAgeDays::Int = 7,
+                         pageSize::Int = 20,
+                         pageToken::String = "",
+                         offset::Int = 0,
+                         delay::Float64 = 1/(300/60)) # allows reqs per minute = 300
+    # Initialize the results array
+    results = []
+    # Get the first page of results
+    response = search_claims(query=query,
+                             languageCode=languageCode,
+                             reviewPublisherSiteFilter=reviewPublisherSiteFilter,
+                             maxAgeDays=maxAgeDays,
+                             pageSize=pageSize,
+                             pageToken=pageToken,
+                             offset=offset)
+    push!(results, response)
+    # Get the next page of results
+    while haskey(response, "nextPageToken")
+        sleep(delay)
+        pageToken = response["nextPageToken"]
+        response = search_claims(query=query,
+                                 languageCode=languageCode,
+                                 reviewPublisherSiteFilter=reviewPublisherSiteFilter,
+                                 maxAgeDays=maxAgeDays,
+                                 pageSize=pageSize,
+                                 pageToken=pageToken,
+                                 offset=offset)
+        push!(results, response)
+    end
+    return results
+end
+"""
+# script to daily check for new fact-checks for each category
+allfacts = periodic_fact_check(365*8)
+## Save the results to a CSV file
+using CSV, Dates
+CSV.write("data/google_fact_checks$(today()).csv", allfacts)
+allfacts= filter(:claimReviewTextualRating => x -> !contains(x, r" accurate| true"), allfacts)
+CSV.write("data/google_fact_checks2024-11-14.csv", allfacts)
+"""
+function periodic_fact_check(max_days::Int = 1)
+    allresults = DataFrame[]
+    for category in query_categories
+        println("getting Category: $category")
+        paginated_results = paginate_claims(query = category, languageCode="en-US", maxAgeDays=max_days, pageSize=200)
+        if any(haskey.(paginated_results, "claims"))
+            results = [searchresponse_to_tabular(page) for page in paginated_results]
+            ## concat the results
+            results = vcat(results...)
+            results[!, :category] .= category
+            push!(allresults, results)
+        end
+    end
+    return vcat(allresults...)
+end
+function get_latest_fact_checks()
+    download("https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json", "data/fact_check_latest.json")
+end
+"""
+d = Dict(
+    :author => Dict(
+        :name => "John Doe"
+    )
+)
+safe_get(d, (:author, :name), "No name")
+"""
+function safe_get(dict::Dict, keys::Tuple, default=nothing)
+    current = dict
+    for key in keys
+        if haskey(current, key)
+            current = current[key]
+        else
+            return default
+        end
+    end
+    return current
+end
+function safe_datetime(date::Union{DateTime, Missing})
+    return date
+end
+## Convert date string to DateTime object without throwing an error
+function safe_datetime(date::String)
+    try
+        return Dates.DateTime(date)
+    catch
+        try
+            Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SS.sssZ")
+        catch
+            try
+                Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SSZ")
+            catch
+                ## If all attempts fail
+                return missing
+            end
+        end
+    end
+end
+"""
+## Load the entire fact check JSON file
+- the updated dataset can be found at: https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json
+df, errors = load_fact_check_json()
+"""
+function load_fact_check_json(file::String="data/fact_check_latest.json"; get_latest=false)
+    if get_latest
+        get_latest_fact_checks()
+    end
+    df = JSON3.read(file)
+    dfout = DataFrame[]
+    errors = 0
+    error_index = Int64[]
+    for (i, data) in enumerate(df[:dataFeedElement])
+        try
+            d = Dict(data[:item][1])
+            results = DataFrame(
+                text = get(d, :claimReviewed, ""),
+                claimant = safe_get(d, (:itemReviewed, :author, :name), ""),
+                claimDate = safe_get(d, (:itemReviewed, :datePublished), ""),
+                claimReviewDate = get(d, :datePublished, ""),
+                claimReviewPublisher = get(d[:author], :name, ""),
+                claimReviewTitle = "",
+                claimReviewTextualRating = safe_get(d, (:reviewRating, :alternateName), ""),
+                claimReviewUrl = get(data, :url, "")
+            )
+            push!(dfout, results)
+        catch
+            push!(error_index, i)
+            errors += 1
+        end
+    end
+    return (vcat(dfout...), error_index)
+end
+"""
+## Format the date columns in the DataFrame
+- drop rows where both date columns are missing
+df, errors = load_fact_check_json("data/fact_check_latest.json")
+format_date_cols!(df)
+"""
+function format_date_cols!(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
+    # Drop where date = ""
+    for col in date_cols
+        df[!, col] = safe_datetime.(df[!, col])
+    end
+    # Drop if both date columns are missing
+    df[:, :contains_date] = (ismissing.(df.claimDate) .& ismissing.(df.claimReviewDate)) .== false
+    subset!(df, :contains_date)
+end
+"""
+## Gets the latest date in the DataFrame from current date columns
+- used to identify the latest fact-checks in the datasets
+df, errs = load_fact_check_json("data/fact_check_latest.json")
+get_latest_date(df)
+"""
+function get_latest_date(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
+    if typeof(df.claimDate[1]) == String
+        format_date_cols!(df)
+    end
+    ## Identify any dates that are in the future - must be miscoded, set to missing
+    for col in date_cols
+        df[!, col] = [ coalesce(x, Dates.today()) > Dates.today() ? missing : x for x in df[!, col]]
+    end
+    maxdates = [maximum(coalesce.(df[!, col], Date(1901, 1, 1))) for col in date_cols]
+    maxcolumn = date_cols[argmax(maxdates)]
+    return maximum(maxdates), maxcolumn
+end
+"""
+## Identify the fact-checks in the latest dataset that are not in the previous dataset
+- use claimReviewDate to identify differences
+- get the latest claimReviewDate in current_data
+- get the latest claimReviewDate in previous_data
+- select the rows in current_data where claimReviewDate > latest_claimReviewDate
+Example:
+previous_data, errs = load_fact_check_json("data/fact_check_latest.json")
+current_data, errs = load_fact_check_json("data/fact_check_latest.json", get_latest=true)
+CSV.write("data/fact_check_latest.csv", current_data)
+new_fact_checks = get_new_fact_checks(current_data, previous_data)
+"""
+function get_new_fact_checks(current_data::DataFrame, previous_data::DataFrame)
+    latest_of_newest, datecol = get_latest_date(current_data)
+    latest_of_previous, datecol = get_latest_date(previous_data)
+    # Get the indices of the new fact-checks
+    if latest_of_newest > latest_of_previous
+        return current_data[coalesce.(current_data[!, datecol], Date(1901, 1, 1)) .> latest_of_previous, :]
+    else
+        return DataFrame()
+    end
+end

scripts/misinfo.jl ADDED Viewed

	@@ -0,0 +1,145 @@

+##
+using CSV, JLD2, DataFrames, OpenAI, StatsBase, Distances, TidierPlots
+## TODO: Base Functions
+# 1. Create a function to generate embeddings
+# 2. Create a function to get the distance to the closest claim, cut based on threshold
+# 3. Create a function to get the distance to the closest counterclaim, no cutting on threshold
+# 4. Create a function to compare the distance to the closest claim and counterclaim, assign label only if the distance to the closest claim is less than the distance to the closest counterclaim
+## Analysis:
+# What is the distribution of distances by assigned narrative and label?
+### UTILITIES ####
+# Define count function
+function table(df::DataFrame, cols::Vector{Symbol})
+    combine(groupby(df, cols), nrow)
+end
+#########
+"""
+## Embeddings to recover narratives
+narrative_embeddings = create_narrative_embeddings()
+"""
+function create_narrative_embeddings(regenerate=false)
+    if !regenerate && isfile("data/narrative_embeddings.jld2")
+        return load_object("data/narrative_embeddings.jld2")
+    end
+    @info "Regenerating narrative embeddings..."
+    narratives = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
+    ## narrative Embeddings
+    n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], narratives[!, "Misinformation Narrative"])
+    ## Add vector of embeddings to dataset
+    narratives[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
+    # Save the embeddings
+    save_object("data/narrative_embeddings.jld2", narratives)
+    return narratives
+end
+"""
+# This is the testing data
+target_embeddings = create_test_embeddings()
+"""
+function create_test_embeddings(regenerate=false)
+    if !regenerate && isfile("data/test_embeddings.jld2")
+        return load_object("data/test_embeddings.jld2")
+    end
+    @info "Regenerating test embeddings..."
+    df_test = CSV.read("data/Indicator_Test.csv", DataFrame)
+    ## narrative Embeddings
+    n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], df_test[!, "text"])
+    ## Add vector of embeddings to dataset
+    df_test[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
+    # Save the embeddings
+    save_object("data/test_embeddings.jld2", df_test)
+    return df_test
+end
+"""
+### The embeddings for each example are along the rows, so they can be compared column-wise (fast)
+narrative_embeddings = create_narrative_embeddings()
+target_embeddings = create_test_embeddings()
+one_shot_classification!(narrative_embeddings, target_embeddings)
+## Show the results - text, closest narrative
+target_embeddings[:, ["text", "Closest Narrative", "label"]] |> first(5)
+"""
+function one_shot_classification!(narrative_embeddings, target_embeddings)
+    ## Matrix of embeddings
+    narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
+    target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
+    # Create a search function
+    function search(narrative_matrix, target_matrix)
+        distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
+        # get the index of the column with the smallest distance
+        narrative_index = argmin(distances, dims=2)
+        return narrative_index
+    end
+    # Search for the closest narrative for each test data
+    narrative_assignment = search(narrative_matrix, target_matrix)
+    target_embeddings[:, "Closest Narrative"] = [narrative_embeddings[x[2], "Misinformation Narrative"] for x in narrative_assignment[:, 1]]
+    return target_embeddings
+end
+function get_distances!(narrative_embeddings, target_embeddings)
+    ## Matrix of embeddings
+    narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
+    target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
+    # Create a search function
+    function embedding_distances(narrative_matrix, target_matrix)
+        distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
+        # get the index of the column with the smallest distance
+        return distances[argmin(distances, dims=2)][:, 1]
+    end
+    # Search for the closest narrative for each test data
+    target_embeddings[:, "Dist"] = embedding_distances(narrative_matrix, target_matrix)
+    return target_embeddings
+end
+## Add vector of embeddings to the test dataset
+# 3. Generate embeddings of the narratives in multiple languages
+# 4. Create a langchain search function to check which narrative is closest to the input narrative
+# 5. Figure out how effective the embeddings are in recovering the narrative classification
+## STEPS::::: Models
+# 1. Within each of the classified narratives, reuse the embeddings to find the misinfo by selecting the top K matches
+# 2. Train a model on the embeddings to predict the misinfo
+# Get the embeddings for the narratives
+narrative_embeddings = create_narrative_embeddings()
+target_embeddings = create_test_embeddings()
+one_shot_classification!(narrative_embeddings, target_embeddings)
+get_distances!(narrative_embeddings, target_embeddings)
+# Plot the distribution of distances by narrative and label
+using TidierPlots
+## By Label
+ggplot(target_embeddings, @aes(x = label, y = Dist)) +
+    geom_violin() + labs(x="Misinfo Label", y="Distance") #+ geom_hline()
+## By Narrative
+#ggplot(target_embeddings |> (data -> filter(:label => x -> x .== 1.0, data)), @aes(x = "Closest Narrative", y = Dist)) +
+#    geom_violin()
+### Assign MisinfoPred = true if distance is less than .2
+target_embeddings[!, "MisinfoPred"] = target_embeddings[!, "Dist"] .< 0.2
+## Precision and Recall
+using MLJ
+y_true = target_embeddings[!, "label"]
+y_pred = target_embeddings[!, "MisinfoPred"]
+confusion_matrix(y_pred, y_true)
+accuracy(y_true, y_pred)
+true_positive_rate(y_true, y_pred)
+false_positive_rate(y_true, y_pred)
+## Top 10 closest narratives
+target_embeddings |>
+    (data -> filter(:label => x -> x .== 1.0, data)) |>
+    (data -> sort(data, :Dist)) |>
+    (data -> first(data, 10)) |>
+    (data -> select(data, ["text", "Closest Narrative", "Dist"]))

scripts/single_climate_example.jl ADDED Viewed

	@@ -0,0 +1,44 @@

+include("src/narrative_construction.jl")
+# Create a new claim
+claim1 = createClaim("Antarctica is gaining ice and is not actually warming",
+                    "Antarctical is losing ice year by year due to the effects of climate change",
+                    "Facebook post",
+                    ["antarctica", "global warming", "climate change"])
+claim2 = createClaim("It's natural cycles/variation in weather, not global warming",
+                    "There is substantial evidence that the current warming is not due to natural cycles",
+                    "Facebook post",
+                    ["natural cycles", "global warming", "climate change"])
+climate_narrative = Narrative(
+                    randid(),
+                    "Climate Change Denial",
+                    "Climate Change",
+                    "Scientists, Elites",
+                    "The science behind climate change inconclusive or flawed",
+                    Claim[])
+# Add the claims to the narrative
+add_claim!(climate_narrative, claim1)
+add_claim!(climate_narrative, claim2)
+## show the narrative
+climate_narrative
+## Now, let's deploy the narrative to the database
+candidate_data = DataFrame()
+"""
+## Deploy narrative does the following:
+1. Retrieves the data from the database
+2. Filters the data based on the keywords in the claims
+3. Generates embeddings for the claims and the data
+4. Computes the similarity between the claims and the data
+5. Returns the top 10 most similar data points
+"""
+deploy_narrative_model!(climate_narrative, threshold=.2)

scripts/upsert_climate_test.py ADDED Viewed

	@@ -0,0 +1,20 @@

+## Chunk and upload vectors from a DataFrame to Pinecone
+index = pc.Index("test-index")
+## Working Example 1
+df = pd.read_csv('data/climate_test.csv')
+pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
+model = "multilingual-e5-large"
+df = chunk_and_embed(pc, model, df)
+df['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(df))]
+# drop everything except 'Embeddings', 'text', and 'CSV_File'
+#df = df[['Embeddings', 'text', 'id']]
+# create 4 random embeddings for each row
+vectors = create_vectors_from_df(df)
+index.upsert(
+            vectors=vectors[0:12],
+            namespace="test-namespace"
+        )
+chunk_df_and_upsert(index, df, chunk_size=100)