stefanjwojcik commited on
Commit
143b0d4
·
verified ·
1 Parent(s): 9ff0a35

add scripts

Browse files
scripts/CARDS_climate_training_classification.jl ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Embed all of the CARDS Data
2
+ df = OC.DataLoader.pd.read_csv("data/climate_training.csv")
3
+ model = "multilingual-e5-large"
4
+ indexname = "ostreacultura-v1"
5
+ namespace = "cards-data"
6
+ out = OC.multi_embeddings(model, df, 96, "text")
7
+ OC.upsert_data(out, "ostreacultura-v1", namespace, chunk_size=96)
8
+
9
+ # Now, query cards data using Climate Misinformation Claims
10
+ import OstreaCultura as OC
11
+ using DataFrames
12
+ claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv")
13
+ indexname = "ostreacultura-v1"
14
+ namespace = "cards-data"
15
+ claim = claims.Claims[1]
16
+ counterclaim = claims.Counterclaims[1]
17
+ threshold = .8
18
+ top_k = 10_000 # top_k for the initial query = to total number of claims
19
+ @time OC.query_claims(claims.Claims[1], claims.Counterclaims[1], indexname, namespace)
20
+
21
+ # Write a loop to query all claims, then assign the claim to the top k values
22
+ classified = DataFrame()
23
+ @time for i in 1:size(claims)[1]
24
+ result = OC.query_claims(string(claims.Claims[i]), string(claims.Counterclaims[i]), indexname, namespace; top_k=top_k, include_values=false)
25
+ if nrow(result) == 0
26
+ println("No results found for claim: ", claims.Claims[i])
27
+ continue
28
+ else
29
+ result.assigned_claim .= claims.Claims[i]
30
+ classified = vcat(classified, result)
31
+ end
32
+ end
33
+
34
+ # Write the classified data to a csv file
35
+ using CSV
36
+ CSV.write("data/cards_top10000_results.csv", classified)
37
+ classified = CSV.read("data/cards_top10000_results.csv", DataFrame)
38
+ ## Assign labels
39
+ classified.predlabel80 .= 1
40
+ classified.predlabel85 .= 0
41
+ classified.predlabel85[classified.claim_score .>= .85] .= 1
42
+ classified.predlabel90 .= 0
43
+ classified.predlabel90[classified.claim_score .>= .90] .= 1
44
+ classified.ytrue .= [occursin("1", x) for x in classified.claim]
45
+ sclassified = select(classified, r"id|predlabel|claim_score|assigned_claim")
46
+
47
+
48
+ # Group by id and combine to keep only the id with the max score
49
+ sclassified_grouped = groupby(sclassified, :id)
50
+ sdf = combine(sclassified_grouped) do eh
51
+ eh[argmax(eh.claim_score), :]
52
+ end
53
+
54
+ # climate full - get all ids
55
+ pc = OC.create_pinecone_context()
56
+ index = pc.Index(indexname)
57
+ hi = [x for x in index.list(namespace=namespace)]
58
+ ## concat all the ids
59
+ ids = vcat(hi...)
60
+ # get all the data
61
+ #cardsfull = OC.fetch_data(ids, indexname, namespace)
62
+ #CSV.write("data/cards_full.csv", cardsfull)
63
+ cardsfull = CSV.read("data/cards_full.csv", DataFrame)
64
+ ## left join the full data with the predicted
65
+ cardsfull = leftjoin(cardsfull, sdf, on=:id)
66
+ cardsfull.ytrue .= [!occursin("0", x) for x in cardsfull.claim]
67
+ #coalesce all the predlabels
68
+ cardsfull.predlabel80 .= coalesce.(cardsfull.predlabel80, 0)
69
+ cardsfull.predlabel85 .= coalesce.(cardsfull.predlabel85, 0)
70
+ cardsfull.predlabel90 .= coalesce.(cardsfull.predlabel90, 0)
71
+
72
+ # Get precision and recall
73
+ using MLJ
74
+ ytrue = [!occursin("0", x) for x in cardsfull.claim]
75
+
76
+ ## Now predlabel 90
77
+ confusion_matrix(cardsfull.predlabel90, ytrue)
78
+ accuracy(cardsfull.predlabel90, ytrue)
79
+ true_positive_rate(ytrue, cardsfull.predlabel90)
80
+ false_negative_rate(cardsfull.predlabel90,ytrue)
81
+
82
+ ## Now predlabel 85
83
+ confusion_matrix(cardsfull.predlabel85, ytrue)
84
+ accuracy(cardsfull.predlabel85, ytrue)
85
+ true_positive_rate(ytrue, cardsfull.predlabel85)
86
+ false_negative_rate(cardsfull.predlabel85, ytrue)
87
+
88
+ ## Now predlabel 80
89
+ confusion_matrix(cardsfull.predlabel80, ytrue)
90
+ accuracy(cardsfull.predlabel80, ytrue)
91
+ true_positive_rate(ytrue, cardsfull.predlabel80)
92
+ false_negative_rate(cardsfull.predlabel80, ytrue)
scripts/CARDStestclassification.jl ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Embed all of the CARDS Data
2
+ df = CSV.read("data/climate_data/data/test.csv", DataFrame)
3
+ # Drop rows == "nan"
4
+ df = filter(row -> row.text != "nan", df)
5
+ df = OC.df_to_pd(df)
6
+ model = "multilingual-e5-large"
7
+ indexname = "ostreacultura-v1"
8
+ namespace = "cards-data-test"
9
+ out = OC.multi_embeddings(model, df, 96, "text")
10
+ OC.upsert_data(out, "ostreacultura-v1", namespace, chunk_size=96)
11
+
12
+ # Now, query cards data using Climate Misinformation Claims
13
+ import OstreaCultura as OC
14
+ using DataFrames
15
+ claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv")
16
+ indexname = "ostreacultura-v1"
17
+ namespace = "cards-data-test"
18
+ claim = claims.Claims[1]
19
+ counterclaim = claims.Counterclaims[1]
20
+ threshold = .8
21
+ top_k = 400 # top_k for the initial query = to total number of claims
22
+ @time OC.query(claim, indexname, namespace, top_k=top_k)
23
+
24
+ # Write a loop to query all claims, then assign the claim to the top k values
25
+ classified = DataFrame()
26
+ @time for i in 1:size(claims)[1]
27
+ result = OC.query(string(claims.Claims[i]), indexname, namespace; top_k=top_k, include_values=false)
28
+ if nrow(result) == 0
29
+ println("No results found for claim: ", claims.Claims[i])
30
+ continue
31
+ else
32
+ result.assigned_claim .= claims.Claims[i]
33
+ classified = vcat(classified, result)
34
+ end
35
+ end
36
+
37
+ # Write the classified data to a csv file
38
+ using CSV
39
+ CSV.write("data/cards_test_query_top400_results.csv", classified)
40
+ classified = CSV.read("data/cards_test_query_top400_results.csv", DataFrame)
41
+ ## Assign labels
42
+ classified.predlabel80 .= 1
43
+ classified.predlabel85 .= 0
44
+ classified.predlabel85[classified.score .>= .85] .= 1
45
+ classified.predlabel90 .= 0
46
+ classified.predlabel90[classified.score .>= .90] .= 1
47
+ classified.ytrue .= [occursin("1", x) for x in classified.claim]
48
+ sclassified = select(classified, r"id|predlabel|score|assigned_claim")
49
+
50
+
51
+ # Group by id and combine to keep only the id with the max score
52
+ sclassified_grouped = groupby(sclassified, :id)
53
+ sdf = combine(sclassified_grouped) do eh
54
+ eh[argmax(eh.score), :]
55
+ end
56
+
57
+ # climate full - get all ids
58
+ pc = OC.create_pinecone_context()
59
+ index = pc.Index(indexname)
60
+ hi = [x for x in index.list(namespace=namespace)]
61
+ ## concat all the ids
62
+ ids = vcat(hi...)
63
+ # get all the data
64
+ cardsfull = OC.fetch_data(ids, indexname, namespace)
65
+ #CSV.write("data/climate_data/data/test_w_ids.csv", cardsfull)
66
+ cardsfull = CSV.read("data/climate_data/data/test_w_ids.csv", DataFrame)
67
+ ## left join the full data with the predicted
68
+ cardsfull = leftjoin(cardsfull, sdf, on=:id)
69
+ cardsfull.ytrue .= [!occursin("0", x) for x in cardsfull.claim]
70
+ #coalesce all the predlabels
71
+ cardsfull.predlabel80 .= coalesce.(cardsfull.predlabel80, 0)
72
+ cardsfull.predlabel85 .= coalesce.(cardsfull.predlabel85, 0)
73
+ cardsfull.predlabel90 .= coalesce.(cardsfull.predlabel90, 0)
74
+
75
+ # Get precision and recall
76
+ using MLJ
77
+ ytrue = [!occursin("0", x) for x in cardsfull.claim]
78
+
79
+ ## Now predlabel 90
80
+ confusion_matrix(cardsfull.predlabel90, ytrue)
81
+ accuracy(cardsfull.predlabel90, ytrue)
82
+ true_positive_rate(ytrue, cardsfull.predlabel90)
83
+ false_negative_rate(cardsfull.predlabel90,ytrue)
84
+
85
+ ## Now predlabel 85
86
+ confusion_matrix(cardsfull.predlabel85, ytrue)
87
+ accuracy(cardsfull.predlabel85, ytrue)
88
+ true_positive_rate(ytrue, cardsfull.predlabel85)
89
+ false_negative_rate(cardsfull.predlabel85, ytrue)
90
+
91
+ ## Now predlabel 80
92
+ confusion_matrix(cardsfull.predlabel80, ytrue)
93
+ accuracy(cardsfull.predlabel80, ytrue)
94
+ true_positive_rate(ytrue, cardsfull.predlabel80)
95
+ false_negative_rate(cardsfull.predlabel80, ytrue)
96
+
97
+ CSV.write("data/cards_test_query_top400_results.csv", cardsfull)
98
+
99
+
100
+ using TidierPlots
101
+
102
+ ## By Label
103
+ ggplot(filter(:score => x -> !ismissing(x), cardsfull), @aes(x = ytrue, y = score)) +
104
+ geom_violin() + labs(x="Misinfo Label", y="Score") #+ geom_hline()
scripts/CardsExample.jl ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import OstreaCultura as OC
2
+ using DataFrames, XLSX, CSV
3
+
4
+ df = DataFrame(XLSX.readtable("data/Misinformation Library with counterclaims.xlsx", "Climate"))
5
+ CSV.write("data/Climate Misinformation Library with counterclaims.csv", df)
6
+ claims = OC.DataLoader.pd.read_csv("data/Climate Misinformation Library with counterclaims.csv")
7
+ indexname = "ostreacultura-v1"
8
+ namespace = "cards-data"
9
+ claim = claims.Claims[1]
10
+ counterclaim = claims.Counterclaims[1]
11
+ threshold = .8
12
+ top_k = 100 # top_k for the initial query
13
+ #OC.query_claims(claims.Claims[1], claims.Counterclaims[1], indexname, namespace)
14
+
15
+ # Write a loop to query all claims, then assign the claim to the top k values
16
+ classified = DataFrame()
17
+ for i in 1:size(claims)[1]
18
+ result = OC.query_claims(string(claims.Claims[i]), string(claims.Counterclaims[i]), indexname, namespace; top_k=100, include_values=false)
19
+ if nrow(result) == 0
20
+ println("No results found for claim: ", claims.Claims[i])
21
+ continue
22
+ else
23
+ result.assigned_claim .= claims.Claims[i]
24
+ classified = vcat(classified, result)
25
+ end
26
+ end
27
+
28
+ # Write the classified data to a csv file
29
+ using CSV
30
+ CSV.write("data/cards_top100_results.csv", classified)
31
+ ##
scripts/claims_from_communitynotes.jl ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## We're going to use OpenAI.jl to find the root misleading claims based on the given contextual notes
2
+ using OpenAI, Dates, DataFrames, CSV
3
+
4
+ ## First, we need to set the API key
5
+ api_key = ENV["OPENAI_API_KEY"]
6
+
7
+ systemprompt = """
8
+ Generate a plausible misleading claim based on the provided notes on the original misleading statement.
9
+
10
+ Your task is to create an original misleading claim that aligns with the information given in the corresponding notes. Use the notes to understand how the misleading statement could be presented, but do not copy directly from any information or break from the midleading nature of the statement.
11
+
12
+ # Steps
13
+
14
+ 1. Review the provided notes that detail information about the misleading claim.
15
+ 2. Identify the key misleading aspect or the central theme that could be easily misrepresented.
16
+ 3. Structure a concise, misleading statement or claim that could have reasonably informed the notes.
17
+
18
+ # Output Format
19
+
20
+ The output should be a single sentence representing the original misleading claim. Ensure the claim is clear enough to align with what the notes provide but presents the same misleading perspective.
21
+
22
+ # Examples
23
+
24
+ **Notes:** "Whales are indeed mammals. Marine mammals are able to 'stay hydrated' because their kidneys have evolved to excrete more salt and reclaim more water than humans and many other mammals can. They also obtain water from their food. This is widely documented, for example in [reputable link]"
25
+ **Generated Claim:** "Whales are not actually mammals. If humans (land mammals) can't drink water - try it! - how can supposed sea mammals like whales stay hydrated?"
26
+
27
+ **Notes:** "[The supplement] does have some clinical trials showing side effects, including fatigue, but these are not significant enough to require a full FDA warning."
28
+ **Generated Claim:** "Clinically tested with no significant side effects found."
29
+
30
+ (The real examples may vary in complexity or phrasing, but the misleading nature must always be consistent with the supplied notes.)
31
+ """
32
+
33
+ """
34
+ ## create_notes_claim: Function to generate a misleading claim based on the provided notes
35
+
36
+ # Example
37
+ claimprompt = "Forbes has a good rundown of the investigation and the Washington Post has a fuller picture of where the investigation is headed. Gaetz seems to be deliberately misleading his readers about the timeline of any investigation with this tweet. https://www.forbes.com/sites/rachelsandler/2021/03/30/gop-rep-matt-gaetz-reportedly-under-investigation-over-relationship-with-17-year-old-girl/?sh=7da3be1a23f4 https://www.washingtonpost.com/politics/2022/01/27/sex-trafficking-allegations-matt-gaetz/"
38
+ response = create_notes_claim(claimprompt, systemprompt)
39
+ println(response.response.choices[1].message.content)
40
+
41
+ # Example
42
+ claimprompt = "The Jan 6th riots were encouraged by the sitting US President saying to his followers to "stop the steal" not just protest it. As well as laying groundwork well before. https://www.nytimes.com/2021/01/10/us/trump-speech-riot.html https://www.wsj.com/articles/trump-and-his-allies-set-the-stage-for-riot-well-before-january-6-11610156283 Four people in the crowd on Jan 6th died as well as five officers shortly after. https://www.nytimes.com/2022/01/05/us/politics/jan-6-capitol-deaths.html"
43
+ response = create_notes_claim(claimprompt, systemprompt)
44
+ println(response.response.choices[1].message.content)
45
+ # "The sitting US President merely suggested peaceful protests during the events leading up to January 6th, with no evidence of any incitement or preparation for violence."
46
+ """
47
+ function create_notes_claim(claimprompt, systemprompt=systemprompt)
48
+ response = OpenAI.create_chat(
49
+ ENV["OPENAI_API_KEY"],
50
+ "gpt-4o",
51
+ [Dict("role" => "system", "content" => systemprompt),
52
+ Dict("role" => "user", "content" => claimprompt)]
53
+ )
54
+ return response
55
+ end
56
+
57
+ """
58
+ ## Function to pull in community notes data - merge in note status and generate filters
59
+
60
+ """
61
+ function get_community_notes(; kwargs...)
62
+ path_or_db = get(kwargs, "path_or_db", "db")
63
+ path = get(kwargs, "path", "data/community_notes/")
64
+ if path_or_db == "db"
65
+ ## Load the community notes data
66
+ community_notes = bq("SELECT * FROM ostreacultura.community_notes.notes")
67
+ note_status = bq("SELECT * FROM ostreacultura.community_notes.note_status")
68
+ else
69
+ ## Load the community notes data
70
+ community_notes = CSV.File(joinpath(path, "notes.tsv"), DataFrame)
71
+ note_status = CSV.File(joinpath(path, "noteStatusHistory.tsv"), DataFrame)
72
+ ## Get latest status
73
+ note_status = note_status |>
74
+ @groupby(_.noteId) |>
75
+ @map({noteId=key(_), status=last(_.status)}) |> DataFrame
76
+ end
77
+ ## Get the latest note in the data
78
+
79
+
80
+ end
scripts/dashapp.jl ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ using Dash
2
+ using DataFrames
3
+ using PlotlyJS
4
+
5
+ # Sample data for demonstration
6
+ df = DataFrame(
7
+ text = ["Example text $i" for i in 1:10],
8
+ classification = ["Misinformation", "Legitimate", "Misinformation", "Legitimate", "Misinformation", "Legitimate", "Misinformation", "Legitimate", "Misinformation", "Legitimate"],
9
+ score = rand(0:100, 10)
10
+ )
11
+
12
+ app = dash()
13
+
14
+ app.layout = html_div() do
15
+ [
16
+ dcc_input(id="search-box", type="text", placeholder="Enter text to search for", style=Dict("width" => "100%")),
17
+ html_button("Search", id="search-button", n_clicks=0),
18
+ dash_datatable(
19
+ id="results-table",
20
+ columns=[Dict("name" => i, "id" => i) for i in names(df)],
21
+ data=Dict.(pairs.(eachrow(df))),
22
+ row_selectable="multi",
23
+ selected_rows=[]
24
+ ),
25
+ dcc_graph(id="score-distribution")
26
+ ]
27
+ end
28
+
29
+ callback!(
30
+ app,
31
+ Output("results-table", "data"),
32
+ Output("score-distribution", "figure"),
33
+ Input("search-button", "n_clicks"),
34
+ State("search-box", "value")
35
+ ) do n_clicks, search_value
36
+ if n_clicks > 0 && !isempty(search_value)
37
+ filtered_data = filter(row -> occursin(search_value, row.text), df)
38
+ data_dict = Dict.(pairs.(eachrow(filtered_data)))
39
+
40
+ scores = filtered_data[!, :score]
41
+ fig = plot(
42
+ bar(x=1:length(scores), y=scores, marker_color="blue"),
43
+ Layout(title="Score Distribution", xaxis_title="Index", yaxis_title="Score")
44
+ )
45
+
46
+ return data_dict, fig
47
+ else
48
+ empty_data = Dict.(pairs.(eachrow(DataFrame())))
49
+ empty_fig = plot(
50
+ bar(x=[], y=[], marker_color="blue"),
51
+ Layout(title="Score Distribution", xaxis_title="Index", yaxis_title="Score")
52
+ )
53
+ return empty_data, empty_fig
54
+ end
55
+ end
56
+
57
+ run_server(app, "0.0.0.0", debug=true)
scripts/database_scratch.jl ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Uploading the data to the database
2
+ import OstreaCultura as OC
3
+
4
+ ### Creating a long database of claims
5
+ cards = CSV.read("data/Climate Misinformation Library with counterclaims.csv", DataFrame)
6
+ oclib = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
7
+ ## OC Library modification:
8
+ ## 1. Drop Random ID
9
+ oclib = select(oclib, Not(r"Random"))
10
+ ## 2. Rename Target to Topic
11
+ rename!(oclib, :Target => :Topic)
12
+ ## 3. Rename Misinformation Narrative to Narrative
13
+ rename!(oclib, "Misinformation Narrative" => "Narrative")
14
+ ## 4. Add column Claims, populate with Narrative
15
+ oclib[!, :Claims] = oclib[!, :Narrative];
16
+ ## Model -> Topic
17
+ oclib[!, :Model] .= oclib[!, :Topic];
18
+ ## Drop Type
19
+ oclib = select(oclib, Not(r"Type"))
20
+
21
+ ## Cards modification:
22
+ ## 1. Drop Sub-narrative
23
+ cards = select(cards, Not(r"Sub-narrative"))
24
+
25
+ ## Vcat the two dataframes with cols = :union
26
+ df = vcat(cards, oclib; cols=:union)
27
+ ## Save as CSV
28
+ CSV.write("data/Combined Misinformation Library.csv", df)
29
+
30
+
31
+ ### CREATING TEST SET ON INFERENCE ENDPOINT
32
+ pc = OC.create_inf_pinecone_context()
33
+ df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
34
+ model = "multilingual-e5-large"
35
+ test_embeds = OC.multi_embeddings(pc, model, df, 96, "text")
36
+
37
+ ## Uploading the data to the database
38
+ OC.upsert_data(test_embeds, "test-index", "test-namespace")
39
+
40
+ ## CREATING Test Set for Indicator Test
41
+ df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
42
+ model = "multilingual-e5-large"
43
+ test_embeds = OC.multi_embeddings(model, df, 96, "text")
44
+ # Drop all columns except text, id, label, and embeddings
45
+ test_embeds.drop(columns=["channelID", "MessageID", "AccountID", "topics", "weak topics",
46
+ "contexts", "indicators", "CSV_File"], inplace=true)
47
+ OC.upsert_data(test_embeds, "test-index", "indicator-test-namespace-2", chunk_size=96)
48
+
49
+
50
+ ## Creating Initial Library to query against
51
+ df = OC.DataLoader.pd.read_csv("data/Modified Misinformation Library.csv")
52
+ model = "multilingual-e5-large"
53
+ out = OC.multi_embeddings(model, df, 96, "Misinformation Narrative")
54
+ # Rename column Misinformation Narrative to text
55
+ out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true)
56
+ # Drop Random ID
57
+ out.drop(columns=["Random ID"], inplace=true)
58
+ OC.upsert_data(out, "ostreacultura-v1", "modified-misinfo-library", chunk_size=96)
59
+
60
+ ## Access the working database
61
+ import XLSX, DataFrames
62
+ xf = XLSX.readxlsx("data/Misinformation Library.xlsx")
63
+ allsheets = ["LGBTQ", "Anti-Semitic", "Reproductive Health", "Climate"]
64
+ #df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", "LGBTQ"))
65
+ out = DataFrame()
66
+ for sheet in allsheets
67
+ df = DataFrame(XLSX.readtable("data/Misinformation Library.xlsx", sheet))
68
+ # select model, narrative, instances
69
+ df = select(df, [:Model, :Narrative, :Instances])
70
+ # convert all columns to string
71
+ out = vcat(out, df)
72
+ end
73
+ # if Instance is missing, fill with Narrative
74
+ out[ismissing.(out.Instances), :Instances] .= out[ismissing.(out.Instances), :Narrative]
75
+ [out[!, col] = string.(out[!, col]) for col in names(out)]
76
+ # drop duplicate instances
77
+ out = unique(out)
78
+ model = "multilingual-e5-large"
79
+ out = OC.multi_embeddings(dropmissing(out); textcol="Instances")
80
+ # Rename column Misinformation Narrative to text
81
+ out.rename(py"{'Misinformation Narrative' : 'text'}", axis=1, inplace=true)
82
+ OC.upsert_data(out, "ostreacultura-v1", "expanded-misinfo-library", chunk_size=96)
83
+
84
+ ## How long does it take to query and classify 1000 claims
85
+ using BenchmarkTools
86
+ claims = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
87
+ indexname = "ostreacultura-v1"
88
+ namespace = "expanded-misinfo-library"
89
+ classified = []
90
+ ## TODO: Adjust for longer text by splitting
91
+ @time for i in 1:1000
92
+ claim = claims.text[i]
93
+ push!(classified, OC.classify_claim(claim, "", indexname, namespace; top_k=5, include_values=false))
94
+ end
95
+
96
+ OC.classify_claim(claims.text[1], "", indexname, namespace; top_k=5, include_values=false)
97
+
scripts/expansive_claims_with_LLM.jl ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## We're going to use OpenAI.jl expand upon very simple claims we already have
2
+ using OpenAI, Dates, DataFrames, CSV,ProgressMeter, JSON3
3
+
4
+ ## First, we need to set the API key
5
+ api_key = ENV["OPENAI_API_KEY"]
6
+
7
+ systemprompt = """
8
+ Create a conversation between a misinformed user and a fact-checker. Given a misleading claim, expand on that claim to make it sound credible, then provide the fact-checker's response to correct it. Structure the conversation as alternating exchanges, with each misleading claim followed by a fact-checked response.
9
+
10
+ # Steps
11
+
12
+ 1. Elaborate on the misleading claim, providing reasoning that a misinformed user might use to justify their belief.
13
+ 2. Construct a response from the fact-checker that addresses each erroneous point, correcting the misinformation using clear and reliable information.
14
+ 3. Alternate between "User" and "Fact-checker" dialogue, ensuring there are **at least 2 exchanges** per conversation.
15
+ 4. Present results such that each interaction is divided into separate payloads for an API response.
16
+
17
+ # Output Format
18
+
19
+ Result should be formatted as JSON without code blocks:
20
+ {
21
+ "user_statements": [
22
+ {
23
+ "message": "[First misinformed user statement]"
24
+ },
25
+ {
26
+ "message": "[Second misinformed user statement if needed]"
27
+ }
28
+ ],
29
+ "fact_checker_responses": [
30
+ {
31
+ "message": "[Fact-checker's response to the first user statement]"
32
+ },
33
+ {
34
+ "message": "[Fact-checker's response to the second user statement if needed]"
35
+ }
36
+ ]
37
+ }
38
+
39
+ # Examples
40
+
41
+ Input:
42
+
43
+ The earth is flat
44
+
45
+ Output:
46
+
47
+ {
48
+ "user_statements": [
49
+ {
50
+ "message": "I've heard that the Earth is flat because if it were round, we would all fall off. Plus, they say there's no real proof of a round Earth, just some photoshopped images by space agencies. It just makes sense when you think about it."
51
+ }
52
+ ],
53
+ "fact_checker_responses": [
54
+ {
55
+ "message": "Actually, the Earth isn't flat. Gravity keeps everything attached to the Earth's surface regardless of where we are on the globe, which explains why we don't fall off. Additionally, countless photos and scientific missions over decades have demonstrated that the Earth is round. The images of Earth from space are verified by experts worldwide and they come from many different agencies and companies, not just government entities. Private organizations, like SpaceX, have also provided evidence that the Earth is round."
56
+ }
57
+ ]
58
+ }
59
+
60
+ Input:
61
+
62
+ Vaccines are dangerous
63
+
64
+ Output:
65
+
66
+ {
67
+ "user_statements": [
68
+ {
69
+ "message": "I read somewhere that vaccines are dangerous because they contain harmful chemicals like mercury, and they can cause severe diseases. Isn't that a huge risk to take?"
70
+ }
71
+ ],
72
+ "fact_checker_responses": [
73
+ {
74
+ "message": "Vaccines do contain ingredients to help enhance their effectiveness, but they are used in very small, safe amounts. For instance, mercury is found in the form of Thimerosal, which serves as a preservative to prevent contamination and has been repeatedly found to be safe in those minimal amounts. Moreover, most modern vaccines no longer contain any mercury at all. Decades of research have shown that vaccines are far safer than the dangerous diseases they prevent, protecting millions of lives worldwide."
75
+ }
76
+ ]
77
+ }
78
+
79
+ # Notes
80
+
81
+ - Ensure each claim is expanded to appear credible, using reasoning or information one might encounter from unreliable sources.
82
+ - Fact-checking responses should be direct and supported with verified facts.
83
+ - Keep each user statement clearly differentiated from the fact-checker's response to make it easy to parse through the API."""
84
+
85
+
86
+ """
87
+ ## create_expansive_claim: Function to generate a misleading claim based on a very simple claim
88
+
89
+ # Example
90
+ claimprompt = "vaccines are dangerous"
91
+ response = create_expansive_claim(claimprompt, systemprompt)
92
+ println(response.response.choices[1].message.content)
93
+
94
+ # Example
95
+ """
96
+ function create_expansive_claim(claimprompt, systemprompt=systemprompt)
97
+ response = OpenAI.create_chat(
98
+ ENV["OPENAI_API_KEY"],
99
+ "gpt-4o",
100
+ [Dict("role" => "system", "content" => systemprompt),
101
+ Dict("role" => "user", "content" => claimprompt)]
102
+ )
103
+ return response
104
+ end
105
+
106
+ """
107
+ ## Function to parse the result of the expansive claim generation
108
+
109
+ """
110
+ function get_misinfo_claim(response; kwargs...)
111
+ # Escape control characters in the JSON string
112
+ json_string = replace(response.response.choices[1].message.content, "\n" => "")
113
+ json_content_response = JSON3.read(json_string)
114
+ user_statements = String[]
115
+ for statement in json_content_response["user_statements"]
116
+ push!(user_statements, statement["message"])
117
+ end
118
+
119
+ return user_statements
120
+ end
121
+
122
+ """
123
+ ## Function to generate expansive claims based on a library of claims
124
+
125
+ # Example
126
+ expansive_claims_library = expansive_combined_library()
127
+ query_categories = ["climate change", "jewish people", "black people",
128
+ "immigration", "LGBTQ", "sexual and reproductive health"]
129
+ replace_dict = Dict("Climate Change" => "climate change",
130
+ "Anti-semitic" => "jewish people",
131
+ "Black" => "black people",
132
+ "Immigration" => "immigration",
133
+ "LGBTQ" => "LGBTQ",
134
+ "Reproductive health" => "sexual and reproductive health")
135
+ ## Use replace dict to generate category where .Model equal the dict key
136
+ expansive_claims_library[!, :category] = [replace_dict[x] for x in expansive_claims_library.Model]
137
+ expansive_claims_library[!, :text] = expansive_claims_library.ExpandedClaim
138
+ CSV.write("data/expansive_claims_library.csv", expansive_claims_library)
139
+
140
+ """
141
+ function expansive_combined_library(path::String= "data/Combined Misinformation Library.csv")
142
+ ## Load the expansive claims library
143
+ expansive_claims_library = CSV.read(path, DataFrame)
144
+ expansive_claims_library[!, :ExpandedClaim] .= ""
145
+ @showprogress for (i, claim) in enumerate(expansive_claims_library.Claims)
146
+ response = create_expansive_claim(claim)
147
+ user_statements = get_misinfo_claim(response)
148
+ expansive_claims_library[i, :ExpandedClaim] = user_statements[1]
149
+ end
150
+ return expansive_claims_library
151
+ end
152
+
153
+ """
154
+ include("scripts/expansive_claims_with_LLM.jl")
155
+ cl = CSV.read("data/expansive_claims_library.csv", DataFrame)
156
+ fill_expansive_claims_library!(cl)
157
+ CSV.write("data/expansive_claims_library_expanded.csv", cl)
158
+ """
159
+ function fill_expansive_claims_library!(cl::DataFrame)
160
+ # Get all those with missing expanded claims
161
+ missing_claims = findall(ismissing, cl.ExpandedClaim)
162
+ @showprogress for i in missing_claims
163
+ claim = cl.Claims[i]
164
+ response = create_expansive_claim(claim)
165
+ user_statements = get_misinfo_claim(response)
166
+ cl[i, :ExpandedClaim] = user_statements[1]
167
+ end
168
+ end
scripts/google_fact_check_api.jl ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###
2
+ using HTTP
3
+ using JSON3, DataFrames, ProgressMeter, Dates
4
+
5
+ const fact_check_sources = ["nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com"]
6
+ const query_categories = ["climate change", "jewish people", "black people", "immigration", "LGBTQ", "sexual and reproductive health"]
7
+
8
+ """
9
+ ## Search Google Fact Check API
10
+
11
+ ## API specs here:
12
+ https://developers.google.com/fact-check/tools/api/reference/rest/v1alpha1/claims/search
13
+
14
+ ## Example:
15
+ response = search_claims(languageCode="en-US", reviewPublisherSiteFilter="politifact.com", maxAgeDays=7, pageSize=20, pageToken="", offset=0)
16
+
17
+ responsenyt = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=700, pageSize=20)
18
+
19
+ response = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=1, pageSize=200)
20
+ """
21
+ function search_claims(;
22
+ query::String = "",
23
+ languageCode::String = "en-US", # bcp-47 language code
24
+ reviewPublisherSiteFilter::String = "",
25
+ maxAgeDays::Int = 7,
26
+ pageSize::Int = 20,
27
+ pageToken::String = "",
28
+ offset::Int = 0)
29
+
30
+ # Prepare the base URL
31
+ url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
32
+
33
+ # Build query parameters
34
+ params = Dict("key" => ENV["GOOGLECLOUD"])
35
+ if !isempty(query)
36
+ params["query"] = query
37
+ end
38
+ if !isempty(languageCode)
39
+ params["languageCode"] = languageCode
40
+ end
41
+ if !isempty(reviewPublisherSiteFilter)
42
+ params["reviewPublisherSiteFilter"] = reviewPublisherSiteFilter
43
+ end
44
+ if maxAgeDays > 0
45
+ params["maxAgeDays"] = string(maxAgeDays)
46
+ end
47
+ if pageSize != 10
48
+ params["pageSize"] = string(pageSize)
49
+ end
50
+ if !isempty(pageToken)
51
+ params["pageToken"] = pageToken
52
+ elseif offset > 0
53
+ params["offset"] = string(offset)
54
+ end
55
+
56
+ # Make the HTTP GET request
57
+ response = HTTP.get(url, query=params)
58
+
59
+ # Parse the JSON response
60
+ return JSON3.read(response.body)
61
+ end
62
+
63
+ """
64
+ ## Convert the search response to a tabular format
65
+ qu= "Video shows Kamala (Harris) responding to someone"
66
+ response = search_claims(query = qu, languageCode="en-US", maxAgeDays=700, pageSize=20)
67
+ searchresponse_to_tabular(response)
68
+ """
69
+ function searchresponse_to_tabular(response::JSON3.Object)::DataFrame
70
+ # Initialize the results array
71
+ try
72
+ results = DataFrame(
73
+ text = String[get(x, :text, "") for x in response.claims],
74
+ claimant = String[get(x, :claimant, "") for x in response.claims],
75
+ claimDate = String[get(x, :claimDate, "") for x in response.claims],
76
+ claimReviewPublisher = String[get(x[:claimReview][1], "publisher", Dict())["site"] for x in response.claims],
77
+ claimReviewTitle = String[get(x[:claimReview][1], "title", "") for x in response.claims],
78
+ claimReviewTextualRating = String[get(x[:claimReview][1], "textualRating", "") for x in response.claims],
79
+ claimReviewUrl = String[get(x[:claimReview][1], "url", "") for x in response.claims])
80
+ return results
81
+ catch
82
+ return DataFrame()
83
+ end
84
+ end
85
+
86
+ """
87
+ ## Paginate Google Fact Check API results
88
+ use the pageToken to get the next page of results
89
+
90
+ ## NOTES:
91
+ - 'reviewPublisherSiteFilter' is a string that filters by the review publisher site. You can use things like: "nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com", etc.
92
+ - If you have reviewPublisherSiteFilter, then query can be empty.
93
+ """
94
+ function paginate_claims(;
95
+ query::String = "",
96
+ languageCode::String = "en-US", # bcp-47 language code
97
+ reviewPublisherSiteFilter::String = "",
98
+ maxAgeDays::Int = 7,
99
+ pageSize::Int = 20,
100
+ pageToken::String = "",
101
+ offset::Int = 0,
102
+ delay::Float64 = 1/(300/60)) # allows reqs per minute = 300
103
+
104
+ # Initialize the results array
105
+ results = []
106
+
107
+ # Get the first page of results
108
+ response = search_claims(query=query,
109
+ languageCode=languageCode,
110
+ reviewPublisherSiteFilter=reviewPublisherSiteFilter,
111
+ maxAgeDays=maxAgeDays,
112
+ pageSize=pageSize,
113
+ pageToken=pageToken,
114
+ offset=offset)
115
+ push!(results, response)
116
+
117
+ # Get the next page of results
118
+ while haskey(response, "nextPageToken")
119
+ sleep(delay)
120
+ pageToken = response["nextPageToken"]
121
+ response = search_claims(query=query,
122
+ languageCode=languageCode,
123
+ reviewPublisherSiteFilter=reviewPublisherSiteFilter,
124
+ maxAgeDays=maxAgeDays,
125
+ pageSize=pageSize,
126
+ pageToken=pageToken,
127
+ offset=offset)
128
+ push!(results, response)
129
+ end
130
+
131
+ return results
132
+ end
133
+
134
+ """
135
+ # script to daily check for new fact-checks for each category
136
+ allfacts = periodic_fact_check(365*8)
137
+ ## Save the results to a CSV file
138
+ using CSV, Dates
139
+ CSV.write("data/google_fact_checks$(today()).csv", allfacts)
140
+
141
+ allfacts= filter(:claimReviewTextualRating => x -> !contains(x, r" accurate| true"), allfacts)
142
+
143
+ CSV.write("data/google_fact_checks2024-11-14.csv", allfacts)
144
+
145
+ """
146
+ function periodic_fact_check(max_days::Int = 1)
147
+ allresults = DataFrame[]
148
+ for category in query_categories
149
+ println("getting Category: $category")
150
+ paginated_results = paginate_claims(query = category, languageCode="en-US", maxAgeDays=max_days, pageSize=200)
151
+ if any(haskey.(paginated_results, "claims"))
152
+ results = [searchresponse_to_tabular(page) for page in paginated_results]
153
+ ## concat the results
154
+ results = vcat(results...)
155
+ results[!, :category] .= category
156
+ push!(allresults, results)
157
+ end
158
+ end
159
+ return vcat(allresults...)
160
+ end
161
+
162
+ function get_latest_fact_checks()
163
+ download("https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json", "data/fact_check_latest.json")
164
+ end
165
+
166
+ """
167
+ d = Dict(
168
+ :author => Dict(
169
+ :name => "John Doe"
170
+ )
171
+ )
172
+ safe_get(d, (:author, :name), "No name")
173
+ """
174
+ function safe_get(dict::Dict, keys::Tuple, default=nothing)
175
+ current = dict
176
+ for key in keys
177
+ if haskey(current, key)
178
+ current = current[key]
179
+ else
180
+ return default
181
+ end
182
+ end
183
+ return current
184
+ end
185
+
186
+ function safe_datetime(date::Union{DateTime, Missing})
187
+ return date
188
+ end
189
+
190
+ ## Convert date string to DateTime object without throwing an error
191
+ function safe_datetime(date::String)
192
+ try
193
+ return Dates.DateTime(date)
194
+ catch
195
+ try
196
+ Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SS.sssZ")
197
+ catch
198
+ try
199
+ Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SSZ")
200
+ catch
201
+ ## If all attempts fail
202
+ return missing
203
+ end
204
+ end
205
+ end
206
+ end
207
+
208
+ """
209
+ ## Load the entire fact check JSON file
210
+ - the updated dataset can be found at: https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json
211
+ df, errors = load_fact_check_json()
212
+ """
213
+ function load_fact_check_json(file::String="data/fact_check_latest.json"; get_latest=false)
214
+ if get_latest
215
+ get_latest_fact_checks()
216
+ end
217
+ df = JSON3.read(file)
218
+ dfout = DataFrame[]
219
+ errors = 0
220
+ error_index = Int64[]
221
+ for (i, data) in enumerate(df[:dataFeedElement])
222
+ try
223
+ d = Dict(data[:item][1])
224
+ results = DataFrame(
225
+ text = get(d, :claimReviewed, ""),
226
+ claimant = safe_get(d, (:itemReviewed, :author, :name), ""),
227
+ claimDate = safe_get(d, (:itemReviewed, :datePublished), ""),
228
+ claimReviewDate = get(d, :datePublished, ""),
229
+ claimReviewPublisher = get(d[:author], :name, ""),
230
+ claimReviewTitle = "",
231
+ claimReviewTextualRating = safe_get(d, (:reviewRating, :alternateName), ""),
232
+ claimReviewUrl = get(data, :url, "")
233
+ )
234
+ push!(dfout, results)
235
+ catch
236
+ push!(error_index, i)
237
+ errors += 1
238
+ end
239
+ end
240
+ return (vcat(dfout...), error_index)
241
+ end
242
+
243
+ """
244
+ ## Format the date columns in the DataFrame
245
+ - drop rows where both date columns are missing
246
+
247
+ df, errors = load_fact_check_json("data/fact_check_latest.json")
248
+ format_date_cols!(df)
249
+ """
250
+ function format_date_cols!(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
251
+ # Drop where date = ""
252
+ for col in date_cols
253
+ df[!, col] = safe_datetime.(df[!, col])
254
+ end
255
+ # Drop if both date columns are missing
256
+ df[:, :contains_date] = (ismissing.(df.claimDate) .& ismissing.(df.claimReviewDate)) .== false
257
+ subset!(df, :contains_date)
258
+ end
259
+
260
+ """
261
+ ## Gets the latest date in the DataFrame from current date columns
262
+ - used to identify the latest fact-checks in the datasets
263
+ df, errs = load_fact_check_json("data/fact_check_latest.json")
264
+ get_latest_date(df)
265
+ """
266
+ function get_latest_date(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate])
267
+ if typeof(df.claimDate[1]) == String
268
+ format_date_cols!(df)
269
+ end
270
+ ## Identify any dates that are in the future - must be miscoded, set to missing
271
+ for col in date_cols
272
+ df[!, col] = [ coalesce(x, Dates.today()) > Dates.today() ? missing : x for x in df[!, col]]
273
+ end
274
+ maxdates = [maximum(coalesce.(df[!, col], Date(1901, 1, 1))) for col in date_cols]
275
+ maxcolumn = date_cols[argmax(maxdates)]
276
+ return maximum(maxdates), maxcolumn
277
+ end
278
+
279
+ """
280
+ ## Identify the fact-checks in the latest dataset that are not in the previous dataset
281
+ - use claimReviewDate to identify differences
282
+ - get the latest claimReviewDate in current_data
283
+ - get the latest claimReviewDate in previous_data
284
+ - select the rows in current_data where claimReviewDate > latest_claimReviewDate
285
+
286
+ Example:
287
+ previous_data, errs = load_fact_check_json("data/fact_check_latest.json")
288
+ current_data, errs = load_fact_check_json("data/fact_check_latest.json", get_latest=true)
289
+ CSV.write("data/fact_check_latest.csv", current_data)
290
+ new_fact_checks = get_new_fact_checks(current_data, previous_data)
291
+ """
292
+ function get_new_fact_checks(current_data::DataFrame, previous_data::DataFrame)
293
+ latest_of_newest, datecol = get_latest_date(current_data)
294
+ latest_of_previous, datecol = get_latest_date(previous_data)
295
+ # Get the indices of the new fact-checks
296
+ if latest_of_newest > latest_of_previous
297
+ return current_data[coalesce.(current_data[!, datecol], Date(1901, 1, 1)) .> latest_of_previous, :]
298
+ else
299
+ return DataFrame()
300
+ end
301
+ end
302
+
scripts/misinfo.jl ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##
2
+ using CSV, JLD2, DataFrames, OpenAI, StatsBase, Distances, TidierPlots
3
+
4
+ ## TODO: Base Functions
5
+ # 1. Create a function to generate embeddings
6
+ # 2. Create a function to get the distance to the closest claim, cut based on threshold
7
+ # 3. Create a function to get the distance to the closest counterclaim, no cutting on threshold
8
+ # 4. Create a function to compare the distance to the closest claim and counterclaim, assign label only if the distance to the closest claim is less than the distance to the closest counterclaim
9
+
10
+
11
+ ## Analysis:
12
+ # What is the distribution of distances by assigned narrative and label?
13
+
14
+ ### UTILITIES ####
15
+ # Define count function
16
+ function table(df::DataFrame, cols::Vector{Symbol})
17
+ combine(groupby(df, cols), nrow)
18
+ end
19
+ #########
20
+
21
+ """
22
+ ## Embeddings to recover narratives
23
+ narrative_embeddings = create_narrative_embeddings()
24
+ """
25
+ function create_narrative_embeddings(regenerate=false)
26
+ if !regenerate && isfile("data/narrative_embeddings.jld2")
27
+ return load_object("data/narrative_embeddings.jld2")
28
+ end
29
+ @info "Regenerating narrative embeddings..."
30
+ narratives = CSV.read("data/Modified Misinformation Library.csv", DataFrame)
31
+ ## narrative Embeddings
32
+ n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], narratives[!, "Misinformation Narrative"])
33
+ ## Add vector of embeddings to dataset
34
+ narratives[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
35
+ # Save the embeddings
36
+ save_object("data/narrative_embeddings.jld2", narratives)
37
+ return narratives
38
+ end
39
+
40
+ """
41
+ # This is the testing data
42
+ target_embeddings = create_test_embeddings()
43
+ """
44
+ function create_test_embeddings(regenerate=false)
45
+ if !regenerate && isfile("data/test_embeddings.jld2")
46
+ return load_object("data/test_embeddings.jld2")
47
+ end
48
+ @info "Regenerating test embeddings..."
49
+ df_test = CSV.read("data/Indicator_Test.csv", DataFrame)
50
+ ## narrative Embeddings
51
+ n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], df_test[!, "text"])
52
+ ## Add vector of embeddings to dataset
53
+ df_test[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]]
54
+ # Save the embeddings
55
+ save_object("data/test_embeddings.jld2", df_test)
56
+ return df_test
57
+ end
58
+
59
+ """
60
+ ### The embeddings for each example are along the rows, so they can be compared column-wise (fast)
61
+ narrative_embeddings = create_narrative_embeddings()
62
+ target_embeddings = create_test_embeddings()
63
+ one_shot_classification!(narrative_embeddings, target_embeddings)
64
+ ## Show the results - text, closest narrative
65
+ target_embeddings[:, ["text", "Closest Narrative", "label"]] |> first(5)
66
+ """
67
+ function one_shot_classification!(narrative_embeddings, target_embeddings)
68
+ ## Matrix of embeddings
69
+ narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
70
+ target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
71
+ # Create a search function
72
+ function search(narrative_matrix, target_matrix)
73
+ distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
74
+ # get the index of the column with the smallest distance
75
+ narrative_index = argmin(distances, dims=2)
76
+ return narrative_index
77
+ end
78
+ # Search for the closest narrative for each test data
79
+ narrative_assignment = search(narrative_matrix, target_matrix)
80
+ target_embeddings[:, "Closest Narrative"] = [narrative_embeddings[x[2], "Misinformation Narrative"] for x in narrative_assignment[:, 1]]
81
+ return target_embeddings
82
+ end
83
+
84
+ function get_distances!(narrative_embeddings, target_embeddings)
85
+ ## Matrix of embeddings
86
+ narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...)
87
+ target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
88
+ # Create a search function
89
+ function embedding_distances(narrative_matrix, target_matrix)
90
+ distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
91
+ # get the index of the column with the smallest distance
92
+ return distances[argmin(distances, dims=2)][:, 1]
93
+ end
94
+ # Search for the closest narrative for each test data
95
+ target_embeddings[:, "Dist"] = embedding_distances(narrative_matrix, target_matrix)
96
+ return target_embeddings
97
+ end
98
+
99
+
100
+ ## Add vector of embeddings to the test dataset
101
+
102
+ # 3. Generate embeddings of the narratives in multiple languages
103
+ # 4. Create a langchain search function to check which narrative is closest to the input narrative
104
+ # 5. Figure out how effective the embeddings are in recovering the narrative classification
105
+
106
+ ## STEPS::::: Models
107
+ # 1. Within each of the classified narratives, reuse the embeddings to find the misinfo by selecting the top K matches
108
+ # 2. Train a model on the embeddings to predict the misinfo
109
+
110
+ # Get the embeddings for the narratives
111
+ narrative_embeddings = create_narrative_embeddings()
112
+ target_embeddings = create_test_embeddings()
113
+ one_shot_classification!(narrative_embeddings, target_embeddings)
114
+ get_distances!(narrative_embeddings, target_embeddings)
115
+
116
+ # Plot the distribution of distances by narrative and label
117
+ using TidierPlots
118
+
119
+ ## By Label
120
+ ggplot(target_embeddings, @aes(x = label, y = Dist)) +
121
+ geom_violin() + labs(x="Misinfo Label", y="Distance") #+ geom_hline()
122
+ ## By Narrative
123
+ #ggplot(target_embeddings |> (data -> filter(:label => x -> x .== 1.0, data)), @aes(x = "Closest Narrative", y = Dist)) +
124
+ # geom_violin()
125
+
126
+ ### Assign MisinfoPred = true if distance is less than .2
127
+
128
+ target_embeddings[!, "MisinfoPred"] = target_embeddings[!, "Dist"] .< 0.2
129
+
130
+ ## Precision and Recall
131
+ using MLJ
132
+
133
+ y_true = target_embeddings[!, "label"]
134
+ y_pred = target_embeddings[!, "MisinfoPred"]
135
+ confusion_matrix(y_pred, y_true)
136
+ accuracy(y_true, y_pred)
137
+ true_positive_rate(y_true, y_pred)
138
+ false_positive_rate(y_true, y_pred)
139
+
140
+ ## Top 10 closest narratives
141
+ target_embeddings |>
142
+ (data -> filter(:label => x -> x .== 1.0, data)) |>
143
+ (data -> sort(data, :Dist)) |>
144
+ (data -> first(data, 10)) |>
145
+ (data -> select(data, ["text", "Closest Narrative", "Dist"]))
scripts/single_climate_example.jl ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include("src/narrative_construction.jl")
2
+
3
+ # Create a new claim
4
+ claim1 = createClaim("Antarctica is gaining ice and is not actually warming",
5
+ "Antarctical is losing ice year by year due to the effects of climate change",
6
+ "Facebook post",
7
+ ["antarctica", "global warming", "climate change"])
8
+
9
+ claim2 = createClaim("It's natural cycles/variation in weather, not global warming",
10
+ "There is substantial evidence that the current warming is not due to natural cycles",
11
+ "Facebook post",
12
+ ["natural cycles", "global warming", "climate change"])
13
+
14
+ climate_narrative = Narrative(
15
+ randid(),
16
+ "Climate Change Denial",
17
+ "Climate Change",
18
+ "Scientists, Elites",
19
+ "The science behind climate change inconclusive or flawed",
20
+ Claim[])
21
+
22
+ # Add the claims to the narrative
23
+ add_claim!(climate_narrative, claim1)
24
+ add_claim!(climate_narrative, claim2)
25
+
26
+
27
+
28
+ ## show the narrative
29
+ climate_narrative
30
+
31
+ ## Now, let's deploy the narrative to the database
32
+
33
+ candidate_data = DataFrame()
34
+
35
+ """
36
+ ## Deploy narrative does the following:
37
+ 1. Retrieves the data from the database
38
+ 2. Filters the data based on the keywords in the claims
39
+ 3. Generates embeddings for the claims and the data
40
+ 4. Computes the similarity between the claims and the data
41
+ 5. Returns the top 10 most similar data points
42
+
43
+ """
44
+ deploy_narrative_model!(climate_narrative, threshold=.2)
scripts/upsert_climate_test.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Chunk and upload vectors from a DataFrame to Pinecone
2
+
3
+ index = pc.Index("test-index")
4
+
5
+ ## Working Example 1
6
+ df = pd.read_csv('data/climate_test.csv')
7
+ pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
8
+ model = "multilingual-e5-large"
9
+ df = chunk_and_embed(pc, model, df)
10
+ df['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(df))]
11
+ # drop everything except 'Embeddings', 'text', and 'CSV_File'
12
+ #df = df[['Embeddings', 'text', 'id']]
13
+ # create 4 random embeddings for each row
14
+ vectors = create_vectors_from_df(df)
15
+ index.upsert(
16
+ vectors=vectors[0:12],
17
+ namespace="test-namespace"
18
+ )
19
+ chunk_df_and_upsert(index, df, chunk_size=100)
20
+