Upload 24 files
Browse files- .gitattributes +2 -0
- data/Climate Misinformation claims.csv +81 -0
- data/Combined Misinformation Library.csv +197 -0
- data/Indicator_Development.csv +0 -0
- data/Indicator_Test.csv +0 -0
- data/Modified Misinformation Library.csv +97 -0
- data/climate_data/data/README.txt +46 -0
- data/expansive_claims_library_expanded_embed.csv +0 -0
- data/filtered_fact_check_latest_embed.csv +3 -0
- data/random_300k.csv +3 -0
- src/Embeddings.jl +186 -0
- src/Models.jl +182 -0
- src/OstreaCultura.jl +25 -0
- src/PyPineCone.jl +415 -0
- src/bash/update_fact_checks.sh +14 -0
- src/deprecated/Narrative.jl +242 -0
- src/deprecated/NarrativeClassification.jl +107 -0
- src/dev/Utils.jl +73 -0
- src/py_init.jl +14 -0
- src/python/DataLoader.py +344 -0
- src/python/MiniEncoder.py +10 -0
- src/python/__pycache__/DataLoader.cpython-310.pyc +0 -0
- src/python/__pycache__/DataLoader.cpython-312.pyc +0 -0
- src/python/update_fact_check_data.py +83 -0
- src/python/upload_library_hybrid-sparse.py +107 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/filtered_fact_check_latest_embed.csv filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
|
data/Climate Misinformation claims.csv
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Topic,Narrative,Claim,Instances
|
2 |
+
Climate Change,Global warming is not happening,Ice isn't melting,Antarctica is gaining ice/not warming
|
3 |
+
Climate Change,Global warming is not happening,Ice isn't melting,Greenland is gaining ice/not melting
|
4 |
+
Climate Change,Global warming is not happening,Ice isn't melting,Arctic sea ice isn't vanishing
|
5 |
+
Climate Change,Global warming is not happening,Glaciers aren't vanishing,Glaciers aren't vanishing
|
6 |
+
Climate Change,Global warming is not happening,We're heading into an ice age/global cooling,We're heading into an ice age/global cooling
|
7 |
+
Climate Change,Global warming is not happening,Weather is cold/snowing,Weather is cold/snowing
|
8 |
+
Climate Change,Global warming is not happening,Climate hasn't warmed/changed over the last (few) decade(s),Climate hasn't warmed/changed over the last (few) decade(s)
|
9 |
+
Climate Change,Global warming is not happening,Oceans are cooling/not warming,Oceans are cooling/not warming
|
10 |
+
Climate Change,Global warming is not happening,Sea level rise is exaggerated/not accelerating,Sea level rise is exaggerated/not accelerating
|
11 |
+
Climate Change,Global warming is not happening,Extreme weather isn't increasing/has happened before/isn't linked to climate change,Extreme weather isn't increasing/has happened before/isn't linked to climate change
|
12 |
+
Climate Change,Global warming is not happening,They changed the name from 'global warming' to 'climate change',They changed the name from 'global warming' to 'climate change'
|
13 |
+
Climate Change,Climate change is not human caused,It's natural cycles/variation,It's the sun/cosmic rays/astronomical
|
14 |
+
Climate Change,Climate change is not human caused,It's natural cycles/variation,It's geological (includes volcanoes)
|
15 |
+
Climate Change,Climate change is not human caused,It's natural cycles/variation,It's the ocean/internal variability
|
16 |
+
Climate Change,Climate change is not human caused,It's natural cycles/variation,Climate has changed naturally/been warm in the past
|
17 |
+
Climate Change,Climate change is not human caused,It's natural cycles/variation,Human CO2 emissions are tiny compared to natural CO2 emission
|
18 |
+
Climate Change,Climate change is not human caused,It's natural cycles/variation,"It's non-greenhouse gas human climate forcings (aerosols, land use)"
|
19 |
+
Climate Change,Climate change is not human caused,There's no evidence for greenhouse effect/carbon dioxide driving climate change,Carbon dioxide is just a trace gas
|
20 |
+
Climate Change,Climate change is not human caused,There's no evidence for greenhouse effect/carbon dioxide driving climate change,Greenhouse effect is saturated/logarithmic
|
21 |
+
Climate Change,Climate change is not human caused,There's no evidence for greenhouse effect/carbon dioxide driving climate change,Carbon dioxide lags/not correlated with climate change
|
22 |
+
Climate Change,Climate change is not human caused,There's no evidence for greenhouse effect/carbon dioxide driving climate change,Water vapor is the most powerful greenhouse gas
|
23 |
+
Climate Change,Climate change is not human caused,There's no evidence for greenhouse effect/carbon dioxide driving climate change,There's no tropospheric hot spot
|
24 |
+
Climate Change,Climate change is not human caused,There's no evidence for greenhouse effect/carbon dioxide driving climate change,CO2 was higher in the past
|
25 |
+
Climate Change,Climate change is not human caused,CO2 is not rising/ocean pH is not falling,CO2 is not rising/ocean pH is not falling
|
26 |
+
Climate Change,Climate change is not human caused,Human CO2 emissions are miniscule/not raising atmospheric CO2,Human CO2 emissions are miniscule/not raising atmospheric CO2
|
27 |
+
Climate Change,Climate impacts/global warming is beneficial/not bad,Climate sensitivity is low/negative feedbacks reduce warming,Climate sensitivity is low/negative feedbacks reduce warming
|
28 |
+
Climate Change,Climate impacts/global warming is beneficial/not bad,Species/plants/reefs aren't showing climate impacts yet/are benefiting from climate change,Species/plants/reefs aren't showing climate impacts yet/are benefiting from climate change
|
29 |
+
Climate Change,Climate impacts/global warming is beneficial/not bad,Species/plants/reefs aren't showing climate impacts yet/are benefiting from climate change,Species can adapt to global warming
|
30 |
+
Climate Change,Climate impacts/global warming is beneficial/not bad,Species/plants/reefs aren't showing climate impacts yet/are benefiting from climate change,Polar bears are not in danger from climate change
|
31 |
+
Climate Change,Climate impacts/global warming is beneficial/not bad,Species/plants/reefs aren't showing climate impacts yet/are benefiting from climate change,Ocean acidification/coral impacts aren't serious
|
32 |
+
Climate Change,Climate impacts/global warming is beneficial/not bad,CO2 is beneficial/not a pollutant,CO2 is beneficial/not a pollutant
|
33 |
+
Climate Change,Climate impacts/global warming is beneficial/not bad,CO2 is beneficial/not a pollutant,CO2 is plant food
|
34 |
+
Climate Change,Climate impacts/global warming is beneficial/not bad,It's only a few degrees (or less),It's only a few degrees (or less)
|
35 |
+
Climate Change,Climate impacts/global warming is beneficial/not bad,Climate change does not contribute to human conflict/threaten national security,Climate change does not contribute to human conflict/threaten national security
|
36 |
+
Climate Change,Climate impacts/global warming is beneficial/not bad,Climate change doesn't negatively impact health,Climate change doesn't negatively impact health
|
37 |
+
Climate Change,Climate solutions won't work,Climate solutions won't work,Climate solutions won't work
|
38 |
+
Climate Change,Climate solutions won't work,Climate policies (mitigation or adaptation) are harmful,Climate policies (mitigation or adaptation) are harmful
|
39 |
+
Climate Change,Climate solutions won't work,Climate policies (mitigation or adaptation) are harmful,Climate policy will increase costs/harm economy/kill jobs
|
40 |
+
Climate Change,Climate solutions won't work,Climate policies (mitigation or adaptation) are harmful,Proposed action would weaken national security/national sovereignty/cause conflict
|
41 |
+
Climate Change,Climate solutions won't work,Climate policies (mitigation or adaptation) are harmful,Proposed action would actually harm the environment and species
|
42 |
+
Climate Change,Climate solutions won't work,Climate policies (mitigation or adaptation) are harmful,Future generations will be richer and better able to adapt
|
43 |
+
Climate Change,Climate solutions won't work,Climate policies (mitigation or adaptation) are harmful,Climate policy limits liberty/freedom/capitalism
|
44 |
+
Climate Change,Climate solutions won't work,Climate policies are ineffective/flawed,Climate policies are ineffective/flawed
|
45 |
+
Climate Change,Climate solutions won't work,Climate policies are ineffective/flawed,Clean energy/green jobs/businesses won't work
|
46 |
+
Climate Change,Climate solutions won't work,Climate policies are ineffective/flawed,Markets/private sector are economically more efficient than government policies
|
47 |
+
Climate Change,Climate solutions won't work,Climate policies are ineffective/flawed,Climate policy will make negligible difference to climate change
|
48 |
+
Climate Change,Climate solutions won't work,Climate policies are ineffective/flawed,A single country/region only contributes a small % of global emissions
|
49 |
+
Climate Change,Climate solutions won't work,Climate policies are ineffective/flawed,Better to adapt/geoengineer/increase resiliency
|
50 |
+
Climate Change,Climate solutions won't work,Climate policies are ineffective/flawed,Climate action is pointless because of China/India/other countries' emissions
|
51 |
+
Climate Change,Climate solutions won't work,Climate policies are ineffective/flawed,We should invest in technology/reduce poverty/disease first
|
52 |
+
Climate Change,Climate solutions won't work,It's too hard to solve,It's too hard to solve
|
53 |
+
Climate Change,Climate solutions won't work,It's too hard to solve,Climate policy is politically/legally/economically/technically too difficult
|
54 |
+
Climate Change,Climate solutions won't work,It's too hard to solve,Media/public support/acceptance is low/decreasing
|
55 |
+
Climate Change,Climate solutions won't work,Clean energy technology/biofuels won't work,Clean energy technology/biofuels won't work
|
56 |
+
Climate Change,Climate solutions won't work,Clean energy technology/biofuels won't work,Clean energy/biofuels are too expensive/unreliable/counterproductive/harmful
|
57 |
+
Climate Change,Climate solutions won't work,Clean energy technology/biofuels won't work,Carbon Capture & Sequestration (CCS) is unproven/expensive
|
58 |
+
Climate Change,Climate solutions won't work,"People need energy (e.g., from fossil fuels/nuclear)
|
59 |
+
","People need energy (e.g., from fossil fuels/nuclear)
|
60 |
+
"
|
61 |
+
Climate Change,Climate solutions won't work,"People need energy (e.g., from fossil fuels/nuclear)
|
62 |
+
",Fossil fuel reserves are plentiful
|
63 |
+
Climate Change,Climate solutions won't work,"People need energy (e.g., from fossil fuels/nuclear)
|
64 |
+
",Fossil fuels are cheap/good/safe for society/economy/environment
|
65 |
+
Climate Change,Climate solutions won't work,"People need energy (e.g., from fossil fuels/nuclear)
|
66 |
+
",Nuclear power is safe/good for society/economy/environment
|
67 |
+
Climate Change,Climate movement/science is unreliable,Climate movement/science is unreliable,Climate movement/science is unreliable
|
68 |
+
Climate Change,Climate movement/science is unreliable,"Climate-related science is uncertain/unsound/unreliable (data , methods & models)","Climate-related science is uncertain/unsound/unreliable (data , methods & models)"
|
69 |
+
Climate Change,Climate movement/science is unreliable,"Climate-related science is uncertain/unsound/unreliable (data , methods & models)",There's no scientific consensus on climate/the science isn't settled
|
70 |
+
Climate Change,Climate movement/science is unreliable,"Climate-related science is uncertain/unsound/unreliable (data , methods & models)",Proxy data is unreliable (includes hockey stick)
|
71 |
+
Climate Change,Climate movement/science is unreliable,"Climate-related science is uncertain/unsound/unreliable (data , methods & models)",Temperature record is unreliable
|
72 |
+
Climate Change,Climate movement/science is unreliable,"Climate-related science is uncertain/unsound/unreliable (data , methods & models)",Models are wrong/unreliable/uncertain
|
73 |
+
Climate Change,Climate movement/science is unreliable,Climate movement is alarmist/wrong/political/biased/hypocritical (people or groups),Climate movement is alarmist/wrong/political/biased/hypocritical (people or groups)
|
74 |
+
Climate Change,Climate movement/science is unreliable,Climate movement is alarmist/wrong/political/biased/hypocritical (people or groups),Climate movement is religion
|
75 |
+
Climate Change,Climate movement/science is unreliable,Climate movement is alarmist/wrong/political/biased/hypocritical (people or groups),Media (including bloggers) is alarmist/wrong/political/biased
|
76 |
+
Climate Change,Climate movement/science is unreliable,Climate movement is alarmist/wrong/political/biased/hypocritical (people or groups),Politicians/government/UN are alarmist/wrong/political/biased
|
77 |
+
Climate Change,Climate movement/science is unreliable,Climate movement is alarmist/wrong/political/biased/hypocritical (people or groups),Environmentalists are alarmist/wrong/political/biased
|
78 |
+
Climate Change,Climate movement/science is unreliable,Climate movement is alarmist/wrong/political/biased/hypocritical (people or groups),Scientists/academics are alarmist/wrong/political/biased
|
79 |
+
Climate Change,Climate movement/science is unreliable,Climate change (science or policy) is a conspiracy (deception),Climate change (science or policy) is a conspiracy (deception)
|
80 |
+
Climate Change,Climate movement/science is unreliable,Climate change (science or policy) is a conspiracy (deception),Climate policy/renewables is a hoax/scam/conspiracy/secretive
|
81 |
+
Climate Change,Climate movement/science is unreliable,Climate change (science or policy) is a conspiracy (deception),Climate science is a hoax/scam/conspiracy/secretive/money-motivated (includes climategate)
|
data/Combined Misinformation Library.csv
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Topic,Narrative,Claims,Counterclaims,Harm 1,Harm 2
|
2 |
+
Climate Change,Climate Change,Global warming is not happening,Antarctica is gaining ice/not warming,Antarctica is warming,,
|
3 |
+
Climate Change,Climate Change,Global warming is not happening,Greenland is gaining ice/not melting,Greenland is warming,,
|
4 |
+
Climate Change,Climate Change,Global warming is not happening,Arctic sea ice isn't vanishing,Arctic sea ice is vanishing,,
|
5 |
+
Climate Change,Climate Change,Global warming is not happening,Glaciers aren't vanishing,Glaciers are vanishing,,
|
6 |
+
Climate Change,Climate Change,Global warming is not happening,We're heading into an global cooling,We're heading into global warming,,
|
7 |
+
Climate Change,Climate Change,Global warming is not happening,It is cold so global warming isn't happening,It is cold but global warming is still happening,,
|
8 |
+
Climate Change,Climate Change,Global warming is not happening,Climate hasn't changed over the past few decades,Climate has changed ,,
|
9 |
+
Climate Change,Climate Change,Global warming is not happening,Oceans are not warming,Oceans are warming,,
|
10 |
+
Climate Change,Climate Change,Global warming is not happening,Sea level rise is exaggerated,Sea level rise is not exaggerated,,
|
11 |
+
Climate Change,Climate Change,Global warming is not happening,Sea level rise is exaggerated/not accelerating,Sea level rise is accelerating,,
|
12 |
+
Climate Change,Climate Change,Global warming is not happening,Extreme weather isn't increasing/has happened before/isn't linked to climate change,Extreme weather is linked to climate change,,
|
13 |
+
Climate Change,Climate Change,Global warming is not happening,Extreme weather isn't increasing,Extreme weather is increasing,,
|
14 |
+
Climate Change,Climate Change,Global warming is not happening,They changed the name from 'global warming' to 'climate change',They didn't change the name to climate change,,
|
15 |
+
Climate Change,Climate Change,Climate change is not human caused,Climate change is from cosmic rays,Climate change is not caused by cosmic rays,,
|
16 |
+
Climate Change,Climate Change,Climate change is not human caused,Climate change is from astronomical forces,Climate change is not caused by astronomical forces,,
|
17 |
+
Climate Change,Climate Change,Climate change is not human caused,Climate change is from volcanos,Climate change is not from volcanos,,
|
18 |
+
Climate Change,Climate Change,Climate change is not human caused,Climate change is caused by the oceans,Climate change is not caused by the oceans,,
|
19 |
+
Climate Change,Climate Change,Climate change is not human caused,Climate change is caused by natural cycles,Climate change is not caused by natural cycles,,
|
20 |
+
Climate Change,Climate Change,Climate change is not human caused,Climate change is normal or natural,Climate change is not normal or natural,,
|
21 |
+
Climate Change,Climate Change,Climate change is not human caused,Human CO2 emissions are tiny compared to natural CO2 emission,Human CO2 emissions are not tiny,,
|
22 |
+
Climate Change,Climate Change,Climate change is not human caused,"It's non-greenhouse gas human climate forcings (aerosols, land use)",,,
|
23 |
+
Climate Change,Climate Change,Climate change is not human caused,Carbon dioxide is just a trace gas,,,
|
24 |
+
Climate Change,Climate Change,Climate change is not human caused,Greenhouse effect is logarithmic,The greenhouse effect is not logarithmic,,
|
25 |
+
Climate Change,Climate Change,Climate change is not human caused,Greenhouse effect is saturated,The greenhouse effect is not saturated,,
|
26 |
+
Climate Change,Climate Change,Climate change is not human caused,Carbon dioxide lags climate change,Carbon dioxide does not lag climate change,,
|
27 |
+
Climate Change,Climate Change,Climate change is not human caused,Carbon dioxide is not correlated with climate change,Carbon dioxide is correlated with climate change,,
|
28 |
+
Climate Change,Climate Change,Climate change is not human caused,Water vapor is the most powerful greenhouse gas,Water vapor is not the most powerful greenhouse gas,,
|
29 |
+
Climate Change,Climate Change,Climate change is not human caused,There is no tropospheric hot spot,There is a tropospheric hot spot,,
|
30 |
+
Climate Change,Climate Change,Climate change is not human caused,CO2 was higher in the past,CO2 is higher today,,
|
31 |
+
Climate Change,Climate Change,Climate change is not human caused,CO2 is not rising,CO2 is not rising,,
|
32 |
+
Climate Change,Climate Change,Climate change is not human caused,Ocean pH is not falling,Ocean pH is falling,,
|
33 |
+
Climate Change,Climate Change,Climate change is not human caused,Human CO2 emissions are not raising atmospheric CO2,Human CO2 emissions are raising atmospheric CO2,,
|
34 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,Negative feedbacks reduce warming,Negative feedbacks do not reduce climate change,,
|
35 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,Life is not showing signs of climate change,Life is showing signs of climate change,,
|
36 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,Life is benefiting from climate change,Life is not benefiting from climate change,,
|
37 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,Species can adapt to climate change,species cannot adapt to climate change in time,,
|
38 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,Polar bears are not in danger from climate change,Polar bears are in danger from climate change,,
|
39 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,Ocean acidification is not serious,Ocean acidification is serious,,
|
40 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,Climate impact on coral isn't serious,Climate impact on coral is serious,,
|
41 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,CO2 is not a pollutant,,,
|
42 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,CO2 is beneficial to the environment,,,
|
43 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,CO2 is plant food,,,
|
44 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,Climate change is only a few degrees ,Climate change is a big temperature change,,
|
45 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,Climate change does not contribute to human conflict/threaten national security,,,
|
46 |
+
Climate Change,Climate Change,Climate impacts/global warming is beneficial/not bad,Climate change doesn't negatively impact health,Climate change does negatively impact health,,
|
47 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate solutions won't work,Climate solutions will work,,
|
48 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate policies are harmful,Climate policies are not harmful,,
|
49 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate policy will reduce jobs,Climate policies will not reduce jobs,,
|
50 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate policy will harm the economy,Climate policy will not harm the economy,,
|
51 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate policy would weaken national security/national sovereignty/cause conflict,Climate policies would weaken national security ,,
|
52 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate policies would cause international conflict,Climate policies would not cause international conflict,,
|
53 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate policy would actually harm the environment,Climate policy would not harm the environment,,
|
54 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate policy limits capitalism,Climate policy does not limit capitalism,,
|
55 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate policy limits freedom,Climate policy does not limit freedom,,
|
56 |
+
Climate Change,Climate Change,Climate solutions won't work,Green jobs won't work,Green jobs will work,,
|
57 |
+
Climate Change,Climate Change,Climate solutions won't work,Green businesses won't work,Green businesses will work,,
|
58 |
+
Climate Change,Climate Change,Climate solutions won't work,Government policies are less efficient than market solutions,Government policies are not less efficient than market solutions,,
|
59 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate policy will not make a big difference to climate change,Climate policy will make a difference to climate change,,
|
60 |
+
Climate Change,Climate Change,Climate solutions won't work,Most CO2 emissions come from a single country,Most CO2 emissions do not come from a single country,,
|
61 |
+
Climate Change,Climate Change,Climate solutions won't work,It is better to adapt to climate change than stop it ,It is not better to adapt to climate change than stop it ,,
|
62 |
+
Climate Change,Climate Change,Climate solutions won't work,It is better to geoengineer than stop climate change,It is not better to geoengineer than stop climate change,,
|
63 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate policy is useless because of other countries' emissions ,Climate policy is not useless because of other countries' emissions ,,
|
64 |
+
Climate Change,Climate Change,Climate solutions won't work,We should invest in other public policy areas first,We should not invest in other public policies first ,,
|
65 |
+
Climate Change,Climate Change,Climate solutions won't work,Climate change is too hard to solve,Climate change is not too hard to solve,,
|
66 |
+
Climate Change,Climate Change,Climate solutions won't work,Public support for climate policy is low,Public support for climate policy is not low,,
|
67 |
+
Climate Change,Climate Change,Climate solutions won't work,Clean energy technology won't work,Clean energy technology will work,,
|
68 |
+
Climate Change,Climate Change,Climate solutions won't work,Biofuels won't work,Biofuels won't work,,
|
69 |
+
Climate Change,Climate Change,Climate solutions won't work,Clean energy is too expensive,Clean energy is too expensive,,
|
70 |
+
Climate Change,Climate Change,Climate solutions won't work,Clean energy is too unreliable,Clean energy is not too unreliable,,
|
71 |
+
Climate Change,Climate Change,Climate solutions won't work,Clean energy is harmful,Clean energy is not harmful,,
|
72 |
+
Climate Change,Climate Change,Climate solutions won't work,Carbon Capture and Sequestration won't work,Carbon Capture and Sequestration will work,,
|
73 |
+
Climate Change,Climate Change,Climate solutions won't work,"People need energy from fossil fuels
|
74 |
+
","People do not need energy from fossil fuels
|
75 |
+
",,
|
76 |
+
Climate Change,Climate Change,Climate solutions won't work,Fossil fuel reserves are plentiful,,,
|
77 |
+
Climate Change,Climate Change,Climate solutions won't work,Fossil Fuels are good for society,Fossil Fuels are not good for society,,
|
78 |
+
Climate Change,Climate Change,Climate solutions won't work,Fossil fuels are cheap,Fossil Fuels are not cheap,,
|
79 |
+
Climate Change,Climate Change,Climate solutions won't work,Fossil fuels are safe,Fossil fuels are not safe,,
|
80 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate science is unreliable,Climate science is reliable,Civil Discourse,Violent Extremism
|
81 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate science is uncertain,Climate science is not uncertain,Civil Discourse,Violent Extremism
|
82 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate science is unsound,Climate science is sound,Civil Discourse,Violent Extremism
|
83 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,There is no scientific consensus on climate change,There is scientific consensus on climate change,Civil Discourse,
|
84 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Proxy data on climate change is unreliable,Proxy data on climate change is reliable,Civil Discourse,
|
85 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Temperature record is unreliable,Temperature record is not unreliable,Civil Discourse,
|
86 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate models are wrong,Climate models are not wrong,Civil Discourse,
|
87 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate movement is alarmist,Climate movement is not alarmist,Civil Discourse,Violent Extremism
|
88 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate movement is political,Climate movement is not political,Civil Discourse,Violent Extremism
|
89 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate movement is a religion,Climate movement is not a religion,Civil Discourse,Violent Extremism
|
90 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Media about climate change is alarmist,Media about climate change is not alarmist,Civil Discourse,Violent Extremism
|
91 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Media about climate change is political,Media about climate change is not political,Civil Discourse,Violent Extremism
|
92 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,The UN is wrong on climate change,The UN is right on climate change,Civil Discourse,Violent Extremism
|
93 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,The UN is alarmist on climate change,The UN is not alarmist on climate change,Civil Discourse,Violent Extremism
|
94 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,The government is alarmist about climate change,The government is not alarmist about climate change,Civil Discourse,Violent Extremism
|
95 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Scientists are biased about climate change,Scientists are biased about climate change,Civil Discourse,Violent Extremism
|
96 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Scientists are alarmist about climate change,Scientists are not alarmist about climate change,Civil Discourse,Violent Extremism
|
97 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate change is not a conspiracy ,Climate change is not a conspiracy ,Civil Discourse,Violent Extremism
|
98 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate policies are a scam,Climate policies are not a scam,Civil Discourse,Violent Extremism
|
99 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Clean energy is a conspiracy,Clean energy is not a conspiracy,Civil Discourse,Violent Extremism
|
100 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate technology is a scam,Climate technology is not a scam,Civil Discourse,Violent Extremism
|
101 |
+
Climate Change,Climate Change,Climate movement/science is unreliable,Climate science is a conspiracy,Climate science is not a conspiracy,Civil Discourse,Violent Extremism
|
102 |
+
Anti-semitic,Anti-semitic,Jews are responsible for the death of jesus,Jews are responsible for the death of jesus,,,
|
103 |
+
Anti-semitic,Anti-semitic,Jews are trying to destroy Christianity,Jews are trying to destroy Christianity,,,
|
104 |
+
Anti-semitic,Anti-semitic,Jews conduct ritual murder,Jews conduct ritual murder,,,
|
105 |
+
Anti-semitic,Anti-semitic,Jews use (christian) blood in rituals,Jews use (christian) blood in rituals,,,
|
106 |
+
Anti-semitic,Anti-semitic,Jews are penny pinchers or usurius,Jews are penny pinchers or usurius,,,
|
107 |
+
Anti-semitic,Anti-semitic,Jews are loyal to israel,Jews are loyal to israel,,,
|
108 |
+
Anti-semitic,Anti-semitic,Jews control black politics ,Jews control black politics ,,,
|
109 |
+
Anti-semitic,Anti-semitic,Jews control communism ,Jews control communism ,,,
|
110 |
+
Anti-semitic,Anti-semitic,Jews control democrats,Jews control democrats,,,
|
111 |
+
Anti-semitic,Anti-semitic,Jews control LGBTQ politics,Jews control LGBTQ politics,,,
|
112 |
+
Anti-semitic,Anti-semitic,Jews control liberalism,Jews control liberalism,,,
|
113 |
+
Anti-semitic,Anti-semitic,Jews control the global financial system,Jews control the global financial system,,,
|
114 |
+
Anti-semitic,Anti-semitic,Jews control the UN,Jews control the UN,,,
|
115 |
+
Anti-semitic,Anti-semitic,Jews control the weather,Jews control the weather,,,
|
116 |
+
Anti-semitic,Anti-semitic,Jews control the West,Jews control the West,,,
|
117 |
+
Anti-semitic,Anti-semitic,Jews ran the slave trade,Jews ran the slave trade,,,
|
118 |
+
Anti-semitic,Anti-semitic,Jews run hollywood,Jews run hollywood,,,
|
119 |
+
Anti-semitic,Anti-semitic,Jews run the media,Jews run the media,,,
|
120 |
+
Anti-semitic,Anti-semitic,Antisemitism isn't real,Antisemitism isn't real,,,
|
121 |
+
Anti-semitic,Anti-semitic,Jews provoke antisemitism,Jews provoke antisemitism,,,
|
122 |
+
Anti-semitic,Anti-semitic,Jesus was not jewish,Jesus was not jewish,,,
|
123 |
+
Anti-semitic,Anti-semitic,"Jews are descended from Khazar, not Judea","Jews are descended from Khazar, not Judea",,,
|
124 |
+
Anti-semitic,Anti-semitic,Jews are behind global migration,Jews are behind global migration,,,
|
125 |
+
Anti-semitic,Anti-semitic,Holocaust did not happen,Holocaust did not happen,,,
|
126 |
+
Anti-semitic,Anti-semitic,Jewish life lost during the holocaust is over-estimated,Jewish life lost during the holocaust is over-estimated,,,
|
127 |
+
Anti-semitic,Anti-semitic,Jews are behind multiculturalism,Jews are behind multiculturalism,,,
|
128 |
+
Anti-semitic,Anti-semitic,Jews are making people gay,Jews are making people gay,,,
|
129 |
+
Black,Black,Black lives matter protests were insurrections,Black lives matter protests were insurrections,,,
|
130 |
+
Black,Black,Black lives matter protests were riots,Black lives matter protests were riots,,,
|
131 |
+
Black,Black,Black people are targeting white people in response to George Floyd,Black people are targeting white people in response to George Floyd,,,
|
132 |
+
Black,Black,BLM activists commit non-protest-related crimes,BLM activists commit non-protest-related crimes,,,
|
133 |
+
Black,Black,BLM did the J6 insurrection,BLM did the J6 insurrection,,,
|
134 |
+
Black,Black,BLM seeks to enslave white people,BLM seeks to enslave white people,,,
|
135 |
+
Black,Black,Schools are teaching Black Lives Matter politics,Schools are teaching Black Lives Matter politics,,,
|
136 |
+
Black,Black,African Americans abuse government systems,African Americans abuse government systems,,,
|
137 |
+
Black,Black,African Americans are abnormally violent,African Americans are abnormally violent,,,
|
138 |
+
Black,Black,African Americans are criminals,African Americans are criminals,,,
|
139 |
+
Black,Black,African Americans are dependent on welfare,African Americans are dependent on welfare,,,
|
140 |
+
Black,Black,African Americans are lazy,African Americans are lazy,,,
|
141 |
+
Black,Black,Black people are less intelligent than white people,Black people are less intelligent than white people,,,
|
142 |
+
Black,Black,Democrats push the adoption of critical race theory,Democrats push the adoption of critical race theory,,,
|
143 |
+
Black,Black,Public education promotes critical race theory,Public education promotes critical race theory,,,
|
144 |
+
Black,Black,Public schools teach children critical race theory,Public schools teach children critical race theory,,,
|
145 |
+
Black,Black,Implicit bias doesn't exist,Implicit bias doesn't exist,,,
|
146 |
+
Black,Black,Systemic racism doesn't exist,Systemic racism doesn't exist,,,
|
147 |
+
Black,Black,Most Black people are not descended from slaves,Most Black people are not descended from slaves,,,
|
148 |
+
Black,Black,Black reproduction is meant to eliminate white people,Black reproduction is meant to eliminate white people,,,
|
149 |
+
Black,Black,Companies will not hire whites because of Affirmative Action,Companies will not hire whites because of Affirmative Action,,,
|
150 |
+
Black,Black,Companies will not hire whites because of DEI,Companies will not hire whites because of DEI,,,
|
151 |
+
Immigration,Immigration,Immigrants are bringing diseases to the west,Immigrants are bringing diseases to the west,,,
|
152 |
+
Immigration,Immigration,Immigrants are unvaccinated,Immigrants are unvaccinated,,,
|
153 |
+
Immigration,Immigration,Immigrants are violent,Immigrants are violent,,,
|
154 |
+
Immigration,Immigration,Immigrants commit disproportionate crime,Immigrants commit disproportionate crime,,,
|
155 |
+
Immigration,Immigration,Immigrants poison the blood of the nation,Immigrants poison the blood of the nation,,,
|
156 |
+
Immigration,Immigration,Immigrants are being allowed in to vote in elections,Immigrants are being allowed in to vote in elections,,,
|
157 |
+
Immigration,Immigration,Immigrants stole the 2020 election,Immigrants stole the 2020 election,,,
|
158 |
+
Immigration,Immigration,Immigration is an invasion of western countries,Immigration is an invasion of western countries,,,
|
159 |
+
Immigration,Immigration,Immigration is engineered to replace white people,Immigration is engineered to replace white people,,,
|
160 |
+
Immigration,Immigration,immigrants are given free health care in the united states,immigrants are given free health care in the united states,,,
|
161 |
+
Immigration,Immigration,Immigration is a globalist/multiculturalist conspiracy,Immigration is a globalist/multiculturalist conspiracy,,,
|
162 |
+
Immigration,Immigration,Immigration is a process of deculturalizing the west,Immigration is a process of deculturalizing the west,,,
|
163 |
+
Immigration,Immigration,Immigration is a process of despiritualizing the west,Immigration is a process of despiritualizing the west,,,
|
164 |
+
Immigration,Immigration,immigration is reverse colonization,immigration is reverse colonization,,,
|
165 |
+
Immigration,Immigration,immigration leads to the decline of western civilization,immigration leads to the decline of western civilization,,,
|
166 |
+
Immigration,Immigration,immigration will eliminate the white race through racial mixing,immigration will eliminate the white race through racial mixing,,,
|
167 |
+
LGBTQ,LGBTQ,LGBTQ rights is a form of colonization by the west,LGBTQ rights is a form of colonization by the west,,,
|
168 |
+
LGBTQ,LGBTQ,There are only two genders people are born with,There are only two genders people are born with,,,
|
169 |
+
LGBTQ,LGBTQ,LGBTQ is a disease that can be cured,LGBTQ is a disease that can be cured,,,
|
170 |
+
LGBTQ,LGBTQ,LGBTQ status is a choice,LGBTQ status is a choice,,,
|
171 |
+
LGBTQ,LGBTQ,LGBTQ status is caused by parenting,LGBTQ status is caused by parenting,,,
|
172 |
+
LGBTQ,LGBTQ,LGBTQ is a form of moral degeneracy,LGBTQ is a form of moral degeneracy,,,
|
173 |
+
LGBTQ,LGBTQ,LGBTQ is pushing children to change their gender,LGBTQ is pushing children to change their gender,,,
|
174 |
+
LGBTQ,LGBTQ,LGBTQ people are threats to children,LGBTQ people are threats to children,,,
|
175 |
+
LGBTQ,LGBTQ,LGBTQ people groom children,LGBTQ people groom children,,,
|
176 |
+
LGBTQ,LGBTQ,LGBTQ people threaten the safety of women and children in bathrooms,LGBTQ people threaten the safety of women and children in bathrooms,,,
|
177 |
+
LGBTQ,LGBTQ,LGTBQ people use the LGBTQ identity as a cover for dangerous qualities (e.g. they are secretly a rapist),LGTBQ people use the LGBTQ identity as a cover for dangerous qualities (e.g. they are secretly a rapist),,,
|
178 |
+
LGBTQ,LGBTQ,Gays control the media,Gays control the media,,,
|
179 |
+
LGBTQ,LGBTQ,There is a secret gay agenda/cabal,There is a secret gay agenda/cabal,,,
|
180 |
+
LGBTQ,LGBTQ,Gender affirming care is unsafe,Gender affirming care is unsafe,,,
|
181 |
+
LGBTQ,LGBTQ,Gender-affirming health care is a form of child abuse or mutiliation,Gender-affirming health care is a form of child abuse or mutiliation,,,
|
182 |
+
LGBTQ,LGBTQ,Gender-affirming health care is a form of sterilization,Gender-affirming health care is a form of sterilization,,,
|
183 |
+
LGBTQ,LGBTQ,Most people who transition regret it and want to detransition,Most people who transition regret it and want to detransition,,,
|
184 |
+
LGBTQ,LGBTQ,Being transgender is new or represents a recent trend,Being transgender is new or represents a recent trend,,,
|
185 |
+
LGBTQ,LGBTQ,LGBTQ is part of a social contagion or rapid onset gender dysphoria,LGBTQ is part of a social contagion or rapid onset gender dysphoria,,,
|
186 |
+
LGBTQ,LGBTQ,"Gay marriage is a slippery slope to: pedophilia, bestiality, or polygamy","Gay marriage is a slippery slope to: pedophilia, bestiality, or polygamy",,,
|
187 |
+
LGBTQ,LGBTQ,LGBTQ is an ideological movement pushing gender ideology and transgenderism,LGBTQ is an ideological movement pushing gender ideology and transgenderism,,,
|
188 |
+
LGBTQ,LGBTQ,[Public figure] is secretly trans,[Public figure] is secretly trans,,,
|
189 |
+
LGBTQ,LGBTQ,LGBTQ people can be distinguished by physical features,LGBTQ people can be distinguished by physical features,,,
|
190 |
+
LGBTQ,LGBTQ,LGBTQ people are satanists,LGBTQ people are satanists,,,
|
191 |
+
LGBTQ,LGBTQ,LGBTQ people cannot provide stable homes,LGBTQ people cannot provide stable homes,,,
|
192 |
+
Reproductive health,Reproductive health,Abortion is black genocide,Abortion is black genocide,,,
|
193 |
+
Reproductive health,Reproductive health,Abortion is genocide,Abortion is genocide,,,
|
194 |
+
Reproductive health,Reproductive health,Abortion is white genocide,Abortion is white genocide,,,
|
195 |
+
Reproductive health,Reproductive health,Birth control is black genocide,Birth control is black genocide,,,
|
196 |
+
Reproductive health,Reproductive health,Birth control is genocide,Birth control is genocide,,,
|
197 |
+
Reproductive health,Reproductive health,Birth control is white genocide,Birth control is white genocide,,,
|
data/Indicator_Development.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Indicator_Test.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/Modified Misinformation Library.csv
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Target,Type,Misinformation Narrative,Random ID
|
2 |
+
Anti-semitic,Anti-Christian,Jews are responsible for the death of jesus,UdR1EJ
|
3 |
+
Anti-semitic,Anti-Christian,Jews are trying to destroy Christianity,iiQLW3
|
4 |
+
Anti-semitic,Blood Libel,Jews conduct ritual murder,bzvo8C
|
5 |
+
Anti-semitic,Blood Libel,Jews use (christian) blood in rituals,E8Gihk
|
6 |
+
Anti-semitic,Character Assasination,Jews are penny pinchers or usurius,XhKvwR
|
7 |
+
Anti-semitic,Conspiracy,Jews are loyal to israel,gPdJTy
|
8 |
+
Anti-semitic,Conspiracy,Jews control black politics ,jHIGen
|
9 |
+
Anti-semitic,Conspiracy,Jews control communism ,00oDoA
|
10 |
+
Anti-semitic,Conspiracy,Jews control democrats,oX8nUF
|
11 |
+
Anti-semitic,Conspiracy,Jews control LGBTQ politics,z26UiS
|
12 |
+
Anti-semitic,Conspiracy,Jews control liberalism,Gm048t
|
13 |
+
Anti-semitic,Conspiracy,Jews control the global financial system,XrfWiT
|
14 |
+
Anti-semitic,Conspiracy,Jews control the UN,omXiC8
|
15 |
+
Anti-semitic,Conspiracy,Jews control the weather,oUguN7
|
16 |
+
Anti-semitic,Conspiracy,Jews control the West,xnfgPu
|
17 |
+
Anti-semitic,Conspiracy,Jews ran the slave trade,GXx4f1
|
18 |
+
Anti-semitic,Conspiracy,Jews run hollywood,wFhPBW
|
19 |
+
Anti-semitic,Conspiracy,Jews run the media,URbKNx
|
20 |
+
Anti-semitic,Deny marginalization,Antisemitism isn't real,PYXuER
|
21 |
+
Anti-semitic,Deny marginalization,Jews provoke antisemitism,53X1lc
|
22 |
+
Anti-semitic,Ethnic identity,Jesus was not jewish,NMVCa4
|
23 |
+
Anti-semitic,Ethnic identity,"Jews are descended from Khazar, not Judea",BhDWro
|
24 |
+
Anti-semitic,Great replacement,Jews are behind global migration,dVMbzJ
|
25 |
+
Anti-semitic,Holocaust Denial,Holocaust did not happen,JaSIbY
|
26 |
+
Anti-semitic,Holocaust Denial,Jewish life lost during the holocaust is over-estimated,dhOlra
|
27 |
+
Anti-semitic,Western Chauvinism,Jews are behind multiculturalism,GayHrv
|
28 |
+
Anti-semitic,Western Chauvinism,Jews are making people gay,5SYQ2q
|
29 |
+
Black,BLM,Black lives matter protests were insurrections,whcn6U
|
30 |
+
Black,BLM,Black lives matter protests were riots,qjshDE
|
31 |
+
Black,BLM,Black people are targeting white people in response to George Floyd,JJzM7y
|
32 |
+
Black,BLM,BLM activists commit non-protest-related crimes,wCYHg7
|
33 |
+
Black,BLM,BLM did the J6 insurrection,GVHQah
|
34 |
+
Black,BLM,BLM seeks to enslave white people,5nnDbt
|
35 |
+
Black,BLM,Schools are teaching Black Lives Matter politics,f8v3rm
|
36 |
+
Black,Character Assasination,African Americans abuse government systems,LGCKdm
|
37 |
+
Black,Character Assasination,African Americans are abnormally violent,eVd1Eg
|
38 |
+
Black,Character Assasination,African Americans are criminals,SY77H4
|
39 |
+
Black,Character Assasination,African Americans are dependent on welfare,ySNZtE
|
40 |
+
Black,Character Assasination,African Americans are lazy,KJykwB
|
41 |
+
Black,Character Assasination,Black people are less intelligent than white people,UikLfc
|
42 |
+
Black,CRT,Democrats push the adoption of critical race theory,jyF0Yl
|
43 |
+
Black,CRT,Public education promotes critical race theory,YoWcaU
|
44 |
+
Black,CRT,Public schools teach children critical race theory,WiYklo
|
45 |
+
Black,Deny marginalization,Implicit bias doesn't exist,JkHnUH
|
46 |
+
Black,Deny marginalization,Systemic racism doesn't exist,GXMok3
|
47 |
+
Black,Ethnic Identity,Most Black people are not descended from slaves,B96bhS
|
48 |
+
Black,Great replacement,Black reproduction is meant to eliminate white people,lx3WsW
|
49 |
+
Black,Reverse marginalization,Companies will not hire whites because of Affirmative Action,hEZ6KU
|
50 |
+
Black,Reverse marginalization,Companies will not hire whites because of DEI,oOeF3U
|
51 |
+
Immigration,Character Assasination,Immigrants are bringing diseases to the west,G0mUU3
|
52 |
+
Immigration,Character Assasination,Immigrants are unvaccinated,EbtpqX
|
53 |
+
Immigration,Character Assasination,Immigrants are violent,XarKDi
|
54 |
+
Immigration,Character Assasination,Immigrants commit disproportionate crime,dTISzI
|
55 |
+
Immigration,Great replacement,Immigrants poison the blood of the nation,5Cokji
|
56 |
+
Immigration,Great replacement,Immigrants are being allowed in to vote in elections,zdgRli
|
57 |
+
Immigration,Great replacement,Immigrants stole the 2020 election,cEKPsz
|
58 |
+
Immigration,Great replacement,Immigration is an invasion of western countries,L0ZyUA
|
59 |
+
Immigration,Great replacement,Immigration is engineered to replace white people,KEpIQf
|
60 |
+
Immigration,Policies,immigrants are given free health care in the united states,GkNQFl
|
61 |
+
Immigration,Western Chauvinism,Immigration is a globalist/multiculturalist conspiracy,NJ53RR
|
62 |
+
Immigration,Western Chauvinism,Immigration is a process of deculturalizing the west,fKJrv0
|
63 |
+
Immigration,Western Chauvinism,Immigration is a process of despiritualizing the west,EwykD2
|
64 |
+
Immigration,Western Chauvinism,immigration is reverse colonization,iUu1dv
|
65 |
+
Immigration,Western Chauvinism,immigration leads to the decline of western civilization,v5RcgG
|
66 |
+
Immigration,Western Chauvinism,immigration will eliminate the white race through racial mixing,dlbkPD
|
67 |
+
LGBTQ,Anti-liberalism,LGBTQ rights is a form of colonization by the west,98l33O
|
68 |
+
LGBTQ,Anti-science,There are only two genders people are born with,B1RpCU
|
69 |
+
LGBTQ,Anti-science,LGBTQ is a disease that can be cured,e7r1ws
|
70 |
+
LGBTQ,Anti-science,LGBTQ status is a choice,i3TdA8
|
71 |
+
LGBTQ,Anti-science,LGBTQ status is caused by parenting,vdmbmW
|
72 |
+
LGBTQ,Character Assasination,LGBTQ is a form of moral degeneracy,AUxRMf
|
73 |
+
LGBTQ,Character Assasination,LGBTQ is pushing children to change their gender,gTE1iB
|
74 |
+
LGBTQ,Character Assasination,LGBTQ people are threats to children,7TqDW3
|
75 |
+
LGBTQ,Character Assasination,LGBTQ people groom children,RQ8N4o
|
76 |
+
LGBTQ,Character Assasination,LGBTQ people threaten the safety of women and children in bathrooms,6PIKk3
|
77 |
+
LGBTQ,Character Assasination,LGTBQ people use the LGBTQ identity as a cover for dangerous qualities (e.g. they are secretly a rapist),TXX0OT
|
78 |
+
LGBTQ,Conspiracy,Gays control the media,tCHyHj
|
79 |
+
LGBTQ,Conspiracy,There is a secret gay agenda/cabal,6zsNn0
|
80 |
+
LGBTQ,Gender affirming care,Gender affirming care is unsafe,ofwj9b
|
81 |
+
LGBTQ,Gender affirming care,Gender-affirming health care is a form of child abuse or mutiliation,v7fGCm
|
82 |
+
LGBTQ,Gender affirming care,Gender-affirming health care is a form of sterilization,Mo26Zl
|
83 |
+
LGBTQ,Gender affirming care,Most people who transition regret it and want to detransition,xpia1A
|
84 |
+
LGBTQ,Kids these days,Being transgender is new or represents a recent trend,BMyDKR
|
85 |
+
LGBTQ,Kids these days,LGBTQ is part of a social contagion or rapid onset gender dysphoria,KXMEUC
|
86 |
+
LGBTQ,Policies,"Gay marriage is a slippery slope to: pedophilia, bestiality, or polygamy",d753iK
|
87 |
+
LGBTQ,Policies,LGBTQ is an ideological movement pushing gender ideology and transgenderism,vubrAX
|
88 |
+
LGBTQ,Pseudo-science,[Public figure] is secretly trans,2axpUt
|
89 |
+
LGBTQ,Psuedo-science,LGBTQ people can be distinguished by physical features,R6Bv5Q
|
90 |
+
LGBTQ,Satanism',LGBTQ people are satanists,aVunFJ
|
91 |
+
LGBTQ,Western Chauvinism,LGBTQ people cannot provide stable homes,DWHuWO
|
92 |
+
Reproductive health,Abortion,Abortion is black genocide,tKzbS5
|
93 |
+
Reproductive health,Abortion,Abortion is genocide,9TycbG
|
94 |
+
Reproductive health,Abortion,Abortion is white genocide,WNhhGj
|
95 |
+
Reproductive health,Abortion,Birth control is black genocide,0FHlMA
|
96 |
+
Reproductive health,Abortion,Birth control is genocide,9sDtYl
|
97 |
+
Reproductive health,Abortion,Birth control is white genocide,a8GiIm
|
data/climate_data/data/README.txt
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----------------------------------------------------
|
2 |
+
Data used in Coan, Boussalis, Cook, and Nanko (2021)
|
3 |
+
-----------------------------------------------------
|
4 |
+
|
5 |
+
This directory includes two sub-directories that house the main
|
6 |
+
data used during training and in the analysis.
|
7 |
+
|
8 |
+
------------------
|
9 |
+
analysis directory
|
10 |
+
------------------
|
11 |
+
|
12 |
+
The analysis directory includes a single CSV file: cards_for_analysis.csv. The
|
13 |
+
file has the following fields:
|
14 |
+
|
15 |
+
domain: the domain for each organization or blog.
|
16 |
+
|
17 |
+
date: the date the article or blog post was written.
|
18 |
+
|
19 |
+
ctt_status: an indictor for whether the source is a conservative think tank
|
20 |
+
(CTTs). [CTT = True; Blog = False]
|
21 |
+
|
22 |
+
pid: unique paragraph identifier
|
23 |
+
|
24 |
+
claim: the estimated sub-claim based on the RoBERTa-Logistic ensemble described
|
25 |
+
in the paper. [The variable assumes the following format: superclaim_subclaim.
|
26 |
+
For example, 5_1 would represent super-claim 5 ("Climate movement/science is
|
27 |
+
unreliable"), sub-claim 1 ("Science is unreliable").]
|
28 |
+
|
29 |
+
------------------
|
30 |
+
training directory
|
31 |
+
------------------
|
32 |
+
|
33 |
+
The training directory includes 3 CSV files:
|
34 |
+
|
35 |
+
training.csv: annotations used for training
|
36 |
+
validation.csv: the held-out validation set used during training (noisy)
|
37 |
+
test.csv: the held-out test set used to assess final model performance
|
38 |
+
(noise free)
|
39 |
+
|
40 |
+
Each file has the following fields:
|
41 |
+
|
42 |
+
text: the paragraph text that is annotated
|
43 |
+
claim: the annotated sub-claim [The variable assumes the following format:
|
44 |
+
superclaim_subclaim. For example, 5_1 would represent super-claim 5
|
45 |
+
("Climate movement/science is unreliable"), sub-claim 1 ("Science is
|
46 |
+
unreliable").]
|
data/expansive_claims_library_expanded_embed.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/filtered_fact_check_latest_embed.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0b868219291c167703e5eb45b95aceae6fa29779b7cb4d62ef977e2853516829
|
3 |
+
size 145825870
|
data/random_300k.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9843f88f219a0e8d43296ed8e62033affdebf0540cde53c3e1a7c3bac755f8d
|
3 |
+
size 86368740
|
src/Embeddings.jl
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Embeddings
|
2 |
+
|
3 |
+
function string_to_float32_vector(str::String)::Vector{Float32}
|
4 |
+
# Remove the "Float32[" prefix and the "]" suffix
|
5 |
+
str = strip(str, ['F', 'l', 'o', 'a', 't', '3', '2', '[', ']'])
|
6 |
+
|
7 |
+
# Replace 'f' with 'e' for scientific notation
|
8 |
+
str = replace(str, 'f' => 'e')
|
9 |
+
|
10 |
+
# Split the string by commas to get individual elements
|
11 |
+
elements = split(str, ",")
|
12 |
+
|
13 |
+
# Convert each element to Float32 and collect into a vector
|
14 |
+
return Float32[parse(Float32, strip(el)) for el in elements]
|
15 |
+
end
|
16 |
+
|
17 |
+
function dfdat_to_matrix(df::DataFrame, col::Symbol)::Matrix{Float32}
|
18 |
+
return hcat([string_to_float32_vector(row[col]) for row in eachrow(df)]...)
|
19 |
+
end
|
20 |
+
|
21 |
+
"""
|
22 |
+
## Any piece of text longer than 280 characters will be chunked into smaller pieces, and the embeddings will be averaged.
|
23 |
+
|
24 |
+
#Example:
|
25 |
+
text = repeat("This is a test. ", 100)
|
26 |
+
chunktext = create_chunked_text(text)
|
27 |
+
function create_chunked_text(text; chunk_size=280)
|
28 |
+
## Chunk the data
|
29 |
+
chunks = []
|
30 |
+
for chunk in 1:chunk_size:length(text)
|
31 |
+
push!(chunks, text[chunk:min(chunk+chunk_size-1, length(text))])
|
32 |
+
end
|
33 |
+
return chunks
|
34 |
+
end
|
35 |
+
"""
|
36 |
+
|
37 |
+
function create_chunked_text(text::String; chunk_size::Int=280)
|
38 |
+
chunks = []
|
39 |
+
start_idx = 1
|
40 |
+
while start_idx <= lastindex(text)
|
41 |
+
end_idx = start_idx
|
42 |
+
for _ in 1:chunk_size
|
43 |
+
end_idx = nextind(text, end_idx, 1)
|
44 |
+
if end_idx > lastindex(text)
|
45 |
+
end_idx = lastindex(text)
|
46 |
+
break
|
47 |
+
end
|
48 |
+
end
|
49 |
+
push!(chunks, text[start_idx:end_idx])
|
50 |
+
start_idx = nextind(text, end_idx)
|
51 |
+
end
|
52 |
+
return chunks
|
53 |
+
end
|
54 |
+
|
55 |
+
"""
|
56 |
+
## Embeddings of text
|
57 |
+
|
58 |
+
"""
|
59 |
+
function generate_embeddings(text::String)
|
60 |
+
try
|
61 |
+
return MiniEncoder.get_embeddings(text)
|
62 |
+
catch e
|
63 |
+
println("Error: ", e)
|
64 |
+
return zeros(Float32, 384)
|
65 |
+
end
|
66 |
+
end
|
67 |
+
|
68 |
+
"""
|
69 |
+
# This is the core function - takes in a string of any length and returns the embeddings
|
70 |
+
|
71 |
+
text = repeat("This is a test. ", 100)
|
72 |
+
mini_embed(text)
|
73 |
+
|
74 |
+
# Test to embed truthseeker subsample
|
75 |
+
ts = CSV.read("data/truthseeker_subsample.csv", DataFrame)
|
76 |
+
ts_embed = mini_embed.(ts.statement) # can embed 3K in 25 seconds
|
77 |
+
ts.Embeddings = ts_embed
|
78 |
+
CSV.write("data/truthseeker_subsample_embed.csv", ts)
|
79 |
+
|
80 |
+
## embed fact check data
|
81 |
+
fc = CSV.read("data/fact_check_latest.csv", DataFrame)
|
82 |
+
# drop missing text
|
83 |
+
fc = fc[.!ismissing.(fc.text), :]
|
84 |
+
fc_embed = mini_embed.(fc.text) # 12 minutes
|
85 |
+
fc.Embeddings = fc_embed
|
86 |
+
CSV.write("data/fact_check_latest_embed.csv", fc)
|
87 |
+
|
88 |
+
narrs = CSV.read("data/expansive_claims_library_expanded.csv", DataFrame)
|
89 |
+
# drop missing text
|
90 |
+
narrs.text = narrs.ExpandedClaim
|
91 |
+
narrs = narrs[.!ismissing.(narrs.text), :]
|
92 |
+
narratives_embed = OC.mini_embed.(narrs.text) # seconds to run
|
93 |
+
narrs.Embeddings = narratives_embed
|
94 |
+
CSV.write("data/expansive_claims_library_expanded_embed.csv", narrs)
|
95 |
+
|
96 |
+
"""
|
97 |
+
function mini_embed(text::String)
|
98 |
+
chunked_text = create_chunked_text(text)
|
99 |
+
embeddings = generate_embeddings.(chunked_text)
|
100 |
+
mean(embeddings)
|
101 |
+
end
|
102 |
+
|
103 |
+
"""
|
104 |
+
# Get distance and classification
|
105 |
+
|
106 |
+
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
|
107 |
+
ts_embed = dfdat_to_matrix(ts, :Embeddings)
|
108 |
+
fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame)
|
109 |
+
fc_embed = dfdat_to_matrix(fc, :Embeddings)
|
110 |
+
distances, classification = distances_and_classification(fc_embed, ts_embed[:, 1:5])
|
111 |
+
"""
|
112 |
+
function distances_and_classification(narrative_matrix, target_matrix)
|
113 |
+
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
|
114 |
+
# get the index of the column with the smallest distance
|
115 |
+
return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1]
|
116 |
+
end
|
117 |
+
|
118 |
+
"""
|
119 |
+
# Get the dot product of the two matrices
|
120 |
+
|
121 |
+
ind, scores = dotproduct_distances(fc_embed, ts_embed)
|
122 |
+
|
123 |
+
ts.scores = scores
|
124 |
+
|
125 |
+
# Group by target and get the max score
|
126 |
+
ts_grouped = combine(groupby(ts, :target), :scores => mean)
|
127 |
+
# show the matched text
|
128 |
+
ts.fc_text = fc.text[ind]
|
129 |
+
|
130 |
+
"""
|
131 |
+
function dotproduct_distances(narrative_matrix, target_matrix)
|
132 |
+
# multiply each column of the narrative matrix by the target vector
|
133 |
+
dprods = narrative_matrix' * target_matrix
|
134 |
+
# get maximum dotproduct and index of the row
|
135 |
+
max_dot = argmax(dprods, dims=1)[1, :]
|
136 |
+
return first.(Tuple.(max_dot)), dprods[max_dot]
|
137 |
+
end
|
138 |
+
|
139 |
+
function dotproduct_topk(narrative_matrix, target_vector, k)
|
140 |
+
# multiply each column of the narrative matrix by the target vector
|
141 |
+
dprods = narrative_matrix' * target_vector
|
142 |
+
# indices of the top k dot products
|
143 |
+
topk = sortperm(dprods, rev=true)[1:k]
|
144 |
+
return topk, dprods[topk]
|
145 |
+
end
|
146 |
+
|
147 |
+
"""
|
148 |
+
# Get the top k scores
|
149 |
+
|
150 |
+
using CSV, DataFrames
|
151 |
+
ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
|
152 |
+
ts_embed = OC.dfdat_to_matrix(ts, :Embeddings)
|
153 |
+
fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame)
|
154 |
+
fc_embed = OC.dfdat_to_matrix(fc, :Embeddings)
|
155 |
+
|
156 |
+
OC.fast_topk(fc_embed, fc, ts.statement[1], 5)
|
157 |
+
|
158 |
+
## How fast to get the top 5 scores for 3K statements?
|
159 |
+
@time [OC.fast_topk(fc_embed, fc, ts.statement[x], 5) for x in 1:3000] # 63 seconds
|
160 |
+
"""
|
161 |
+
function fast_topk(narrative_matrix, narratives, text::String, k)
|
162 |
+
target_vector = mini_embed(text)
|
163 |
+
inds, scores = dotproduct_topk(narrative_matrix, target_vector, k)
|
164 |
+
if hasproperty(narratives, :Policy)
|
165 |
+
policy = narratives.Policy[inds]
|
166 |
+
narrative = narratives.Narrative[inds]
|
167 |
+
else
|
168 |
+
policy = fill("No policy", k)
|
169 |
+
narrative = fill("No narrative", k)
|
170 |
+
end
|
171 |
+
if !hasproperty(narratives, :claimReviewUrl)
|
172 |
+
narratives.claimReviewUrl = fill("No URL", size(narratives, 1))
|
173 |
+
end
|
174 |
+
vec_of_dicts = [Dict("score" => scores[i],
|
175 |
+
"text" => narratives.text[ind],
|
176 |
+
"claimUrl" => narratives.claimReviewUrl[ind],
|
177 |
+
"policy" => policy[i],
|
178 |
+
"narrative" => narrative[i]) for (i, ind) in enumerate(inds)]
|
179 |
+
return vec_of_dicts
|
180 |
+
end
|
181 |
+
|
182 |
+
function load_fasttext_embeddings(file::String="data/fact_check_latest_embed.csv")
|
183 |
+
fc = CSV.read(file, DataFrame)
|
184 |
+
fc_embed = dfdat_to_matrix(fc, :Embeddings)
|
185 |
+
return fc_embed, fc
|
186 |
+
end
|
src/Models.jl
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Utility Functions
|
2 |
+
## Note: edit ~/.bigqueryrc to set global settings for bq command line tool
|
3 |
+
|
4 |
+
using CSV, DataFrames, JSON3
|
5 |
+
|
6 |
+
function read_json(file_path::String)
|
7 |
+
json_data = JSON3.read(open(file_path, "r"))
|
8 |
+
return json_data
|
9 |
+
end
|
10 |
+
|
11 |
+
"""
|
12 |
+
## ostreacultura_bq_auth()
|
13 |
+
- Activate the service account using the credentials file
|
14 |
+
"""
|
15 |
+
function ostreacultura_bq_auth()
|
16 |
+
if isfile("ostreacultura-credentials.json")
|
17 |
+
run(`gcloud auth activate-service-account --key-file=ostreacultura-credentials.json`)
|
18 |
+
else
|
19 |
+
println("Credentials file not found")
|
20 |
+
end
|
21 |
+
end
|
22 |
+
|
23 |
+
"""
|
24 |
+
## julia_to_bq_type(julia_type::DataType)
|
25 |
+
- Map Julia types to BigQuery types
|
26 |
+
|
27 |
+
Arguments:
|
28 |
+
- julia_type: The Julia data type to map
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
- The corresponding BigQuery type as a string
|
32 |
+
"""
|
33 |
+
function julia_to_bq_type(julia_type::DataType)
|
34 |
+
if julia_type == String
|
35 |
+
return "STRING"
|
36 |
+
elseif julia_type == Int64
|
37 |
+
return "INTEGER"
|
38 |
+
elseif julia_type == Float64
|
39 |
+
return "FLOAT"
|
40 |
+
elseif julia_type <: AbstractArray{Float64}
|
41 |
+
return "FLOAT64"
|
42 |
+
elseif julia_type <: AbstractArray{Int64}
|
43 |
+
return "INTEGER"
|
44 |
+
else
|
45 |
+
return "STRING"
|
46 |
+
end
|
47 |
+
end
|
48 |
+
|
49 |
+
"""
|
50 |
+
## create_bq_schema(df::DataFrame)
|
51 |
+
- Create a BigQuery schema from a DataFrame
|
52 |
+
|
53 |
+
Arguments:
|
54 |
+
- df: The DataFrame to create the schema from
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
- The schema as a string in BigQuery format
|
58 |
+
|
59 |
+
Example:
|
60 |
+
df = DataFrame(text = ["Alice", "Bob"], embed = [rand(3), rand(3)])
|
61 |
+
create_bq_schema(df)
|
62 |
+
"""
|
63 |
+
function create_bq_schema(df::DataFrame)
|
64 |
+
schema = []
|
65 |
+
for col in names(df)
|
66 |
+
if eltype(df[!, col]) <: AbstractArray
|
67 |
+
push!(schema, Dict("name" => col, "type" => "FLOAT64", "mode" => "REPEATED"))
|
68 |
+
else
|
69 |
+
push!(schema, Dict("name" => col, "type" => julia_to_bq_type(eltype(df[!, col])), "mode" => "NULLABLE"))
|
70 |
+
end
|
71 |
+
end
|
72 |
+
return JSON3.write(schema)
|
73 |
+
end
|
74 |
+
|
75 |
+
"""
|
76 |
+
## dataframe_to_json(df::DataFrame, file_path::String)
|
77 |
+
- Convert a DataFrame to JSON format and save to a file
|
78 |
+
|
79 |
+
Arguments:
|
80 |
+
- df: The DataFrame to convert
|
81 |
+
- file_path: The path where the JSON file should be saved
|
82 |
+
"""
|
83 |
+
function dataframe_to_json(df::DataFrame, file_path::String)
|
84 |
+
open(file_path, "w") do io
|
85 |
+
for row in eachrow(df)
|
86 |
+
JSON.print(io, Dict(col => row[col] for col in names(df)))
|
87 |
+
write(io, "\n")
|
88 |
+
end
|
89 |
+
end
|
90 |
+
end
|
91 |
+
|
92 |
+
"""
|
93 |
+
# Function to send a DataFrame to a BigQuery table
|
94 |
+
## send_to_bq_table(df::DataFrame, dataset_name::String, table_name::String)
|
95 |
+
- Send a DataFrame to a BigQuery table, which will append if the table already exists
|
96 |
+
|
97 |
+
Arguments:
|
98 |
+
- df: The DataFrame to upload
|
99 |
+
- dataset_name: The BigQuery dataset name
|
100 |
+
- table_name: The BigQuery table name
|
101 |
+
|
102 |
+
# Example usage
|
103 |
+
df = DataFrame(text = ["Alice", "Bob"], embed = [rand(3), rand(3)])
|
104 |
+
send_to_bq_table(df, "climate_truth", "embtest")
|
105 |
+
|
106 |
+
# Upload a DataFrame
|
107 |
+
using CSV, DataFrames
|
108 |
+
import OstreaCultura as OC
|
109 |
+
tdat = CSV.read("data/climate_test.csv", DataFrame)
|
110 |
+
emb = OC.multi_embeddings(tdat)
|
111 |
+
|
112 |
+
|
113 |
+
"""
|
114 |
+
function send_to_bq_table(df::DataFrame, dataset_name::String, table_name::String)
|
115 |
+
# Temporary JSON file
|
116 |
+
json_file_path = tempname() * ".json"
|
117 |
+
schema = create_bq_schema(df)
|
118 |
+
## Save schema to a file
|
119 |
+
schema_file_path = tempname() * ".json"
|
120 |
+
open(schema_file_path, "w") do io
|
121 |
+
write(io, schema)
|
122 |
+
end
|
123 |
+
|
124 |
+
# Save DataFrame to JSON
|
125 |
+
dataframe_to_json(df, json_file_path)
|
126 |
+
|
127 |
+
# Use bq command-line tool to load JSON to BigQuery table with specified schema
|
128 |
+
run(`bq load --source_format=NEWLINE_DELIMITED_JSON $dataset_name.$table_name $json_file_path $schema_file_path`)
|
129 |
+
|
130 |
+
# Clean up and remove the temporary JSON file after upload
|
131 |
+
rm(json_file_path)
|
132 |
+
rm(schema_file_path)
|
133 |
+
return nothing
|
134 |
+
end
|
135 |
+
|
136 |
+
"""
|
137 |
+
## bq(query::String)
|
138 |
+
- Run a BigQuery query and return the result as a DataFrame
|
139 |
+
|
140 |
+
Example: bq("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10")
|
141 |
+
"""
|
142 |
+
function bq(query::String)
|
143 |
+
tname = tempname()
|
144 |
+
run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, tname))
|
145 |
+
return CSV.read(tname, DataFrame)
|
146 |
+
end
|
147 |
+
|
148 |
+
|
149 |
+
"""
|
150 |
+
## Function to average embeddings over some group
|
151 |
+
example:
|
152 |
+
avg_embeddings("ostreacultura.climate_truth.embtest", "text", "embed")
|
153 |
+
"""
|
154 |
+
function avg_embeddings(table::String, group::String, embedname::String)
|
155 |
+
query = """
|
156 |
+
SELECT
|
157 |
+
$group,
|
158 |
+
ARRAY(
|
159 |
+
SELECT AVG(value)
|
160 |
+
FROM UNNEST($embedname) AS value WITH OFFSET pos
|
161 |
+
GROUP BY pos
|
162 |
+
ORDER BY pos
|
163 |
+
) AS averaged_array
|
164 |
+
FROM (
|
165 |
+
SELECT $group, ARRAY_CONCAT_AGG($embedname) AS $embedname
|
166 |
+
FROM $table
|
167 |
+
GROUP BY $group
|
168 |
+
)
|
169 |
+
"""
|
170 |
+
return query
|
171 |
+
end
|
172 |
+
|
173 |
+
"""
|
174 |
+
## SAVE results of query to a CSV file
|
175 |
+
|
176 |
+
Example:
|
177 |
+
bq_csv("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10", "data/test.csv")
|
178 |
+
"""
|
179 |
+
function bq_csv(query::String, path::String)
|
180 |
+
run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, path))
|
181 |
+
end
|
182 |
+
|
src/OstreaCultura.jl
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## OSTREA
|
2 |
+
module OstreaCultura
|
3 |
+
|
4 |
+
@info "Loading OstreaCultura.jl"
|
5 |
+
|
6 |
+
using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
|
7 |
+
|
8 |
+
import Pandas.DataFrame as pdataframe
|
9 |
+
|
10 |
+
export MiniEncoder
|
11 |
+
|
12 |
+
## Load the FC Dataset
|
13 |
+
#const fc = CSV.read("data/fact_check_latest.csv", DataFrame)
|
14 |
+
#const fc_embed = OC.dfdat_to_matrix(fc, :Embeddings)
|
15 |
+
|
16 |
+
#export multi_embeddings, DataLoader, df_to_pd, pd_to_df, create_pinecone_context
|
17 |
+
|
18 |
+
#include("Narrative.jl")
|
19 |
+
#include("NarrativeClassification.jl")
|
20 |
+
include("py_init.jl")
|
21 |
+
include("Embeddings.jl")
|
22 |
+
include("PyPineCone.jl")
|
23 |
+
#include("Models.jl")
|
24 |
+
|
25 |
+
end
|
src/PyPineCone.jl
ADDED
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### PineCone Embed and I/O Functions
|
2 |
+
|
3 |
+
"""
|
4 |
+
# This dataset matches the example data from DataLoader.py
|
5 |
+
import OstreaCultura as OC
|
6 |
+
hi = OC.example_data()
|
7 |
+
hi = OC.df_to_pd(hi)
|
8 |
+
OC.DataLoader.create_vectors_from_df(hi)
|
9 |
+
"""
|
10 |
+
function example_data()
|
11 |
+
DataFrame(
|
12 |
+
Embeddings = [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]],
|
13 |
+
id = ["vec1", "vec2"],
|
14 |
+
genre = ["drama", "action"]
|
15 |
+
)
|
16 |
+
end
|
17 |
+
|
18 |
+
"""
|
19 |
+
df= OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
|
20 |
+
df_julia = OC.pd_to_df(df)
|
21 |
+
"""
|
22 |
+
function pd_to_df(df_pd)
|
23 |
+
df= DataFrame()
|
24 |
+
for col in df_pd.columns
|
25 |
+
df[!, col] = getproperty(df_pd, col).values
|
26 |
+
end
|
27 |
+
df
|
28 |
+
end
|
29 |
+
|
30 |
+
"""
|
31 |
+
Available functions
|
32 |
+
pc.create_index - see below
|
33 |
+
pc.delete_index: pc.delete_index(index_name)
|
34 |
+
"""
|
35 |
+
function create_pinecone_context()
|
36 |
+
pc = DataLoader.Pinecone(api_key=ENV["PINECONE_API_KEY"])
|
37 |
+
return pc
|
38 |
+
end
|
39 |
+
|
40 |
+
"""
|
41 |
+
# Context for inference endpoints
|
42 |
+
"""
|
43 |
+
function create_inf_pinecone_context()
|
44 |
+
pc = DataLoader.Pinecone(ENV["PINECONE_API_KEY"])
|
45 |
+
return pc
|
46 |
+
end
|
47 |
+
|
48 |
+
"""
|
49 |
+
pc = create_pinecone_context()
|
50 |
+
create_index("new-index", 4, "cosine", "aws", "us-east-1")
|
51 |
+
"""
|
52 |
+
function create_index(name, dimension, metric, cloud, region)
|
53 |
+
ppc = create_pinecone_context()
|
54 |
+
DataLoader.create_index(ppc, name, dimension, metric, cloud, region)
|
55 |
+
end
|
56 |
+
|
57 |
+
"""
|
58 |
+
import OstreaCultura as OC
|
59 |
+
df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
|
60 |
+
model = "multilingual-e5-large"
|
61 |
+
out = OC.multi_embeddings(model, df, 96, "text")
|
62 |
+
# Id and Embeddings are required columns in the DataFrame
|
63 |
+
OC.upsert_data(out, "test-index", "test-namespace")
|
64 |
+
|
65 |
+
df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
|
66 |
+
model = "multilingual-e5-large"
|
67 |
+
test_embeds = OC.multi_embeddings(model, df, 96, "text")
|
68 |
+
test_embeds_min = test_embeds.head(10)
|
69 |
+
# Id and Embeddings are required columns in the DataFrame
|
70 |
+
OC.upsert_data(test_embeds_min, "test-index", "indicator-test-namespace", chunk_size=100)
|
71 |
+
|
72 |
+
"""
|
73 |
+
function upsert_data(df, indexname, namespace; chunk_size=1000)
|
74 |
+
# Import DataLoader.py
|
75 |
+
pc = create_pinecone_context()
|
76 |
+
index = pc.Index(indexname)
|
77 |
+
DataLoader.chunk_df_and_upsert(index, df, namespace=namespace, chunk_size=chunk_size)
|
78 |
+
end
|
79 |
+
|
80 |
+
"""
|
81 |
+
## How to query data using an existing embedding
|
82 |
+
import OstreaCultura as OC; using DataFrames
|
83 |
+
mydf = DataFrame(id = ["vec1", "vec2"], text = ["drama", "action"])
|
84 |
+
mydf = OC.multi_embeddings(mydf)
|
85 |
+
vector = mydf.Embeddings[1]
|
86 |
+
top_k = 5
|
87 |
+
include_values = true
|
88 |
+
OC.query_data("test-index", "test-namespace", vector, top_k, include_values)
|
89 |
+
"""
|
90 |
+
function query_data(indexname, namespace, vector, top_k, include_values)
|
91 |
+
pc = create_pinecone_context()
|
92 |
+
index = pc.Index(indexname)
|
93 |
+
DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict()
|
94 |
+
end
|
95 |
+
|
96 |
+
"""
|
97 |
+
## How to query data using an existing hybrid embedding
|
98 |
+
|
99 |
+
import OstreaCultura as OC; using DataFrames
|
100 |
+
querytext = "drama"
|
101 |
+
dense = OC.embed_query(querytext)
|
102 |
+
top_k = 5
|
103 |
+
include_values = true
|
104 |
+
include_metadata = true
|
105 |
+
OC.query_data_with_sparse("oc-hybrid-library-index", "immigration", dense, OC.DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata)
|
106 |
+
|
107 |
+
"""
|
108 |
+
function query_data_with_sparse(indexname, namespace, dense, sparse, top_k, include_values, include_metadata)
|
109 |
+
pc = create_pinecone_context()
|
110 |
+
index = pc.Index(indexname)
|
111 |
+
DataLoader.query_data_with_sparse(index, namespace, dense, sparse, top_k=top_k, include_values=include_values, include_metadata=include_metadata).to_dict()
|
112 |
+
end
|
113 |
+
|
114 |
+
"""
|
115 |
+
## Querying function for GGWP - using updated hybrid vector
|
116 |
+
import OstreaCultura as OC
|
117 |
+
claim = "drama"
|
118 |
+
indexname = "oc-hybrid-library-index"
|
119 |
+
ocmodel = "expanded-fact-checks"
|
120 |
+
OC.search(claim, indexname, ocmodel, include_values=false, include_metadata=false)
|
121 |
+
res = OC.search(claim, indexname, ocmodel)
|
122 |
+
"""
|
123 |
+
function search(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true)
|
124 |
+
dense = embed_query(claim)
|
125 |
+
query_data_with_sparse(indexname, ocmodel, dense, DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata)
|
126 |
+
end
|
127 |
+
|
128 |
+
function unicodebarplot(x, y, title = "Query Matches")
|
129 |
+
UnicodePlots.barplot(x, y, title=title)
|
130 |
+
end
|
131 |
+
|
132 |
+
function searchresult_to_unicodeplot(searchresult)
|
133 |
+
scores = [x["score"] for x in searchresult["matches"]]
|
134 |
+
text = [x["metadata"]["text"] for x in searchresult["matches"]]
|
135 |
+
## reduce the text to 41 characters
|
136 |
+
text_to_show = [length(x) > 41 ? x[1:41] * "..." : x for x in text]
|
137 |
+
unicodebarplot(text_to_show, scores)
|
138 |
+
end
|
139 |
+
|
140 |
+
"""
|
141 |
+
## Search and plot the results
|
142 |
+
|
143 |
+
import OstreaCultura as OC
|
144 |
+
claim = "drama"
|
145 |
+
indexname = "oc-hybrid-library-index"
|
146 |
+
ocmodel = "immigration"
|
147 |
+
OC.searchplot(claim, indexname, ocmodel)
|
148 |
+
"""
|
149 |
+
function searchplot(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true)
|
150 |
+
searchresult = search(claim, indexname, ocmodel, top_k=top_k, include_values=include_values, include_metadata=include_metadata)
|
151 |
+
searchresult_to_unicodeplot(searchresult)
|
152 |
+
end
|
153 |
+
|
154 |
+
"""
|
155 |
+
import OstreaCultura as OC
|
156 |
+
df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
|
157 |
+
model = "multilingual-e5-large"
|
158 |
+
out = OC.multi_embeddings(model, df, 96, "text")
|
159 |
+
|
160 |
+
using CSV, DataFrames
|
161 |
+
tdat = CSV.read("data/climate_test.csv", DataFrame)
|
162 |
+
OC.multi_embeddings(model, Pandas.DataFrame(tdat), 96, "text")
|
163 |
+
"""
|
164 |
+
function multi_embeddings(model, data, chunk_size, textcol)
|
165 |
+
pc = create_inf_pinecone_context()
|
166 |
+
DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol)
|
167 |
+
end
|
168 |
+
|
169 |
+
"""
|
170 |
+
using CSV, DataFrames
|
171 |
+
import OstreaCultura as OC
|
172 |
+
tdat = CSV.read("data/climate_test.csv", DataFrame)
|
173 |
+
OC.multi_embeddings(tdat)
|
174 |
+
"""
|
175 |
+
function multi_embeddings(data::DataFrames.DataFrame; kwargs...)
|
176 |
+
data = df_to_pd(data)
|
177 |
+
model = get(kwargs, :model, "multilingual-e5-large")
|
178 |
+
chunk_size = get(kwargs, :chunk_size, 96)
|
179 |
+
textcol = get(kwargs, :textcol, "text")
|
180 |
+
pc = create_inf_pinecone_context()
|
181 |
+
DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol)
|
182 |
+
end
|
183 |
+
|
184 |
+
"""
|
185 |
+
## Julia DataFrame to pandas DataFrame
|
186 |
+
"""
|
187 |
+
function df_to_pd(df::DataFrames.DataFrame)
|
188 |
+
pdataframe(df)
|
189 |
+
end
|
190 |
+
|
191 |
+
function embed_query(querytext; kwargs...)
|
192 |
+
firstdf = DataFrame(id = "vec1", text = querytext)
|
193 |
+
firstdf = multi_embeddings(firstdf)
|
194 |
+
vector = firstdf.Embeddings[1]
|
195 |
+
return vector
|
196 |
+
end
|
197 |
+
|
198 |
+
"""
|
199 |
+
## Query with a vector of embeddings
|
200 |
+
import OstreaCultura as OC
|
201 |
+
vector = rand(1024)
|
202 |
+
indexname = "test-index"
|
203 |
+
namespace = "test-namespace"
|
204 |
+
vecresults = OC.query_w_vector(vector, indexname, namespace)
|
205 |
+
"""
|
206 |
+
function query_w_vector(vector, indexname, namespace; kwargs...)
|
207 |
+
top_k = get(kwargs, :top_k, 5)
|
208 |
+
include_values = get(kwargs, :include_values, true)
|
209 |
+
pc = create_pinecone_context()
|
210 |
+
index = pc.Index(indexname)
|
211 |
+
queryresults = DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict()
|
212 |
+
##
|
213 |
+
if include_values
|
214 |
+
values_vector = [queryresults["matches"][i]["values"] for i in 1:length(queryresults["matches"])]
|
215 |
+
else
|
216 |
+
values_vector = [missing for i in 1:length(queryresults["matches"])]
|
217 |
+
end
|
218 |
+
# drop the "values" key from each dict so it doesn't get added to the DataFrame
|
219 |
+
for i in 1:length(queryresults["matches"])
|
220 |
+
delete!(queryresults["matches"][i], "values")
|
221 |
+
end
|
222 |
+
out = DataFrame()
|
223 |
+
for i in 1:length(queryresults["matches"])
|
224 |
+
out = vcat(out, DataFrame(queryresults["matches"][i]))
|
225 |
+
end
|
226 |
+
# If desired update this function to add the embeddings to the DataFrame
|
227 |
+
if include_values
|
228 |
+
out[:, "values"] = values_vector
|
229 |
+
end
|
230 |
+
|
231 |
+
return out
|
232 |
+
end
|
233 |
+
|
234 |
+
"""
|
235 |
+
import OstreaCultura as OC
|
236 |
+
indexname = "test-index"
|
237 |
+
namespace = "test-namespace"
|
238 |
+
pc = OC.create_pinecone_context()
|
239 |
+
vector = OC.embed_query("drama")
|
240 |
+
queryresults = OC.query_w_vector(vector, indexname, namespace, top_k=5, include_values=false)
|
241 |
+
### now, fetch the underlying data
|
242 |
+
#fetched_data = OC.fetch_data(queryresults.id, indexname, namespace)
|
243 |
+
index = pc.Index(indexname)
|
244 |
+
resultfetch = OC.DataLoader.fetch_data(index, queryresults.id, namespace).to_dict()
|
245 |
+
OC.parse_fetched_results(resultfetch)
|
246 |
+
"""
|
247 |
+
function parse_fetched_results(resultfetch)
|
248 |
+
if length(resultfetch["vectors"]) > 0
|
249 |
+
ids = collect(keys(resultfetch["vectors"]))
|
250 |
+
## Grab the MetaData
|
251 |
+
data = []
|
252 |
+
for id in ids
|
253 |
+
push!(data, resultfetch["vectors"][id]["metadata"])
|
254 |
+
end
|
255 |
+
## Create a DataFrame From the MetaData
|
256 |
+
out = DataFrame()
|
257 |
+
for i in 1:length(data)
|
258 |
+
try
|
259 |
+
out = vcat(out, DataFrame(data[i]))
|
260 |
+
catch
|
261 |
+
out = vcat(out, DataFrame(data[i]), cols=:union)
|
262 |
+
end
|
263 |
+
end
|
264 |
+
out[!, :id] = ids
|
265 |
+
return out
|
266 |
+
else
|
267 |
+
@info "No data found"
|
268 |
+
return DataFrame()
|
269 |
+
end
|
270 |
+
end
|
271 |
+
|
272 |
+
"""
|
273 |
+
import OstreaCultura as OC
|
274 |
+
indexname = "test-index"
|
275 |
+
namespace = "test-namespace"
|
276 |
+
pc = OC.create_pinecone_context()
|
277 |
+
index = pc.Index(indexname)
|
278 |
+
ids = ["OSJeL7", "3TxWTNpPn"]
|
279 |
+
query_results_as_dataframe = OC.fetch_data(ids, indexname, namespace)
|
280 |
+
"""
|
281 |
+
function fetch_data(ids, indexname, namespace; chunk_size=900)
|
282 |
+
pc = create_pinecone_context()
|
283 |
+
index = pc.Index(indexname)
|
284 |
+
result_out = DataFrame()
|
285 |
+
for i in 1:ceil(Int, length(ids)/chunk_size)
|
286 |
+
chunk = ids[(i-1)*chunk_size+1:min(i*chunk_size, length(ids))]
|
287 |
+
resultfetch = DataLoader.fetch_data(index, chunk, namespace).to_dict()
|
288 |
+
result_out = vcat(result_out, parse_fetched_results(resultfetch))
|
289 |
+
end
|
290 |
+
return result_out
|
291 |
+
end
|
292 |
+
|
293 |
+
"""
|
294 |
+
## FINAL Query function - embeds, queries, and fetches data
|
295 |
+
import OstreaCultura as OC
|
296 |
+
querytext = "drama"
|
297 |
+
indexname = "test-index"
|
298 |
+
namespace = "test-namespace"
|
299 |
+
OC.query(querytext, indexname, namespace)
|
300 |
+
"""
|
301 |
+
function query(querytext::String, indexname::String, namespace::String; kwargs...)
|
302 |
+
top_k = get(kwargs, :top_k, 5)
|
303 |
+
include_values = get(kwargs, :include_values, true)
|
304 |
+
vector = embed_query(querytext)
|
305 |
+
queryresults = query_w_vector(vector, indexname, namespace, top_k=top_k, include_values=include_values)
|
306 |
+
### now, fetch the underlying data
|
307 |
+
fetched_data = fetch_data(queryresults.id, indexname, namespace)
|
308 |
+
# join the two dataframes on id
|
309 |
+
merged = innerjoin(queryresults, fetched_data, on=:id)
|
310 |
+
return merged
|
311 |
+
end
|
312 |
+
|
313 |
+
function filter_claims_closer_to_counterclaims(claim_results, counterclaim_results)
|
314 |
+
# Rename scores to avoid conflicts
|
315 |
+
rename!(claim_results, :score => :claim_score)
|
316 |
+
rename!(counterclaim_results, :score => :counterclaim_score)
|
317 |
+
# Innerjoin
|
318 |
+
df = leftjoin(claim_results, counterclaim_results, on=:id)
|
319 |
+
# Fill missing values with 0
|
320 |
+
df.counterclaim_score = coalesce.(df.counterclaim_score, 0.0)
|
321 |
+
# Keep only results where the claim score is greater than the counterclaim score
|
322 |
+
df = df[df.claim_score .> df.counterclaim_score, :]
|
323 |
+
return df
|
324 |
+
end
|
325 |
+
|
326 |
+
"""
|
327 |
+
## Query with claims and counterclaims
|
328 |
+
import OstreaCultura as OC
|
329 |
+
|
330 |
+
claim = "Climate change is a hoax"
|
331 |
+
counterclaim = "Climate change is real"
|
332 |
+
indexname = "test-index"
|
333 |
+
namespace = "test-namespace"
|
334 |
+
hi = OC.query_claims(claim, counterclaim, indexname, namespace)
|
335 |
+
"""
|
336 |
+
function query_claims(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...)
|
337 |
+
threshold = get(kwargs, :threshold, 0.8)
|
338 |
+
top_k = get(kwargs, :top_k, 5000) # top_k for the initial query
|
339 |
+
# Get embeddings
|
340 |
+
claim_vector = embed_query(claim)
|
341 |
+
counterclaim_vector = embed_query(counterclaim)
|
342 |
+
# Query the embeddings
|
343 |
+
claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false)
|
344 |
+
counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false)
|
345 |
+
# If a given id has a greater score for the claim than the counterclaim, keep it
|
346 |
+
allscores = filter_claims_closer_to_counterclaims(claim_results, counterclaim_results)
|
347 |
+
# Filter to scores above the threshold
|
348 |
+
allscores = allscores[allscores.claim_score .> threshold, :]
|
349 |
+
if size(allscores)[1] == 0
|
350 |
+
@info "No claims were above the threshold"
|
351 |
+
return DataFrame()
|
352 |
+
else
|
353 |
+
## now, fetch the data
|
354 |
+
resulting_data = fetch_data(allscores.id, indexname, namespace)
|
355 |
+
# merge the data on id
|
356 |
+
resulting_data = innerjoin(allscores, resulting_data, on=:id)
|
357 |
+
return resulting_data
|
358 |
+
end
|
359 |
+
end
|
360 |
+
|
361 |
+
|
362 |
+
"""
|
363 |
+
## Classify a claim against the existing misinformation library
|
364 |
+
import OstreaCultura as OC
|
365 |
+
|
366 |
+
## Example 1
|
367 |
+
claim = "There is a lot of dispute about whether the Holocaust happened"
|
368 |
+
counterclaim = "The Holocaust is a well-documented historical event"
|
369 |
+
indexname = "ostreacultura-v1"
|
370 |
+
namespace = "modified-misinfo-library"
|
371 |
+
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)
|
372 |
+
|
373 |
+
## Example 2
|
374 |
+
claim = "it's cool to be trans these days"
|
375 |
+
counterclaim = ""
|
376 |
+
indexname = "ostreacultura-v1"
|
377 |
+
namespace = "modified-misinfo-library"
|
378 |
+
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)
|
379 |
+
|
380 |
+
## Example 3
|
381 |
+
claim = "No existe racismo contra las personas negras"
|
382 |
+
counterclaim = "Racism is a systemic issue that affects people of color"
|
383 |
+
indexname = "ostreacultura-v1"
|
384 |
+
namespace = "modified-misinfo-library"
|
385 |
+
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)
|
386 |
+
|
387 |
+
"""
|
388 |
+
function classify_claim(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...)
|
389 |
+
threshold = get(kwargs, :threshold, 0.8)
|
390 |
+
top_k = get(kwargs, :top_k, 10) # top_k for the initial query
|
391 |
+
# Get embeddings
|
392 |
+
claim_vector = embed_query(claim)
|
393 |
+
if counterclaim != ""
|
394 |
+
counterclaim_vector = embed_query(counterclaim)
|
395 |
+
counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false)
|
396 |
+
counterclaim_score = counterclaim_results.score[1]
|
397 |
+
else
|
398 |
+
counterclaim_score = 0.0
|
399 |
+
end
|
400 |
+
# Query the embeddings
|
401 |
+
claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false)
|
402 |
+
# Filter to scores above the threshold
|
403 |
+
claim_results = claim_results[claim_results.score .> threshold, :]
|
404 |
+
## now, fetch the data
|
405 |
+
resulting_data = fetch_data(claim_results.id, indexname, namespace)
|
406 |
+
resulting_data.scores = claim_results.score
|
407 |
+
return resulting_data, counterclaim_score
|
408 |
+
end
|
409 |
+
|
410 |
+
function generate_sparse_model()
|
411 |
+
df = DataLoader.pd.read_csv("data/random_300k.csv")
|
412 |
+
corpus = df["text"].tolist()
|
413 |
+
vector, bm25 = OC.DataLoader.encode_documents(corpus)
|
414 |
+
return vector, bm25
|
415 |
+
end
|
src/bash/update_fact_checks.sh
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Script to Run periodic updates for the fact-check model
|
4 |
+
|
5 |
+
# Set the working directory
|
6 |
+
cd /home/ubuntu/fact-check
|
7 |
+
|
8 |
+
# path to julia
|
9 |
+
JULIA=/home/swojcik/.juliaup/bin/julia
|
10 |
+
|
11 |
+
# Run load_fact_check_json() from google_fact_check_api.jl to get the latest data
|
12 |
+
$JULIA -e 'include(`src/google_fact_check_api.jl`); load_fact_check_json()'
|
13 |
+
|
14 |
+
# Run the python script that goes and updates the fact-check model data
|
src/deprecated/Narrative.jl
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Structure of a Narrative
|
2 |
+
|
3 |
+
function randid()
|
4 |
+
config = Sqids.configure() # Local configuration
|
5 |
+
id = Sqids.encode(config, [rand(1:100), rand(1:100)])
|
6 |
+
return id
|
7 |
+
end
|
8 |
+
|
9 |
+
function timestamp()
|
10 |
+
(now() - unix2datetime(0)).value
|
11 |
+
end
|
12 |
+
|
13 |
+
"""
|
14 |
+
ts_to_time(timestamp()) == now()
|
15 |
+
"""
|
16 |
+
function ts_to_time(ts)
|
17 |
+
return unix2datetime(ts / 1000)
|
18 |
+
end
|
19 |
+
|
20 |
+
"""
|
21 |
+
Claim: something that supports a misinformation narrative
|
22 |
+
|
23 |
+
id: unique identifier for the claim
|
24 |
+
claim: text of the claim
|
25 |
+
counterclaim: text of the counterclaim
|
26 |
+
claimembedding: embedding of the claim
|
27 |
+
counterclaimembedding: embedding of the counterclaim
|
28 |
+
created_at: date the claim was created
|
29 |
+
updated_at: date the claim was last updated
|
30 |
+
source: source of the claim
|
31 |
+
|
32 |
+
"""
|
33 |
+
mutable struct Claim
|
34 |
+
id::String
|
35 |
+
claim::String # claim text
|
36 |
+
counterclaim::String # counterclaim text
|
37 |
+
claimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the claim
|
38 |
+
counterclaimembedding::Union{Array{Float32, 1}, Nothing} # embedding of the counterclaim
|
39 |
+
created_at::Int64 # date the claim was created
|
40 |
+
updated_at::Int64 # date the claim was last updated
|
41 |
+
source::String # source of the claim
|
42 |
+
keywords::Union{Array{String, 1}, Nothing} # keywords associated with the claim
|
43 |
+
end
|
44 |
+
|
45 |
+
"""
|
46 |
+
createClaim(claim::String, counterclaim::String, source::String)
|
47 |
+
|
48 |
+
Create a new Claim object with the given claim, counterclaim, and source.
|
49 |
+
The claim and counterclaim embeddings are set to nothing by default.
|
50 |
+
|
51 |
+
Example:
|
52 |
+
createClaim("Solar panels poison the soil and reduce crop yields",
|
53 |
+
"There is no evidence that solar panels poison the soil or reduce crop yields",
|
54 |
+
"Facebook post")
|
55 |
+
"""
|
56 |
+
function createClaim(claim::String, counterclaim::String, source::String, keywords::Array{String, 1})
|
57 |
+
return Claim(randid(), claim, counterclaim, nothing, nothing, timestamp(), timestamp(), source, keywords)
|
58 |
+
end
|
59 |
+
|
60 |
+
|
61 |
+
"""
|
62 |
+
Narrative: a collection of claims that support a misinformation narrative
|
63 |
+
|
64 |
+
id: unique identifier for the narrative
|
65 |
+
title: descriptive title of the narrative
|
66 |
+
type: broad type of narrative (e.g., anti-semitism)
|
67 |
+
target: target group/topic of the narrative
|
68 |
+
narrativesummary: base narrative text
|
69 |
+
claims: list of Claim objects
|
70 |
+
|
71 |
+
Example:
|
72 |
+
example_narrative = Narrative(
|
73 |
+
randid(),
|
74 |
+
"Jews killed Jesus",
|
75 |
+
"Anti-semitism",
|
76 |
+
"Jews",
|
77 |
+
"Jews are responsible for the death of Jesus",
|
78 |
+
nothing)
|
79 |
+
"""
|
80 |
+
mutable struct Narrative
|
81 |
+
id::String
|
82 |
+
title::String # descriptive title (e.g., Jews killed Jesus)
|
83 |
+
topic::String # broad type of narrative (e.g., anti-semitism)
|
84 |
+
target::String # target group/topic of the narrative
|
85 |
+
narrativesummary::String # base narrative text (e.g., Jews are responsible for the death of Jesus)
|
86 |
+
claims::Vector{Claim} # list of Claim objects
|
87 |
+
end
|
88 |
+
|
89 |
+
"""
|
90 |
+
## TODO: When you have a lot of narratives, you can create a NarrativeSet
|
91 |
+
- If you apply a narrative set over a database, it will perform classification using all the narratives
|
92 |
+
|
93 |
+
"""
|
94 |
+
mutable struct NarrativeSet
|
95 |
+
narratives::Vector{Narrative}
|
96 |
+
end
|
97 |
+
|
98 |
+
import Base: show
|
99 |
+
## Make the Narrative pretty to show -
|
100 |
+
function show(io::IO, narrative::Narrative)
|
101 |
+
println(io, "Narrative: $(narrative.title)")
|
102 |
+
println(io, "Topic: $(narrative.topic)")
|
103 |
+
println(io, "Target: $(narrative.target)")
|
104 |
+
println(io, "Narrative Summary: $(narrative.narrativesummary)")
|
105 |
+
println(io, "Claims:")
|
106 |
+
for claim in narrative.claims
|
107 |
+
println(io, " - $(claim.claim)")
|
108 |
+
end
|
109 |
+
end
|
110 |
+
|
111 |
+
"""
|
112 |
+
add_claim!(narrative::Narrative, claim::Claim)
|
113 |
+
|
114 |
+
Add a claim to a narrative.
|
115 |
+
|
116 |
+
Example:
|
117 |
+
add_claim!(example_narrative, example_claim)
|
118 |
+
"""
|
119 |
+
|
120 |
+
function add_claim!(narrative::Narrative, claim::Claim)
|
121 |
+
push!(narrative.claims, claim)
|
122 |
+
end
|
123 |
+
|
124 |
+
function remove_claim!(narrative::Narrative, claim_id::String)
|
125 |
+
narrative.claims = filter(c -> c.id != claim_id, narrative.claims)
|
126 |
+
end
|
127 |
+
|
128 |
+
function narrative_to_dataframe(narrative::Narrative)
|
129 |
+
out = DataFrame( narrative_title = narrative.title,
|
130 |
+
id = [claim.id for claim in narrative.claims],
|
131 |
+
claim = [claim.claim for claim in narrative.claims],
|
132 |
+
counterclaim = [claim.counterclaim for claim in narrative.claims],
|
133 |
+
claimembedding = [claim.claimembedding for claim in narrative.claims],
|
134 |
+
counterclaimembedding = [claim.counterclaimembedding for claim in narrative.claims],
|
135 |
+
created_at = [claim.created_at for claim in narrative.claims],
|
136 |
+
updated_at = [claim.updated_at for claim in narrative.claims],
|
137 |
+
source = [claim.source for claim in narrative.claims],
|
138 |
+
keywords = [claim.keywords for claim in narrative.claims])
|
139 |
+
return out
|
140 |
+
end
|
141 |
+
|
142 |
+
"""
|
143 |
+
# Collapse a dataframe into a narrative
|
144 |
+
"""
|
145 |
+
function dataframe_to_narrative(df::DataFrame, narrative_title::String, narrative_summary::String)
|
146 |
+
claims = [Claim(row.id, row.claim, row.counterclaim, row.claimembedding, row.counterclaimembedding, row.created_at, row.updated_at, row.source, row.keywords) for row in eachrow(df)]
|
147 |
+
return Narrative(randid(), narrative_title, "", "", narrative_summary, claims)
|
148 |
+
end
|
149 |
+
|
150 |
+
function deduplicate_claims_in_narrative!(narrative::Narrative)
|
151 |
+
## check which claim in non-unique in the set
|
152 |
+
claims = [claim.claim for claim in narrative.claims]
|
153 |
+
is_duplicated = nonunique(DataFrame(claim=claims))
|
154 |
+
# Get ID's of duplicated claims then remove them
|
155 |
+
if length(claims[findall(is_duplicated)]) > 0
|
156 |
+
for dupclaim in claims[findall(is_duplicated)]
|
157 |
+
id_dup = [claim.id for claim in narrative.claims if claim.claim == dupclaim]
|
158 |
+
# Remove all claims except the first one
|
159 |
+
[remove_claim!(narrative, id) for id in id_dup[2:end]]
|
160 |
+
end
|
161 |
+
end
|
162 |
+
return narrative
|
163 |
+
end
|
164 |
+
|
165 |
+
"""
|
166 |
+
## Embeddings to recover narratives
|
167 |
+
cand_embeddings = candidate_embeddings_from_narrative(narrative)
|
168 |
+
- Input: narrative
|
169 |
+
- Output: candidate embeddings - embeddings of text that match the regex defined in claims
|
170 |
+
|
171 |
+
"""
|
172 |
+
function candidate_embeddings(candidates::DataFrame; kwargs...)::DataFrame
|
173 |
+
model_id = get(kwargs, :model_id, "text-embedding-3-small")
|
174 |
+
textcol = get(kwargs, :textcol, "text")
|
175 |
+
# check if text column exists
|
176 |
+
if !textcol in names(candidates)
|
177 |
+
error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
|
178 |
+
end
|
179 |
+
## Data Embeddings
|
180 |
+
cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
|
181 |
+
## Add vector of embeddings to dataset
|
182 |
+
candidates[: , "Embeddings"] = [x for x in cand_embeddings]
|
183 |
+
return candidates
|
184 |
+
end
|
185 |
+
## Embeddings
|
186 |
+
|
187 |
+
"""
|
188 |
+
df = CSV.read("data/random_300k.csv", DataFrame)
|
189 |
+
df = filter(:message => x -> occursin(Regex("climate"), x), df)
|
190 |
+
embeds = create_chunked_embeddings(df[:, "message"]; chunk_size=10)
|
191 |
+
|
192 |
+
"""
|
193 |
+
function create_openai_chunked_embeddings(texts; model_id="text-embedding-3-small", chunk_size=1000)
|
194 |
+
## Chunk the data
|
195 |
+
embeddings = []
|
196 |
+
for chunk in 1:chunk_size:length(texts)
|
197 |
+
embeddings_resp = create_embeddings(ENV["OPENAI_API_KEY"],
|
198 |
+
texts[chunk:min(chunk+chunk_size-1, length(texts))]; model_id=model_id)
|
199 |
+
push!(embeddings, [x["embedding"] for x in embeddings_resp.response["data"]])
|
200 |
+
end
|
201 |
+
return vcat(embeddings...)
|
202 |
+
end
|
203 |
+
|
204 |
+
"""
|
205 |
+
## Embeddings of narrative claims
|
206 |
+
- bang because it modifies the narrative object in place
|
207 |
+
include("src/ExampleNarrative.jl")
|
208 |
+
include("src/Narrative.jl")
|
209 |
+
climate_narrative = create_example_narrative();
|
210 |
+
generate_claim_embeddings_from_narrative!(climate_narrative)
|
211 |
+
|
212 |
+
"""
|
213 |
+
function generate_openai_claim_embeddings_from_narrative!(narrative::Narrative)
|
214 |
+
## claim embeddings
|
215 |
+
claim_embeddings = create_chunked_embeddings([x.claim for x in narrative.claims])
|
216 |
+
[narrative.claims[i].claimembedding = claim_embeddings[i] for i in 1:length(narrative.claims)]
|
217 |
+
## counterclaim embeddings
|
218 |
+
counterclaim_embeddings = create_chunked_embeddings([x.counterclaim for x in narrative.claims])
|
219 |
+
[narrative.claims[i].counterclaimembedding = counterclaim_embeddings[i] for i in 1:length(narrative.claims)]
|
220 |
+
return nothing
|
221 |
+
end
|
222 |
+
|
223 |
+
"""
|
224 |
+
## Embeddings of candidate data
|
225 |
+
cand_embeddings = candidate_embeddings_from_narrative(narrative)
|
226 |
+
- Input: narrative
|
227 |
+
- Output: candidate embeddings - embeddings of text that match the regex defined in claims
|
228 |
+
|
229 |
+
"""
|
230 |
+
function candidate_openai_embeddings(candidates::DataFrame; kwargs...)::DataFrame
|
231 |
+
model_id = get(kwargs, :model_id, "text-embedding-3-small")
|
232 |
+
textcol = get(kwargs, :textcol, "text")
|
233 |
+
# check if text column exists
|
234 |
+
if !textcol in names(candidates)
|
235 |
+
error("Text column not found in the dataframe, try specifying the text column using the textcol keyword argument")
|
236 |
+
end
|
237 |
+
## Data Embeddings
|
238 |
+
cand_embeddings = create_chunked_embeddings(candidates[:, textcol]; model_id=model_id);
|
239 |
+
## Add vector of embeddings to dataset
|
240 |
+
candidates[: , "Embeddings"] = [x for x in cand_embeddings]
|
241 |
+
return candidates
|
242 |
+
end
|
src/deprecated/NarrativeClassification.jl
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Database retrieval based on keywords
|
2 |
+
## need to ] add [email protected]
|
3 |
+
|
4 |
+
|
5 |
+
"""
|
6 |
+
## Calculates distances and assigns tentative classification
|
7 |
+
"""
|
8 |
+
function distances_and_classification(narrative_matrix, target_matrix)
|
9 |
+
distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
|
10 |
+
# get the index of the column with the smallest distance
|
11 |
+
return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1]
|
12 |
+
end
|
13 |
+
|
14 |
+
"""
|
15 |
+
## Assignments of closest claim and counterclaim to the test data
|
16 |
+
"""
|
17 |
+
function assignments!(narrative_matrix, target_matrix, narrative_embeddings, target_embeddings; kwargs...)
|
18 |
+
claim_counter_claim = get(kwargs, :claim_counter_claim, "claim")
|
19 |
+
dists, narrative_assignment = distances_and_classification(narrative_matrix, target_matrix)
|
20 |
+
target_embeddings[:, "$(claim_counter_claim)Dist"] = dists
|
21 |
+
target_embeddings[:, "Closest$(claim_counter_claim)"] = [narrative_embeddings[x[2], claim_counter_claim] for x in narrative_assignment[:, 1]]
|
22 |
+
return nothing
|
23 |
+
end
|
24 |
+
|
25 |
+
"""
|
26 |
+
## Get distances and assign the closest claim to the test data
|
27 |
+
|
28 |
+
include("src/Narrative.jl")
|
29 |
+
include("src/NarrativeClassification.jl")
|
30 |
+
climate_narrative = create_example_narrative();
|
31 |
+
generate_claim_embeddings_from_narrative!(climate_narrative)
|
32 |
+
candidate_data = candidate_embeddings(climate_narrative)
|
33 |
+
get_distances!(climate_narrative, candidate_data)
|
34 |
+
"""
|
35 |
+
function get_distances!(narrative::Narrative, target_embeddings::DataFrame)
|
36 |
+
## Matrix of embeddings
|
37 |
+
narrative_embeddings = narrative_to_dataframe(narrative)
|
38 |
+
narrative_matrix = hcat([claim.claimembedding for claim in narrative.claims]...)
|
39 |
+
counternarrative_matrix = hcat([claim.counterclaimembedding for claim in narrative.claims]...)
|
40 |
+
target_matrix = hcat(target_embeddings[:, "Embeddings"]...)
|
41 |
+
# Create a search function
|
42 |
+
# Assign the closest claim to the test data
|
43 |
+
assignments!(narrative_matrix, target_matrix, narrative_embeddings, target_embeddings, claim_counter_claim="claim")
|
44 |
+
# Assign the closest counterclaim to the test data
|
45 |
+
assignments!(counternarrative_matrix, target_matrix, narrative_embeddings, target_embeddings, claim_counter_claim="counterclaim")
|
46 |
+
return nothing
|
47 |
+
end
|
48 |
+
|
49 |
+
function apply_gate_logic!(target_embeddings; kwargs...)
|
50 |
+
threshold = get(kwargs, :threshold, 0.2)
|
51 |
+
# Find those closer to claim than counter claim
|
52 |
+
closer_to_claim = findall(target_embeddings[:, "claimDist"] .< target_embeddings[:, "counterclaimDist"])
|
53 |
+
# Meets the threshold
|
54 |
+
meets_threshold = findall(target_embeddings[:, "claimDist"] .< threshold)
|
55 |
+
# Meets the threshold and is closer to claim than counter claim
|
56 |
+
target_embeddings[:, "OCLabel"] .= 0
|
57 |
+
target_embeddings[intersect(meets_threshold, closer_to_claim), "OCLabel"] .= 1
|
58 |
+
return nothing
|
59 |
+
end
|
60 |
+
|
61 |
+
"""
|
62 |
+
## Deploy the narrative model
|
63 |
+
- Input: narrative, threshold
|
64 |
+
|
65 |
+
include("src/Narrative.jl")
|
66 |
+
include("src/NarrativeClassification.jl")
|
67 |
+
include("src/ExampleNarrative.jl")
|
68 |
+
climate_narrative = create_example_narrative();
|
69 |
+
generate_claim_embeddings_from_narrative!(climate_narrative)
|
70 |
+
candidate_data = candidate_embeddings_from_narrative(climate_narrative)
|
71 |
+
get_distances!(climate_narrative, candidate_data)
|
72 |
+
apply_gate_logic!(candidate_data; threshold=0.2)
|
73 |
+
return_top_labels(candidate_data)
|
74 |
+
|
75 |
+
"""
|
76 |
+
function return_top_labels(target_embeddings; kwargs...)
|
77 |
+
top_labels = get(kwargs, :top_labels, 10)
|
78 |
+
# Filter to "OCLabel" == 1
|
79 |
+
out = target_embeddings[findall(target_embeddings[:, "OCLabel"] .== 1), :]
|
80 |
+
# sort by claimDist
|
81 |
+
sort!(out, :claimDist)
|
82 |
+
return out[1:min(top_labels, nrow(out)), :]
|
83 |
+
end
|
84 |
+
|
85 |
+
function return_positive_candidates(target_embeddings)
|
86 |
+
return target_embeddings[findall(target_embeddings[:, "OCLabel"] .== 1), :]
|
87 |
+
end
|
88 |
+
|
89 |
+
"""
|
90 |
+
## Deploy the narrative model
|
91 |
+
- Input: narrative, threshold
|
92 |
+
|
93 |
+
include("src/Narrative.jl")
|
94 |
+
include("src/NarrativeClassification.jl")
|
95 |
+
include("src/ExampleNarrative.jl")
|
96 |
+
climate_narrative = create_example_narrative();
|
97 |
+
deploy_narrative_model!(climate_narrative; threshold=0.2)
|
98 |
+
"""
|
99 |
+
function deploy_narrative_model!(narrative::Narrative; kwargs...)
|
100 |
+
threshold = get(kwargs, :threshold, 0.2)
|
101 |
+
db = get(kwargs, :db, "data/random_300k.csv")
|
102 |
+
generate_claim_embeddings_from_narrative!(narrative)
|
103 |
+
candidate_data = candidate_embeddings_from_narrative(narrative; db=db)
|
104 |
+
get_distances!(narrative, candidate_data)
|
105 |
+
apply_gate_logic!(candidate_data, threshold=threshold)
|
106 |
+
return candidate_data
|
107 |
+
end
|
src/dev/Utils.jl
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Utility Functions
|
2 |
+
## Note: edit ~/.bigqueryrc to set global settings for bq command line tool
|
3 |
+
|
4 |
+
|
5 |
+
"""
|
6 |
+
## ostreacultura_bq_auth()
|
7 |
+
- Activate the service account using the credentials file
|
8 |
+
"""
|
9 |
+
function ostreacultura_bq_auth()
|
10 |
+
if isfile("ostreacultura-credentials.json")
|
11 |
+
run(`gcloud auth activate-service-account --key-file=ostreacultura-credentials.json`)
|
12 |
+
else
|
13 |
+
println("Credentials file not found")
|
14 |
+
end
|
15 |
+
end
|
16 |
+
|
17 |
+
"""
|
18 |
+
## bq(query::String)
|
19 |
+
- Run a BigQuery query and return the result as a DataFrame
|
20 |
+
|
21 |
+
Example: bq("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10")
|
22 |
+
"""
|
23 |
+
function bq(query::String)
|
24 |
+
tname = tempname()
|
25 |
+
run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, tname))
|
26 |
+
return CSV.read(tname, DataFrame)
|
27 |
+
end
|
28 |
+
|
29 |
+
"""
|
30 |
+
## bq_db(query::String, db::String)
|
31 |
+
- Run a BigQuery query and save to a database
|
32 |
+
|
33 |
+
Example:
|
34 |
+
bq_db("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10", "data/test.csv")
|
35 |
+
"""
|
36 |
+
function bq_db(query::String, db::String)
|
37 |
+
run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, db))
|
38 |
+
end
|
39 |
+
|
40 |
+
"""
|
41 |
+
one token is roughly 3/4 of a word
|
42 |
+
|
43 |
+
"""
|
44 |
+
function token_estimate(allstrings::Vector{String})
|
45 |
+
## Tokenize the strings
|
46 |
+
tokens = [split(x) for x in allstrings]
|
47 |
+
## Estimate the number of tokens
|
48 |
+
token_estimate = sum([length(x) for x in tokens])
|
49 |
+
return token_estimate * 4 / 3
|
50 |
+
end
|
51 |
+
|
52 |
+
function chunk_by_tokens(allstrings::Vector{String}, max_tokens::Int=8191)
|
53 |
+
## Tokenize the strings
|
54 |
+
tokens = [split(x) for x in allstrings]
|
55 |
+
## Estimate the number of tokens
|
56 |
+
token_estimate = sum([length(x) for x in tokens]) * 4 / 3
|
57 |
+
## Chunk the strings
|
58 |
+
chunks = []
|
59 |
+
chunk = []
|
60 |
+
chunk_tokens = 0
|
61 |
+
for i in 1:length(allstrings)
|
62 |
+
if chunk_tokens + length(tokens[i]) < max_tokens
|
63 |
+
push!(chunk, allstrings[i])
|
64 |
+
chunk_tokens += length(tokens[i])
|
65 |
+
else
|
66 |
+
push!(chunks, chunk)
|
67 |
+
chunk = [allstrings[i]]
|
68 |
+
chunk_tokens = length(tokens[i])
|
69 |
+
end
|
70 |
+
end
|
71 |
+
push!(chunks, chunk)
|
72 |
+
return chunks
|
73 |
+
end
|
src/py_init.jl
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
DataLoader = PyNULL()
|
3 |
+
MiniEncoder = PyNULL()
|
4 |
+
|
5 |
+
function __init__()
|
6 |
+
# Import DataLoader.py
|
7 |
+
pushfirst!(pyimport("sys")."path", "src/python");
|
8 |
+
_DataLoader = pyimport("DataLoader")
|
9 |
+
_MiniEncoder = pyimport("MiniEncoder")
|
10 |
+
copy!(DataLoader, _DataLoader)
|
11 |
+
copy!(MiniEncoder, _MiniEncoder)
|
12 |
+
end
|
13 |
+
|
14 |
+
|
src/python/DataLoader.py
ADDED
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pip install pinecone[grpc]
|
2 |
+
#from pinecone import Pinecone
|
3 |
+
from pinecone.grpc import PineconeGRPC as Pinecone
|
4 |
+
import os
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
from pinecone import ServerlessSpec
|
8 |
+
from pinecone_text.sparse import BM25Encoder
|
9 |
+
|
10 |
+
## ID generation
|
11 |
+
from sqids import Sqids
|
12 |
+
sqids = Sqids()
|
13 |
+
#######
|
14 |
+
#import protobuf_module_pb2
|
15 |
+
#pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
|
16 |
+
|
17 |
+
##### EMBEDDINGS AND ENCODINGS
|
18 |
+
"""
|
19 |
+
## Embed in the inference API
|
20 |
+
df = pd.read_csv('data/Indicator_Test.csv')
|
21 |
+
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
|
22 |
+
model = "multilingual-e5-large"
|
23 |
+
embeddings = bulk_embed(pc, model, df[1:96])
|
24 |
+
|
25 |
+
"""
|
26 |
+
def bulk_embed(pc, model, data, textcol='text'):
|
27 |
+
embeddings = pc.inference.embed(
|
28 |
+
model,
|
29 |
+
inputs=[x for x in data[textcol]],
|
30 |
+
parameters={
|
31 |
+
"input_type": "passage"
|
32 |
+
}
|
33 |
+
)
|
34 |
+
return embeddings
|
35 |
+
|
36 |
+
|
37 |
+
def join_chunked_results(embeddings):
|
38 |
+
result = []
|
39 |
+
for chunk in embeddings:
|
40 |
+
for emblist in chunk.data:
|
41 |
+
result.append(emblist["values"])
|
42 |
+
return result
|
43 |
+
|
44 |
+
"""
|
45 |
+
## Chunk and embed in the inference API
|
46 |
+
df = pd.read_csv('data/climate_test.csv')
|
47 |
+
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
|
48 |
+
model = "multilingual-e5-large"
|
49 |
+
embeddings = chunk_and_embed(pc, model, df)
|
50 |
+
## Upgrade this function to return a dataframe with the Embeddings as a new column
|
51 |
+
|
52 |
+
"""
|
53 |
+
def chunk_and_embed(pc, model, data, chunk_size=96, textcol='text'):
|
54 |
+
embeddings = []
|
55 |
+
for i in range(0, len(data), chunk_size):
|
56 |
+
chunk = data[i:min(i + chunk_size, len(data))]
|
57 |
+
embeddings.append(bulk_embed(pc, model, chunk, textcol))
|
58 |
+
chunked_embeddings = join_chunked_results(embeddings)
|
59 |
+
data['Embeddings'] = chunked_embeddings
|
60 |
+
data['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(data))]
|
61 |
+
return data
|
62 |
+
|
63 |
+
"""
|
64 |
+
## Query the embeddings
|
65 |
+
query = "What is the impact of climate change on the economy?"
|
66 |
+
embeddings = query_embed(pc, model, query)
|
67 |
+
"""
|
68 |
+
def query_embed(pc, model, query):
|
69 |
+
embeddings = pc.inference.embed(
|
70 |
+
model,
|
71 |
+
inputs=query,
|
72 |
+
parameters={
|
73 |
+
"input_type": "query"
|
74 |
+
}
|
75 |
+
)
|
76 |
+
return embeddings[0]['values']
|
77 |
+
|
78 |
+
"""
|
79 |
+
### Sparse vector encoding
|
80 |
+
- write a function to embed
|
81 |
+
from pinecone_text.sparse import BM25Encoder
|
82 |
+
|
83 |
+
corpus = ["The quick brown fox jumps over the lazy dog",
|
84 |
+
"The lazy dog is brown",
|
85 |
+
"The fox is brown"]
|
86 |
+
|
87 |
+
# Initialize BM25 and fit the corpus.
|
88 |
+
bm25 = BM25Encoder()
|
89 |
+
#bm25.fit(corpus)
|
90 |
+
#bm25 = BM25Encoder.default()
|
91 |
+
doc_sparse_vector = bm25.encode_documents("The brown fox is quick")
|
92 |
+
|
93 |
+
vector, bm25 = encode_documents(corpus)
|
94 |
+
"""
|
95 |
+
def encode_documents(corpus):
|
96 |
+
bm25 = BM25Encoder()
|
97 |
+
bm25.fit(corpus)
|
98 |
+
doc_sparse_vector = bm25.encode_documents(corpus)
|
99 |
+
return doc_sparse_vector, bm25
|
100 |
+
|
101 |
+
def encode_query(bm25, query):
|
102 |
+
query_sparse_vector = bm25.encode_queries(query)
|
103 |
+
return query_sparse_vector
|
104 |
+
|
105 |
+
"""
|
106 |
+
## Generate format of sparse-dense vectors
|
107 |
+
# Example usage
|
108 |
+
df = pd.read_csv('data/Indicator_Test.csv')
|
109 |
+
df = df.head(3)
|
110 |
+
newdf = create_sparse_embeds(df)
|
111 |
+
newdf['metadata'] = newdf.metadata.to_list()
|
112 |
+
|
113 |
+
"""
|
114 |
+
def create_sparse_embeds(pc, df, textcol='text', idcol='id', model="multilingual-e5-large"):
|
115 |
+
endocs, bm25 = encode_documents(df[textcol].to_list())
|
116 |
+
chunk_and_embed(pc, model, df) # this is an in-place operation
|
117 |
+
# rename Embeddings to values
|
118 |
+
df.rename(columns={'Embeddings': 'values'}, inplace=True)
|
119 |
+
df['sparse_values'] = [x['values'] for x in endocs]
|
120 |
+
df['indices'] = [x['indices'] for x in endocs]
|
121 |
+
df['metadata'] = df.drop(columns=[idcol, 'values', 'indices', 'sparse_values']).to_dict(orient='records')
|
122 |
+
df = df[[idcol, 'values', 'metadata', 'indices', 'sparse_values']]
|
123 |
+
return bm25, df
|
124 |
+
|
125 |
+
"""
|
126 |
+
## Generate format of sparse-dense vectors
|
127 |
+
# Example usage
|
128 |
+
data = {
|
129 |
+
'id': ['vec1', 'vec2'],
|
130 |
+
'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]],
|
131 |
+
'metadata': [{'genre': 'drama', 'text': 'this'}, {'genre': 'action'}],
|
132 |
+
'sparse_indices': [[10, 45, 16], [12, 34, 56]],
|
133 |
+
'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]]
|
134 |
+
}
|
135 |
+
|
136 |
+
df = pd.DataFrame(data)
|
137 |
+
sparse_dense_dicts = create_sparse_dense_dict(df)
|
138 |
+
vecs = create_sparse_dense_vectors_from_df(df)
|
139 |
+
index.upsert(vecs, namespace="example-namespace")
|
140 |
+
|
141 |
+
|
142 |
+
# Example usage
|
143 |
+
df = pd.read_csv('data/Indicator_Test.csv')
|
144 |
+
df = df.head(3)
|
145 |
+
newdf = create_sparse_embeds(df)
|
146 |
+
metadata = df[['text', 'label']].to_dict(orient='records')
|
147 |
+
newdf['metadata'] = metadata
|
148 |
+
vecs = create_sparse_dense_dict(newdf)
|
149 |
+
index.upsert(vecs, namespace="example-namespace")
|
150 |
+
|
151 |
+
"""
|
152 |
+
def create_sparse_dense_dict(df, id_col='id', values_col='values', metadata_col='metadata', sparse_indices_col='indices', sparse_values_col='sparse_values'):
|
153 |
+
result = []
|
154 |
+
|
155 |
+
for _, row in df.iterrows():
|
156 |
+
vector_dict = {
|
157 |
+
'id': row[id_col],
|
158 |
+
'values': row[values_col],
|
159 |
+
'metadata': row[metadata_col],
|
160 |
+
'sparse_values': {
|
161 |
+
'indices': row[sparse_indices_col],
|
162 |
+
'values': row[sparse_values_col]
|
163 |
+
}
|
164 |
+
}
|
165 |
+
result.append(vector_dict)
|
166 |
+
|
167 |
+
return result
|
168 |
+
|
169 |
+
|
170 |
+
############ UPSERTING DATA
|
171 |
+
|
172 |
+
def create_index(pc, name, dimension, metric, cloud, region):
|
173 |
+
pc.create_index(
|
174 |
+
name=name,
|
175 |
+
dimension=dimension,
|
176 |
+
metric=metric,
|
177 |
+
spec=ServerlessSpec(
|
178 |
+
cloud=cloud,
|
179 |
+
region=region
|
180 |
+
)
|
181 |
+
)
|
182 |
+
|
183 |
+
#pc.delete_index("example-index")
|
184 |
+
|
185 |
+
#index = pc.Index("test-index")
|
186 |
+
|
187 |
+
"""
|
188 |
+
## Create vectors from a DataFrame to be uploaded to Pinecone
|
189 |
+
import pandas as pd
|
190 |
+
|
191 |
+
# Create a sample DataFrame
|
192 |
+
data = {
|
193 |
+
'Embeddings': [
|
194 |
+
[0.1, 0.2, 0.3, 0.4],
|
195 |
+
[0.2, 0.3, 0.4, 0.5]
|
196 |
+
],
|
197 |
+
'id': ['vec1', 'vec2'],
|
198 |
+
'genre': ['drama', 'action']
|
199 |
+
}
|
200 |
+
df = pd.DataFrame(data)
|
201 |
+
|
202 |
+
vecs = create_vectors_from_df(df)
|
203 |
+
|
204 |
+
# Upload the vectors to Pinecone
|
205 |
+
index.upsert(
|
206 |
+
vectors=vecs,
|
207 |
+
namespace="example-namespace"
|
208 |
+
)
|
209 |
+
"""
|
210 |
+
def create_vectors_from_df(df):
|
211 |
+
vectors = []
|
212 |
+
for _, row in df.iterrows():
|
213 |
+
vectors.append((row['id'], row['Embeddings'], row.drop(['Embeddings', 'id']).to_dict()))
|
214 |
+
return vectors
|
215 |
+
|
216 |
+
def chunk_upload_vectors(index, vectors, namespace="example-namespace", chunk_size=1000):
|
217 |
+
for i in range(0, len(vectors), chunk_size):
|
218 |
+
chunk = vectors[i:min(i + chunk_size, len(vectors))]
|
219 |
+
index.upsert(
|
220 |
+
vectors=chunk,
|
221 |
+
namespace=namespace
|
222 |
+
)
|
223 |
+
|
224 |
+
"""
|
225 |
+
## Working Example 2
|
226 |
+
|
227 |
+
df = pd.read_csv('data/Indicator_Test.csv')
|
228 |
+
dfe = DataLoader.chunk_and_embed(pc, model, df)
|
229 |
+
# Keep only text, embeddings, id
|
230 |
+
dfmin = dfe[['text', 'Embeddings', 'id', 'label']]
|
231 |
+
DataLoader.chunk_df_and_upsert(index, dfmin, namespace="indicator-test-namespace", chunk_size=96)
|
232 |
+
|
233 |
+
"""
|
234 |
+
def chunk_df_and_upsert(index, df, namespace="new-namespace", chunk_size=1000):
|
235 |
+
vectors = create_vectors_from_df(df)
|
236 |
+
chunk_upload_vectors(index, vectors, namespace, chunk_size)
|
237 |
+
|
238 |
+
#### QUERYING DATA
|
239 |
+
"""
|
240 |
+
namespace = "namespace"
|
241 |
+
vector = [0.1, 0.2, 0.3, 0.4]
|
242 |
+
top_k = 3
|
243 |
+
include_values = True
|
244 |
+
"""
|
245 |
+
def query_data(index, namespace, vector, top_k=3, include_values=True):
|
246 |
+
out = index.query(
|
247 |
+
namespace=namespace,
|
248 |
+
vector=vector.tolist(),
|
249 |
+
top_k=top_k,
|
250 |
+
include_values=include_values
|
251 |
+
)
|
252 |
+
return out
|
253 |
+
|
254 |
+
"""
|
255 |
+
Example:
|
256 |
+
|
257 |
+
"""
|
258 |
+
def query_data_with_sparse(index, namespace, vector, sparse_vector, top_k=5, include_values=True, include_metadata=True):
|
259 |
+
out = index.query(
|
260 |
+
namespace=namespace,
|
261 |
+
vector=vector,
|
262 |
+
sparse_vector=sparse_vector,
|
263 |
+
top_k=top_k,
|
264 |
+
include_metadata=include_metadata,
|
265 |
+
include_values=include_values
|
266 |
+
)
|
267 |
+
return out
|
268 |
+
|
269 |
+
# create sparse vector with zero weighting
|
270 |
+
def empty_sparse_vector():
|
271 |
+
return {
|
272 |
+
'indices': [1],
|
273 |
+
'values': [0.0]
|
274 |
+
}
|
275 |
+
|
276 |
+
|
277 |
+
"""
|
278 |
+
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
|
279 |
+
index = pc.Index("test-index")
|
280 |
+
namespace = "test-namespace"
|
281 |
+
vector = np.random.rand(1024)
|
282 |
+
top_k = 3
|
283 |
+
include_values = True
|
284 |
+
filter={
|
285 |
+
"label": {"$lt": 2}
|
286 |
+
}
|
287 |
+
query_data_with_filter(index, namespace, vector, top_k, include_values, filter)
|
288 |
+
"""
|
289 |
+
def query_data_with_filter(index, namespace, vector, top_k=3, include_values=True, filter=None):
|
290 |
+
out = index.query(
|
291 |
+
namespace=namespace,
|
292 |
+
vector=vector.tolist(),
|
293 |
+
top_k=top_k,
|
294 |
+
include_values=include_values,
|
295 |
+
filter=filter
|
296 |
+
)
|
297 |
+
return out
|
298 |
+
|
299 |
+
"""
|
300 |
+
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
|
301 |
+
ids = ["UkfgLgeYW9wo", "GkkzUYYOcooB"]
|
302 |
+
indexname = "ostreacultura-v1"
|
303 |
+
namespace = "cards-data"
|
304 |
+
index = pc.Index(indexname)
|
305 |
+
DL.fetch_data(index, ids, namespace)
|
306 |
+
|
307 |
+
"""
|
308 |
+
def fetch_data(index, ids, namespace):
|
309 |
+
out = index.fetch(ids=ids, namespace=namespace)
|
310 |
+
return out
|
311 |
+
|
312 |
+
|
313 |
+
def get_all_ids_from_namespace(index, namespace):
|
314 |
+
ids = index.list(namespace=namespace)
|
315 |
+
return ids
|
316 |
+
|
317 |
+
"""
|
318 |
+
## Hybrid search weighting - Alpa is equal to the weight of the dense vector
|
319 |
+
dense = [0.1, 0.2, 0.3, 0.4]
|
320 |
+
sparse_vector={
|
321 |
+
'indices': [10, 45, 16],
|
322 |
+
'values': [0.5, 0.5, 0.2]
|
323 |
+
}
|
324 |
+
dense, sparse = hybrid_score_norm(dense, sparse, alpha=1.0)
|
325 |
+
"""
|
326 |
+
def hybrid_score_norm(dense, sparse, alpha: float):
|
327 |
+
"""Hybrid score using a convex combination
|
328 |
+
|
329 |
+
alpha * dense + (1 - alpha) * sparse
|
330 |
+
|
331 |
+
Args:
|
332 |
+
dense: Array of floats representing
|
333 |
+
sparse: a dict of `indices` and `values`
|
334 |
+
alpha: scale between 0 and 1
|
335 |
+
"""
|
336 |
+
if alpha < 0 or alpha > 1:
|
337 |
+
raise ValueError("Alpha must be between 0 and 1")
|
338 |
+
hs = {
|
339 |
+
'indices': sparse['indices'],
|
340 |
+
'values': [v * (1 - alpha) for v in sparse['values']]
|
341 |
+
}
|
342 |
+
return [v * alpha for v in dense], hs
|
343 |
+
|
344 |
+
#############
|
src/python/MiniEncoder.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Mini Encoder
|
2 |
+
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
|
5 |
+
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
6 |
+
|
7 |
+
def get_embeddings(sentences):
|
8 |
+
embeddings = model.encode(sentences)
|
9 |
+
return embeddings
|
10 |
+
|
src/python/__pycache__/DataLoader.cpython-310.pyc
ADDED
Binary file (4.98 kB). View file
|
|
src/python/__pycache__/DataLoader.cpython-312.pyc
ADDED
Binary file (7.88 kB). View file
|
|
src/python/update_fact_check_data.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## SCRIPT TO UPDATE THE FACT CHECK DATA
|
2 |
+
#######################################
|
3 |
+
from pinecone.grpc import PineconeGRPC as Pinecone
|
4 |
+
import os
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
from pinecone import ServerlessSpec
|
8 |
+
from pinecone_text.sparse import BM25Encoder
|
9 |
+
import sys
|
10 |
+
sys.path.append('src/python')
|
11 |
+
import DataLoader
|
12 |
+
pc = Pinecone(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0", pool_threads=50) # <-- make sure to set this)
|
13 |
+
##############################
|
14 |
+
|
15 |
+
df = pd.read_csv('data/fact_check_latest.csv')
|
16 |
+
# Drop non-unique text values
|
17 |
+
df = df.drop_duplicates(subset=['text'])
|
18 |
+
# skip rows where text is NaN
|
19 |
+
df = df.dropna(subset=['text'])
|
20 |
+
## for 'claimReviewTitle' and 'claimReviewUrl' columns, fill NaN with empty string
|
21 |
+
df['claimReviewUrl'] = df['claimReviewUrl'].fillna('')
|
22 |
+
# now, check for NaN values in 'claimReviewUrl' column
|
23 |
+
## get top three rows
|
24 |
+
# get text and MessageID
|
25 |
+
bm25, newdf = DataLoader.create_sparse_embeds(pc, df)
|
26 |
+
#metadata = df[['text', 'category', 'claimReviewTitle', 'claimReviewUrl']].to_dict(orient='records')
|
27 |
+
metadata = df[['text', 'claimReviewUrl']].to_dict(orient='records')
|
28 |
+
newdf.loc[:, 'metadata'] = metadata
|
29 |
+
|
30 |
+
## Taka look at rows where sparse values is an empty array
|
31 |
+
sparse_lengths = [len(x) for x in newdf['sparse_values']]
|
32 |
+
## Drop newdf rows where sparse length is
|
33 |
+
newdf = newdf[np.array(sparse_lengths) != 0].reset_index(drop=True)
|
34 |
+
vecs = DataLoader.create_sparse_dense_dict(newdf)
|
35 |
+
index = pc.Index("oc-hybrid-library-index")
|
36 |
+
for i in range(0, len(vecs), 400):
|
37 |
+
end_index = min(i + 400, len(vecs))
|
38 |
+
index.upsert(vecs[i:end_index], namespace="expanded-fact-checks")
|
39 |
+
print(f"Upserted vectors")
|
40 |
+
|
41 |
+
#####################################
|
42 |
+
### Querying performance for TruthSeeker Subset
|
43 |
+
df = pd.read_csv('data/truthseeker_subsample.csv')
|
44 |
+
corpus = df['claim'].tolist()
|
45 |
+
|
46 |
+
"""
|
47 |
+
## Function query, return score, title, link
|
48 |
+
Example: get_score_title_link(corpus[0], pc, index)
|
49 |
+
"""
|
50 |
+
def get_score_title_link(querytext, pc, index):
|
51 |
+
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext)
|
52 |
+
empty_sparse = DataLoader.empty_sparse_vector()
|
53 |
+
res = index.query(
|
54 |
+
top_k=1,
|
55 |
+
namespace="expanded-fact-checks",
|
56 |
+
vector=queryembed,
|
57 |
+
sparse_vector=empty_sparse,
|
58 |
+
include_metadata=True
|
59 |
+
)
|
60 |
+
score = res['matches'][0]['score']
|
61 |
+
title = res['matches'][0]['metadata']['text']
|
62 |
+
link = res['matches'][0]['metadata']['claimReviewUrl']
|
63 |
+
return pd.Series([score, title, link], index=['score', 'title', 'link'])
|
64 |
+
|
65 |
+
## Get score, title, link for each querytext in corpus
|
66 |
+
import time
|
67 |
+
from pinecone.grpc import PineconeGRPC
|
68 |
+
pc = PineconeGRPC(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0") # <-- make sure to set this)
|
69 |
+
index = pc.Index(
|
70 |
+
name="oc-hybrid-library-index",
|
71 |
+
pool_threads=50, # <-- make sure to set this
|
72 |
+
)
|
73 |
+
|
74 |
+
### TIMING
|
75 |
+
start_time = time.time()
|
76 |
+
|
77 |
+
df[['score', 'title', 'link']] = df['claim'].apply(get_score_title_link, args=(pc, index)) #send the claim column to be scored.
|
78 |
+
|
79 |
+
elapsed_time = time.time() - start_time
|
80 |
+
print(f"Time taken: {elapsed_time:.2f} seconds")
|
81 |
+
|
82 |
+
|
83 |
+
######## END TIMING
|
src/python/upload_library_hybrid-sparse.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Upload Telegram 300K to hybrid-sparse
|
2 |
+
from pinecone.grpc import PineconeGRPC as Pinecone
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
from pinecone import ServerlessSpec
|
7 |
+
from pinecone_text.sparse import BM25Encoder
|
8 |
+
import sys
|
9 |
+
sys.path.append('src/python')
|
10 |
+
import DataLoader
|
11 |
+
|
12 |
+
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
|
13 |
+
pc.delete_index("oc-hybrid-library-index")
|
14 |
+
|
15 |
+
pc.create_index(
|
16 |
+
name="oc-hybrid-library-index",
|
17 |
+
dimension=1024,
|
18 |
+
metric="dotproduct",
|
19 |
+
spec=ServerlessSpec(
|
20 |
+
cloud="aws",
|
21 |
+
region="us-east-1"
|
22 |
+
)
|
23 |
+
)
|
24 |
+
|
25 |
+
## Upsert Indicator Data
|
26 |
+
df = pd.read_csv('data/google_fact_checks2024-11-14.csv')
|
27 |
+
# Drop non-unique text values
|
28 |
+
df = df.drop_duplicates(subset=['text'])
|
29 |
+
|
30 |
+
## get top three rows
|
31 |
+
#df = df.head(3)
|
32 |
+
# get text and MessageID
|
33 |
+
bm25, newdf = DataLoader.create_sparse_embeds(pc, df)
|
34 |
+
metadata = df[['text', 'category', 'claimReviewTitle', 'claimReviewUrl']].to_dict(orient='records')
|
35 |
+
newdf.loc[:, 'metadata'] = metadata
|
36 |
+
## Taka look at rows where sparse values is an empty array
|
37 |
+
sparse_lengths = [len(x) for x in newdf['sparse_values']]
|
38 |
+
## Drop newdf rows where sparse length is
|
39 |
+
#newdf = newdf[pd.Series(sparse_lengths) != 0]
|
40 |
+
|
41 |
+
# Create a dictionary of sparse and dense vectors for each category value in the dataframe
|
42 |
+
#for category in df['category'].unique():
|
43 |
+
# category_df = newdf[df['category'] == category]
|
44 |
+
# vecs = DataLoader.create_sparse_dense_dict(category_df)
|
45 |
+
# index = pc.Index("oc-hybrid-library-index")
|
46 |
+
# for i in range(0, len(vecs), 400):
|
47 |
+
# end_index = min(i + 400, len(vecs))
|
48 |
+
# index.upsert(vecs[i:end_index], namespace=category)
|
49 |
+
# print(f"Upserted {category} vectors")
|
50 |
+
vecs = DataLoader.create_sparse_dense_dict(newdf)
|
51 |
+
index = pc.Index("oc-hybrid-library-index")
|
52 |
+
for i in range(0, len(vecs), 400):
|
53 |
+
end_index = min(i + 400, len(vecs))
|
54 |
+
index.upsert(vecs[i:end_index], namespace="fact-checks")
|
55 |
+
print(f"Upserted vectors")
|
56 |
+
|
57 |
+
|
58 |
+
################# Querying the index
|
59 |
+
df = pd.read_csv('data/google_fact_checks2024-11-14.csv')
|
60 |
+
corpus = df['text'].tolist()
|
61 |
+
vector, bm25 = DataLoader.encode_documents(corpus)
|
62 |
+
index = pc.Index("oc-hybrid-library-index")
|
63 |
+
|
64 |
+
querytext = "satanic"
|
65 |
+
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext)
|
66 |
+
query_sparse_vector = bm25.encode_documents(querytext)
|
67 |
+
empty_sparse = empty_sparse_vector()
|
68 |
+
|
69 |
+
query_response = index.query(
|
70 |
+
top_k=5,
|
71 |
+
namespace="immigration",
|
72 |
+
vector=queryembed,
|
73 |
+
sparse_vector=empty_sparse,
|
74 |
+
include_metadata=True
|
75 |
+
)
|
76 |
+
query_response
|
77 |
+
|
78 |
+
## UPLOAD Expansive LLM's
|
79 |
+
df = pd.read_csv('data/expansive_claims_library_expanded.csv')
|
80 |
+
df['text']=df['ExpandedClaim']
|
81 |
+
## get top three rows
|
82 |
+
#df = df.head(3)
|
83 |
+
# get text and MessageID
|
84 |
+
bm25, newdf = DataLoader.create_sparse_embeds(pc, df)
|
85 |
+
metadata = df[['Narrative', 'Model', 'Policy']].to_dict(orient='records')
|
86 |
+
newdf.loc[:, 'metadata'] = metadata
|
87 |
+
## Taka look at rows where sparse values is an empty array
|
88 |
+
sparse_lengths = [len(x) for x in newdf['sparse_values']]
|
89 |
+
## Drop newdf rows where sparse length is 0
|
90 |
+
newdf = newdf[pd.Series(sparse_lengths) != 0]
|
91 |
+
|
92 |
+
# Create a dictionary of sparse and dense vectors for each category value in the dataframe
|
93 |
+
#for category in df['category'].unique():
|
94 |
+
# category_df = newdf[df['category'] == category]
|
95 |
+
# vecs = DataLoader.create_sparse_dense_dict(category_df)
|
96 |
+
# index = pc.Index("oc-hybrid-library-index")
|
97 |
+
# for i in range(0, len(vecs), 400):
|
98 |
+
# end_index = min(i + 400, len(vecs))
|
99 |
+
# index.upsert(vecs[i:end_index], namespace=category)
|
100 |
+
# print(f"Upserted {category} vectors")
|
101 |
+
|
102 |
+
vecs = DataLoader.create_sparse_dense_dict(newdf)
|
103 |
+
index = pc.Index("oc-hybrid-library-index")
|
104 |
+
for i in range(0, len(vecs), 400):
|
105 |
+
end_index = min(i + 400, len(vecs))
|
106 |
+
index.upsert(vecs[i:end_index], namespace="narratives")
|
107 |
+
print(f"Upserted vectors")
|