Spaces:

stefanjwojcik
/

misinfo_detection_app

Running

File size: 1,342 Bytes

## Script to update the fact-check data for HuggingFace API 
using OstreaCultura

# Get the latest data to embed 
#include("../scripts/google_fact_check_api.jl")
include("scripts/google_fact_check_api.jl")

# Latest data 
#get_latest_fact_checks()
fc, errors = load_fact_check_json()

## Embed with MiniEncoder or MTR5-encoder 
#fc = CSV.read("data/fact_check_latest.csv", DataFrame)
# drop missing text 
fc = fc[.!ismissing.(fc.text), :]
fc_embed = mini_embed.(fc.text) # 12 minutes
fc.Embeddings = fc_embed
CSV.write("data/fact_check_latest_embed_mini.csv", fc)

## Embed with MTR5-encoder
fc = fc[.!ismissing.(fc.text), :]
# Drop where text = ""
fc = fc[fc.text .!= "", :]
fc_embed = OstreaCultura.maxi_embed.(fc.text) # 
fc.Embeddings = fc_embed
CSV.write("data/fact_check_latest_embed_maxi.csv", fc)


narrs = CSV.read("data/expansive_claims_library_expanded_embed.csv", DataFrame)
# drop missing text 
narrs.text = narrs.ExpandedClaim
narrs = narrs[.!ismissing.(narrs.text), :]
narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run 
narrs.Embeddings = narratives_embed
CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
# Compress the fact check data
OC.compress_csv("data/fact_check_latest_embed_maxi.csv", "data/fc_latest_maxi_compr")
# Delete the original 
rm("data/fact_check_latest_embed_maxi.csv")