## Script to update the fact-check data for HuggingFace API using OstreaCultura # Get the latest data to embed #include("../scripts/google_fact_check_api.jl") include("scripts/google_fact_check_api.jl") # Latest data #get_latest_fact_checks() fc, errors = load_fact_check_json() ## Embed with MiniEncoder or MTR5-encoder #fc = CSV.read("data/fact_check_latest.csv", DataFrame) # drop missing text fc = fc[.!ismissing.(fc.text), :] fc_embed = mini_embed.(fc.text) # 12 minutes fc.Embeddings = fc_embed CSV.write("data/fact_check_latest_embed_mini.csv", fc) ## Embed with MTR5-encoder fc = fc[.!ismissing.(fc.text), :] # Drop where text = "" fc = fc[fc.text .!= "", :] fc_embed = OstreaCultura.maxi_embed.(fc.text) # fc.Embeddings = fc_embed CSV.write("data/fact_check_latest_embed_maxi.csv", fc) narrs = CSV.read("data/expansive_claims_library_expanded_embed.csv", DataFrame) # drop missing text narrs.text = narrs.ExpandedClaim narrs = narrs[.!ismissing.(narrs.text), :] narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run narrs.Embeddings = narratives_embed CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs) # Compress the fact check data OC.compress_csv("data/fact_check_latest_embed_maxi.csv", "data/fc_latest_maxi_compr") # Delete the original rm("data/fact_check_latest_embed_maxi.csv")