Spaces:

stefanjwojcik
/

misinfo_detection_app

Running

File size: 14,395 Bytes

48bb68b

### PineCone Embed and I/O Functions 

"""
# This dataset matches the example data from DataLoader.py
import OstreaCultura as OC
hi = OC.example_data()
hi = OC.df_to_pd(hi)
OC.DataLoader.create_vectors_from_df(hi)
"""
function example_data()
    DataFrame(
    Embeddings = [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]],
    id = ["vec1", "vec2"],
    genre = ["drama", "action"]
)
end

"""
df= OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
df_julia = OC.pd_to_df(df)
"""
function pd_to_df(df_pd)
    df= DataFrame()
    for col in df_pd.columns
        df[!, col] = getproperty(df_pd, col).values
    end
    df
end

"""
Available functions 
pc.create_index - see below 
pc.delete_index: pc.delete_index(index_name)
"""
function create_pinecone_context()
    pc = DataLoader.Pinecone(api_key=ENV["PINECONE_API_KEY"])
    return pc
end 

"""
# Context for inference endpoints 
"""
function create_inf_pinecone_context()
    pc = DataLoader.Pinecone(ENV["PINECONE_API_KEY"])
    return pc
end

"""
pc = create_pinecone_context()
create_index("new-index", 4, "cosine", "aws", "us-east-1")
"""
function create_index(name, dimension, metric, cloud, region)
    ppc = create_pinecone_context()
    DataLoader.create_index(ppc, name, dimension, metric, cloud, region)
end

"""
import OstreaCultura as OC
df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
model = "multilingual-e5-large"
out = OC.multi_embeddings(model, df, 96, "text")
# Id and Embeddings are required columns in the DataFrame
OC.upsert_data(out, "test-index", "test-namespace")

df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
model = "multilingual-e5-large"
test_embeds = OC.multi_embeddings(model, df, 96, "text")
test_embeds_min = test_embeds.head(10)
# Id and Embeddings are required columns in the DataFrame
OC.upsert_data(test_embeds_min, "test-index", "indicator-test-namespace", chunk_size=100)

"""
function upsert_data(df, indexname, namespace; chunk_size=1000)
    # Import DataLoader.py 
    pc = create_pinecone_context()
    index = pc.Index(indexname)
    DataLoader.chunk_df_and_upsert(index, df, namespace=namespace, chunk_size=chunk_size)
end

"""
## How to query data using an existing embedding 
import OstreaCultura as OC; using DataFrames
mydf = DataFrame(id = ["vec1", "vec2"], text = ["drama", "action"])
mydf = OC.multi_embeddings(mydf)
vector = mydf.Embeddings[1]
top_k = 5
include_values = true
OC.query_data("test-index", "test-namespace", vector, top_k, include_values)
"""
function query_data(indexname, namespace, vector, top_k, include_values)
    pc = create_pinecone_context()
    index = pc.Index(indexname)
    DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict()
end

"""
## How to query data using an existing hybrid embedding

import OstreaCultura as OC; using DataFrames
querytext = "drama"
dense = OC.embed_query(querytext)
top_k = 5
include_values = true
include_metadata = true
OC.query_data_with_sparse("oc-hybrid-library-index", "immigration", dense, OC.DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata)

"""
function query_data_with_sparse(indexname, namespace, dense, sparse, top_k, include_values, include_metadata)
    pc = create_pinecone_context()
    index = pc.Index(indexname)
    DataLoader.query_data_with_sparse(index, namespace, dense, sparse, top_k=top_k, include_values=include_values, include_metadata=include_metadata).to_dict()
end

"""
## Querying function for GGWP - using updated hybrid vector 
import OstreaCultura as OC
claim = "drama"
indexname = "oc-hybrid-library-index"
ocmodel = "expanded-fact-checks"
OC.search(claim, indexname, ocmodel, include_values=false, include_metadata=false)
res = OC.search(claim, indexname, ocmodel)
"""
function search(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true)
    dense = embed_query(claim)
    query_data_with_sparse(indexname, ocmodel, dense, DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata)
end

function unicodebarplot(x, y, title = "Query Matches")
    UnicodePlots.barplot(x, y, title=title)
end

function searchresult_to_unicodeplot(searchresult)
    scores = [x["score"] for x in searchresult["matches"]]
    text = [x["metadata"]["text"] for x in searchresult["matches"]]
    ## reduce the text to 41 characters
    text_to_show = [length(x) > 41 ? x[1:41] * "..." : x for x in text]
    unicodebarplot(text_to_show, scores)
end

"""
## Search and plot the results

import OstreaCultura as OC
claim = "drama"
indexname = "oc-hybrid-library-index"
ocmodel = "immigration"
OC.searchplot(claim, indexname, ocmodel)
"""
function searchplot(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true)
    searchresult = search(claim, indexname, ocmodel, top_k=top_k, include_values=include_values, include_metadata=include_metadata)
    searchresult_to_unicodeplot(searchresult)
end

"""
import OstreaCultura as OC
df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
model = "multilingual-e5-large"
out = OC.multi_embeddings(model, df, 96, "text")

using CSV, DataFrames
tdat = CSV.read("data/climate_test.csv", DataFrame)
OC.multi_embeddings(model, Pandas.DataFrame(tdat), 96, "text")
"""
function multi_embeddings(model, data, chunk_size, textcol)
    pc = create_inf_pinecone_context()
    DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol)
end

"""
using CSV, DataFrames
import OstreaCultura as OC
tdat = CSV.read("data/climate_test.csv", DataFrame)
OC.multi_embeddings(tdat)
"""
function multi_embeddings(data::DataFrames.DataFrame; kwargs...)
    data = df_to_pd(data)
    model = get(kwargs, :model, "multilingual-e5-large")
    chunk_size = get(kwargs, :chunk_size, 96)
    textcol = get(kwargs, :textcol, "text")
    pc = create_inf_pinecone_context()
    DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol)
end

"""
## Julia DataFrame to pandas DataFrame
"""
function df_to_pd(df::DataFrames.DataFrame)
    pdataframe(df)
end

function embed_query(querytext; kwargs...)
    firstdf = DataFrame(id = "vec1", text = querytext)
    firstdf = multi_embeddings(firstdf)
    vector = firstdf.Embeddings[1]
    return vector 
end

"""
## Query with a vector of embeddings
import OstreaCultura as OC
vector = rand(1024)
indexname = "test-index"
namespace = "test-namespace"
vecresults = OC.query_w_vector(vector, indexname, namespace)
"""
function query_w_vector(vector, indexname, namespace; kwargs...)
    top_k = get(kwargs, :top_k, 5)
    include_values = get(kwargs, :include_values, true)
    pc = create_pinecone_context()
    index = pc.Index(indexname)
    queryresults = DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict()
    ##
    if include_values
        values_vector = [queryresults["matches"][i]["values"] for i in 1:length(queryresults["matches"])]
    else 
        values_vector = [missing for i in 1:length(queryresults["matches"])]
    end
    # drop the "values" key from each dict so it doesn't get added to the DataFrame
    for i in 1:length(queryresults["matches"])
        delete!(queryresults["matches"][i], "values")
    end
    out = DataFrame()
    for i in 1:length(queryresults["matches"])
        out = vcat(out, DataFrame(queryresults["matches"][i]))
    end
    # If desired update this function to add the embeddings to the DataFrame
    if include_values
        out[:, "values"] = values_vector
    end

    return out
end

"""
import OstreaCultura as OC
indexname = "test-index"
namespace = "test-namespace"
pc = OC.create_pinecone_context()
vector = OC.embed_query("drama")
queryresults = OC.query_w_vector(vector, indexname, namespace, top_k=5, include_values=false)
### now, fetch the underlying data 
#fetched_data = OC.fetch_data(queryresults.id, indexname, namespace)
index = pc.Index(indexname)
resultfetch = OC.DataLoader.fetch_data(index, queryresults.id, namespace).to_dict()
OC.parse_fetched_results(resultfetch)
"""
function parse_fetched_results(resultfetch)
    if length(resultfetch["vectors"]) > 0
        ids = collect(keys(resultfetch["vectors"]))
        ## Grab the MetaData
        data = []
        for id in ids
            push!(data, resultfetch["vectors"][id]["metadata"])
        end 
        ## Create a DataFrame From the MetaData 
        out = DataFrame()
        for i in 1:length(data)
            try 
                out = vcat(out, DataFrame(data[i]))
            catch
                out = vcat(out, DataFrame(data[i]), cols=:union)
            end
        end
        out[!, :id] = ids
        return out
    else
        @info "No data found"
        return DataFrame()
    end
end

"""
import OstreaCultura as OC
indexname = "test-index"
namespace = "test-namespace"
pc = OC.create_pinecone_context()
index = pc.Index(indexname)
ids = ["OSJeL7", "3TxWTNpPn"]
query_results_as_dataframe = OC.fetch_data(ids, indexname, namespace)
"""
function fetch_data(ids, indexname, namespace; chunk_size=900)
    pc = create_pinecone_context()
    index = pc.Index(indexname)
    result_out = DataFrame()
    for i in 1:ceil(Int, length(ids)/chunk_size)
        chunk = ids[(i-1)*chunk_size+1:min(i*chunk_size, length(ids))]
        resultfetch = DataLoader.fetch_data(index, chunk, namespace).to_dict()
        result_out = vcat(result_out, parse_fetched_results(resultfetch))
    end
    return result_out
end

"""
## FINAL Query function - embeds, queries, and fetches data
import OstreaCultura as OC
querytext = "drama"
indexname = "test-index"
namespace = "test-namespace"
OC.query(querytext, indexname, namespace)
"""
function query(querytext::String, indexname::String, namespace::String; kwargs...)
    top_k = get(kwargs, :top_k, 5)
    include_values = get(kwargs, :include_values, true)
    vector = embed_query(querytext)
    queryresults = query_w_vector(vector, indexname, namespace, top_k=top_k, include_values=include_values)
    ### now, fetch the underlying data 
    fetched_data = fetch_data(queryresults.id, indexname, namespace)
    # join the two dataframes on id 
    merged = innerjoin(queryresults, fetched_data, on=:id)
    return merged
end

function filter_claims_closer_to_counterclaims(claim_results, counterclaim_results)
    # Rename scores to avoid conflicts
    rename!(claim_results, :score => :claim_score)
    rename!(counterclaim_results, :score => :counterclaim_score)    
    # Innerjoin 
    df = leftjoin(claim_results, counterclaim_results, on=:id)
    # Fill missing values with 0
    df.counterclaim_score = coalesce.(df.counterclaim_score, 0.0)
    # Keep only results where the claim score is greater than the counterclaim score
    df = df[df.claim_score .> df.counterclaim_score, :]
    return df
end

"""
## Query with claims and counterclaims 
import OstreaCultura as OC

claim = "Climate change is a hoax"
counterclaim = "Climate change is real"
indexname = "test-index"
namespace = "test-namespace"
hi = OC.query_claims(claim, counterclaim, indexname, namespace)
"""
function query_claims(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...)
    threshold = get(kwargs, :threshold, 0.8)
    top_k = get(kwargs, :top_k, 5000) # top_k for the initial query
    # Get embeddings 
    claim_vector = embed_query(claim)
    counterclaim_vector = embed_query(counterclaim)
    # Query the embeddings
    claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false)
    counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false)
    # If a given id has a greater score for the claim than the counterclaim, keep it
    allscores = filter_claims_closer_to_counterclaims(claim_results, counterclaim_results)
    # Filter to scores above the threshold
    allscores = allscores[allscores.claim_score .> threshold, :]
    if size(allscores)[1] == 0
        @info "No claims were above the threshold"
        return DataFrame()
    else 
        ## now, fetch the data 
        resulting_data = fetch_data(allscores.id, indexname, namespace)
        # merge the data on id
        resulting_data = innerjoin(allscores, resulting_data, on=:id)
        return resulting_data
    end
end


"""
## Classify a claim against the existing misinformation library 
import OstreaCultura as OC

## Example 1 
claim = "There is a lot of dispute about whether the Holocaust happened"
counterclaim = "The Holocaust is a well-documented historical event"
indexname = "ostreacultura-v1"
namespace = "modified-misinfo-library"
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)

## Example 2 
claim = "it's cool to be trans these days" 
counterclaim = ""
indexname = "ostreacultura-v1"
namespace = "modified-misinfo-library"
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)

## Example 3
claim = "No existe racismo contra las personas negras" 
counterclaim = "Racism is a systemic issue that affects people of color"
indexname = "ostreacultura-v1"
namespace = "modified-misinfo-library"
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)

"""
function classify_claim(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...)
    threshold = get(kwargs, :threshold, 0.8)
    top_k = get(kwargs, :top_k, 10) # top_k for the initial query
    # Get embeddings 
    claim_vector = embed_query(claim)
    if counterclaim != ""
        counterclaim_vector = embed_query(counterclaim)
        counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false)
        counterclaim_score = counterclaim_results.score[1]
    else
        counterclaim_score = 0.0
    end
    # Query the embeddings
    claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false)
    # Filter to scores above the threshold
    claim_results = claim_results[claim_results.score .> threshold, :]
    ## now, fetch the data 
    resulting_data = fetch_data(claim_results.id, indexname, namespace)
    resulting_data.scores = claim_results.score
    return resulting_data, counterclaim_score
end

function generate_sparse_model()
    df = DataLoader.pd.read_csv("data/random_300k.csv")
    corpus = df["text"].tolist()
    vector, bm25 = OC.DataLoader.encode_documents(corpus)
    return vector, bm25
end