misinfo_detection_app / src /PyPineCone.jl
stefanjwojcik's picture
Upload 24 files
48bb68b verified
### PineCone Embed and I/O Functions
"""
# This dataset matches the example data from DataLoader.py
import OstreaCultura as OC
hi = OC.example_data()
hi = OC.df_to_pd(hi)
OC.DataLoader.create_vectors_from_df(hi)
"""
function example_data()
DataFrame(
Embeddings = [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]],
id = ["vec1", "vec2"],
genre = ["drama", "action"]
)
end
"""
df= OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
df_julia = OC.pd_to_df(df)
"""
function pd_to_df(df_pd)
df= DataFrame()
for col in df_pd.columns
df[!, col] = getproperty(df_pd, col).values
end
df
end
"""
Available functions
pc.create_index - see below
pc.delete_index: pc.delete_index(index_name)
"""
function create_pinecone_context()
pc = DataLoader.Pinecone(api_key=ENV["PINECONE_API_KEY"])
return pc
end
"""
# Context for inference endpoints
"""
function create_inf_pinecone_context()
pc = DataLoader.Pinecone(ENV["PINECONE_API_KEY"])
return pc
end
"""
pc = create_pinecone_context()
create_index("new-index", 4, "cosine", "aws", "us-east-1")
"""
function create_index(name, dimension, metric, cloud, region)
ppc = create_pinecone_context()
DataLoader.create_index(ppc, name, dimension, metric, cloud, region)
end
"""
import OstreaCultura as OC
df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
model = "multilingual-e5-large"
out = OC.multi_embeddings(model, df, 96, "text")
# Id and Embeddings are required columns in the DataFrame
OC.upsert_data(out, "test-index", "test-namespace")
df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
model = "multilingual-e5-large"
test_embeds = OC.multi_embeddings(model, df, 96, "text")
test_embeds_min = test_embeds.head(10)
# Id and Embeddings are required columns in the DataFrame
OC.upsert_data(test_embeds_min, "test-index", "indicator-test-namespace", chunk_size=100)
"""
function upsert_data(df, indexname, namespace; chunk_size=1000)
# Import DataLoader.py
pc = create_pinecone_context()
index = pc.Index(indexname)
DataLoader.chunk_df_and_upsert(index, df, namespace=namespace, chunk_size=chunk_size)
end
"""
## How to query data using an existing embedding
import OstreaCultura as OC; using DataFrames
mydf = DataFrame(id = ["vec1", "vec2"], text = ["drama", "action"])
mydf = OC.multi_embeddings(mydf)
vector = mydf.Embeddings[1]
top_k = 5
include_values = true
OC.query_data("test-index", "test-namespace", vector, top_k, include_values)
"""
function query_data(indexname, namespace, vector, top_k, include_values)
pc = create_pinecone_context()
index = pc.Index(indexname)
DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict()
end
"""
## How to query data using an existing hybrid embedding
import OstreaCultura as OC; using DataFrames
querytext = "drama"
dense = OC.embed_query(querytext)
top_k = 5
include_values = true
include_metadata = true
OC.query_data_with_sparse("oc-hybrid-library-index", "immigration", dense, OC.DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata)
"""
function query_data_with_sparse(indexname, namespace, dense, sparse, top_k, include_values, include_metadata)
pc = create_pinecone_context()
index = pc.Index(indexname)
DataLoader.query_data_with_sparse(index, namespace, dense, sparse, top_k=top_k, include_values=include_values, include_metadata=include_metadata).to_dict()
end
"""
## Querying function for GGWP - using updated hybrid vector
import OstreaCultura as OC
claim = "drama"
indexname = "oc-hybrid-library-index"
ocmodel = "expanded-fact-checks"
OC.search(claim, indexname, ocmodel, include_values=false, include_metadata=false)
res = OC.search(claim, indexname, ocmodel)
"""
function search(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true)
dense = embed_query(claim)
query_data_with_sparse(indexname, ocmodel, dense, DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata)
end
function unicodebarplot(x, y, title = "Query Matches")
UnicodePlots.barplot(x, y, title=title)
end
function searchresult_to_unicodeplot(searchresult)
scores = [x["score"] for x in searchresult["matches"]]
text = [x["metadata"]["text"] for x in searchresult["matches"]]
## reduce the text to 41 characters
text_to_show = [length(x) > 41 ? x[1:41] * "..." : x for x in text]
unicodebarplot(text_to_show, scores)
end
"""
## Search and plot the results
import OstreaCultura as OC
claim = "drama"
indexname = "oc-hybrid-library-index"
ocmodel = "immigration"
OC.searchplot(claim, indexname, ocmodel)
"""
function searchplot(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true)
searchresult = search(claim, indexname, ocmodel, top_k=top_k, include_values=include_values, include_metadata=include_metadata)
searchresult_to_unicodeplot(searchresult)
end
"""
import OstreaCultura as OC
df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
model = "multilingual-e5-large"
out = OC.multi_embeddings(model, df, 96, "text")
using CSV, DataFrames
tdat = CSV.read("data/climate_test.csv", DataFrame)
OC.multi_embeddings(model, Pandas.DataFrame(tdat), 96, "text")
"""
function multi_embeddings(model, data, chunk_size, textcol)
pc = create_inf_pinecone_context()
DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol)
end
"""
using CSV, DataFrames
import OstreaCultura as OC
tdat = CSV.read("data/climate_test.csv", DataFrame)
OC.multi_embeddings(tdat)
"""
function multi_embeddings(data::DataFrames.DataFrame; kwargs...)
data = df_to_pd(data)
model = get(kwargs, :model, "multilingual-e5-large")
chunk_size = get(kwargs, :chunk_size, 96)
textcol = get(kwargs, :textcol, "text")
pc = create_inf_pinecone_context()
DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol)
end
"""
## Julia DataFrame to pandas DataFrame
"""
function df_to_pd(df::DataFrames.DataFrame)
pdataframe(df)
end
function embed_query(querytext; kwargs...)
firstdf = DataFrame(id = "vec1", text = querytext)
firstdf = multi_embeddings(firstdf)
vector = firstdf.Embeddings[1]
return vector
end
"""
## Query with a vector of embeddings
import OstreaCultura as OC
vector = rand(1024)
indexname = "test-index"
namespace = "test-namespace"
vecresults = OC.query_w_vector(vector, indexname, namespace)
"""
function query_w_vector(vector, indexname, namespace; kwargs...)
top_k = get(kwargs, :top_k, 5)
include_values = get(kwargs, :include_values, true)
pc = create_pinecone_context()
index = pc.Index(indexname)
queryresults = DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict()
##
if include_values
values_vector = [queryresults["matches"][i]["values"] for i in 1:length(queryresults["matches"])]
else
values_vector = [missing for i in 1:length(queryresults["matches"])]
end
# drop the "values" key from each dict so it doesn't get added to the DataFrame
for i in 1:length(queryresults["matches"])
delete!(queryresults["matches"][i], "values")
end
out = DataFrame()
for i in 1:length(queryresults["matches"])
out = vcat(out, DataFrame(queryresults["matches"][i]))
end
# If desired update this function to add the embeddings to the DataFrame
if include_values
out[:, "values"] = values_vector
end
return out
end
"""
import OstreaCultura as OC
indexname = "test-index"
namespace = "test-namespace"
pc = OC.create_pinecone_context()
vector = OC.embed_query("drama")
queryresults = OC.query_w_vector(vector, indexname, namespace, top_k=5, include_values=false)
### now, fetch the underlying data
#fetched_data = OC.fetch_data(queryresults.id, indexname, namespace)
index = pc.Index(indexname)
resultfetch = OC.DataLoader.fetch_data(index, queryresults.id, namespace).to_dict()
OC.parse_fetched_results(resultfetch)
"""
function parse_fetched_results(resultfetch)
if length(resultfetch["vectors"]) > 0
ids = collect(keys(resultfetch["vectors"]))
## Grab the MetaData
data = []
for id in ids
push!(data, resultfetch["vectors"][id]["metadata"])
end
## Create a DataFrame From the MetaData
out = DataFrame()
for i in 1:length(data)
try
out = vcat(out, DataFrame(data[i]))
catch
out = vcat(out, DataFrame(data[i]), cols=:union)
end
end
out[!, :id] = ids
return out
else
@info "No data found"
return DataFrame()
end
end
"""
import OstreaCultura as OC
indexname = "test-index"
namespace = "test-namespace"
pc = OC.create_pinecone_context()
index = pc.Index(indexname)
ids = ["OSJeL7", "3TxWTNpPn"]
query_results_as_dataframe = OC.fetch_data(ids, indexname, namespace)
"""
function fetch_data(ids, indexname, namespace; chunk_size=900)
pc = create_pinecone_context()
index = pc.Index(indexname)
result_out = DataFrame()
for i in 1:ceil(Int, length(ids)/chunk_size)
chunk = ids[(i-1)*chunk_size+1:min(i*chunk_size, length(ids))]
resultfetch = DataLoader.fetch_data(index, chunk, namespace).to_dict()
result_out = vcat(result_out, parse_fetched_results(resultfetch))
end
return result_out
end
"""
## FINAL Query function - embeds, queries, and fetches data
import OstreaCultura as OC
querytext = "drama"
indexname = "test-index"
namespace = "test-namespace"
OC.query(querytext, indexname, namespace)
"""
function query(querytext::String, indexname::String, namespace::String; kwargs...)
top_k = get(kwargs, :top_k, 5)
include_values = get(kwargs, :include_values, true)
vector = embed_query(querytext)
queryresults = query_w_vector(vector, indexname, namespace, top_k=top_k, include_values=include_values)
### now, fetch the underlying data
fetched_data = fetch_data(queryresults.id, indexname, namespace)
# join the two dataframes on id
merged = innerjoin(queryresults, fetched_data, on=:id)
return merged
end
function filter_claims_closer_to_counterclaims(claim_results, counterclaim_results)
# Rename scores to avoid conflicts
rename!(claim_results, :score => :claim_score)
rename!(counterclaim_results, :score => :counterclaim_score)
# Innerjoin
df = leftjoin(claim_results, counterclaim_results, on=:id)
# Fill missing values with 0
df.counterclaim_score = coalesce.(df.counterclaim_score, 0.0)
# Keep only results where the claim score is greater than the counterclaim score
df = df[df.claim_score .> df.counterclaim_score, :]
return df
end
"""
## Query with claims and counterclaims
import OstreaCultura as OC
claim = "Climate change is a hoax"
counterclaim = "Climate change is real"
indexname = "test-index"
namespace = "test-namespace"
hi = OC.query_claims(claim, counterclaim, indexname, namespace)
"""
function query_claims(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...)
threshold = get(kwargs, :threshold, 0.8)
top_k = get(kwargs, :top_k, 5000) # top_k for the initial query
# Get embeddings
claim_vector = embed_query(claim)
counterclaim_vector = embed_query(counterclaim)
# Query the embeddings
claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false)
counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false)
# If a given id has a greater score for the claim than the counterclaim, keep it
allscores = filter_claims_closer_to_counterclaims(claim_results, counterclaim_results)
# Filter to scores above the threshold
allscores = allscores[allscores.claim_score .> threshold, :]
if size(allscores)[1] == 0
@info "No claims were above the threshold"
return DataFrame()
else
## now, fetch the data
resulting_data = fetch_data(allscores.id, indexname, namespace)
# merge the data on id
resulting_data = innerjoin(allscores, resulting_data, on=:id)
return resulting_data
end
end
"""
## Classify a claim against the existing misinformation library
import OstreaCultura as OC
## Example 1
claim = "There is a lot of dispute about whether the Holocaust happened"
counterclaim = "The Holocaust is a well-documented historical event"
indexname = "ostreacultura-v1"
namespace = "modified-misinfo-library"
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)
## Example 2
claim = "it's cool to be trans these days"
counterclaim = ""
indexname = "ostreacultura-v1"
namespace = "modified-misinfo-library"
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)
## Example 3
claim = "No existe racismo contra las personas negras"
counterclaim = "Racism is a systemic issue that affects people of color"
indexname = "ostreacultura-v1"
namespace = "modified-misinfo-library"
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)
"""
function classify_claim(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...)
threshold = get(kwargs, :threshold, 0.8)
top_k = get(kwargs, :top_k, 10) # top_k for the initial query
# Get embeddings
claim_vector = embed_query(claim)
if counterclaim != ""
counterclaim_vector = embed_query(counterclaim)
counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false)
counterclaim_score = counterclaim_results.score[1]
else
counterclaim_score = 0.0
end
# Query the embeddings
claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false)
# Filter to scores above the threshold
claim_results = claim_results[claim_results.score .> threshold, :]
## now, fetch the data
resulting_data = fetch_data(claim_results.id, indexname, namespace)
resulting_data.scores = claim_results.score
return resulting_data, counterclaim_score
end
function generate_sparse_model()
df = DataLoader.pd.read_csv("data/random_300k.csv")
corpus = df["text"].tolist()
vector, bm25 = OC.DataLoader.encode_documents(corpus)
return vector, bm25
end