Spaces:
Sleeping
Sleeping
### PineCone Embed and I/O Functions | |
""" | |
# This dataset matches the example data from DataLoader.py | |
import OstreaCultura as OC | |
hi = OC.example_data() | |
hi = OC.df_to_pd(hi) | |
OC.DataLoader.create_vectors_from_df(hi) | |
""" | |
function example_data() | |
DataFrame( | |
Embeddings = [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]], | |
id = ["vec1", "vec2"], | |
genre = ["drama", "action"] | |
) | |
end | |
""" | |
df= OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") | |
df_julia = OC.pd_to_df(df) | |
""" | |
function pd_to_df(df_pd) | |
df= DataFrame() | |
for col in df_pd.columns | |
df[!, col] = getproperty(df_pd, col).values | |
end | |
df | |
end | |
""" | |
Available functions | |
pc.create_index - see below | |
pc.delete_index: pc.delete_index(index_name) | |
""" | |
function create_pinecone_context() | |
pc = DataLoader.Pinecone(api_key=ENV["PINECONE_API_KEY"]) | |
return pc | |
end | |
""" | |
# Context for inference endpoints | |
""" | |
function create_inf_pinecone_context() | |
pc = DataLoader.Pinecone(ENV["PINECONE_API_KEY"]) | |
return pc | |
end | |
""" | |
pc = create_pinecone_context() | |
create_index("new-index", 4, "cosine", "aws", "us-east-1") | |
""" | |
function create_index(name, dimension, metric, cloud, region) | |
ppc = create_pinecone_context() | |
DataLoader.create_index(ppc, name, dimension, metric, cloud, region) | |
end | |
""" | |
import OstreaCultura as OC | |
df = OC.DataLoader.pd.read_csv("data/climate_test.csv") | |
model = "multilingual-e5-large" | |
out = OC.multi_embeddings(model, df, 96, "text") | |
# Id and Embeddings are required columns in the DataFrame | |
OC.upsert_data(out, "test-index", "test-namespace") | |
df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") | |
model = "multilingual-e5-large" | |
test_embeds = OC.multi_embeddings(model, df, 96, "text") | |
test_embeds_min = test_embeds.head(10) | |
# Id and Embeddings are required columns in the DataFrame | |
OC.upsert_data(test_embeds_min, "test-index", "indicator-test-namespace", chunk_size=100) | |
""" | |
function upsert_data(df, indexname, namespace; chunk_size=1000) | |
# Import DataLoader.py | |
pc = create_pinecone_context() | |
index = pc.Index(indexname) | |
DataLoader.chunk_df_and_upsert(index, df, namespace=namespace, chunk_size=chunk_size) | |
end | |
""" | |
## How to query data using an existing embedding | |
import OstreaCultura as OC; using DataFrames | |
mydf = DataFrame(id = ["vec1", "vec2"], text = ["drama", "action"]) | |
mydf = OC.multi_embeddings(mydf) | |
vector = mydf.Embeddings[1] | |
top_k = 5 | |
include_values = true | |
OC.query_data("test-index", "test-namespace", vector, top_k, include_values) | |
""" | |
function query_data(indexname, namespace, vector, top_k, include_values) | |
pc = create_pinecone_context() | |
index = pc.Index(indexname) | |
DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict() | |
end | |
""" | |
## How to query data using an existing hybrid embedding | |
import OstreaCultura as OC; using DataFrames | |
querytext = "drama" | |
dense = OC.embed_query(querytext) | |
top_k = 5 | |
include_values = true | |
include_metadata = true | |
OC.query_data_with_sparse("oc-hybrid-library-index", "immigration", dense, OC.DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata) | |
""" | |
function query_data_with_sparse(indexname, namespace, dense, sparse, top_k, include_values, include_metadata) | |
pc = create_pinecone_context() | |
index = pc.Index(indexname) | |
DataLoader.query_data_with_sparse(index, namespace, dense, sparse, top_k=top_k, include_values=include_values, include_metadata=include_metadata).to_dict() | |
end | |
""" | |
## Querying function for GGWP - using updated hybrid vector | |
import OstreaCultura as OC | |
claim = "drama" | |
indexname = "oc-hybrid-library-index" | |
ocmodel = "expanded-fact-checks" | |
OC.search(claim, indexname, ocmodel, include_values=false, include_metadata=false) | |
res = OC.search(claim, indexname, ocmodel) | |
""" | |
function search(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true) | |
dense = embed_query(claim) | |
query_data_with_sparse(indexname, ocmodel, dense, DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata) | |
end | |
function unicodebarplot(x, y, title = "Query Matches") | |
UnicodePlots.barplot(x, y, title=title) | |
end | |
function searchresult_to_unicodeplot(searchresult) | |
scores = [x["score"] for x in searchresult["matches"]] | |
text = [x["metadata"]["text"] for x in searchresult["matches"]] | |
## reduce the text to 41 characters | |
text_to_show = [length(x) > 41 ? x[1:41] * "..." : x for x in text] | |
unicodebarplot(text_to_show, scores) | |
end | |
""" | |
## Search and plot the results | |
import OstreaCultura as OC | |
claim = "drama" | |
indexname = "oc-hybrid-library-index" | |
ocmodel = "immigration" | |
OC.searchplot(claim, indexname, ocmodel) | |
""" | |
function searchplot(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true) | |
searchresult = search(claim, indexname, ocmodel, top_k=top_k, include_values=include_values, include_metadata=include_metadata) | |
searchresult_to_unicodeplot(searchresult) | |
end | |
""" | |
import OstreaCultura as OC | |
df = OC.DataLoader.pd.read_csv("data/climate_test.csv") | |
model = "multilingual-e5-large" | |
out = OC.multi_embeddings(model, df, 96, "text") | |
using CSV, DataFrames | |
tdat = CSV.read("data/climate_test.csv", DataFrame) | |
OC.multi_embeddings(model, Pandas.DataFrame(tdat), 96, "text") | |
""" | |
function multi_embeddings(model, data, chunk_size, textcol) | |
pc = create_inf_pinecone_context() | |
DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol) | |
end | |
""" | |
using CSV, DataFrames | |
import OstreaCultura as OC | |
tdat = CSV.read("data/climate_test.csv", DataFrame) | |
OC.multi_embeddings(tdat) | |
""" | |
function multi_embeddings(data::DataFrames.DataFrame; kwargs...) | |
data = df_to_pd(data) | |
model = get(kwargs, :model, "multilingual-e5-large") | |
chunk_size = get(kwargs, :chunk_size, 96) | |
textcol = get(kwargs, :textcol, "text") | |
pc = create_inf_pinecone_context() | |
DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol) | |
end | |
""" | |
## Julia DataFrame to pandas DataFrame | |
""" | |
function df_to_pd(df::DataFrames.DataFrame) | |
pdataframe(df) | |
end | |
function embed_query(querytext; kwargs...) | |
firstdf = DataFrame(id = "vec1", text = querytext) | |
firstdf = multi_embeddings(firstdf) | |
vector = firstdf.Embeddings[1] | |
return vector | |
end | |
""" | |
## Query with a vector of embeddings | |
import OstreaCultura as OC | |
vector = rand(1024) | |
indexname = "test-index" | |
namespace = "test-namespace" | |
vecresults = OC.query_w_vector(vector, indexname, namespace) | |
""" | |
function query_w_vector(vector, indexname, namespace; kwargs...) | |
top_k = get(kwargs, :top_k, 5) | |
include_values = get(kwargs, :include_values, true) | |
pc = create_pinecone_context() | |
index = pc.Index(indexname) | |
queryresults = DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict() | |
## | |
if include_values | |
values_vector = [queryresults["matches"][i]["values"] for i in 1:length(queryresults["matches"])] | |
else | |
values_vector = [missing for i in 1:length(queryresults["matches"])] | |
end | |
# drop the "values" key from each dict so it doesn't get added to the DataFrame | |
for i in 1:length(queryresults["matches"]) | |
delete!(queryresults["matches"][i], "values") | |
end | |
out = DataFrame() | |
for i in 1:length(queryresults["matches"]) | |
out = vcat(out, DataFrame(queryresults["matches"][i])) | |
end | |
# If desired update this function to add the embeddings to the DataFrame | |
if include_values | |
out[:, "values"] = values_vector | |
end | |
return out | |
end | |
""" | |
import OstreaCultura as OC | |
indexname = "test-index" | |
namespace = "test-namespace" | |
pc = OC.create_pinecone_context() | |
vector = OC.embed_query("drama") | |
queryresults = OC.query_w_vector(vector, indexname, namespace, top_k=5, include_values=false) | |
### now, fetch the underlying data | |
#fetched_data = OC.fetch_data(queryresults.id, indexname, namespace) | |
index = pc.Index(indexname) | |
resultfetch = OC.DataLoader.fetch_data(index, queryresults.id, namespace).to_dict() | |
OC.parse_fetched_results(resultfetch) | |
""" | |
function parse_fetched_results(resultfetch) | |
if length(resultfetch["vectors"]) > 0 | |
ids = collect(keys(resultfetch["vectors"])) | |
## Grab the MetaData | |
data = [] | |
for id in ids | |
push!(data, resultfetch["vectors"][id]["metadata"]) | |
end | |
## Create a DataFrame From the MetaData | |
out = DataFrame() | |
for i in 1:length(data) | |
try | |
out = vcat(out, DataFrame(data[i])) | |
catch | |
out = vcat(out, DataFrame(data[i]), cols=:union) | |
end | |
end | |
out[!, :id] = ids | |
return out | |
else | |
"No data found" | |
return DataFrame() | |
end | |
end | |
""" | |
import OstreaCultura as OC | |
indexname = "test-index" | |
namespace = "test-namespace" | |
pc = OC.create_pinecone_context() | |
index = pc.Index(indexname) | |
ids = ["OSJeL7", "3TxWTNpPn"] | |
query_results_as_dataframe = OC.fetch_data(ids, indexname, namespace) | |
""" | |
function fetch_data(ids, indexname, namespace; chunk_size=900) | |
pc = create_pinecone_context() | |
index = pc.Index(indexname) | |
result_out = DataFrame() | |
for i in 1:ceil(Int, length(ids)/chunk_size) | |
chunk = ids[(i-1)*chunk_size+1:min(i*chunk_size, length(ids))] | |
resultfetch = DataLoader.fetch_data(index, chunk, namespace).to_dict() | |
result_out = vcat(result_out, parse_fetched_results(resultfetch)) | |
end | |
return result_out | |
end | |
""" | |
## FINAL Query function - embeds, queries, and fetches data | |
import OstreaCultura as OC | |
querytext = "drama" | |
indexname = "test-index" | |
namespace = "test-namespace" | |
OC.query(querytext, indexname, namespace) | |
""" | |
function query(querytext::String, indexname::String, namespace::String; kwargs...) | |
top_k = get(kwargs, :top_k, 5) | |
include_values = get(kwargs, :include_values, true) | |
vector = embed_query(querytext) | |
queryresults = query_w_vector(vector, indexname, namespace, top_k=top_k, include_values=include_values) | |
### now, fetch the underlying data | |
fetched_data = fetch_data(queryresults.id, indexname, namespace) | |
# join the two dataframes on id | |
merged = innerjoin(queryresults, fetched_data, on=:id) | |
return merged | |
end | |
function filter_claims_closer_to_counterclaims(claim_results, counterclaim_results) | |
# Rename scores to avoid conflicts | |
rename!(claim_results, :score => :claim_score) | |
rename!(counterclaim_results, :score => :counterclaim_score) | |
# Innerjoin | |
df = leftjoin(claim_results, counterclaim_results, on=:id) | |
# Fill missing values with 0 | |
df.counterclaim_score = coalesce.(df.counterclaim_score, 0.0) | |
# Keep only results where the claim score is greater than the counterclaim score | |
df = df[df.claim_score .> df.counterclaim_score, :] | |
return df | |
end | |
""" | |
## Query with claims and counterclaims | |
import OstreaCultura as OC | |
claim = "Climate change is a hoax" | |
counterclaim = "Climate change is real" | |
indexname = "test-index" | |
namespace = "test-namespace" | |
hi = OC.query_claims(claim, counterclaim, indexname, namespace) | |
""" | |
function query_claims(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...) | |
threshold = get(kwargs, :threshold, 0.8) | |
top_k = get(kwargs, :top_k, 5000) # top_k for the initial query | |
# Get embeddings | |
claim_vector = embed_query(claim) | |
counterclaim_vector = embed_query(counterclaim) | |
# Query the embeddings | |
claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false) | |
counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false) | |
# If a given id has a greater score for the claim than the counterclaim, keep it | |
allscores = filter_claims_closer_to_counterclaims(claim_results, counterclaim_results) | |
# Filter to scores above the threshold | |
allscores = allscores[allscores.claim_score .> threshold, :] | |
if size(allscores)[1] == 0 | |
"No claims were above the threshold" | |
return DataFrame() | |
else | |
## now, fetch the data | |
resulting_data = fetch_data(allscores.id, indexname, namespace) | |
# merge the data on id | |
resulting_data = innerjoin(allscores, resulting_data, on=:id) | |
return resulting_data | |
end | |
end | |
""" | |
## Classify a claim against the existing misinformation library | |
import OstreaCultura as OC | |
## Example 1 | |
claim = "There is a lot of dispute about whether the Holocaust happened" | |
counterclaim = "The Holocaust is a well-documented historical event" | |
indexname = "ostreacultura-v1" | |
namespace = "modified-misinfo-library" | |
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) | |
## Example 2 | |
claim = "it's cool to be trans these days" | |
counterclaim = "" | |
indexname = "ostreacultura-v1" | |
namespace = "modified-misinfo-library" | |
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) | |
## Example 3 | |
claim = "No existe racismo contra las personas negras" | |
counterclaim = "Racism is a systemic issue that affects people of color" | |
indexname = "ostreacultura-v1" | |
namespace = "modified-misinfo-library" | |
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) | |
""" | |
function classify_claim(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...) | |
threshold = get(kwargs, :threshold, 0.8) | |
top_k = get(kwargs, :top_k, 10) # top_k for the initial query | |
# Get embeddings | |
claim_vector = embed_query(claim) | |
if counterclaim != "" | |
counterclaim_vector = embed_query(counterclaim) | |
counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false) | |
counterclaim_score = counterclaim_results.score[1] | |
else | |
counterclaim_score = 0.0 | |
end | |
# Query the embeddings | |
claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false) | |
# Filter to scores above the threshold | |
claim_results = claim_results[claim_results.score .> threshold, :] | |
## now, fetch the data | |
resulting_data = fetch_data(claim_results.id, indexname, namespace) | |
resulting_data.scores = claim_results.score | |
return resulting_data, counterclaim_score | |
end | |
function generate_sparse_model() | |
df = DataLoader.pd.read_csv("data/random_300k.csv") | |
corpus = df["text"].tolist() | |
vector, bm25 = OC.DataLoader.encode_documents(corpus) | |
return vector, bm25 | |
end |