|
|
|
|
|
""" |
|
# This dataset matches the example data from DataLoader.py |
|
import OstreaCultura as OC |
|
hi = OC.example_data() |
|
hi = OC.df_to_pd(hi) |
|
OC.DataLoader.create_vectors_from_df(hi) |
|
""" |
|
function example_data() |
|
DataFrame( |
|
Embeddings = [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]], |
|
id = ["vec1", "vec2"], |
|
genre = ["drama", "action"] |
|
) |
|
end |
|
|
|
""" |
|
df= OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") |
|
df_julia = OC.pd_to_df(df) |
|
""" |
|
function pd_to_df(df_pd) |
|
df= DataFrame() |
|
for col in df_pd.columns |
|
df[!, col] = getproperty(df_pd, col).values |
|
end |
|
df |
|
end |
|
|
|
""" |
|
Available functions |
|
pc.create_index - see below |
|
pc.delete_index: pc.delete_index(index_name) |
|
""" |
|
function create_pinecone_context() |
|
pc = DataLoader.Pinecone(api_key=ENV["PINECONE_API_KEY"]) |
|
return pc |
|
end |
|
|
|
""" |
|
# Context for inference endpoints |
|
""" |
|
function create_inf_pinecone_context() |
|
pc = DataLoader.Pinecone(ENV["PINECONE_API_KEY"]) |
|
return pc |
|
end |
|
|
|
""" |
|
pc = create_pinecone_context() |
|
create_index("new-index", 4, "cosine", "aws", "us-east-1") |
|
""" |
|
function create_index(name, dimension, metric, cloud, region) |
|
ppc = create_pinecone_context() |
|
DataLoader.create_index(ppc, name, dimension, metric, cloud, region) |
|
end |
|
|
|
""" |
|
import OstreaCultura as OC |
|
df = OC.DataLoader.pd.read_csv("data/climate_test.csv") |
|
model = "multilingual-e5-large" |
|
out = OC.multi_embeddings(model, df, 96, "text") |
|
# Id and Embeddings are required columns in the DataFrame |
|
OC.upsert_data(out, "test-index", "test-namespace") |
|
|
|
df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") |
|
model = "multilingual-e5-large" |
|
test_embeds = OC.multi_embeddings(model, df, 96, "text") |
|
test_embeds_min = test_embeds.head(10) |
|
# Id and Embeddings are required columns in the DataFrame |
|
OC.upsert_data(test_embeds_min, "test-index", "indicator-test-namespace", chunk_size=100) |
|
|
|
""" |
|
function upsert_data(df, indexname, namespace; chunk_size=1000) |
|
|
|
pc = create_pinecone_context() |
|
index = pc.Index(indexname) |
|
DataLoader.chunk_df_and_upsert(index, df, namespace=namespace, chunk_size=chunk_size) |
|
end |
|
|
|
""" |
|
## How to query data using an existing embedding |
|
import OstreaCultura as OC; using DataFrames |
|
mydf = DataFrame(id = ["vec1", "vec2"], text = ["drama", "action"]) |
|
mydf = OC.multi_embeddings(mydf) |
|
vector = mydf.Embeddings[1] |
|
top_k = 5 |
|
include_values = true |
|
OC.query_data("test-index", "test-namespace", vector, top_k, include_values) |
|
""" |
|
function query_data(indexname, namespace, vector, top_k, include_values) |
|
pc = create_pinecone_context() |
|
index = pc.Index(indexname) |
|
DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict() |
|
end |
|
|
|
""" |
|
## How to query data using an existing hybrid embedding |
|
|
|
import OstreaCultura as OC; using DataFrames |
|
querytext = "drama" |
|
dense = OC.embed_query(querytext) |
|
top_k = 5 |
|
include_values = true |
|
include_metadata = true |
|
OC.query_data_with_sparse("oc-hybrid-library-index", "immigration", dense, OC.DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata) |
|
|
|
""" |
|
function query_data_with_sparse(indexname, namespace, dense, sparse, top_k, include_values, include_metadata) |
|
pc = create_pinecone_context() |
|
index = pc.Index(indexname) |
|
DataLoader.query_data_with_sparse(index, namespace, dense, sparse, top_k=top_k, include_values=include_values, include_metadata=include_metadata).to_dict() |
|
end |
|
|
|
""" |
|
## Querying function for GGWP - using updated hybrid vector |
|
import OstreaCultura as OC |
|
claim = "drama" |
|
indexname = "oc-hybrid-library-index" |
|
ocmodel = "expanded-fact-checks" |
|
OC.search(claim, indexname, ocmodel, include_values=false, include_metadata=false) |
|
res = OC.search(claim, indexname, ocmodel) |
|
""" |
|
function search(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true) |
|
dense = embed_query(claim) |
|
query_data_with_sparse(indexname, ocmodel, dense, DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata) |
|
end |
|
|
|
function unicodebarplot(x, y, title = "Query Matches") |
|
UnicodePlots.barplot(x, y, title=title) |
|
end |
|
|
|
function searchresult_to_unicodeplot(searchresult) |
|
scores = [x["score"] for x in searchresult["matches"]] |
|
text = [x["metadata"]["text"] for x in searchresult["matches"]] |
|
|
|
text_to_show = [length(x) > 41 ? x[1:41] * "..." : x for x in text] |
|
unicodebarplot(text_to_show, scores) |
|
end |
|
|
|
""" |
|
## Search and plot the results |
|
|
|
import OstreaCultura as OC |
|
claim = "drama" |
|
indexname = "oc-hybrid-library-index" |
|
ocmodel = "immigration" |
|
OC.searchplot(claim, indexname, ocmodel) |
|
""" |
|
function searchplot(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true) |
|
searchresult = search(claim, indexname, ocmodel, top_k=top_k, include_values=include_values, include_metadata=include_metadata) |
|
searchresult_to_unicodeplot(searchresult) |
|
end |
|
|
|
""" |
|
import OstreaCultura as OC |
|
df = OC.DataLoader.pd.read_csv("data/climate_test.csv") |
|
model = "multilingual-e5-large" |
|
out = OC.multi_embeddings(model, df, 96, "text") |
|
|
|
using CSV, DataFrames |
|
tdat = CSV.read("data/climate_test.csv", DataFrame) |
|
OC.multi_embeddings(model, Pandas.DataFrame(tdat), 96, "text") |
|
""" |
|
function multi_embeddings(model, data, chunk_size, textcol) |
|
pc = create_inf_pinecone_context() |
|
DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol) |
|
end |
|
|
|
""" |
|
using CSV, DataFrames |
|
import OstreaCultura as OC |
|
tdat = CSV.read("data/climate_test.csv", DataFrame) |
|
OC.multi_embeddings(tdat) |
|
""" |
|
function multi_embeddings(data::DataFrames.DataFrame; kwargs...) |
|
data = df_to_pd(data) |
|
model = get(kwargs, :model, "multilingual-e5-large") |
|
chunk_size = get(kwargs, :chunk_size, 96) |
|
textcol = get(kwargs, :textcol, "text") |
|
pc = create_inf_pinecone_context() |
|
DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol) |
|
end |
|
|
|
""" |
|
## Julia DataFrame to pandas DataFrame |
|
""" |
|
function df_to_pd(df::DataFrames.DataFrame) |
|
pdataframe(df) |
|
end |
|
|
|
function embed_query(querytext; kwargs...) |
|
firstdf = DataFrame(id = "vec1", text = querytext) |
|
firstdf = multi_embeddings(firstdf) |
|
vector = firstdf.Embeddings[1] |
|
return vector |
|
end |
|
|
|
""" |
|
## Query with a vector of embeddings |
|
import OstreaCultura as OC |
|
vector = rand(1024) |
|
indexname = "test-index" |
|
namespace = "test-namespace" |
|
vecresults = OC.query_w_vector(vector, indexname, namespace) |
|
""" |
|
function query_w_vector(vector, indexname, namespace; kwargs...) |
|
top_k = get(kwargs, :top_k, 5) |
|
include_values = get(kwargs, :include_values, true) |
|
pc = create_pinecone_context() |
|
index = pc.Index(indexname) |
|
queryresults = DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict() |
|
|
|
if include_values |
|
values_vector = [queryresults["matches"][i]["values"] for i in 1:length(queryresults["matches"])] |
|
else |
|
values_vector = [missing for i in 1:length(queryresults["matches"])] |
|
end |
|
|
|
for i in 1:length(queryresults["matches"]) |
|
delete!(queryresults["matches"][i], "values") |
|
end |
|
out = DataFrame() |
|
for i in 1:length(queryresults["matches"]) |
|
out = vcat(out, DataFrame(queryresults["matches"][i])) |
|
end |
|
|
|
if include_values |
|
out[:, "values"] = values_vector |
|
end |
|
|
|
return out |
|
end |
|
|
|
""" |
|
import OstreaCultura as OC |
|
indexname = "test-index" |
|
namespace = "test-namespace" |
|
pc = OC.create_pinecone_context() |
|
vector = OC.embed_query("drama") |
|
queryresults = OC.query_w_vector(vector, indexname, namespace, top_k=5, include_values=false) |
|
### now, fetch the underlying data |
|
#fetched_data = OC.fetch_data(queryresults.id, indexname, namespace) |
|
index = pc.Index(indexname) |
|
resultfetch = OC.DataLoader.fetch_data(index, queryresults.id, namespace).to_dict() |
|
OC.parse_fetched_results(resultfetch) |
|
""" |
|
function parse_fetched_results(resultfetch) |
|
if length(resultfetch["vectors"]) > 0 |
|
ids = collect(keys(resultfetch["vectors"])) |
|
|
|
data = [] |
|
for id in ids |
|
push!(data, resultfetch["vectors"][id]["metadata"]) |
|
end |
|
|
|
out = DataFrame() |
|
for i in 1:length(data) |
|
try |
|
out = vcat(out, DataFrame(data[i])) |
|
catch |
|
out = vcat(out, DataFrame(data[i]), cols=:union) |
|
end |
|
end |
|
out[!, :id] = ids |
|
return out |
|
else |
|
@info "No data found" |
|
return DataFrame() |
|
end |
|
end |
|
|
|
""" |
|
import OstreaCultura as OC |
|
indexname = "test-index" |
|
namespace = "test-namespace" |
|
pc = OC.create_pinecone_context() |
|
index = pc.Index(indexname) |
|
ids = ["OSJeL7", "3TxWTNpPn"] |
|
query_results_as_dataframe = OC.fetch_data(ids, indexname, namespace) |
|
""" |
|
function fetch_data(ids, indexname, namespace; chunk_size=900) |
|
pc = create_pinecone_context() |
|
index = pc.Index(indexname) |
|
result_out = DataFrame() |
|
for i in 1:ceil(Int, length(ids)/chunk_size) |
|
chunk = ids[(i-1)*chunk_size+1:min(i*chunk_size, length(ids))] |
|
resultfetch = DataLoader.fetch_data(index, chunk, namespace).to_dict() |
|
result_out = vcat(result_out, parse_fetched_results(resultfetch)) |
|
end |
|
return result_out |
|
end |
|
|
|
""" |
|
## FINAL Query function - embeds, queries, and fetches data |
|
import OstreaCultura as OC |
|
querytext = "drama" |
|
indexname = "test-index" |
|
namespace = "test-namespace" |
|
OC.query(querytext, indexname, namespace) |
|
""" |
|
function query(querytext::String, indexname::String, namespace::String; kwargs...) |
|
top_k = get(kwargs, :top_k, 5) |
|
include_values = get(kwargs, :include_values, true) |
|
vector = embed_query(querytext) |
|
queryresults = query_w_vector(vector, indexname, namespace, top_k=top_k, include_values=include_values) |
|
|
|
fetched_data = fetch_data(queryresults.id, indexname, namespace) |
|
|
|
merged = innerjoin(queryresults, fetched_data, on=:id) |
|
return merged |
|
end |
|
|
|
function filter_claims_closer_to_counterclaims(claim_results, counterclaim_results) |
|
|
|
rename!(claim_results, :score => :claim_score) |
|
rename!(counterclaim_results, :score => :counterclaim_score) |
|
|
|
df = leftjoin(claim_results, counterclaim_results, on=:id) |
|
|
|
df.counterclaim_score = coalesce.(df.counterclaim_score, 0.0) |
|
|
|
df = df[df.claim_score .> df.counterclaim_score, :] |
|
return df |
|
end |
|
|
|
""" |
|
## Query with claims and counterclaims |
|
import OstreaCultura as OC |
|
|
|
claim = "Climate change is a hoax" |
|
counterclaim = "Climate change is real" |
|
indexname = "test-index" |
|
namespace = "test-namespace" |
|
hi = OC.query_claims(claim, counterclaim, indexname, namespace) |
|
""" |
|
function query_claims(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...) |
|
threshold = get(kwargs, :threshold, 0.8) |
|
top_k = get(kwargs, :top_k, 5000) |
|
|
|
claim_vector = embed_query(claim) |
|
counterclaim_vector = embed_query(counterclaim) |
|
|
|
claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false) |
|
counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false) |
|
|
|
allscores = filter_claims_closer_to_counterclaims(claim_results, counterclaim_results) |
|
|
|
allscores = allscores[allscores.claim_score .> threshold, :] |
|
if size(allscores)[1] == 0 |
|
@info "No claims were above the threshold" |
|
return DataFrame() |
|
else |
|
|
|
resulting_data = fetch_data(allscores.id, indexname, namespace) |
|
|
|
resulting_data = innerjoin(allscores, resulting_data, on=:id) |
|
return resulting_data |
|
end |
|
end |
|
|
|
|
|
""" |
|
## Classify a claim against the existing misinformation library |
|
import OstreaCultura as OC |
|
|
|
## Example 1 |
|
claim = "There is a lot of dispute about whether the Holocaust happened" |
|
counterclaim = "The Holocaust is a well-documented historical event" |
|
indexname = "ostreacultura-v1" |
|
namespace = "modified-misinfo-library" |
|
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) |
|
|
|
## Example 2 |
|
claim = "it's cool to be trans these days" |
|
counterclaim = "" |
|
indexname = "ostreacultura-v1" |
|
namespace = "modified-misinfo-library" |
|
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) |
|
|
|
## Example 3 |
|
claim = "No existe racismo contra las personas negras" |
|
counterclaim = "Racism is a systemic issue that affects people of color" |
|
indexname = "ostreacultura-v1" |
|
namespace = "modified-misinfo-library" |
|
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) |
|
|
|
""" |
|
function classify_claim(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...) |
|
threshold = get(kwargs, :threshold, 0.8) |
|
top_k = get(kwargs, :top_k, 10) |
|
|
|
claim_vector = embed_query(claim) |
|
if counterclaim != "" |
|
counterclaim_vector = embed_query(counterclaim) |
|
counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false) |
|
counterclaim_score = counterclaim_results.score[1] |
|
else |
|
counterclaim_score = 0.0 |
|
end |
|
|
|
claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false) |
|
|
|
claim_results = claim_results[claim_results.score .> threshold, :] |
|
|
|
resulting_data = fetch_data(claim_results.id, indexname, namespace) |
|
resulting_data.scores = claim_results.score |
|
return resulting_data, counterclaim_score |
|
end |
|
|
|
function generate_sparse_model() |
|
df = DataLoader.pd.read_csv("data/random_300k.csv") |
|
corpus = df["text"].tolist() |
|
vector, bm25 = OC.DataLoader.encode_documents(corpus) |
|
return vector, bm25 |
|
end |