### PineCone Embed and I/O Functions """ # This dataset matches the example data from DataLoader.py import OstreaCultura as OC hi = OC.example_data() hi = OC.df_to_pd(hi) OC.DataLoader.create_vectors_from_df(hi) """ function example_data() DataFrame( Embeddings = [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]], id = ["vec1", "vec2"], genre = ["drama", "action"] ) end """ df= OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") df_julia = OC.pd_to_df(df) """ function pd_to_df(df_pd) df= DataFrame() for col in df_pd.columns df[!, col] = getproperty(df_pd, col).values end df end """ Available functions pc.create_index - see below pc.delete_index: pc.delete_index(index_name) """ function create_pinecone_context() pc = DataLoader.Pinecone(api_key=ENV["PINECONE_API_KEY"]) return pc end """ # Context for inference endpoints """ function create_inf_pinecone_context() pc = DataLoader.Pinecone(ENV["PINECONE_API_KEY"]) return pc end """ pc = create_pinecone_context() create_index("new-index", 4, "cosine", "aws", "us-east-1") """ function create_index(name, dimension, metric, cloud, region) ppc = create_pinecone_context() DataLoader.create_index(ppc, name, dimension, metric, cloud, region) end """ import OstreaCultura as OC df = OC.DataLoader.pd.read_csv("data/climate_test.csv") model = "multilingual-e5-large" out = OC.multi_embeddings(model, df, 96, "text") # Id and Embeddings are required columns in the DataFrame OC.upsert_data(out, "test-index", "test-namespace") df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") model = "multilingual-e5-large" test_embeds = OC.multi_embeddings(model, df, 96, "text") test_embeds_min = test_embeds.head(10) # Id and Embeddings are required columns in the DataFrame OC.upsert_data(test_embeds_min, "test-index", "indicator-test-namespace", chunk_size=100) """ function upsert_data(df, indexname, namespace; chunk_size=1000) # Import DataLoader.py pc = create_pinecone_context() index = pc.Index(indexname) DataLoader.chunk_df_and_upsert(index, df, namespace=namespace, chunk_size=chunk_size) end """ ## How to query data using an existing embedding import OstreaCultura as OC; using DataFrames mydf = DataFrame(id = ["vec1", "vec2"], text = ["drama", "action"]) mydf = OC.multi_embeddings(mydf) vector = mydf.Embeddings[1] top_k = 5 include_values = true OC.query_data("test-index", "test-namespace", vector, top_k, include_values) """ function query_data(indexname, namespace, vector, top_k, include_values) pc = create_pinecone_context() index = pc.Index(indexname) DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict() end """ ## How to query data using an existing hybrid embedding import OstreaCultura as OC; using DataFrames querytext = "drama" dense = OC.embed_query(querytext) top_k = 5 include_values = true include_metadata = true OC.query_data_with_sparse("oc-hybrid-library-index", "immigration", dense, OC.DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata) """ function query_data_with_sparse(indexname, namespace, dense, sparse, top_k, include_values, include_metadata) pc = create_pinecone_context() index = pc.Index(indexname) DataLoader.query_data_with_sparse(index, namespace, dense, sparse, top_k=top_k, include_values=include_values, include_metadata=include_metadata).to_dict() end """ ## Querying function for GGWP - using updated hybrid vector import OstreaCultura as OC claim = "drama" indexname = "oc-hybrid-library-index" ocmodel = "expanded-fact-checks" OC.search(claim, indexname, ocmodel, include_values=false, include_metadata=false) res = OC.search(claim, indexname, ocmodel) """ function search(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true) dense = embed_query(claim) query_data_with_sparse(indexname, ocmodel, dense, DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata) end function unicodebarplot(x, y, title = "Query Matches") UnicodePlots.barplot(x, y, title=title) end function searchresult_to_unicodeplot(searchresult) scores = [x["score"] for x in searchresult["matches"]] text = [x["metadata"]["text"] for x in searchresult["matches"]] ## reduce the text to 41 characters text_to_show = [length(x) > 41 ? x[1:41] * "..." : x for x in text] unicodebarplot(text_to_show, scores) end """ ## Search and plot the results import OstreaCultura as OC claim = "drama" indexname = "oc-hybrid-library-index" ocmodel = "immigration" OC.searchplot(claim, indexname, ocmodel) """ function searchplot(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true) searchresult = search(claim, indexname, ocmodel, top_k=top_k, include_values=include_values, include_metadata=include_metadata) searchresult_to_unicodeplot(searchresult) end """ import OstreaCultura as OC df = OC.DataLoader.pd.read_csv("data/climate_test.csv") model = "multilingual-e5-large" out = OC.multi_embeddings(model, df, 96, "text") using CSV, DataFrames tdat = CSV.read("data/climate_test.csv", DataFrame) OC.multi_embeddings(model, Pandas.DataFrame(tdat), 96, "text") """ function multi_embeddings(model, data, chunk_size, textcol) pc = create_inf_pinecone_context() DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol) end """ using CSV, DataFrames import OstreaCultura as OC tdat = CSV.read("data/climate_test.csv", DataFrame) OC.multi_embeddings(tdat) """ function multi_embeddings(data::DataFrames.DataFrame; kwargs...) data = df_to_pd(data) model = get(kwargs, :model, "multilingual-e5-large") chunk_size = get(kwargs, :chunk_size, 96) textcol = get(kwargs, :textcol, "text") pc = create_inf_pinecone_context() DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol) end """ ## Julia DataFrame to pandas DataFrame """ function df_to_pd(df::DataFrames.DataFrame) pdataframe(df) end function embed_query(querytext; kwargs...) firstdf = DataFrame(id = "vec1", text = querytext) firstdf = multi_embeddings(firstdf) vector = firstdf.Embeddings[1] return vector end """ ## Query with a vector of embeddings import OstreaCultura as OC vector = rand(1024) indexname = "test-index" namespace = "test-namespace" vecresults = OC.query_w_vector(vector, indexname, namespace) """ function query_w_vector(vector, indexname, namespace; kwargs...) top_k = get(kwargs, :top_k, 5) include_values = get(kwargs, :include_values, true) pc = create_pinecone_context() index = pc.Index(indexname) queryresults = DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict() ## if include_values values_vector = [queryresults["matches"][i]["values"] for i in 1:length(queryresults["matches"])] else values_vector = [missing for i in 1:length(queryresults["matches"])] end # drop the "values" key from each dict so it doesn't get added to the DataFrame for i in 1:length(queryresults["matches"]) delete!(queryresults["matches"][i], "values") end out = DataFrame() for i in 1:length(queryresults["matches"]) out = vcat(out, DataFrame(queryresults["matches"][i])) end # If desired update this function to add the embeddings to the DataFrame if include_values out[:, "values"] = values_vector end return out end """ import OstreaCultura as OC indexname = "test-index" namespace = "test-namespace" pc = OC.create_pinecone_context() vector = OC.embed_query("drama") queryresults = OC.query_w_vector(vector, indexname, namespace, top_k=5, include_values=false) ### now, fetch the underlying data #fetched_data = OC.fetch_data(queryresults.id, indexname, namespace) index = pc.Index(indexname) resultfetch = OC.DataLoader.fetch_data(index, queryresults.id, namespace).to_dict() OC.parse_fetched_results(resultfetch) """ function parse_fetched_results(resultfetch) if length(resultfetch["vectors"]) > 0 ids = collect(keys(resultfetch["vectors"])) ## Grab the MetaData data = [] for id in ids push!(data, resultfetch["vectors"][id]["metadata"]) end ## Create a DataFrame From the MetaData out = DataFrame() for i in 1:length(data) try out = vcat(out, DataFrame(data[i])) catch out = vcat(out, DataFrame(data[i]), cols=:union) end end out[!, :id] = ids return out else @info "No data found" return DataFrame() end end """ import OstreaCultura as OC indexname = "test-index" namespace = "test-namespace" pc = OC.create_pinecone_context() index = pc.Index(indexname) ids = ["OSJeL7", "3TxWTNpPn"] query_results_as_dataframe = OC.fetch_data(ids, indexname, namespace) """ function fetch_data(ids, indexname, namespace; chunk_size=900) pc = create_pinecone_context() index = pc.Index(indexname) result_out = DataFrame() for i in 1:ceil(Int, length(ids)/chunk_size) chunk = ids[(i-1)*chunk_size+1:min(i*chunk_size, length(ids))] resultfetch = DataLoader.fetch_data(index, chunk, namespace).to_dict() result_out = vcat(result_out, parse_fetched_results(resultfetch)) end return result_out end """ ## FINAL Query function - embeds, queries, and fetches data import OstreaCultura as OC querytext = "drama" indexname = "test-index" namespace = "test-namespace" OC.query(querytext, indexname, namespace) """ function query(querytext::String, indexname::String, namespace::String; kwargs...) top_k = get(kwargs, :top_k, 5) include_values = get(kwargs, :include_values, true) vector = embed_query(querytext) queryresults = query_w_vector(vector, indexname, namespace, top_k=top_k, include_values=include_values) ### now, fetch the underlying data fetched_data = fetch_data(queryresults.id, indexname, namespace) # join the two dataframes on id merged = innerjoin(queryresults, fetched_data, on=:id) return merged end function filter_claims_closer_to_counterclaims(claim_results, counterclaim_results) # Rename scores to avoid conflicts rename!(claim_results, :score => :claim_score) rename!(counterclaim_results, :score => :counterclaim_score) # Innerjoin df = leftjoin(claim_results, counterclaim_results, on=:id) # Fill missing values with 0 df.counterclaim_score = coalesce.(df.counterclaim_score, 0.0) # Keep only results where the claim score is greater than the counterclaim score df = df[df.claim_score .> df.counterclaim_score, :] return df end """ ## Query with claims and counterclaims import OstreaCultura as OC claim = "Climate change is a hoax" counterclaim = "Climate change is real" indexname = "test-index" namespace = "test-namespace" hi = OC.query_claims(claim, counterclaim, indexname, namespace) """ function query_claims(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...) threshold = get(kwargs, :threshold, 0.8) top_k = get(kwargs, :top_k, 5000) # top_k for the initial query # Get embeddings claim_vector = embed_query(claim) counterclaim_vector = embed_query(counterclaim) # Query the embeddings claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false) counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false) # If a given id has a greater score for the claim than the counterclaim, keep it allscores = filter_claims_closer_to_counterclaims(claim_results, counterclaim_results) # Filter to scores above the threshold allscores = allscores[allscores.claim_score .> threshold, :] if size(allscores)[1] == 0 @info "No claims were above the threshold" return DataFrame() else ## now, fetch the data resulting_data = fetch_data(allscores.id, indexname, namespace) # merge the data on id resulting_data = innerjoin(allscores, resulting_data, on=:id) return resulting_data end end """ ## Classify a claim against the existing misinformation library import OstreaCultura as OC ## Example 1 claim = "There is a lot of dispute about whether the Holocaust happened" counterclaim = "The Holocaust is a well-documented historical event" indexname = "ostreacultura-v1" namespace = "modified-misinfo-library" hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) ## Example 2 claim = "it's cool to be trans these days" counterclaim = "" indexname = "ostreacultura-v1" namespace = "modified-misinfo-library" hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) ## Example 3 claim = "No existe racismo contra las personas negras" counterclaim = "Racism is a systemic issue that affects people of color" indexname = "ostreacultura-v1" namespace = "modified-misinfo-library" hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) """ function classify_claim(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...) threshold = get(kwargs, :threshold, 0.8) top_k = get(kwargs, :top_k, 10) # top_k for the initial query # Get embeddings claim_vector = embed_query(claim) if counterclaim != "" counterclaim_vector = embed_query(counterclaim) counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false) counterclaim_score = counterclaim_results.score[1] else counterclaim_score = 0.0 end # Query the embeddings claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false) # Filter to scores above the threshold claim_results = claim_results[claim_results.score .> threshold, :] ## now, fetch the data resulting_data = fetch_data(claim_results.id, indexname, namespace) resulting_data.scores = claim_results.score return resulting_data, counterclaim_score end function generate_sparse_model() df = DataLoader.pd.read_csv("data/random_300k.csv") corpus = df["text"].tolist() vector, bm25 = OC.DataLoader.encode_documents(corpus) return vector, bm25 end