Spaces:

stefanjwojcik
/

misinfo_detection_app

Sleeping

App Files Files Community

misinfo_detection_app / src /PyPineCone.jl

stefanjwojcik

Upload 24 files

48bb68b verified 5 months ago

raw

history blame

14.4 kB

	### PineCone Embed and I/O Functions

	"""
	# This dataset matches the example data from DataLoader.py
	import OstreaCultura as OC
	hi = OC.example_data()
	hi = OC.df_to_pd(hi)
	OC.DataLoader.create_vectors_from_df(hi)
	"""
	function example_data()
	DataFrame(
	Embeddings = [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]],
	id = ["vec1", "vec2"],
	genre = ["drama", "action"]
	)
	end

	"""
	df= OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
	df_julia = OC.pd_to_df(df)
	"""
	function pd_to_df(df_pd)
	df= DataFrame()
	for col in df_pd.columns
	df[!, col] = getproperty(df_pd, col).values
	end
	df
	end

	"""
	Available functions
	pc.create_index - see below
	pc.delete_index: pc.delete_index(index_name)
	"""
	function create_pinecone_context()
	pc = DataLoader.Pinecone(api_key=ENV["PINECONE_API_KEY"])
	return pc
	end

	"""
	# Context for inference endpoints
	"""
	function create_inf_pinecone_context()
	pc = DataLoader.Pinecone(ENV["PINECONE_API_KEY"])
	return pc
	end

	"""
	pc = create_pinecone_context()
	create_index("new-index", 4, "cosine", "aws", "us-east-1")
	"""
	function create_index(name, dimension, metric, cloud, region)
	ppc = create_pinecone_context()
	DataLoader.create_index(ppc, name, dimension, metric, cloud, region)
	end

	"""
	import OstreaCultura as OC
	df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
	model = "multilingual-e5-large"
	out = OC.multi_embeddings(model, df, 96, "text")
	# Id and Embeddings are required columns in the DataFrame
	OC.upsert_data(out, "test-index", "test-namespace")

	df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
	model = "multilingual-e5-large"
	test_embeds = OC.multi_embeddings(model, df, 96, "text")
	test_embeds_min = test_embeds.head(10)
	# Id and Embeddings are required columns in the DataFrame
	OC.upsert_data(test_embeds_min, "test-index", "indicator-test-namespace", chunk_size=100)

	"""
	function upsert_data(df, indexname, namespace; chunk_size=1000)
	# Import DataLoader.py
	pc = create_pinecone_context()
	index = pc.Index(indexname)
	DataLoader.chunk_df_and_upsert(index, df, namespace=namespace, chunk_size=chunk_size)
	end

	"""
	## How to query data using an existing embedding
	import OstreaCultura as OC; using DataFrames
	mydf = DataFrame(id = ["vec1", "vec2"], text = ["drama", "action"])
	mydf = OC.multi_embeddings(mydf)
	vector = mydf.Embeddings[1]
	top_k = 5
	include_values = true
	OC.query_data("test-index", "test-namespace", vector, top_k, include_values)
	"""
	function query_data(indexname, namespace, vector, top_k, include_values)
	pc = create_pinecone_context()
	index = pc.Index(indexname)
	DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict()
	end

	"""
	## How to query data using an existing hybrid embedding

	import OstreaCultura as OC; using DataFrames
	querytext = "drama"
	dense = OC.embed_query(querytext)
	top_k = 5
	include_values = true
	include_metadata = true
	OC.query_data_with_sparse("oc-hybrid-library-index", "immigration", dense, OC.DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata)

	"""
	function query_data_with_sparse(indexname, namespace, dense, sparse, top_k, include_values, include_metadata)
	pc = create_pinecone_context()
	index = pc.Index(indexname)
	DataLoader.query_data_with_sparse(index, namespace, dense, sparse, top_k=top_k, include_values=include_values, include_metadata=include_metadata).to_dict()
	end

	"""
	## Querying function for GGWP - using updated hybrid vector
	import OstreaCultura as OC
	claim = "drama"
	indexname = "oc-hybrid-library-index"
	ocmodel = "expanded-fact-checks"
	OC.search(claim, indexname, ocmodel, include_values=false, include_metadata=false)
	res = OC.search(claim, indexname, ocmodel)
	"""
	function search(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true)
	dense = embed_query(claim)
	query_data_with_sparse(indexname, ocmodel, dense, DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata)
	end

	function unicodebarplot(x, y, title = "Query Matches")
	UnicodePlots.barplot(x, y, title=title)
	end

	function searchresult_to_unicodeplot(searchresult)
	scores = [x["score"] for x in searchresult["matches"]]
	text = [x["metadata"]["text"] for x in searchresult["matches"]]
	## reduce the text to 41 characters
	text_to_show = [length(x) > 41 ? x[1:41] * "..." : x for x in text]
	unicodebarplot(text_to_show, scores)
	end

	"""
	## Search and plot the results

	import OstreaCultura as OC
	claim = "drama"
	indexname = "oc-hybrid-library-index"
	ocmodel = "immigration"
	OC.searchplot(claim, indexname, ocmodel)
	"""
	function searchplot(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true)
	searchresult = search(claim, indexname, ocmodel, top_k=top_k, include_values=include_values, include_metadata=include_metadata)
	searchresult_to_unicodeplot(searchresult)
	end

	"""
	import OstreaCultura as OC
	df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
	model = "multilingual-e5-large"
	out = OC.multi_embeddings(model, df, 96, "text")

	using CSV, DataFrames
	tdat = CSV.read("data/climate_test.csv", DataFrame)
	OC.multi_embeddings(model, Pandas.DataFrame(tdat), 96, "text")
	"""
	function multi_embeddings(model, data, chunk_size, textcol)
	pc = create_inf_pinecone_context()
	DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol)
	end

	"""
	using CSV, DataFrames
	import OstreaCultura as OC
	tdat = CSV.read("data/climate_test.csv", DataFrame)
	OC.multi_embeddings(tdat)
	"""
	function multi_embeddings(data::DataFrames.DataFrame; kwargs...)
	data = df_to_pd(data)
	model = get(kwargs, :model, "multilingual-e5-large")
	chunk_size = get(kwargs, :chunk_size, 96)
	textcol = get(kwargs, :textcol, "text")
	pc = create_inf_pinecone_context()
	DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol)
	end

	"""
	## Julia DataFrame to pandas DataFrame
	"""
	function df_to_pd(df::DataFrames.DataFrame)
	pdataframe(df)
	end

	function embed_query(querytext; kwargs...)
	firstdf = DataFrame(id = "vec1", text = querytext)
	firstdf = multi_embeddings(firstdf)
	vector = firstdf.Embeddings[1]
	return vector
	end

	"""
	## Query with a vector of embeddings
	import OstreaCultura as OC
	vector = rand(1024)
	indexname = "test-index"
	namespace = "test-namespace"
	vecresults = OC.query_w_vector(vector, indexname, namespace)
	"""
	function query_w_vector(vector, indexname, namespace; kwargs...)
	top_k = get(kwargs, :top_k, 5)
	include_values = get(kwargs, :include_values, true)
	pc = create_pinecone_context()
	index = pc.Index(indexname)
	queryresults = DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict()
	##
	if include_values
	values_vector = [queryresults["matches"][i]["values"] for i in 1:length(queryresults["matches"])]
	else
	values_vector = [missing for i in 1:length(queryresults["matches"])]
	end
	# drop the "values" key from each dict so it doesn't get added to the DataFrame
	for i in 1:length(queryresults["matches"])
	delete!(queryresults["matches"][i], "values")
	end
	out = DataFrame()
	for i in 1:length(queryresults["matches"])
	out = vcat(out, DataFrame(queryresults["matches"][i]))
	end
	# If desired update this function to add the embeddings to the DataFrame
	if include_values
	out[:, "values"] = values_vector
	end

	return out
	end

	"""
	import OstreaCultura as OC
	indexname = "test-index"
	namespace = "test-namespace"
	pc = OC.create_pinecone_context()
	vector = OC.embed_query("drama")
	queryresults = OC.query_w_vector(vector, indexname, namespace, top_k=5, include_values=false)
	### now, fetch the underlying data
	#fetched_data = OC.fetch_data(queryresults.id, indexname, namespace)
	index = pc.Index(indexname)
	resultfetch = OC.DataLoader.fetch_data(index, queryresults.id, namespace).to_dict()
	OC.parse_fetched_results(resultfetch)
	"""
	function parse_fetched_results(resultfetch)
	if length(resultfetch["vectors"]) > 0
	ids = collect(keys(resultfetch["vectors"]))
	## Grab the MetaData
	data = []
	for id in ids
	push!(data, resultfetch["vectors"][id]["metadata"])
	end
	## Create a DataFrame From the MetaData
	out = DataFrame()
	for i in 1:length(data)
	try
	out = vcat(out, DataFrame(data[i]))
	catch
	out = vcat(out, DataFrame(data[i]), cols=:union)
	end
	end
	out[!, :id] = ids
	return out
	else
	@info "No data found"
	return DataFrame()
	end
	end

	"""
	import OstreaCultura as OC
	indexname = "test-index"
	namespace = "test-namespace"
	pc = OC.create_pinecone_context()
	index = pc.Index(indexname)
	ids = ["OSJeL7", "3TxWTNpPn"]
	query_results_as_dataframe = OC.fetch_data(ids, indexname, namespace)
	"""
	function fetch_data(ids, indexname, namespace; chunk_size=900)
	pc = create_pinecone_context()
	index = pc.Index(indexname)
	result_out = DataFrame()
	for i in 1:ceil(Int, length(ids)/chunk_size)
	chunk = ids[(i-1)chunk_size+1:min(ichunk_size, length(ids))]
	resultfetch = DataLoader.fetch_data(index, chunk, namespace).to_dict()
	result_out = vcat(result_out, parse_fetched_results(resultfetch))
	end
	return result_out
	end

	"""
	## FINAL Query function - embeds, queries, and fetches data
	import OstreaCultura as OC
	querytext = "drama"
	indexname = "test-index"
	namespace = "test-namespace"
	OC.query(querytext, indexname, namespace)
	"""
	function query(querytext::String, indexname::String, namespace::String; kwargs...)
	top_k = get(kwargs, :top_k, 5)
	include_values = get(kwargs, :include_values, true)
	vector = embed_query(querytext)
	queryresults = query_w_vector(vector, indexname, namespace, top_k=top_k, include_values=include_values)
	### now, fetch the underlying data
	fetched_data = fetch_data(queryresults.id, indexname, namespace)
	# join the two dataframes on id
	merged = innerjoin(queryresults, fetched_data, on=:id)
	return merged
	end

	function filter_claims_closer_to_counterclaims(claim_results, counterclaim_results)
	# Rename scores to avoid conflicts
	rename!(claim_results, :score => :claim_score)
	rename!(counterclaim_results, :score => :counterclaim_score)
	# Innerjoin
	df = leftjoin(claim_results, counterclaim_results, on=:id)
	# Fill missing values with 0
	df.counterclaim_score = coalesce.(df.counterclaim_score, 0.0)
	# Keep only results where the claim score is greater than the counterclaim score
	df = df[df.claim_score .> df.counterclaim_score, :]
	return df
	end

	"""
	## Query with claims and counterclaims
	import OstreaCultura as OC

	claim = "Climate change is a hoax"
	counterclaim = "Climate change is real"
	indexname = "test-index"
	namespace = "test-namespace"
	hi = OC.query_claims(claim, counterclaim, indexname, namespace)
	"""
	function query_claims(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...)
	threshold = get(kwargs, :threshold, 0.8)
	top_k = get(kwargs, :top_k, 5000) # top_k for the initial query
	# Get embeddings
	claim_vector = embed_query(claim)
	counterclaim_vector = embed_query(counterclaim)
	# Query the embeddings
	claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false)
	counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false)
	# If a given id has a greater score for the claim than the counterclaim, keep it
	allscores = filter_claims_closer_to_counterclaims(claim_results, counterclaim_results)
	# Filter to scores above the threshold
	allscores = allscores[allscores.claim_score .> threshold, :]
	if size(allscores)[1] == 0
	@info "No claims were above the threshold"
	return DataFrame()
	else
	## now, fetch the data
	resulting_data = fetch_data(allscores.id, indexname, namespace)
	# merge the data on id
	resulting_data = innerjoin(allscores, resulting_data, on=:id)
	return resulting_data
	end
	end


	"""
	## Classify a claim against the existing misinformation library
	import OstreaCultura as OC

	## Example 1
	claim = "There is a lot of dispute about whether the Holocaust happened"
	counterclaim = "The Holocaust is a well-documented historical event"
	indexname = "ostreacultura-v1"
	namespace = "modified-misinfo-library"
	hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)

	## Example 2
	claim = "it's cool to be trans these days"
	counterclaim = ""
	indexname = "ostreacultura-v1"
	namespace = "modified-misinfo-library"
	hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)

	## Example 3
	claim = "No existe racismo contra las personas negras"
	counterclaim = "Racism is a systemic issue that affects people of color"
	indexname = "ostreacultura-v1"
	namespace = "modified-misinfo-library"
	hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)

	"""
	function classify_claim(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...)
	threshold = get(kwargs, :threshold, 0.8)
	top_k = get(kwargs, :top_k, 10) # top_k for the initial query
	# Get embeddings
	claim_vector = embed_query(claim)
	if counterclaim != ""
	counterclaim_vector = embed_query(counterclaim)
	counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false)
	counterclaim_score = counterclaim_results.score[1]
	else
	counterclaim_score = 0.0
	end
	# Query the embeddings
	claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false)
	# Filter to scores above the threshold
	claim_results = claim_results[claim_results.score .> threshold, :]
	## now, fetch the data
	resulting_data = fetch_data(claim_results.id, indexname, namespace)
	resulting_data.scores = claim_results.score
	return resulting_data, counterclaim_score
	end

	function generate_sparse_model()
	df = DataLoader.pd.read_csv("data/random_300k.csv")
	corpus = df["text"].tolist()
	vector, bm25 = OC.DataLoader.encode_documents(corpus)
	return vector, bm25
	end