File size: 14,395 Bytes
48bb68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
### PineCone Embed and I/O Functions 

"""
# This dataset matches the example data from DataLoader.py
import OstreaCultura as OC
hi = OC.example_data()
hi = OC.df_to_pd(hi)
OC.DataLoader.create_vectors_from_df(hi)
"""
function example_data()
    DataFrame(
    Embeddings = [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]],
    id = ["vec1", "vec2"],
    genre = ["drama", "action"]
)
end

"""
df= OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
df_julia = OC.pd_to_df(df)
"""
function pd_to_df(df_pd)
    df= DataFrame()
    for col in df_pd.columns
        df[!, col] = getproperty(df_pd, col).values
    end
    df
end

"""
Available functions 
pc.create_index - see below 
pc.delete_index: pc.delete_index(index_name)
"""
function create_pinecone_context()
    pc = DataLoader.Pinecone(api_key=ENV["PINECONE_API_KEY"])
    return pc
end 

"""
# Context for inference endpoints 
"""
function create_inf_pinecone_context()
    pc = DataLoader.Pinecone(ENV["PINECONE_API_KEY"])
    return pc
end

"""
pc = create_pinecone_context()
create_index("new-index", 4, "cosine", "aws", "us-east-1")
"""
function create_index(name, dimension, metric, cloud, region)
    ppc = create_pinecone_context()
    DataLoader.create_index(ppc, name, dimension, metric, cloud, region)
end

"""
import OstreaCultura as OC
df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
model = "multilingual-e5-large"
out = OC.multi_embeddings(model, df, 96, "text")
# Id and Embeddings are required columns in the DataFrame
OC.upsert_data(out, "test-index", "test-namespace")

df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv")
model = "multilingual-e5-large"
test_embeds = OC.multi_embeddings(model, df, 96, "text")
test_embeds_min = test_embeds.head(10)
# Id and Embeddings are required columns in the DataFrame
OC.upsert_data(test_embeds_min, "test-index", "indicator-test-namespace", chunk_size=100)

"""
function upsert_data(df, indexname, namespace; chunk_size=1000)
    # Import DataLoader.py 
    pc = create_pinecone_context()
    index = pc.Index(indexname)
    DataLoader.chunk_df_and_upsert(index, df, namespace=namespace, chunk_size=chunk_size)
end

"""
## How to query data using an existing embedding 
import OstreaCultura as OC; using DataFrames
mydf = DataFrame(id = ["vec1", "vec2"], text = ["drama", "action"])
mydf = OC.multi_embeddings(mydf)
vector = mydf.Embeddings[1]
top_k = 5
include_values = true
OC.query_data("test-index", "test-namespace", vector, top_k, include_values)
"""
function query_data(indexname, namespace, vector, top_k, include_values)
    pc = create_pinecone_context()
    index = pc.Index(indexname)
    DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict()
end

"""
## How to query data using an existing hybrid embedding

import OstreaCultura as OC; using DataFrames
querytext = "drama"
dense = OC.embed_query(querytext)
top_k = 5
include_values = true
include_metadata = true
OC.query_data_with_sparse("oc-hybrid-library-index", "immigration", dense, OC.DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata)

"""
function query_data_with_sparse(indexname, namespace, dense, sparse, top_k, include_values, include_metadata)
    pc = create_pinecone_context()
    index = pc.Index(indexname)
    DataLoader.query_data_with_sparse(index, namespace, dense, sparse, top_k=top_k, include_values=include_values, include_metadata=include_metadata).to_dict()
end

"""
## Querying function for GGWP - using updated hybrid vector 
import OstreaCultura as OC
claim = "drama"
indexname = "oc-hybrid-library-index"
ocmodel = "expanded-fact-checks"
OC.search(claim, indexname, ocmodel, include_values=false, include_metadata=false)
res = OC.search(claim, indexname, ocmodel)
"""
function search(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true)
    dense = embed_query(claim)
    query_data_with_sparse(indexname, ocmodel, dense, DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata)
end

function unicodebarplot(x, y, title = "Query Matches")
    UnicodePlots.barplot(x, y, title=title)
end

function searchresult_to_unicodeplot(searchresult)
    scores = [x["score"] for x in searchresult["matches"]]
    text = [x["metadata"]["text"] for x in searchresult["matches"]]
    ## reduce the text to 41 characters
    text_to_show = [length(x) > 41 ? x[1:41] * "..." : x for x in text]
    unicodebarplot(text_to_show, scores)
end

"""
## Search and plot the results

import OstreaCultura as OC
claim = "drama"
indexname = "oc-hybrid-library-index"
ocmodel = "immigration"
OC.searchplot(claim, indexname, ocmodel)
"""
function searchplot(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true)
    searchresult = search(claim, indexname, ocmodel, top_k=top_k, include_values=include_values, include_metadata=include_metadata)
    searchresult_to_unicodeplot(searchresult)
end

"""
import OstreaCultura as OC
df = OC.DataLoader.pd.read_csv("data/climate_test.csv")
model = "multilingual-e5-large"
out = OC.multi_embeddings(model, df, 96, "text")

using CSV, DataFrames
tdat = CSV.read("data/climate_test.csv", DataFrame)
OC.multi_embeddings(model, Pandas.DataFrame(tdat), 96, "text")
"""
function multi_embeddings(model, data, chunk_size, textcol)
    pc = create_inf_pinecone_context()
    DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol)
end

"""
using CSV, DataFrames
import OstreaCultura as OC
tdat = CSV.read("data/climate_test.csv", DataFrame)
OC.multi_embeddings(tdat)
"""
function multi_embeddings(data::DataFrames.DataFrame; kwargs...)
    data = df_to_pd(data)
    model = get(kwargs, :model, "multilingual-e5-large")
    chunk_size = get(kwargs, :chunk_size, 96)
    textcol = get(kwargs, :textcol, "text")
    pc = create_inf_pinecone_context()
    DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol)
end

"""
## Julia DataFrame to pandas DataFrame
"""
function df_to_pd(df::DataFrames.DataFrame)
    pdataframe(df)
end

function embed_query(querytext; kwargs...)
    firstdf = DataFrame(id = "vec1", text = querytext)
    firstdf = multi_embeddings(firstdf)
    vector = firstdf.Embeddings[1]
    return vector 
end

"""
## Query with a vector of embeddings
import OstreaCultura as OC
vector = rand(1024)
indexname = "test-index"
namespace = "test-namespace"
vecresults = OC.query_w_vector(vector, indexname, namespace)
"""
function query_w_vector(vector, indexname, namespace; kwargs...)
    top_k = get(kwargs, :top_k, 5)
    include_values = get(kwargs, :include_values, true)
    pc = create_pinecone_context()
    index = pc.Index(indexname)
    queryresults = DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict()
    ##
    if include_values
        values_vector = [queryresults["matches"][i]["values"] for i in 1:length(queryresults["matches"])]
    else 
        values_vector = [missing for i in 1:length(queryresults["matches"])]
    end
    # drop the "values" key from each dict so it doesn't get added to the DataFrame
    for i in 1:length(queryresults["matches"])
        delete!(queryresults["matches"][i], "values")
    end
    out = DataFrame()
    for i in 1:length(queryresults["matches"])
        out = vcat(out, DataFrame(queryresults["matches"][i]))
    end
    # If desired update this function to add the embeddings to the DataFrame
    if include_values
        out[:, "values"] = values_vector
    end

    return out
end

"""
import OstreaCultura as OC
indexname = "test-index"
namespace = "test-namespace"
pc = OC.create_pinecone_context()
vector = OC.embed_query("drama")
queryresults = OC.query_w_vector(vector, indexname, namespace, top_k=5, include_values=false)
### now, fetch the underlying data 
#fetched_data = OC.fetch_data(queryresults.id, indexname, namespace)
index = pc.Index(indexname)
resultfetch = OC.DataLoader.fetch_data(index, queryresults.id, namespace).to_dict()
OC.parse_fetched_results(resultfetch)
"""
function parse_fetched_results(resultfetch)
    if length(resultfetch["vectors"]) > 0
        ids = collect(keys(resultfetch["vectors"]))
        ## Grab the MetaData
        data = []
        for id in ids
            push!(data, resultfetch["vectors"][id]["metadata"])
        end 
        ## Create a DataFrame From the MetaData 
        out = DataFrame()
        for i in 1:length(data)
            try 
                out = vcat(out, DataFrame(data[i]))
            catch
                out = vcat(out, DataFrame(data[i]), cols=:union)
            end
        end
        out[!, :id] = ids
        return out
    else
        @info "No data found"
        return DataFrame()
    end
end

"""
import OstreaCultura as OC
indexname = "test-index"
namespace = "test-namespace"
pc = OC.create_pinecone_context()
index = pc.Index(indexname)
ids = ["OSJeL7", "3TxWTNpPn"]
query_results_as_dataframe = OC.fetch_data(ids, indexname, namespace)
"""
function fetch_data(ids, indexname, namespace; chunk_size=900)
    pc = create_pinecone_context()
    index = pc.Index(indexname)
    result_out = DataFrame()
    for i in 1:ceil(Int, length(ids)/chunk_size)
        chunk = ids[(i-1)*chunk_size+1:min(i*chunk_size, length(ids))]
        resultfetch = DataLoader.fetch_data(index, chunk, namespace).to_dict()
        result_out = vcat(result_out, parse_fetched_results(resultfetch))
    end
    return result_out
end

"""
## FINAL Query function - embeds, queries, and fetches data
import OstreaCultura as OC
querytext = "drama"
indexname = "test-index"
namespace = "test-namespace"
OC.query(querytext, indexname, namespace)
"""
function query(querytext::String, indexname::String, namespace::String; kwargs...)
    top_k = get(kwargs, :top_k, 5)
    include_values = get(kwargs, :include_values, true)
    vector = embed_query(querytext)
    queryresults = query_w_vector(vector, indexname, namespace, top_k=top_k, include_values=include_values)
    ### now, fetch the underlying data 
    fetched_data = fetch_data(queryresults.id, indexname, namespace)
    # join the two dataframes on id 
    merged = innerjoin(queryresults, fetched_data, on=:id)
    return merged
end

function filter_claims_closer_to_counterclaims(claim_results, counterclaim_results)
    # Rename scores to avoid conflicts
    rename!(claim_results, :score => :claim_score)
    rename!(counterclaim_results, :score => :counterclaim_score)    
    # Innerjoin 
    df = leftjoin(claim_results, counterclaim_results, on=:id)
    # Fill missing values with 0
    df.counterclaim_score = coalesce.(df.counterclaim_score, 0.0)
    # Keep only results where the claim score is greater than the counterclaim score
    df = df[df.claim_score .> df.counterclaim_score, :]
    return df
end

"""
## Query with claims and counterclaims 
import OstreaCultura as OC

claim = "Climate change is a hoax"
counterclaim = "Climate change is real"
indexname = "test-index"
namespace = "test-namespace"
hi = OC.query_claims(claim, counterclaim, indexname, namespace)
"""
function query_claims(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...)
    threshold = get(kwargs, :threshold, 0.8)
    top_k = get(kwargs, :top_k, 5000) # top_k for the initial query
    # Get embeddings 
    claim_vector = embed_query(claim)
    counterclaim_vector = embed_query(counterclaim)
    # Query the embeddings
    claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false)
    counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false)
    # If a given id has a greater score for the claim than the counterclaim, keep it
    allscores = filter_claims_closer_to_counterclaims(claim_results, counterclaim_results)
    # Filter to scores above the threshold
    allscores = allscores[allscores.claim_score .> threshold, :]
    if size(allscores)[1] == 0
        @info "No claims were above the threshold"
        return DataFrame()
    else 
        ## now, fetch the data 
        resulting_data = fetch_data(allscores.id, indexname, namespace)
        # merge the data on id
        resulting_data = innerjoin(allscores, resulting_data, on=:id)
        return resulting_data
    end
end


"""
## Classify a claim against the existing misinformation library 
import OstreaCultura as OC

## Example 1 
claim = "There is a lot of dispute about whether the Holocaust happened"
counterclaim = "The Holocaust is a well-documented historical event"
indexname = "ostreacultura-v1"
namespace = "modified-misinfo-library"
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)

## Example 2 
claim = "it's cool to be trans these days" 
counterclaim = ""
indexname = "ostreacultura-v1"
namespace = "modified-misinfo-library"
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)

## Example 3
claim = "No existe racismo contra las personas negras" 
counterclaim = "Racism is a systemic issue that affects people of color"
indexname = "ostreacultura-v1"
namespace = "modified-misinfo-library"
hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace)

"""
function classify_claim(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...)
    threshold = get(kwargs, :threshold, 0.8)
    top_k = get(kwargs, :top_k, 10) # top_k for the initial query
    # Get embeddings 
    claim_vector = embed_query(claim)
    if counterclaim != ""
        counterclaim_vector = embed_query(counterclaim)
        counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false)
        counterclaim_score = counterclaim_results.score[1]
    else
        counterclaim_score = 0.0
    end
    # Query the embeddings
    claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false)
    # Filter to scores above the threshold
    claim_results = claim_results[claim_results.score .> threshold, :]
    ## now, fetch the data 
    resulting_data = fetch_data(claim_results.id, indexname, namespace)
    resulting_data.scores = claim_results.score
    return resulting_data, counterclaim_score
end

function generate_sparse_model()
    df = DataLoader.pd.read_csv("data/random_300k.csv")
    corpus = df["text"].tolist()
    vector, bm25 = OC.DataLoader.encode_documents(corpus)
    return vector, bm25
end