File size: 2,293 Bytes
0d178f3
 
 
 
 
 
 
 
5ca010f
 
 
5282225
 
5f0aebe
0d178f3
d600cc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e8c6a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c1e5f1
1ad2b61
985d5cf
8e0e05d
 
2a323dd
f637ac1
d743f2c
 
f3539ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
import tiktoken
import os
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
import numpy as np
import streamlit as st 

input_datapath = "fine_food_reviews_with_embeddings_1k.csv"
df = pd.read_csv(input_datapath, index_col=0)

#os.environ["OPENAI_API_KEY"] = st.secrets("OPENAI_API_KEY")
#openai.api_key = st.secrets("OPENAI_API_KEY")
st.title("Semantic Search")


#adding another column having the summary as title and the actual text as content
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)


# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191


encoding = tiktoken.get_encoding(embedding_encoding)
top_n = 500
# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)


datafile_path = "fine_food_reviews_with_embeddings_1k.csv"
df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(eval).apply(np.array)

# search through the reviews for a specific product
def search_reviews(df, product_description, n=3, pprint=True):
    product_embedding = get_embedding(
        product_description,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .combined.str.replace("Title: ", "")
        .str.replace("; Content:", ": ")
    )

    product = (
                df.sort_values("similarity", ascending=False)
        .head(n)
        .ProductId
    )

    if pprint:
        for r in range(n):
          idx = results.index[r]
          print("Product : ",product[idx])
          print(results[idx])
          print()
    return results,product


prompt = st.text_input("What do you want to search for? : ","pizza")

    
top_n = st.number_input("How many results do you want to see? : ", min_value = 1)
results,product = search_reviews(df, prompt, top_n)
if st.button("Search Reviews"):
    st.write(product,results)