Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import tiktoken | |
import pandas as pd | |
import time | |
import spacy | |
from spacy.lang.en.stop_words import STOP_WORDS | |
from string import punctuation | |
from collections import Counter | |
from heapq import nlargest | |
import nltk | |
import numpy as np | |
from tqdm import tqdm | |
from sentence_transformers import SentenceTransformer, util | |
from sentence_transformers import SentenceTransformer, CrossEncoder, util | |
import gzip | |
import os | |
import torch | |
import re | |
from openai.embeddings_utils import get_embedding, cosine_similarity | |
import os | |
df = pd.read_pickle('entire_data.pkl') #to load 123.pkl back to the dataframe df | |
model = SentenceTransformer('all-mpnet-base-v2') | |
def remove_html_tags(text): | |
clean = re.compile('<.*?>') | |
return re.sub(clean, '', text) | |
df['content'] = df.content.apply(lambda x: remove_html_tags(x)) | |
df['summary_html'] = df.summary_html.apply(lambda x: remove_html_tags(x)) | |
def search(query): | |
n = 10 | |
query_embedding = model.encode(query) | |
df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1))) | |
results = (df.sort_values("similarity", ascending=False).head(n)) | |
r_groupby = pd.DataFrame(results.groupby(['title','url','keywords','summary_html']).similarity.max()) | |
#results = results[['title','url','keywords','summary_html']].drop_duplicates() | |
results = r_groupby.reset_index() | |
results = results.sort_values("similarity", ascending=False) | |
resultlist = [] | |
for r in results.index: | |
resultlist.append( | |
{ | |
"Title":results.title[r], | |
"url":results.url[r], | |
"score": str(results.similarity[r][0]), | |
"summary": results.summary_html[r][:200], | |
"keywords": results.keywords[r] | |
} | |
) | |
return resultlist | |
def greet(query): | |
bm25 = search(query) | |
return bm25 | |
examples = [ | |
["Climate Change Challenges in Europe"], | |
["Philosophy in the world of Minimalism"], | |
["Hate Speech vs Freedom of Speech"], | |
["Articles by Noam Chomsky on US Politics"], | |
["The importance of values and reflection"] | |
] | |
demo = gr.Interface(fn=greet, title="cicero-semantic-search", | |
inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"), | |
outputs="json",examples=examples) | |
demo.launch() | |