import gradio as gr import pandas as pd import tiktoken import pandas as pd import time import spacy from spacy.lang.en.stop_words import STOP_WORDS from string import punctuation from collections import Counter from heapq import nlargest import nltk import numpy as np from tqdm import tqdm from sentence_transformers import SentenceTransformer, util from sentence_transformers import SentenceTransformer, CrossEncoder, util import gzip import os import torch import re from openai.embeddings_utils import get_embedding, cosine_similarity import os df = pd.read_pickle('entire_data.pkl') #to load 123.pkl back to the dataframe df model = SentenceTransformer('all-mpnet-base-v2') def remove_html_tags(text): clean = re.compile('<.*?>') return re.sub(clean, '', text) df['content'] = df.content.apply(lambda x: remove_html_tags(x)) df['summary_html'] = df.summary_html.apply(lambda x: remove_html_tags(x)) def search(query): n = 10 query_embedding = model.encode(query) df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1))) results = (df.sort_values("similarity", ascending=False).head(n)) r_groupby = pd.DataFrame(results.groupby(['title','url','keywords','summary_html']).similarity.max()) #results = results[['title','url','keywords','summary_html']].drop_duplicates() results = r_groupby.reset_index() results = results.sort_values("similarity", ascending=False) resultlist = [] for r in results.index: resultlist.append( { "Title":results.title[r], "url":results.url[r], "score": str(results.similarity[r][0]), "summary": results.summary_html[r][:200], "keywords": results.keywords[r] } ) return resultlist def greet(query): bm25 = search(query) return bm25 examples = [ ["Climate Change Challenges in Europe"], ["Philosophy in the world of Minimalism"], ["Hate Speech vs Freedom of Speech"], ["Articles by Noam Chomsky on US Politics"], ["The importance of values and reflection"] ] demo = gr.Interface(fn=greet, title="cicero-semantic-search", inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"), outputs="json",examples=examples) demo.launch()