hamza50's picture
Update app.py
0b992b5
import gradio as gr
import pandas as pd
import tiktoken
import pandas as pd
import time
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import nltk
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch
import re
from openai.embeddings_utils import get_embedding, cosine_similarity
import os
df = pd.read_pickle('entire_data.pkl') #to load 123.pkl back to the dataframe df
model = SentenceTransformer('all-mpnet-base-v2')
def remove_html_tags(text):
clean = re.compile('<.*?>')
return re.sub(clean, '', text)
df['content'] = df.content.apply(lambda x: remove_html_tags(x))
df['summary_html'] = df.summary_html.apply(lambda x: remove_html_tags(x))
def search(query):
n = 10
query_embedding = model.encode(query)
df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1)))
results = (df.sort_values("similarity", ascending=False).head(n))
r_groupby = pd.DataFrame(results.groupby(['title','url','keywords','summary_html']).similarity.max())
#results = results[['title','url','keywords','summary_html']].drop_duplicates()
results = r_groupby.reset_index()
results = results.sort_values("similarity", ascending=False)
resultlist = []
for r in results.index:
resultlist.append(
{
"Title":results.title[r],
"url":results.url[r],
"score": str(results.similarity[r][0]),
"summary": results.summary_html[r][:200],
"keywords": results.keywords[r]
}
)
return resultlist
def greet(query):
bm25 = search(query)
return bm25
examples = [
["Climate Change Challenges in Europe"],
["Philosophy in the world of Minimalism"],
["Hate Speech vs Freedom of Speech"],
["Articles by Noam Chomsky on US Politics"],
["The importance of values and reflection"]
]
demo = gr.Interface(fn=greet, title="cicero-semantic-search",
inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
outputs="json",examples=examples)
demo.launch()