# https://huggingface.co/tasks/token-classification # https://huggingface.co/spacy/en_core_web_sm # pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl import gradio as gr import os import time import openai import numpy as np import pandas as pd import spacy import en_core_web_sm import plotly.express as px openai.organization = os.environ.get('ORGANIZATION') openai.api_key = os.environ.get('API_KEY') nlp = spacy.load("en_core_web_sm") # The following text inside one of this categories: Entertainment, Business, Politics # This dull recreation of the animated film doesn’t strive for anything more than what was contained in the original version of this film and actually delivers less. # Category: Entertainment def text_to_sentences(text): doc = nlp(text) sentences = [ sentence.text for sentence in list(doc.sents) ] # print(sentences[:3]) return sentences def calculate_embeddings_with_gpt3(text, engine="text-similarity-davinci-001", interval = 1.5, verbose=True): if verbose: print(f'Calculating embedding for {text}...') time.sleep(interval) response = openai.Embedding.create( input=text, engine=engine ) embedding = response['data'][0]['embedding'] return embedding def gpt3_zero_shot_classification(text, labels): df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding']) for idx, sentence in enumerate(text_to_sentences(text)): embedding = calculate_embeddings_with_gpt3(sentence) # Create new row new_row = { 'line': idx + 1, 'sentence': sentence, 'embedding': embedding } df_sentences = df_sentences.append(new_row, ignore_index=True) # print(df_sentences.shape) # df_sentences.head() targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ]) # print(f"targets:{targets.shape}") df_cosines = pd.DataFrame(columns=['line']) for i, row in df_sentences.iterrows(): line = f'{row["line"]:03}' # print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...') source = np.array(row["embedding"]) cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source)) # Create new row new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)]) new_row["line"] = row["line"] df_cosines = df_cosines.append(new_row, ignore_index=True) df_cosines['line'] = df_cosines['line'].astype('int') # print(df_cosines.shape) # df_cosines.head(3) df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)] # print(df_comparison.shape) # df_comparison.head(3) threshold = threshold / 100 df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity']) for i, row in df_comparison.iterrows(): for n in range(1,64+1): col = f"Cosine{f'{n:02}'}" # if row[col] > threshold: phrase = df_phrases.loc[[ n - 1 ]] new_row = { 'line': row["line"], 'sentence': df_sentences.at[int(row["line"])-1,"sentence"], 'phrase': df_phrases.at[n-1,"example"], 'category': df_phrases.at[n-1,"category"], 'tag': df_phrases.at[n-1,"label"], 'similarity': row[col] } df_results = df_results.append(new_row, ignore_index=True) df_results['line'] = df_cosines['line'].astype('int') # print(df_results.shape) # df_results.head(3) df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame() df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False) # df_summary fig = px.bar( df_summary, y='similarity', color='ok', color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] }, text='similarity', text_auto='.3f', labels={'tag': 'Category', 'similarity': 'Similarity'}, title = f"{text[:200]}..." ) fig.add_shape( # add a horizontal "target" line type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot", x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y" ) fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False) fig.update_yaxes(range=[0, 1]) # fig.show() details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index() .drop(labels='index',axis=1) res = df_summary['similarity'].to_dict() return res, fig, details # Gradio UI with gr.Blocks(css=".gradio-container { background-color: white; }") as demo: gr.Markdown(f"# GPT-3 Zero shot classification app") with gr.Row(): context = gr.Textbox(lines=3, label="Context", placeholder="Context Here...") with gr.Row(): threshold = gr.Slider(0, 100, 80) btn = gr.Button(value="Analyze!", variant="primary") with gr.Row(): label = gr.Label() plot = gr.Plot() with gr.Row(): grid = gr.Dataframe(wrap=True) btn.click(fn=gpt3_zero_shot_classification, inputs=[context,threshold], outputs=[label,plot,grid]) gr.Examples( [ [ "", "Entertainment, Business, Politics" ], [ "", "Entertainment, Business, Politics" ], [ "", "Entertainment, Business, Politics" ], [ "", "Entertainment, Business, Politics" ] ], [context, threshold], fn=gpt3_zero_shot_classification ) demo.launch(debug=True)