Spaces:

sergiomar73
/

nlp-gpt3-zero-shot-classification-app

Sleeping

File size: 5,467 Bytes

# https://huggingface.co/tasks/token-classification
# https://huggingface.co/spacy/en_core_web_sm
# pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl

import gradio as gr
import os
import time
import openai
import numpy as np
import pandas as pd
import spacy
import en_core_web_sm
import plotly.express as px

openai.organization = os.environ.get('ORGANIZATION')
openai.api_key = os.environ.get('API_KEY')

nlp = spacy.load("en_core_web_sm")

# The following text inside one of this categories: Entertainment, Business, Politics 
# This dull recreation of the animated film doesn’t strive for anything more than what was contained in the original version of this film and actually delivers less.
# Category: Entertainment

def text_to_sentences(text):
  doc = nlp(text)
  sentences = [ sentence.text for sentence in list(doc.sents) ]
  # print(sentences[:3])
  return sentences
 
def calculate_embeddings_with_gpt3(text, engine="text-similarity-davinci-001", interval = 1.5, verbose=True):
  if verbose:
    print(f'Calculating embedding for {text}...')
  time.sleep(interval)
  response = openai.Embedding.create(
    input=text,
    engine=engine
  )
  embedding = response['data'][0]['embedding']
  return embedding
 
def gpt3_zero_shot_classification(text, labels):
  
  df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
  for idx, sentence in enumerate(text_to_sentences(text)):
    embedding = calculate_embeddings_with_gpt3(sentence)
    # Create new row
    new_row = {
      'line': idx + 1,
      'sentence': sentence,
      'embedding': embedding
    }
    df_sentences = df_sentences.append(new_row, ignore_index=True)
  # print(df_sentences.shape)
  # df_sentences.head()
  
  
  
  targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
  # print(f"targets:{targets.shape}")
  df_cosines = pd.DataFrame(columns=['line'])

  for i, row in df_sentences.iterrows():
    line = f'{row["line"]:03}'
    # print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
    source = np.array(row["embedding"])
    cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source))
    # Create new row
    new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)])
    new_row["line"] = row["line"]
    df_cosines = df_cosines.append(new_row, ignore_index=True)

  df_cosines['line'] = df_cosines['line'].astype('int')
  # print(df_cosines.shape)
  # df_cosines.head(3)

  df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)]
  # print(df_comparison.shape)
  # df_comparison.head(3)

  threshold = threshold / 100

  df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity'])

  for i, row in df_comparison.iterrows():
    for n in range(1,64+1):
      col = f"Cosine{f'{n:02}'}"
      # if row[col] > threshold:
      phrase = df_phrases.loc[[ n - 1 ]]
      new_row = { 
        'line': row["line"],
        'sentence': df_sentences.at[int(row["line"])-1,"sentence"],
        'phrase': df_phrases.at[n-1,"example"],
        'category': df_phrases.at[n-1,"category"],
        'tag': df_phrases.at[n-1,"label"],
        'similarity': row[col]
      }
      df_results = df_results.append(new_row, ignore_index=True)

  df_results['line'] = df_cosines['line'].astype('int')
  # print(df_results.shape)
  # df_results.head(3)

  df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame()
  df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)
  # df_summary

  fig = px.bar(
    df_summary,
    y='similarity',
    color='ok',
    color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
    text='similarity',
    text_auto='.3f',
    labels={'tag': 'Category', 'similarity': 'Similarity'},
    title = f"{text[:200]}..."
  )
  fig.add_shape( # add a horizontal "target" line
    type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
    x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
  )
  fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
  fig.update_yaxes(range=[0, 1])  
  # fig.show()

  details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index()    .drop(labels='index',axis=1)
  
  res = df_summary['similarity'].to_dict()

  return res, fig, details

# Gradio UI

with gr.Blocks(css=".gradio-container { background-color: white; }") as demo:
  gr.Markdown(f"# GPT-3 Zero shot classification app")
  with gr.Row():
    context = gr.Textbox(lines=3, label="Context", placeholder="Context Here...")   
  with gr.Row():
    threshold = gr.Slider(0, 100, 80)
  btn = gr.Button(value="Analyze!", variant="primary")
  with gr.Row():
    label = gr.Label()
    plot = gr.Plot()
  with gr.Row():
    grid = gr.Dataframe(wrap=True)
  btn.click(fn=gpt3_zero_shot_classification, inputs=[context,threshold], outputs=[label,plot,grid])
  gr.Examples(
    [
      [ "", "Entertainment, Business, Politics" ],
      [ "", "Entertainment, Business, Politics" ],
      [ "", "Entertainment, Business, Politics" ],
      [ "", "Entertainment, Business, Politics" ]
    ],
    [context, threshold],
    fn=gpt3_zero_shot_classification
  )

demo.launch(debug=True)