Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
from sklearn.metrics.pairwise import euclidean_distances
|
| 6 |
+
|
| 7 |
+
# Load DataFrame
|
| 8 |
+
text_embeddings = pd.read_parquet('text_embeddings_abstract_all.parquet')
|
| 9 |
+
|
| 10 |
+
# Initialize models
|
| 11 |
+
model_all_Mini = SentenceTransformer('all-MiniLM-L6-v2')
|
| 12 |
+
model_e5_large_v2 = SentenceTransformer('intfloat/e5-large-v2')
|
| 13 |
+
model_e5_small_v2 = SentenceTransformer('intfloat/e5-small-v2')
|
| 14 |
+
model_gte_large = SentenceTransformer('thenlper/gte-large')
|
| 15 |
+
model_GIST_large = SentenceTransformer('avsolatorio/GIST-large-Embedding-v0')
|
| 16 |
+
|
| 17 |
+
# Model selection drop-down list
|
| 18 |
+
model_options = {
|
| 19 |
+
'all-MiniLM-L6-v2': model_all_Mini,
|
| 20 |
+
'intfloat/e5-large-v2': model_e5_large_v2,
|
| 21 |
+
'intfloat/e5-small-v2': model_e5_small_v2,
|
| 22 |
+
'thenlper/gte-large': model_gte_large,
|
| 23 |
+
'avsolatorio/GIST-large-Embedding-v0': model_GIST_large
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# Main function for the Gradio interface
|
| 27 |
+
def find_similar_texts(model_name, input_text):
|
| 28 |
+
# Check whether there are abstracts matching the text input
|
| 29 |
+
input_embedding_mini = model_all_Mini.encode(input_text).reshape(1, -1)
|
| 30 |
+
embedding_matrix_mini = np.vstack(text_embeddings['embedding_all-MiniLM-L6-v2'])
|
| 31 |
+
distances_mini = euclidean_distances(embedding_matrix_mini, input_embedding_mini).flatten()
|
| 32 |
+
|
| 33 |
+
# Only continue if similar abstract found
|
| 34 |
+
if any(distances_mini < 1.05):
|
| 35 |
+
selected_model = model_options[model_name]
|
| 36 |
+
embedding_column = 'embedding_' + model_name
|
| 37 |
+
input_embedding = selected_model.encode(input_text).reshape(1, -1)
|
| 38 |
+
embedding_matrix = np.vstack(text_embeddings[embedding_column])
|
| 39 |
+
distances = euclidean_distances(embedding_matrix, input_embedding).flatten()
|
| 40 |
+
text_embeddings['euclidean_distance'] = distances
|
| 41 |
+
sorted_embeddings = text_embeddings.sort_values(by='euclidean_distance', ascending=True)
|
| 42 |
+
top_five = sorted_embeddings.head(5)[['abstract', 'patent no']]
|
| 43 |
+
formatted_output = '\n\n'.join([f"Patent No: {row['patent no']}\nAbstract: {row['abstract']}\n" for index, row in top_five.iterrows()])
|
| 44 |
+
return formatted_output
|
| 45 |
+
else:
|
| 46 |
+
return "It seems there is no patent abstract close to your description."
|
| 47 |
+
|
| 48 |
+
# Create Gradio interface using Blocks
|
| 49 |
+
with gr.Blocks() as demo:
|
| 50 |
+
gr.Markdown("## Sentence-Transformer based Patent-Abstract Search")
|
| 51 |
+
with gr.Row():
|
| 52 |
+
with gr.Column():
|
| 53 |
+
model_selector = gr.Dropdown(choices=list(model_options.keys()), label="Chose Sentence-Transformer")
|
| 54 |
+
text_input = gr.Textbox(lines=2, placeholder="input_text", label="input_text (your description)")
|
| 55 |
+
submit_button = gr.Button("search")
|
| 56 |
+
|
| 57 |
+
with gr.Column():
|
| 58 |
+
output = gr.Textbox(label="top 5 patent abstracts if available)")
|
| 59 |
+
|
| 60 |
+
submit_button.click(find_similar_texts, inputs=[model_selector, text_input], outputs=output)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
gr.Markdown("""
|
| 64 |
+
### Description
|
| 65 |
+
This demo app leverages several Sentence Transformer models to compute the semantic distance between user input and a small number of patent abstracts in the field of machine learning and AI.
|
| 66 |
+
|
| 67 |
+
- 'all-MiniLM-L6-v2': embedding size is 384. [More info](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) and [here](https://sbert.net/).
|
| 68 |
+
- 'intfloat/e5-large-v2'. Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 1024. [More info](https://huggingface.co/intfloat/e5-large-v2).
|
| 69 |
+
- 'intfloat/e5-small-v2': Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 384. [More info](https://huggingface.co/intfloat/e5-small-v2).
|
| 70 |
+
- 'thenlper/gte-large': General Text Embeddings (GTE) model, embedding size is 1024. [More info](https://huggingface.co/thenlper/gte-large) and [here](https://arxiv.org/abs/2308.03281).
|
| 71 |
+
- 'avsolatorio/GIST-large-Embedding-v0': Fine-tuned on top of the BAAI/bge-large-en-v1.5 using the MEDI dataset augmented with mined triplets from the MTEB Classification training dataset, embedding size is 1024. [More info](https://huggingface.co/avsolatorio/GIST-large-Embedding-v0) and [here](https://arxiv.org/abs/2402.16829).
|
| 72 |
+
|
| 73 |
+
The patents can be viewed at [Espacenet](https://worldwide.espacenet.com/?locale=en_EP), the free onine service by the European Patent Office.
|
| 74 |
+
|
| 75 |
+
Please note: The data used in this demo contains only a very limited subset of patent abstracts and is intended only for demonstration purposes. It does by far not cover all patents or their complete data.
|
| 76 |
+
""")
|
| 77 |
+
model_selector.change(find_similar_texts, inputs=[model_selector, text_input], outputs=output)
|
| 78 |
+
text_input.submit(find_similar_texts, inputs=[model_selector, text_input], outputs=output)
|
| 79 |
+
|
| 80 |
+
demo.launch()
|