Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModel | |
# :white_check_mark: Setup environment | |
os.makedirs(os.environ.get("HF_HOME", "./hf_cache"), exist_ok=True) | |
hf_token = os.environ.get("HF_TOKEN") | |
if not hf_token: | |
raise EnvironmentError(":x: Environment variable HF_TOKEN is not set.") | |
# :white_check_mark: Load model and tokenizer | |
text_tokenizer = AutoTokenizer.from_pretrained( | |
"nomic-ai/nomic-embed-text-v1.5", | |
trust_remote_code=True, | |
token=hf_token, | |
cache_dir=os.environ["HF_HOME"] | |
) | |
text_model = AutoModel.from_pretrained( | |
"nomic-ai/nomic-embed-text-v1.5", | |
trust_remote_code=True, | |
token=hf_token, | |
cache_dir=os.environ["HF_HOME"] | |
) | |
# :white_check_mark: Embedding function | |
def get_text_embeddings(text): | |
""" | |
Converts input text into a dense embedding using the Nomic embedding model. | |
These embeddings are used to query Qdrant for semantically relevant document chunks. | |
""" | |
inputs = text_tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
outputs = text_model(**inputs) | |
embeddings = outputs.last_hidden_state.mean(dim=1) | |
return embeddings[0].detach().numpy() | |
# :white_check_mark: Gradio interface function | |
def embed_text_interface(text): | |
embedding = get_text_embeddings(text) | |
return str(embedding) | |
# :white_check_mark: Gradio UI | |
interface = gr.Interface( | |
fn=embed_text_interface, | |
inputs=gr.Textbox(label="Enter text to embed", lines=5), | |
outputs=gr.Textbox(label="Embedding vector"), | |
title="Text Embedding with Nomic AI", | |
description="Enter some text, and get its embedding vector using Nomic's embedding model." | |
) | |
# :white_check_mark: Launch the app | |
if __name__ == "__main__": | |
interface.launch() |