Spaces:
Running
Running
import streamlit as st | |
import google.generativeai as genai | |
import numpy as np | |
# Configure Gemini API | |
genai.configure(api_key=st.secrets["GEMINI_API_KEY"]) | |
st.title("Text Embedding Similarity Test") | |
def split_into_chunks(text, chunk_size=500): | |
"""Split text into chunks of approximately specified character length""" | |
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | |
def get_embedding(text): | |
"""Get embedding for a single text chunk""" | |
return genai.embed_content( | |
model="models/text-embedding-004", | |
content=text | |
)['embedding'] | |
def cosine_similarity(vec1, vec2): | |
"""Compute cosine similarity between two vectors""" | |
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) | |
# Text input areas | |
col1, col2 = st.columns(2) | |
with col1: | |
input_text1 = st.text_area("Enter your first text:", | |
height=200, | |
placeholder="Type or paste your first text here...") | |
with col2: | |
input_text2 = st.text_area("Enter text to compare:", | |
height=200, | |
placeholder="Type or paste text to compare...") | |
if st.button("Run Similarity Test"): | |
if not input_text1.strip() or not input_text2.strip(): | |
st.warning("Please enter text in both input fields.") | |
else: | |
with st.spinner("Analyzing texts..."): | |
try: | |
# Process first text into chunks | |
chunks = split_into_chunks(input_text1) | |
if len(chunks) > 1: | |
st.info(f"Split first text into {len(chunks)} chunks") | |
# Generate embeddings for all chunks | |
embeddings = [get_embedding(chunk) for chunk in chunks] | |
# Generate embedding for comparison text | |
compare_embedding = get_embedding(input_text2) | |
# Calculate similarities | |
similarities = [cosine_similarity(emb, compare_embedding) for emb in embeddings] | |
max_score = max(similarities) | |
max_index = similarities.index(max_score) | |
# Display results | |
st.subheader("π Similarity Results") | |
st.write(f"**Highest similarity score:** {max_score:.4f}") | |
st.subheader("π§© Most Similar Chunk") | |
st.write(chunks[max_index]) | |
st.subheader("π All Chunk Similarities") | |
for i, (chunk, score) in enumerate(zip(chunks, similarities)): | |
st.write(f"Chunk {i+1} ({len(chunk)} chars): {score:.4f}") | |
st.expander(f"View chunk {i+1}").write(chunk) | |
except Exception as e: | |
st.error(f"Error processing texts: {str(e)}") |