Spaces:
Running
Running
File size: 2,884 Bytes
f9b4a02 e4f69cf f9b4a02 e4f69cf f9b4a02 e4f69cf f9b4a02 e4f69cf f9b4a02 e4f69cf f9b4a02 e4f69cf f9b4a02 e4f69cf f9b4a02 e4f69cf f9b4a02 e4f69cf f9b4a02 e4f69cf f9b4a02 e4f69cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import streamlit as st
import google.generativeai as genai
import numpy as np
# Configure Gemini API
genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
st.title("Text Embedding Similarity Test")
def split_into_chunks(text, chunk_size=500):
"""Split text into chunks of approximately specified character length"""
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def get_embedding(text):
"""Get embedding for a single text chunk"""
return genai.embed_content(
model="models/text-embedding-004",
content=text
)['embedding']
def cosine_similarity(vec1, vec2):
"""Compute cosine similarity between two vectors"""
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
# Text input areas
col1, col2 = st.columns(2)
with col1:
input_text1 = st.text_area("Enter your first text:",
height=200,
placeholder="Type or paste your first text here...")
with col2:
input_text2 = st.text_area("Enter text to compare:",
height=200,
placeholder="Type or paste text to compare...")
if st.button("Run Similarity Test"):
if not input_text1.strip() or not input_text2.strip():
st.warning("Please enter text in both input fields.")
else:
with st.spinner("Analyzing texts..."):
try:
# Process first text into chunks
chunks = split_into_chunks(input_text1)
if len(chunks) > 1:
st.info(f"Split first text into {len(chunks)} chunks")
# Generate embeddings for all chunks
embeddings = [get_embedding(chunk) for chunk in chunks]
# Generate embedding for comparison text
compare_embedding = get_embedding(input_text2)
# Calculate similarities
similarities = [cosine_similarity(emb, compare_embedding) for emb in embeddings]
max_score = max(similarities)
max_index = similarities.index(max_score)
# Display results
st.subheader("π Similarity Results")
st.write(f"**Highest similarity score:** {max_score:.4f}")
st.subheader("π§© Most Similar Chunk")
st.write(chunks[max_index])
st.subheader("π All Chunk Similarities")
for i, (chunk, score) in enumerate(zip(chunks, similarities)):
st.write(f"Chunk {i+1} ({len(chunk)} chars): {score:.4f}")
st.expander(f"View chunk {i+1}").write(chunk)
except Exception as e:
st.error(f"Error processing texts: {str(e)}") |