Ryan
update
fd06f0b
"""
Similarity metrics for text comparison
"""
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def calculate_cosine_similarity(text1, text2):
"""
Calculate cosine similarity between two texts using TF-IDF vectorization.
Args:
text1 (str): First text
text2 (str): Second text
Returns:
float: Cosine similarity score between 0 and 1
"""
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
try:
# Transform texts into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform([text1, text2])
# Calculate cosine similarity
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
return float(similarity)
except Exception as e:
print(f"Error calculating cosine similarity: {e}")
return 0.0
def calculate_jaccard_similarity(text1, text2):
"""
Calculate Jaccard similarity between two texts (word-level).
Args:
text1 (str): First text
text2 (str): Second text
Returns:
float: Jaccard similarity score between 0 and 1
"""
# Convert to sets of words
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
# Calculate Jaccard similarity
if not words1 and not words2:
return 1.0 # If both are empty, they're identical
try:
intersection = len(words1.intersection(words2))
union = len(words1.union(words2))
return intersection / union
except Exception as e:
print(f"Error calculating Jaccard similarity: {e}")
return 0.0
def calculate_semantic_similarity(text1, text2):
"""
Calculate pseudo-semantic similarity by comparing word overlap patterns.
This is a simplified approach that doesn't use embedding models like Word2Vec or BERT.
Args:
text1 (str): First text
text2 (str): Second text
Returns:
float: Semantic similarity score between 0 and 1
"""
# For now, this is a weighted combination of cosine and Jaccard similarity
# In a real app, you'd use a proper semantic model
cosine = calculate_cosine_similarity(text1, text2)
jaccard = calculate_jaccard_similarity(text1, text2)
# Weight more towards cosine similarity
return 0.7 * cosine + 0.3 * jaccard
def calculate_similarity(text1, text2, metrics=None):
"""
Calculate various similarity metrics between two texts.
Args:
text1 (str): First text
text2 (str): Second text
metrics (list): List of metrics to calculate
Returns:
dict: Dictionary of similarity scores
"""
if metrics is None:
metrics = ["Cosine Similarity", "Jaccard Similarity", "Semantic Similarity"]
results = {}
if "Cosine Similarity" in metrics:
results["cosine_similarity"] = calculate_cosine_similarity(text1, text2)
if "Jaccard Similarity" in metrics:
results["jaccard_similarity"] = calculate_jaccard_similarity(text1, text2)
if "Semantic Similarity" in metrics:
results["semantic_similarity"] = calculate_semantic_similarity(text1, text2)
return results