""" Similarity metrics for text comparison """ import numpy as np from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def calculate_cosine_similarity(text1, text2): """ Calculate cosine similarity between two texts using TF-IDF vectorization. Args: text1 (str): First text text2 (str): Second text Returns: float: Cosine similarity score between 0 and 1 """ # Create TF-IDF vectorizer vectorizer = TfidfVectorizer() try: # Transform texts into TF-IDF vectors tfidf_matrix = vectorizer.fit_transform([text1, text2]) # Calculate cosine similarity similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] return float(similarity) except Exception as e: print(f"Error calculating cosine similarity: {e}") return 0.0 def calculate_jaccard_similarity(text1, text2): """ Calculate Jaccard similarity between two texts (word-level). Args: text1 (str): First text text2 (str): Second text Returns: float: Jaccard similarity score between 0 and 1 """ # Convert to sets of words words1 = set(text1.lower().split()) words2 = set(text2.lower().split()) # Calculate Jaccard similarity if not words1 and not words2: return 1.0 # If both are empty, they're identical try: intersection = len(words1.intersection(words2)) union = len(words1.union(words2)) return intersection / union except Exception as e: print(f"Error calculating Jaccard similarity: {e}") return 0.0 def calculate_semantic_similarity(text1, text2): """ Calculate pseudo-semantic similarity by comparing word overlap patterns. This is a simplified approach that doesn't use embedding models like Word2Vec or BERT. Args: text1 (str): First text text2 (str): Second text Returns: float: Semantic similarity score between 0 and 1 """ # For now, this is a weighted combination of cosine and Jaccard similarity # In a real app, you'd use a proper semantic model cosine = calculate_cosine_similarity(text1, text2) jaccard = calculate_jaccard_similarity(text1, text2) # Weight more towards cosine similarity return 0.7 * cosine + 0.3 * jaccard def calculate_similarity(text1, text2, metrics=None): """ Calculate various similarity metrics between two texts. Args: text1 (str): First text text2 (str): Second text metrics (list): List of metrics to calculate Returns: dict: Dictionary of similarity scores """ if metrics is None: metrics = ["Cosine Similarity", "Jaccard Similarity", "Semantic Similarity"] results = {} if "Cosine Similarity" in metrics: results["cosine_similarity"] = calculate_cosine_similarity(text1, text2) if "Jaccard Similarity" in metrics: results["jaccard_similarity"] = calculate_jaccard_similarity(text1, text2) if "Semantic Similarity" in metrics: results["semantic_similarity"] = calculate_semantic_similarity(text1, text2) return results