Spaces:
Running
Running
File size: 7,897 Bytes
5301c48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
"""Example: Using Starfish Embeddings for Data Generation
This example demonstrates how to use FAISS and SentenceTransformers
for embedding-enhanced data generation and deduplication.
"""
import asyncio
import sys
import os
# Add the project root to the Python path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
from starfish.embedding import EmbeddingManager, SimilarityChecker, DataDeduplicator
from starfish.data_gen_template.core import data_gen_template
async def basic_embedding_example():
"""Basic example of using the embedding system."""
print("๐ฎ Basic Embedding Example")
print("=" * 50)
# Initialize embedding manager
embedding_manager = EmbeddingManager(model_name="all-MiniLM-L6-v2", similarity_threshold=0.85)
# Sample texts to embed
texts = [
"What is machine learning?",
"How does artificial intelligence work?",
"What are neural networks?",
"Explain deep learning concepts",
"What is supervised learning?",
"What is machine learning?", # Duplicate
"How do neural networks function?", # Similar to "What are neural networks?"
]
print(f"๐ Processing {len(texts)} sample texts...")
# Add texts to the index
indices = embedding_manager.add_texts(texts)
print(f"โ
Added {len(indices)} texts to the embedding index")
# Search for similar texts
query = "Tell me about AI and ML"
similar_items = embedding_manager.search_similar(query, k=3)
print(f"\n๐ Search results for: '{query}'")
for item in similar_items:
print(f" Similarity: {item['similarity']:.3f} | Text: {item['text']}")
# Find duplicates
duplicate_groups = embedding_manager.find_duplicates(texts)
print(f"\n๐ Found {len(duplicate_groups)} groups of duplicates:")
for i, group in enumerate(duplicate_groups):
print(f" Group {i+1}: {[texts[idx] for idx in group]}")
print(f"\n๐ Index Stats: {embedding_manager.get_stats()}")
async def similarity_checker_example():
"""Example of using the similarity checker."""
print("\n๐ฏ Similarity Checker Example")
print("=" * 50)
similarity_checker = SimilarityChecker(similarity_threshold=0.8)
# Sample data items
data_items = [
{"question": "What is Python?", "answer": "Python is a programming language"},
{"question": "How to learn coding?", "answer": "Start with basic concepts"},
{"question": "What is programming?", "answer": "Programming is writing code"},
{"question": "What is Python programming?", "answer": "Python is a popular language"}, # Similar to first
]
print(f"๐ Analyzing {len(data_items)} data items...")
# Filter similar items
filtered_items, duplicate_groups = similarity_checker.filter_similar_items(data_items)
print(f"โ
Filtered to {len(filtered_items)} unique items")
# Check diversity metrics
diversity_metrics = similarity_checker.check_diversity_batch(data_items)
print(f"๐ Diversity Score: {diversity_metrics['diversity_score']:.3f}")
print(f"๐ Average Similarity: {diversity_metrics['avg_similarity']:.3f}")
# Suggest diverse subset
diverse_subset = similarity_checker.suggest_diverse_subset(data_items, target_size=2)
print(f"\n๐ฒ Diverse subset (2 items):")
for item in diverse_subset:
print(f" Q: {item['question']}")
async def deduplicator_example():
"""Example of using the data deduplicator."""
print("\n๐ง Data Deduplicator Example")
print("=" * 50)
deduplicator = DataDeduplicator(similarity_threshold=0.9)
# Sample dataset with duplicates
dataset = [
{"id": "1", "text": "Machine learning is a subset of AI", "quality_score": 0.8},
{"id": "2", "text": "Deep learning uses neural networks", "quality_score": 0.9},
{"id": "1", "text": "Machine learning is a subset of AI", "quality_score": 0.7}, # Exact duplicate
{"id": "3", "text": "ML is part of artificial intelligence", "quality_score": 0.95}, # Semantic duplicate
{"id": "4", "text": "Natural language processing handles text", "quality_score": 0.85},
]
print(f"๐ Analyzing dataset with {len(dataset)} items...")
# Analyze duplicates without removing
analysis = deduplicator.analyze_duplicates(dataset)
print(f"๐ Analysis Results:")
print(f" Exact duplicates: {analysis['exact_duplicates']['count']}")
print(f" Semantic duplicates: {analysis['semantic_duplicates']['count']}")
print(f" Diversity score: {analysis['diversity_metrics']['diversity_score']:.3f}")
# Perform comprehensive deduplication
clean_dataset, report = deduplicator.deduplicate_comprehensive(dataset)
print(f"\nโจ Deduplication Results:")
print(f" Original: {report['original_count']} items")
print(f" Final: {report['final_count']} items")
print(f" Reduction: {report['reduction_percentage']:.1f}%")
print("\n๐ Clean dataset:")
for item in clean_dataset:
print(f" ID: {item['id']} | Score: {item.get('quality_score', 'N/A')} | Text: {item['text'][:50]}...")
async def template_usage_example():
"""Example of using the embedding-enhanced template."""
print("\n๐ Embedding-Enhanced Template Example")
print("=" * 50)
try:
# Get the embedding template
print(data_gen_template.list())
template = data_gen_template.get("starfish/generate_with_embeddings")
# Configuration for generation
config = {
"num_records": 5, # Small number for demo
"user_instruction": "Generate educational Q&A about data science",
"topics": ["statistics", "data visualization", "machine learning"],
"generation_model_name": "openai/gpt-4o-mini",
"embedding_config": {
"model_name": "all-MiniLM-L6-v2",
"similarity_threshold": 0.8,
"enable_deduplication": True,
"enable_diversity_check": True,
"min_diversity_score": 0.2,
},
}
print("โ๏ธ Generating diverse dataset with embedding quality control...")
results = await template.run(**config)
print(f"\nโ
Generated {len(results)} high-quality items:")
for i, item in enumerate(results[:3]): # Show first 3
print(f"\n Item {i+1}:")
print(f" Q: {item.get('question', 'N/A')}")
print(f" A: {item.get('answer', 'N/A')[:100]}...")
if "_metadata" in item:
print(f" Diversity: {item['_metadata'].get('diversity_score', 'N/A'):.3f}")
except Exception as e:
print(f"โ ๏ธ Template example failed: {e}")
print(" (This might be due to missing API keys or dependencies)")
async def main():
"""Run all examples."""
print("๐ Starfish Embedding System Examples")
print("=" * 60)
try:
await basic_embedding_example()
await similarity_checker_example()
await deduplicator_example()
await template_usage_example()
print("\n" + "=" * 60)
print("โ
All examples completed successfully!")
print("\n๐ก Next steps:")
print(" 1. Install dependencies: poetry install")
print(" 2. Set API keys in .env.local")
print(" 3. Try the embedding template in your projects")
except ImportError as e:
print(f"โ Import error: {e}")
print("๐ก Make sure to install dependencies:")
print(" poetry install")
print(" # or")
print(" pip install faiss-cpu sentence-transformers")
except Exception as e:
print(f"โ Error running examples: {e}")
print("๐ก Check your Python environment and dependencies")
if __name__ == "__main__":
asyncio.run(main())
|