Spaces:
Sleeping
Sleeping
File size: 6,242 Bytes
33d4721 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import os
from autotrain import logger
MODEL_CARD = """
---
library_name: sentence-transformers
tags:
- sentence-transformers
- sentence-similarity
- feature-extraction
- autotrain{base_model}
widget:
- source_sentence: 'search_query: i love autotrain'
sentences:
- 'search_query: huggingface auto train'
- 'search_query: hugging face auto train'
- 'search_query: i love autotrain'
pipeline_tag: sentence-similarity{dataset_tag}
---
# Model Trained Using AutoTrain
- Problem type: Sentence Transformers
## Validation Metrics
{validation_metrics}
## Usage
### Direct Usage (Sentence Transformers)
First install the Sentence Transformers library:
```bash
pip install -U sentence-transformers
```
Then you can load this model and run inference.
```python
from sentence_transformers import SentenceTransformer
# Download from the Hugging Face Hub
model = SentenceTransformer("sentence_transformers_model_id")
# Run inference
sentences = [
'search_query: autotrain',
'search_query: auto train',
'search_query: i love autotrain',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
```
"""
def process_columns(data, config):
"""
Processes and renames columns in the dataset based on the trainer type specified in the configuration.
Args:
data (Dataset): The dataset containing the columns to be processed.
config (Config): Configuration object containing the trainer type and column names.
Returns:
Dataset: The dataset with renamed columns as per the trainer type.
Raises:
ValueError: If the trainer type specified in the configuration is invalid.
Trainer Types and Corresponding Columns:
- "pair": Renames columns to "anchor" and "positive".
- "pair_class": Renames columns to "premise", "hypothesis", and "label".
- "pair_score": Renames columns to "sentence1", "sentence2", and "score".
- "triplet": Renames columns to "anchor", "positive", and "negative".
- "qa": Renames columns to "query" and "answer".
"""
# trainers: pair, pair_class, pair_score, triplet, qa
# pair: anchor, positive
# pair_class: premise, hypothesis, label
# pair_score: sentence1, sentence2, score
# triplet: anchor, positive, negative
# qa: query, answer
if config.trainer == "pair":
if not (config.sentence1_column == "anchor" and config.sentence1_column in data.column_names):
data = data.rename_column(config.sentence1_column, "anchor")
if not (config.sentence2_column == "positive" and config.sentence2_column in data.column_names):
data = data.rename_column(config.sentence2_column, "positive")
elif config.trainer == "pair_class":
if not (config.sentence1_column == "premise" and config.sentence1_column in data.column_names):
data = data.rename_column(config.sentence1_column, "premise")
if not (config.sentence2_column == "hypothesis" and config.sentence2_column in data.column_names):
data = data.rename_column(config.sentence2_column, "hypothesis")
if not (config.target_column == "label" and config.target_column in data.column_names):
data = data.rename_column(config.target_column, "label")
elif config.trainer == "pair_score":
if not (config.sentence1_column == "sentence1" and config.sentence1_column in data.column_names):
data = data.rename_column(config.sentence1_column, "sentence1")
if not (config.sentence2_column == "sentence2" and config.sentence2_column in data.column_names):
data = data.rename_column(config.sentence2_column, "sentence2")
if not (config.target_column == "score" and config.target_column in data.column_names):
data = data.rename_column(config.target_column, "score")
elif config.trainer == "triplet":
if not (config.sentence1_column == "anchor" and config.sentence1_column in data.column_names):
data = data.rename_column(config.sentence1_column, "anchor")
if not (config.sentence2_column == "positive" and config.sentence2_column in data.column_names):
data = data.rename_column(config.sentence2_column, "positive")
if not (config.sentence3_column == "negative" and config.sentence3_column in data.column_names):
data = data.rename_column(config.sentence3_column, "negative")
elif config.trainer == "qa":
if not (config.sentence1_column == "query" and config.sentence1_column in data.column_names):
data = data.rename_column(config.sentence1_column, "query")
if not (config.sentence2_column == "answer" and config.sentence2_column in data.column_names):
data = data.rename_column(config.sentence2_column, "answer")
else:
raise ValueError(f"Invalid trainer: {config.trainer}")
return data
def create_model_card(config, trainer):
"""
Generates a model card string based on the provided configuration and trainer.
Args:
config (object): Configuration object containing model and dataset details.
trainer (object): Trainer object used to evaluate the model.
Returns:
str: A formatted model card string containing dataset information, validation metrics, and base model details.
"""
if config.valid_split is not None:
eval_scores = trainer.evaluate()
logger.info(eval_scores)
eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items()]
eval_scores = "\n\n".join(eval_scores)
else:
eval_scores = "No validation metrics available"
if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path):
dataset_tag = ""
else:
dataset_tag = f"\ndatasets:\n- {config.data_path}"
if os.path.isdir(config.model):
base_model = ""
else:
base_model = f"\nbase_model: {config.model}"
model_card = MODEL_CARD.format(
dataset_tag=dataset_tag,
validation_metrics=eval_scores,
base_model=base_model,
)
return model_card
|