|
import re |
|
import unicodedata |
|
from typing import Tuple |
|
|
|
import fasttext |
|
from transformers import LlamaTokenizerFast |
|
|
|
|
|
def fasttext_preprocess_func(content: str, tokenizer: LlamaTokenizerFast) -> str: |
|
"""Fasttext preprocess function. |
|
|
|
Args: |
|
content (str): Content to process. |
|
|
|
Returns: |
|
str: Processed normalized content. |
|
""" |
|
|
|
|
|
content = re.sub(r'\n{3,}', '\n\n', content) |
|
|
|
|
|
content = content.lower() |
|
|
|
|
|
content = ''.join( |
|
c for c in unicodedata.normalize('NFKD', content) |
|
if unicodedata.category(c) != 'Mn') |
|
|
|
|
|
token_ids = tokenizer.encode(content, add_special_tokens=False) |
|
single_text_list = [] |
|
for token_id in token_ids: |
|
curr_text = tokenizer.decode([token_id]) |
|
single_text_list.append(curr_text) |
|
|
|
content = ' '.join(single_text_list) |
|
|
|
|
|
|
|
content = re.sub(r'\n', '\\\\n', content) |
|
content = re.sub(r'\r', '\\\\r', content) |
|
content = re.sub(r'\t', '\\\\t', content) |
|
content = re.sub(r' +', ' ', content) |
|
content = content.strip() |
|
|
|
return content |
|
|
|
|
|
def fasttext_infer(norm_content: str, fasttext_model: fasttext.FastText) -> Tuple[str, float]: |
|
"""Fasttext inference function |
|
|
|
Args: |
|
content (str): input text |
|
|
|
Returns: |
|
str: json string with pred_label and pred_score |
|
""" |
|
|
|
pred_label, pred_prob = fasttext_model.predict(norm_content) |
|
pred_label = pred_label[0] |
|
_score = min(pred_prob.tolist()[0], 1) |
|
if pred_label == "__label__neg": |
|
_score = 1 - _score |
|
|
|
return pred_label, _score |
|
|
|
|
|
def main(): |
|
|
|
tokenizer_path = "local_tokenizer" |
|
tokenizer = LlamaTokenizerFast.from_pretrained(tokenizer_path) |
|
|
|
|
|
fasttext_model_en_path = "classifiers/ultra_fineweb_en.bin" |
|
fasttext_model_en = fasttext.load_model(fasttext_model_en_path) |
|
|
|
|
|
content_en = "Data quality has become a key factor in enhancing model performance with the rapid development of large language models (LLMs). Model-driven data filtering has increasingly become a primary approach for acquiring high-quality data. However, it still faces two main challenges: (1) the lack of an efficient data verification strategy makes it difficult to provide timely feedback on data quality; and (2) the selection of seed data for training classifiers lacks clear criteria and relies heavily on human expertise, introducing a degree of subjectivity. To address the first challenge, we introduce an efficient verification strategy that enables rapid evaluation of the impact of data on LLM training with minimal computational cost. To tackle the second challenge, we build upon the assumption that high-quality seed data is beneficial for LLM training, and by integrating the proposed verification strategy, we optimize the selection of positive and negative samples and propose an efficient data filtering pipeline. This pipeline not only improves filtering efficiency, classifier quality, and robustness, but also significantly reduces experimental and inference costs. In addition, to efficiently filter high-quality data, we employ a lightweight classifier based on fastText, and successfully apply the filtering pipeline to two widely-used pre-training corpora, FineWeb and Chinese FineWeb datasets, resulting in the creation of the higher-quality Ultra-FineWeb dataset. Ultra-FineWeb contains approximately 1 trillion (T) English tokens and 120 billion (B) Chinese tokens. Empirical results demonstrate that the LLMs trained on Ultra-FineWeb exhibit significant performance improvements across multiple benchmark tasks, validating the effectiveness of our pipeline in enhancing both data quality and training efficiency." |
|
|
|
norm_content_en = fasttext_preprocess_func(content_en, tokenizer) |
|
|
|
pred_label_en, pred_score_en = fasttext_infer(norm_content_en, fasttext_model_en) |
|
|
|
print("-" * 100) |
|
print(f"English content: {content_en}") |
|
print() |
|
print(f"Normalized content: {norm_content_en}") |
|
print() |
|
print(f" - Pred label: {pred_label_en}") |
|
print(f" - Pred score: {pred_score_en}") |
|
print("-" * 100) |
|
|
|
|
|
fasttext_model_zh_path = "classifiers/ultra_fineweb_zh.bin" |
|
fasttext_model_zh = fasttext.load_model(fasttext_model_zh_path) |
|
content_zh = "随着大语言模型(Large Language Model,LLM)的快速发展,数据质量已成为提升模型性能的关键因素之一。模型驱动的数据筛选方法逐渐成为获取高质量数据的主要手段之一。然而,当前仍面临两个核心挑战:(1)缺乏高效的数据验证机制,难以及时反馈数据训练效果;(2)分类器对种子数据的选择缺乏明确标准,过度依赖人工经验,存在一定主观性。为应对第一个问题,我们提出了一种高效验证策略(Efficient Verification Strategy),可在较小计算开销下快速评估数据对 LLM 效果的影响。针对第二个问题,我们基于“种子数据对 LLM 训练应具正向增益”的假设,进一步结合所提出的验证策略,优化了正负样本选择方式,构建出一套高效的数据筛选流程,不仅提升了筛选效率与分类器的质量和鲁棒性,也有效降低了实验与推理成本。此外,我们采用基于 fastText 的轻量级分类器以减少推理开销,并将筛选流程成功应用于 FineWeb 与 Chinese FineWeb 数据集,构建出质量更高的 UltraFineWeb 数据集,包含 1 万亿(1 Trillion)英文和 1200 亿(120 Billion)中文词元( token)。基于 UltraFineWeb 数据集训练的模型在多个基准任务上实现了显著性能提升,验证了所提出筛选方法在提升数据质量与训练效率方面的有效性。" |
|
norm_content_zh = fasttext_preprocess_func(content_zh, tokenizer) |
|
pred_label_zh, pred_score_zh = fasttext_infer(norm_content_zh, fasttext_model_zh) |
|
|
|
print("-" * 100) |
|
print(f"Chinese content: {content_zh}") |
|
print() |
|
print(f"Normalized content: {norm_content_zh}") |
|
print() |
|
print(f" - Pred label: {pred_label_zh}") |
|
print(f" - Pred score: {pred_score_zh}") |
|
print("-" * 100) |
|
|
|
if __name__ == "__main__": |
|
main() |