yulan-team
/

code-classifier

Text Classification

fastText

English

code

Model card Files Files and versions Community

IvanHU commited on Mar 27

Commit

5c3100a

verified ·

1 Parent(s): 3ce9833

Update README.md

Browse files

Files changed (1) hide show

README.md +148 -0

README.md CHANGED Viewed

@@ -13,6 +13,154 @@ tags:
 We use `code-classifier` to retrieve math-related content from `fineweb-edu`, `dclm`, ... to upsample code-related content
 ## Related resources

 We use `code-classifier` to retrieve math-related content from `fineweb-edu`, `dclm`, ... to upsample code-related content
+```python
+import json
+import os
+import time
+from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
+from time import sleep
+import fasttext
+import numpy as np
+import pandas as pd
+import pyarrow.parquet as pq
+from tqdm import tqdm
+def print_error(value):
+    print("error: ", value)
+def data_process(index, file, saved_dir):
+    try:
+        model_path = "code_classifier.bin"
+        model = fasttext.load_model(model_path)
+        # saved_dir: fineweb-edu/data/CC...-code
+        filename = file.split('/')[-1].replace('.parquet', '.jsonl')
+        path90 = os.path.join(saved_dir, "09_10", filename)
+        if os.path.exists(path90):
+            print("exist", path90, flush=True)
+            return
+        # avoid reading at the same time
+        sleep(index * 3)
+        os.makedirs(saved_dir, exist_ok=True)
+        label_list = []
+        s67_list = []
+        s78_list = []
+        s89_list = []
+        s90_list = []
+        st = time.time()
+        print("reading parquet", file, flush=True)
+        df = pd.read_parquet(file)
+        ed = time.time()
+        print("read parquet time: ", ed - st, flush=True)
+        for _, row_orginal in tqdm(
+                df.iterrows(),
+                total=len(df),
+                position=index,
+                desc=filename,
+        ):
+            row = row_orginal.to_dict()
+            text = row['text'].encode('unicode_escape').decode('utf-8')
+            pred = model.predict(text)
+            label, score = pred[0][0], pred[1][0]
+            label_list.append(pred)
+            if label == '__label__positive':
+                if 0.6 <= score < 0.7:
+                    s67_list.append(row)
+                elif 0.7 <= score < 0.8:
+                    s78_list.append(row)
+                elif 0.8 <= score < 0.9:
+                    s89_list.append(row)
+                elif 0.9 <= score <= 1.0:
+                    s90_list.append(row)
+                else:
+                    continue
+    except Exception as e:
+        print_error(e)
+        return None
+    os.makedirs(os.path.join(saved_dir, "labeled"), exist_ok=True)
+    print("writing to file", flush=True)
+    with open(
+            os.path.join(saved_dir, "labeled",
+                         filename.replace('.jsonl', '.txt')), 'w') as f:
+        f.write("\n".join(str(pred) for pred in label_list))
+    for dir_name in ["06_07", "07_08", "08_09", "09_10"]:
+        os.makedirs(os.path.join(saved_dir, dir_name), exist_ok=True)
+    with open(os.path.join(saved_dir, "06_07", filename), 'w') as f:
+        f.write("\n".join(json.dumps(line_now) for line_now in s67_list))
+    with open(os.path.join(saved_dir, "07_08", filename), 'w') as f:
+        f.write("\n".join(json.dumps(line_now) for line_now in s78_list))
+    with open(os.path.join(saved_dir, "08_09", filename), 'w') as f:
+        f.write("\n".join(json.dumps(line_now) for line_now in s89_list))
+    with open(os.path.join(saved_dir, "09_10", filename), 'w') as f:
+        f.write("\n".join(json.dumps(line_now) for line_now in s90_list))
+    return None
+if __name__ == '__main__':
+    num_process = 8
+    start_time = time.time()
+    file_paths = []
+    base = "fineweb-edu"
+    for file_name in [
+            'CC-MAIN-2020-16', 'CC-MAIN-2022-05', 'CC-MAIN-2022-40',
+            'CC-MAIN-2020-24', 'CC-MAIN-2020-34', 'CC-MAIN-2021-39',
+            'CC-MAIN-2020-29', 'CC-MAIN-2023-23', 'CC-MAIN-2022-49',
+            'CC-MAIN-2021-43', 'CC-MAIN-2023-14', 'CC-MAIN-2020-50',
+            'CC-MAIN-2021-25', 'CC-MAIN-2021-10', 'CC-MAIN-2021-49',
+            'CC-MAIN-2020-40', 'CC-MAIN-2020-45', 'CC-MAIN-2022-21',
+            'CC-MAIN-2021-04', 'CC-MAIN-2023-40', 'CC-MAIN-2021-17',
+            'CC-MAIN-2021-21', 'CC-MAIN-2022-27', 'CC-MAIN-2022-33',
+            'CC-MAIN-2023-06', 'CC-MAIN-2023-50', 'CC-MAIN-2020-05',
+            'CC-MAIN-2021-31','CC-MAIN-2020-10'
+    ]:
+        print("Walking:", file_name)
+        original_file_path = base + file_name
+        code_dir = original_file_path + "-code"
+        for root, dirs, files in os.walk(original_file_path):
+            for file in files:
+                if file.endswith(".parquet"):  # 只处理Parquet文件
+                    file_path = os.path.abspath(os.path.join(root, file))
+                    saved_dir = code_dir + "/" + file_path.split("/")[-1][:-8]
+                    file_paths.append((file_path, saved_dir))
+    print("total file paths", len(file_paths))
+    num_process = min(num_process, len(file_paths))
+    print("num_process", num_process)
+    futures = []
+    with ProcessPoolExecutor(num_process) as executor:
+        for index, (file_path, saved_dir) in enumerate(file_paths):
+            futures.append(
+                executor.submit(data_process, index % num_process, file_path,
+                                saved_dir))
+        done, not_done = wait(futures, return_when=ALL_COMPLETED)
+    end_time = time.time()
+    # 计算并打印所用时间
+    elapsed_time = end_time - start_time
+    print(f"Time taken: {elapsed_time} seconds")
+    print("=" * 100)
+```
 ## Related resources