IvanHU commited on
Commit
5c3100a
·
verified ·
1 Parent(s): 3ce9833

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +148 -0
README.md CHANGED
@@ -13,6 +13,154 @@ tags:
13
 
14
  We use `code-classifier` to retrieve math-related content from `fineweb-edu`, `dclm`, ... to upsample code-related content
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  ## Related resources
18
 
 
13
 
14
  We use `code-classifier` to retrieve math-related content from `fineweb-edu`, `dclm`, ... to upsample code-related content
15
 
16
+ ```python
17
+ import json
18
+ import os
19
+ import time
20
+ from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
21
+ from time import sleep
22
+
23
+ import fasttext
24
+ import numpy as np
25
+ import pandas as pd
26
+ import pyarrow.parquet as pq
27
+ from tqdm import tqdm
28
+
29
+
30
+ def print_error(value):
31
+ print("error: ", value)
32
+
33
+
34
+ def data_process(index, file, saved_dir):
35
+
36
+ try:
37
+ model_path = "code_classifier.bin"
38
+ model = fasttext.load_model(model_path)
39
+
40
+ # saved_dir: fineweb-edu/data/CC...-code
41
+ filename = file.split('/')[-1].replace('.parquet', '.jsonl')
42
+ path90 = os.path.join(saved_dir, "09_10", filename)
43
+ if os.path.exists(path90):
44
+ print("exist", path90, flush=True)
45
+ return
46
+
47
+ # avoid reading at the same time
48
+ sleep(index * 3)
49
+ os.makedirs(saved_dir, exist_ok=True)
50
+ label_list = []
51
+ s67_list = []
52
+ s78_list = []
53
+ s89_list = []
54
+ s90_list = []
55
+
56
+ st = time.time()
57
+ print("reading parquet", file, flush=True)
58
+ df = pd.read_parquet(file)
59
+ ed = time.time()
60
+ print("read parquet time: ", ed - st, flush=True)
61
+ for _, row_orginal in tqdm(
62
+ df.iterrows(),
63
+ total=len(df),
64
+ position=index,
65
+ desc=filename,
66
+ ):
67
+ row = row_orginal.to_dict()
68
+ text = row['text'].encode('unicode_escape').decode('utf-8')
69
+
70
+ pred = model.predict(text)
71
+ label, score = pred[0][0], pred[1][0]
72
+ label_list.append(pred)
73
+ if label == '__label__positive':
74
+ if 0.6 <= score < 0.7:
75
+ s67_list.append(row)
76
+ elif 0.7 <= score < 0.8:
77
+ s78_list.append(row)
78
+ elif 0.8 <= score < 0.9:
79
+ s89_list.append(row)
80
+ elif 0.9 <= score <= 1.0:
81
+ s90_list.append(row)
82
+ else:
83
+ continue
84
+ except Exception as e:
85
+ print_error(e)
86
+ return None
87
+
88
+ os.makedirs(os.path.join(saved_dir, "labeled"), exist_ok=True)
89
+
90
+ print("writing to file", flush=True)
91
+
92
+ with open(
93
+ os.path.join(saved_dir, "labeled",
94
+ filename.replace('.jsonl', '.txt')), 'w') as f:
95
+ f.write("\n".join(str(pred) for pred in label_list))
96
+
97
+ for dir_name in ["06_07", "07_08", "08_09", "09_10"]:
98
+ os.makedirs(os.path.join(saved_dir, dir_name), exist_ok=True)
99
+
100
+ with open(os.path.join(saved_dir, "06_07", filename), 'w') as f:
101
+ f.write("\n".join(json.dumps(line_now) for line_now in s67_list))
102
+
103
+ with open(os.path.join(saved_dir, "07_08", filename), 'w') as f:
104
+ f.write("\n".join(json.dumps(line_now) for line_now in s78_list))
105
+
106
+ with open(os.path.join(saved_dir, "08_09", filename), 'w') as f:
107
+ f.write("\n".join(json.dumps(line_now) for line_now in s89_list))
108
+
109
+ with open(os.path.join(saved_dir, "09_10", filename), 'w') as f:
110
+ f.write("\n".join(json.dumps(line_now) for line_now in s90_list))
111
+
112
+ return None
113
+
114
+ if __name__ == '__main__':
115
+
116
+ num_process = 8
117
+ start_time = time.time()
118
+ file_paths = []
119
+ base = "fineweb-edu"
120
+
121
+ for file_name in [
122
+ 'CC-MAIN-2020-16', 'CC-MAIN-2022-05', 'CC-MAIN-2022-40',
123
+ 'CC-MAIN-2020-24', 'CC-MAIN-2020-34', 'CC-MAIN-2021-39',
124
+ 'CC-MAIN-2020-29', 'CC-MAIN-2023-23', 'CC-MAIN-2022-49',
125
+ 'CC-MAIN-2021-43', 'CC-MAIN-2023-14', 'CC-MAIN-2020-50',
126
+ 'CC-MAIN-2021-25', 'CC-MAIN-2021-10', 'CC-MAIN-2021-49',
127
+ 'CC-MAIN-2020-40', 'CC-MAIN-2020-45', 'CC-MAIN-2022-21',
128
+ 'CC-MAIN-2021-04', 'CC-MAIN-2023-40', 'CC-MAIN-2021-17',
129
+ 'CC-MAIN-2021-21', 'CC-MAIN-2022-27', 'CC-MAIN-2022-33',
130
+ 'CC-MAIN-2023-06', 'CC-MAIN-2023-50', 'CC-MAIN-2020-05',
131
+ 'CC-MAIN-2021-31','CC-MAIN-2020-10'
132
+ ]:
133
+
134
+ print("Walking:", file_name)
135
+ original_file_path = base + file_name
136
+ code_dir = original_file_path + "-code"
137
+ for root, dirs, files in os.walk(original_file_path):
138
+ for file in files:
139
+ if file.endswith(".parquet"): # 只处理Parquet文件
140
+ file_path = os.path.abspath(os.path.join(root, file))
141
+ saved_dir = code_dir + "/" + file_path.split("/")[-1][:-8]
142
+ file_paths.append((file_path, saved_dir))
143
+
144
+ print("total file paths", len(file_paths))
145
+ num_process = min(num_process, len(file_paths))
146
+ print("num_process", num_process)
147
+
148
+ futures = []
149
+ with ProcessPoolExecutor(num_process) as executor:
150
+ for index, (file_path, saved_dir) in enumerate(file_paths):
151
+ futures.append(
152
+ executor.submit(data_process, index % num_process, file_path,
153
+ saved_dir))
154
+ done, not_done = wait(futures, return_when=ALL_COMPLETED)
155
+
156
+ end_time = time.time()
157
+
158
+ # 计算并打印所用时间
159
+ elapsed_time = end_time - start_time
160
+ print(f"Time taken: {elapsed_time} seconds")
161
+ print("=" * 100)
162
+ ```
163
+
164
 
165
  ## Related resources
166