chore: adapt path and training hparams
This commit is contained in:
@@ -23,16 +23,18 @@ Megatron's IndexedDatasetBuilder directly.
|
||||
|
||||
Usage:
|
||||
|
||||
python /apps/yi/model_training/scripts/convert_phase_to_megatron.py \
|
||||
--input-dir /apps/yi/model_training/data/phase1 \
|
||||
--output-dir /ssd/yi/converted_data/megatron_phase1 \
|
||||
--megatron-dir /apps/yi/model_training/Megatron-LM \
|
||||
--tokenizer-model /apps/yi/model_training/data/tokenizer \
|
||||
python /ssd1/yi/pretrain_kaiyuan2b/scripts/convert_phase_to_megatron.py \
|
||||
--input-dir /ssd1/yi/data/phase1 \
|
||||
--output-dir /ssd1/yi/converted_data/phase1 \
|
||||
--megatron-dir /ssd1/yi/pretrain_kaiyuan2b/Megatron-LM \
|
||||
--tokenizer-model /ssd1/yi/data/tokenizer \
|
||||
--text-key text \
|
||||
--num-shards 4 \
|
||||
--workers-per-shard 16 \
|
||||
--start 100 \
|
||||
--end 220
|
||||
--num-shards 16 \
|
||||
--workers-per-shard 12 \
|
||||
--batch-size 16384 \
|
||||
--chunksize 128 \
|
||||
--start 0 \
|
||||
--end 210
|
||||
"""
|
||||
|
||||
_TOKENIZER = None
|
||||
@@ -214,8 +216,8 @@ def parse_args():
|
||||
parser.add_argument("--output-prefix-prefix", default="phase1")
|
||||
parser.add_argument("--num-shards", type=int, default=1, help="Parallel parquet files.")
|
||||
parser.add_argument("--workers-per-shard", type=int, default=max((os.cpu_count() or 8) // 2, 1))
|
||||
parser.add_argument("--batch-size", type=int, default=8192, help="Parquet record batch size.")
|
||||
parser.add_argument("--chunksize", type=int, default=64, help="Tokenizer pool imap chunk size.")
|
||||
parser.add_argument("--batch-size", type=int, default=16384, help="Parquet record batch size.")
|
||||
parser.add_argument("--chunksize", type=int, default=128, help="Tokenizer pool imap chunk size.")
|
||||
parser.add_argument("--log-interval", type=int, default=10000)
|
||||
parser.add_argument("--start", type=int, default=0)
|
||||
parser.add_argument("--end", type=int, default=None)
|
||||
|
||||
Reference in New Issue
Block a user