Files
pretrain_kaiyuan2b/scripts/inspect_parquet.py
2026-05-06 15:06:07 +08:00

41 lines
854 B
Python

import argparse
from pathlib import Path
import pyarrow.parquet as pq
import pyarrow.compute as pc
"""
Inspect the organization of kaiyuan-pretraining data parquets.
Usage:
python /apps/yi/scripts/inspect_parquet.py \
--dir /apps/yi/kaiyuan_pretraining/phase1 \
--limit-files 2
"""
import pyarrow.parquet as pq
path = "/apps/yi/kaiyuan_pretraining/phase1/part-00000.zstd.parquet"
pf = pq.ParquetFile(path)
print("=== Schema ===")
print(pf.schema)
print("\n=== Columns ===")
print(pf.schema.names)
# 只读取 very small batch
table = pf.read_row_group(0) # 只读第一个 row group
print("\n=== First row (safe preview) ===")
row = table.slice(0, 1).to_pydict()
for k, v in row.items():
val = v[0]
if isinstance(val, str):
print(f"{k}: {val[:200]}... (len={len(val)})")
else:
print(f"{k}: type={type(val)}")