import argparse from pathlib import Path import pyarrow.parquet as pq import pyarrow.compute as pc """ Inspect the organization of kaiyuan-pretraining data parquets. Usage: python /apps/yi/scripts/inspect_parquet.py \ --dir /apps/yi/kaiyuan_pretraining/phase1 \ --limit-files 2 """ import pyarrow.parquet as pq path = "/apps/yi/kaiyuan_pretraining/phase1/part-00000.zstd.parquet" pf = pq.ParquetFile(path) print("=== Schema ===") print(pf.schema) print("\n=== Columns ===") print(pf.schema.names) # 只读取 very small batch table = pf.read_row_group(0) # 只读第一个 row group print("\n=== First row (safe preview) ===") row = table.slice(0, 1).to_pydict() for k, v in row.items(): val = v[0] if isinstance(val, str): print(f"{k}: {val[:200]}... (len={len(val)})") else: print(f"{k}: type={type(val)}")