41 lines
854 B
Python
41 lines
854 B
Python
import argparse
|
|
from pathlib import Path
|
|
import pyarrow.parquet as pq
|
|
import pyarrow.compute as pc
|
|
|
|
"""
|
|
Inspect the organization of kaiyuan-pretraining data parquets.
|
|
|
|
Usage:
|
|
python /apps/yi/scripts/inspect_parquet.py \
|
|
--dir /apps/yi/kaiyuan_pretraining/phase1 \
|
|
--limit-files 2
|
|
|
|
"""
|
|
|
|
import pyarrow.parquet as pq
|
|
|
|
path = "/apps/yi/kaiyuan_pretraining/phase1/part-00000.zstd.parquet"
|
|
|
|
pf = pq.ParquetFile(path)
|
|
|
|
print("=== Schema ===")
|
|
print(pf.schema)
|
|
|
|
print("\n=== Columns ===")
|
|
print(pf.schema.names)
|
|
|
|
# 只读取 very small batch
|
|
table = pf.read_row_group(0) # 只读第一个 row group
|
|
|
|
print("\n=== First row (safe preview) ===")
|
|
|
|
row = table.slice(0, 1).to_pydict()
|
|
|
|
for k, v in row.items():
|
|
val = v[0]
|
|
|
|
if isinstance(val, str):
|
|
print(f"{k}: {val[:200]}... (len={len(val)})")
|
|
else:
|
|
print(f"{k}: type={type(val)}") |