Initial Commit
This commit is contained in:
41
scripts/inspect_parquet.py
Normal file
41
scripts/inspect_parquet.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow.compute as pc
|
||||
|
||||
"""
|
||||
Inspect the organization of kaiyuan-pretraining data parquets.
|
||||
|
||||
Usage:
|
||||
python /apps/yi/scripts/inspect_parquet.py \
|
||||
--dir /apps/yi/kaiyuan_pretraining/phase1 \
|
||||
--limit-files 2
|
||||
|
||||
"""
|
||||
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
path = "/apps/yi/kaiyuan_pretraining/phase1/part-00000.zstd.parquet"
|
||||
|
||||
pf = pq.ParquetFile(path)
|
||||
|
||||
print("=== Schema ===")
|
||||
print(pf.schema)
|
||||
|
||||
print("\n=== Columns ===")
|
||||
print(pf.schema.names)
|
||||
|
||||
# 只读取 very small batch
|
||||
table = pf.read_row_group(0) # 只读第一个 row group
|
||||
|
||||
print("\n=== First row (safe preview) ===")
|
||||
|
||||
row = table.slice(0, 1).to_pydict()
|
||||
|
||||
for k, v in row.items():
|
||||
val = v[0]
|
||||
|
||||
if isinstance(val, str):
|
||||
print(f"{k}: {val[:200]}... (len={len(val)})")
|
||||
else:
|
||||
print(f"{k}: type={type(val)}")
|
||||
Reference in New Issue
Block a user