Skip to content
Prev Previous commit
Next Next commit
Update src/llamafactory/data/tokenized_parquet.py
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
  • Loading branch information
AbdulmalikDS and gemini-code-assist[bot] authored Oct 26, 2025
commit 112647003af4c74889ad803fda40542c1d9c921e
3 changes: 1 addition & 2 deletions src/llamafactory/data/tokenized_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@
def _iter_parquet_rows(paths: list[str], ids_key: str, mask_key: Optional[str]) -> Iterable[dict[str, Any]]:
r"""Iterate over rows from multiple Parquet files, yielding pre-tokenized samples."""
for path in paths:
with open(path, "rb") as f:
pf = pq.ParquetFile(f)
with pq.ParquetFile(path) as pf:
for i in range(pf.num_row_groups):
table: pa.Table = pf.read_row_group(i)
ids_col = table[ids_key]
Expand Down