diff --git a/src/lib/parquet/src/Flow/Parquet/ParquetFile/ColumnChunkReader/WholeChunkReader.php b/src/lib/parquet/src/Flow/Parquet/ParquetFile/ColumnChunkReader/WholeChunkReader.php index 8078981fe..a4ec28c7f 100644 --- a/src/lib/parquet/src/Flow/Parquet/ParquetFile/ColumnChunkReader/WholeChunkReader.php +++ b/src/lib/parquet/src/Flow/Parquet/ParquetFile/ColumnChunkReader/WholeChunkReader.php @@ -90,6 +90,10 @@ public function read(ColumnChunk $columnChunk, FlatColumn $column, SourceStream $dictionary, $pageStream )); + + if ($dictionary === null) { + $header = $this->readHeader($pageStream); + } } \fclose($pageStream); diff --git a/src/lib/parquet/src/Flow/Parquet/ParquetFile/ColumnChunkViewer/WholeChunkViewer.php b/src/lib/parquet/src/Flow/Parquet/ParquetFile/ColumnChunkViewer/WholeChunkViewer.php index 5b6623a83..acd94fefd 100644 --- a/src/lib/parquet/src/Flow/Parquet/ParquetFile/ColumnChunkViewer/WholeChunkViewer.php +++ b/src/lib/parquet/src/Flow/Parquet/ParquetFile/ColumnChunkViewer/WholeChunkViewer.php @@ -51,6 +51,8 @@ public function view(ColumnChunk $columnChunk, FlatColumn $column, SourceStream break; } + \fseek($pageStream, \ftell($pageStream) + $dataHeader->compressedPageSize()); + yield $dataHeader; } diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/Fixtures/multiple_pages.parquet b/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/Fixtures/multiple_pages.parquet new file mode 100644 index 000000000..92a47c654 Binary files /dev/null and b/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/Fixtures/multiple_pages.parquet differ diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/ReaderTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/ReaderTest.php index 9aa22fc25..21a9d1366 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/ReaderTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/ReaderTest.php @@ -4,11 +4,36 @@ namespace Flow\Parquet\Tests\Integration\IO; -use Flow\Parquet\Reader; +use Flow\Parquet\ParquetFile\{ColumnPageHeader}; +use Flow\Parquet\{Reader}; use PHPUnit\Framework\TestCase; final class ReaderTest extends TestCase { + public function test_reading_columns_with_multiple_data_pages() : void + { + // File generated with https://gist.github.com/norberttech/325df9166bbdb33e18dffa94c1a033c4 + $reader = new Reader(); + $file = $reader->read(__DIR__ . '/Fixtures/multiple_pages.parquet'); + + $rows = 0; + + foreach ($file->values() as $row) { + foreach ($row as $column => $value) { + self::assertNotNull($value); + } + $rows++; + } + + $headers = \iterator_to_array($file->pageHeaders()); + + self::assertCount(79, $headers); + self::assertSame(128, $headers[0]->pageHeader->dataValuesCount()); + self::assertSame(16, $headers[78]->pageHeader->dataValuesCount()); + self::assertSame(10_000, \array_sum(\array_map(static fn (ColumnPageHeader $header) => $header->pageHeader->dataValuesCount(), $headers))); + self::assertSame(10_000, $rows); + } + public function test_reading_required_columns() : void { // File generated with https://gist.github.com/norberttech/01322f61dca77cfde5161e31e94463ef