Skip to content

Commit

Permalink
Fixed parquet reader when reading multiple data pages (#1186)
Browse files Browse the repository at this point in the history
  • Loading branch information
norberttech authored Aug 12, 2024
1 parent 987fa64 commit 5ecf972
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ public function read(ColumnChunk $columnChunk, FlatColumn $column, SourceStream
$dictionary,
$pageStream
));

if ($dictionary === null) {
$header = $this->readHeader($pageStream);
}
}

\fclose($pageStream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ public function view(ColumnChunk $columnChunk, FlatColumn $column, SourceStream
break;
}

\fseek($pageStream, \ftell($pageStream) + $dataHeader->compressedPageSize());

yield $dataHeader;
}

Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,36 @@

namespace Flow\Parquet\Tests\Integration\IO;

use Flow\Parquet\Reader;
use Flow\Parquet\ParquetFile\{ColumnPageHeader};
use Flow\Parquet\{Reader};
use PHPUnit\Framework\TestCase;

final class ReaderTest extends TestCase
{
public function test_reading_columns_with_multiple_data_pages() : void
{
// File generated with https://gist.github.com/norberttech/325df9166bbdb33e18dffa94c1a033c4
$reader = new Reader();
$file = $reader->read(__DIR__ . '/Fixtures/multiple_pages.parquet');

$rows = 0;

foreach ($file->values() as $row) {
foreach ($row as $column => $value) {
self::assertNotNull($value);
}
$rows++;
}

$headers = \iterator_to_array($file->pageHeaders());

self::assertCount(79, $headers);
self::assertSame(128, $headers[0]->pageHeader->dataValuesCount());
self::assertSame(16, $headers[78]->pageHeader->dataValuesCount());
self::assertSame(10_000, \array_sum(\array_map(static fn (ColumnPageHeader $header) => $header->pageHeader->dataValuesCount(), $headers)));
self::assertSame(10_000, $rows);
}

public function test_reading_required_columns() : void
{
// File generated with https://gist.github.com/norberttech/01322f61dca77cfde5161e31e94463ef
Expand Down

0 comments on commit 5ecf972

Please sign in to comment.