Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/generated/core_configuration.html
Original file line number Diff line number Diff line change
Expand Up @@ -1325,6 +1325,12 @@
<td>Integer</td>
<td>The parallelism of scanning manifest files, default value is the size of cpu processor. Note: Scale-up this parameter will increase memory usage while scanning manifest files. We can consider downsize it when we encounter an out of memory exception while scanning</td>
</tr>
<tr>
<td><h5>scan.bucket</h5></td>
<td style="word-wrap: break-word;">(none)</td>
<td>Integer</td>
<td>Specify a single bucket to scan. This option filters manifest entries and only plans splits for the given bucket. It is only supported for fixed-bucket primary key tables (bucket &gt; 0). It cannot be used with postpone bucket tables.</td>
</tr>
<tr>
<td><h5>scan.max-splits-per-task</h5></td>
<td style="word-wrap: break-word;">10</td>
Expand Down
14 changes: 14 additions & 0 deletions paimon-api/src/main/java/org/apache/paimon/CoreOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,16 @@ public InlineElement getDescription() {
"Max split size should be cached for one task while scanning. "
+ "If splits size cached in enumerator are greater than tasks size multiply by this value, scanner will pause scanning.");

public static final ConfigOption<Integer> SCAN_BUCKET =
key("scan.bucket")
.intType()
.noDefaultValue()
.withDescription(
"Specify a single bucket to scan. This option filters manifest entries "
+ "and only plans splits for the given bucket. It is only supported "
+ "for fixed-bucket primary key tables (bucket > 0). It cannot be used "
+ "with postpone bucket tables.");

@Immutable
public static final ConfigOption<MergeEngine> MERGE_ENGINE =
key("merge-engine")
Expand Down Expand Up @@ -3573,6 +3583,10 @@ public Integer scanManifestParallelism() {
return options.get(SCAN_MANIFEST_PARALLELISM);
}

public Integer scanBucket() {
return options.get(SCAN_BUCKET);
}

public Duration streamingReadDelay() {
return options.get(STREAMING_READ_SNAPSHOT_DELAY);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,10 @@ public DataTableScan newScan() {
coreOptions(),
newSnapshotReader(),
catalogEnvironment.tableQueryAuth(coreOptions()));
Integer scanBucket = coreOptions().scanBucket();
if (scanBucket != null) {
scan.withBucket(scanBucket);
}
if (coreOptions().dataEvolutionEnabled()) {
return new DataEvolutionBatchScan(this, scan);
}
Expand All @@ -304,15 +308,21 @@ public DataTableScan newScan() {

@Override
public StreamDataTableScan newStreamScan() {
return new DataTableStreamScan(
tableSchema,
coreOptions(),
newSnapshotReader(),
snapshotManager(),
changelogManager(),
supportStreamingReadOverwrite(),
catalogEnvironment.tableQueryAuth(coreOptions()),
!tableSchema.primaryKeys().isEmpty());
DataTableStreamScan scan =
new DataTableStreamScan(
tableSchema,
coreOptions(),
newSnapshotReader(),
snapshotManager(),
changelogManager(),
supportStreamingReadOverwrite(),
catalogEnvironment.tableQueryAuth(coreOptions()),
!tableSchema.primaryKeys().isEmpty());
Integer scanBucket = coreOptions().scanBucket();
if (scanBucket != null) {
scan.withBucket(scanBucket);
}
return scan;
}

protected abstract SplitGenerator splitGenerator();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.paimon.partition.PartitionPredicate;
import org.apache.paimon.predicate.Predicate;
import org.apache.paimon.schema.TableSchema;
import org.apache.paimon.table.BucketMode;
import org.apache.paimon.table.source.snapshot.CompactedStartingScanner;
import org.apache.paimon.table.source.snapshot.ContinuousCompactorStartingScanner;
import org.apache.paimon.table.source.snapshot.ContinuousFromSnapshotFullStartingScanner;
Expand Down Expand Up @@ -119,10 +120,48 @@ public InnerTableScan withFilter(Predicate predicate) {

@Override
public AbstractDataTableScan withBucket(int bucket) {
validateScanBucketOption(schema, options, bucket);
snapshotReader.withBucket(bucket);
return this;
}

/** Validates {@link CoreOptions#SCAN_BUCKET} for primary-key fixed-bucket tables. */
static void validateScanBucketOption(TableSchema schema, CoreOptions coreOptions, int bucket) {
checkArgument(
!schema.primaryKeys().isEmpty(),
"Bucket scan is only supported for primary key tables.");
checkArgument(
bucketModeFromOption(coreOptions.bucket()) == BucketMode.HASH_FIXED,
"Bucket scan is only supported for fixed-bucket tables, but got bucket mode %s.",
bucketModeFromOption(coreOptions.bucket()));
validateFixedBucketRange(coreOptions, bucket);
}

private static void validateFixedBucketRange(CoreOptions coreOptions, int bucket) {
checkArgument(bucket >= 0, "Bucket id must be non-negative, but is %s.", bucket);
int numBuckets = coreOptions.bucket();
checkArgument(
numBuckets > 0,
"Bucket scan is only supported for tables with bucket > 0, but got bucket %s.",
numBuckets);
checkArgument(
bucket < numBuckets,
"Bucket id %s must be less than table bucket number %s.",
bucket,
numBuckets);
}

private static BucketMode bucketModeFromOption(int bucketOption) {
switch (bucketOption) {
case -2:
return BucketMode.POSTPONE_MODE;
case -1:
return BucketMode.HASH_DYNAMIC;
default:
return BucketMode.HASH_FIXED;
}
}

@Override
public AbstractDataTableScan withBucketFilter(Filter<Integer> bucketFilter) {
snapshotReader.withBucketFilter(bucketFilter);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.table.source;

import org.apache.paimon.CoreOptions;
import org.apache.paimon.catalog.Catalog;
import org.apache.paimon.catalog.FileSystemCatalog;
import org.apache.paimon.catalog.Identifier;
import org.apache.paimon.data.GenericRow;
import org.apache.paimon.fs.Path;
import org.apache.paimon.fs.local.LocalFileIO;
import org.apache.paimon.schema.Schema;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.table.sink.StreamTableCommit;
import org.apache.paimon.table.sink.StreamTableWrite;
import org.apache.paimon.types.DataTypes;

import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

/** Tests for {@link CoreOptions#SCAN_BUCKET}. */
public class ScanBucketTest {

@TempDir java.nio.file.Path tempDir;

@Test
public void testWithBucketRejectsOutOfRangeBucketId() throws Exception {
FileStoreTable table = createTable("4", true);
assertThatThrownBy(() -> table.newScan().withBucket(5))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("Bucket id 5 must be less than table bucket number 4");
}

@Test
public void testScanBucketOptionRejectsOutOfRangeBucketId() throws Exception {
FileStoreTable table = createTableWithScanBucket("4", true, "5");
assertThatThrownBy(() -> table.newScan().plan())
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("Bucket id 5 must be less than table bucket number 4");
}

@Test
public void testScanBucketOptionRejectsDynamicBucketTable() throws Exception {
FileStoreTable table = createTableWithScanBucket("-1", true, "0");
assertThatThrownBy(() -> table.newScan().plan())
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("fixed-bucket tables");
}

@Test
public void testScanBucketOptionRejectsPostponeBucketTable() throws Exception {
FileStoreTable table = createTableWithScanBucket("-2", true, "0");
assertThatThrownBy(() -> table.newScan().plan())
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("fixed-bucket tables");
}

@Test
public void testScanBucketOptionRejectsBucketUnawareTable() throws Exception {
FileStoreTable table = createBucketUnawareTableWithScanBucket("0");
assertThatThrownBy(() -> table.newScan().plan())
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("primary key tables");
}

@Test
public void testScanBucketOptionRejectsTableWithoutPrimaryKey() throws Exception {
FileStoreTable table = createAppendOnlyTableWithScanBucket("4", "0");
assertThatThrownBy(() -> table.newScan().plan())
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("primary key tables");
}

@Test
public void testScanBucketOptionViaDirectTableScan() throws Exception {
FileStoreTable table = createTable("4", true);
writeRows(table, 1, 2, 3, 4, 5, 6, 7, 8);

assertThat(extractBuckets(table.newScan().plan().splits()).size()).isGreaterThan(1);

Map<String, String> options = new HashMap<>(table.options());
options.put(CoreOptions.SCAN_BUCKET.key(), "0");
FileStoreTable tableWithScanBucket = table.copy(options);
assertThat(extractBuckets(tableWithScanBucket.newScan().plan().splits()))
.containsExactly(0);
}

@Test
public void testScanBucketOptionViaTableCopy() throws Exception {
FileStoreTable table = createTable("4", true);
writeRows(table, 1, 2, 3, 4, 5, 6, 7, 8);

Map<String, String> options = new HashMap<>();
options.put(CoreOptions.SCAN_BUCKET.key(), "0");
FileStoreTable tableWithScanBucket = table.copy(options);

assertThat(extractBuckets(table.newScan().plan().splits()).size()).isGreaterThan(1);
assertThat(extractBuckets(tableWithScanBucket.newScan().plan().splits()))
.containsExactly(0);
}

@Test
public void testScanBucketOptionViaReadBuilder() throws Exception {
FileStoreTable table = createTableWithScanBucket("4", true, "0");
writeRows(table, 1, 2, 3, 4, 5, 6, 7, 8);

assertThat(extractBuckets(table.newReadBuilder().newScan().plan().splits()))
.containsExactly(0);
}

@Test
public void testScanBucketOptionRejectsDirectTableScanForDynamicBucketTable() throws Exception {
FileStoreTable table = createTableWithScanBucket("-1", true, "0");
assertThatThrownBy(() -> table.newScan().plan())
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("fixed-bucket tables");
}

private static List<Integer> extractBuckets(List<Split> splits) {
return splits.stream()
.map(split -> ((DataSplit) split).bucket())
.distinct()
.sorted()
.collect(Collectors.toList());
}

private void writeRows(FileStoreTable table, int... ids) throws Exception {
StreamTableWrite write = table.newWrite("test");
StreamTableCommit commit = table.newCommit("test");
for (int id : ids) {
write.write(GenericRow.of(id, id));
}
commit.commit(0, write.prepareCommit(true, 0));
write.close();
commit.close();
}

private FileStoreTable createTableWithScanBucket(
String bucket, boolean withPrimaryKey, String scanBucket) throws Exception {
Map<String, String> options = new HashMap<>();
options.put(CoreOptions.BUCKET.key(), bucket);
options.put(CoreOptions.SCAN_BUCKET.key(), scanBucket);
Schema.Builder schemaBuilder =
Schema.newBuilder().column("id", DataTypes.INT()).column("v", DataTypes.INT());
if (withPrimaryKey) {
schemaBuilder.primaryKey("id");
}
Schema schema = schemaBuilder.options(options).build();
return createTable(schema);
}

private FileStoreTable createBucketUnawareTableWithScanBucket(String scanBucket)
throws Exception {
Map<String, String> options = new HashMap<>();
options.put(CoreOptions.BUCKET.key(), "-1");
options.put(CoreOptions.SCAN_BUCKET.key(), scanBucket);
Schema schema =
Schema.newBuilder()
.column("id", DataTypes.INT())
.column("v", DataTypes.INT())
.options(options)
.build();
return createTable(schema);
}

private FileStoreTable createAppendOnlyTableWithScanBucket(String bucket, String scanBucket)
throws Exception {
Map<String, String> options = new HashMap<>();
options.put(CoreOptions.BUCKET.key(), bucket);
options.put(CoreOptions.BUCKET_KEY.key(), "id");
options.put(CoreOptions.SCAN_BUCKET.key(), scanBucket);
Schema schema =
Schema.newBuilder()
.column("id", DataTypes.INT())
.column("v", DataTypes.INT())
.options(options)
.build();
return createTable(schema);
}

private FileStoreTable createTable(String bucket, boolean withPrimaryKey) throws Exception {
Map<String, String> options = new HashMap<>();
options.put(CoreOptions.BUCKET.key(), bucket);
Schema.Builder schemaBuilder =
Schema.newBuilder().column("id", DataTypes.INT()).column("v", DataTypes.INT());
if (withPrimaryKey) {
schemaBuilder.primaryKey("id");
}
Schema schema = schemaBuilder.options(options).build();
return createTable(schema);
}

private FileStoreTable createTable(Schema schema) throws Exception {
Catalog catalog = new FileSystemCatalog(LocalFileIO.create(), new Path(tempDir.toString()));
catalog.createDatabase("default", true);
Identifier identifier = Identifier.create("default", "test_bucket");
catalog.createTable(identifier, schema, false);
return (FileStoreTable) catalog.getTable(identifier);
}
}
Loading
Loading