Skip to content

Commit f61388b

Browse files
This splits the logic to structure channel data into blob collections out into a separate class and refactors the main nested loop into separate methods within that new class. The collections produced by the new class should be identical to those prior to the change.
1 parent 5d11892 commit f61388b

8 files changed

+309
-230
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
package net.snowflake.ingest.streaming.internal;
2+
3+
import net.snowflake.ingest.utils.Logging;
4+
import net.snowflake.ingest.utils.ParameterProvider;
5+
6+
import java.util.ArrayList;
7+
import java.util.Collection;
8+
import java.util.Collections;
9+
import java.util.List;
10+
import java.util.Objects;
11+
12+
import static net.snowflake.ingest.utils.Constants.MAX_BLOB_SIZE_IN_BYTES;
13+
14+
/**
15+
* Responsible for accepting data from channels and collating into collections that will be used to build the actual blobs
16+
* <p>
17+
* A chunk is represented as a list of channel data from a single table
18+
* A blob is represented as a list of chunks that must share the same schema (but not necessarily the same table)
19+
* <p>
20+
* This class returns a list of blobs
21+
*/
22+
class BlobDataBuilder<T> {
23+
private static final Logging logger = new Logging(BlobDataBuilder.class);
24+
private final List<List<List<ChannelData<T>>>> allBlobs;
25+
private final ParameterProvider parameterProvider;
26+
private final String clientName;
27+
private List<List<ChannelData<T>>> currentBlob;
28+
private ChannelData<T> prevChannelData = null;
29+
private float totalCurrentBlobSizeInBytes = 0F;
30+
private float totalBufferSizeInBytes = 0F;
31+
32+
public BlobDataBuilder(String clientName, ParameterProvider parameterProvider) {
33+
this.clientName = clientName;
34+
this.parameterProvider = parameterProvider;
35+
this.currentBlob = new ArrayList<>();
36+
this.allBlobs = new ArrayList<>();
37+
}
38+
39+
public List<List<List<ChannelData<T>>>> getAllBlobData() {
40+
addCurrentBlob();
41+
return allBlobs;
42+
}
43+
44+
public void appendDataForTable(Collection<? extends SnowflakeStreamingIngestChannelFlushable<T>> tableChannels) {
45+
List<ChannelData<T>> chunk = getChunkForTable(tableChannels);
46+
appendChunk(chunk);
47+
}
48+
49+
private List<ChannelData<T>> getChunkForTable(Collection<? extends SnowflakeStreamingIngestChannelFlushable<T>> tableChannels) {
50+
List<ChannelData<T>> channelsDataPerTable = Collections.synchronizedList(new ArrayList<>());
51+
// Use parallel stream since getData could be the performance bottleneck when we have a
52+
// high number of channels
53+
tableChannels.parallelStream()
54+
.forEach(
55+
channel -> {
56+
if (channel.isValid()) {
57+
ChannelData<T> data = channel.getData();
58+
if (data != null) {
59+
channelsDataPerTable.add(data);
60+
}
61+
}
62+
});
63+
return channelsDataPerTable;
64+
}
65+
66+
private void appendChunk(List<ChannelData<T>> chunkData) {
67+
if (chunkData.isEmpty()) {
68+
return;
69+
}
70+
71+
if (currentBlob.size() >= parameterProvider.getMaxChunksInBlob()) {
72+
// Create a new blob if the current one already contains max allowed number of chunks
73+
logger.logInfo(
74+
"Max allowed number of chunks in the current blob reached. chunkCount={}"
75+
+ " maxChunkCount={}",
76+
currentBlob.size(),
77+
parameterProvider.getMaxChunksInBlob());
78+
79+
addCurrentBlob();
80+
}
81+
82+
int i, start = 0;
83+
for (i = 0; i < chunkData.size(); i++) {
84+
ChannelData<T> channelData = chunkData.get(i);
85+
if (prevChannelData != null && shouldStopProcessing(
86+
totalCurrentBlobSizeInBytes,
87+
totalBufferSizeInBytes,
88+
channelData,
89+
prevChannelData)) {
90+
logger.logInfo(
91+
"Creation of another blob is needed because of blob/chunk size limit or"
92+
+ " different encryption ids or different schema, client={}, table={},"
93+
+ " blobSize={}, chunkSize={}, nextChannelSize={}, encryptionId1={},"
94+
+ " encryptionId2={}, schema1={}, schema2={}",
95+
clientName,
96+
channelData.getChannelContext().getTableName(),
97+
totalCurrentBlobSizeInBytes,
98+
totalBufferSizeInBytes,
99+
channelData.getBufferSize(),
100+
channelData.getChannelContext().getEncryptionKeyId(),
101+
prevChannelData.getChannelContext().getEncryptionKeyId(),
102+
channelData.getColumnEps().keySet(),
103+
prevChannelData.getColumnEps().keySet());
104+
105+
if (i != start) {
106+
currentBlob.add(chunkData.subList(start, i));
107+
start = i;
108+
}
109+
110+
addCurrentBlob();
111+
}
112+
113+
totalCurrentBlobSizeInBytes += channelData.getBufferSize();
114+
totalBufferSizeInBytes += channelData.getBufferSize();
115+
prevChannelData = channelData;
116+
}
117+
118+
if (i != start) {
119+
currentBlob.add(chunkData.subList(start, i));
120+
}
121+
}
122+
123+
private void addCurrentBlob() {
124+
if (!currentBlob.isEmpty()) {
125+
allBlobs.add(currentBlob);
126+
currentBlob = new ArrayList<>();
127+
}
128+
totalBufferSizeInBytes = 0;
129+
totalCurrentBlobSizeInBytes = 0;
130+
}
131+
132+
/**
133+
* Check whether we should stop merging more channels into the same chunk, we need to stop in a
134+
* few cases:
135+
*
136+
* <p>When the blob size is larger than a certain threshold
137+
*
138+
* <p>When the chunk size is larger than a certain threshold
139+
*
140+
* <p>When the schemas are not the same
141+
*/
142+
private boolean shouldStopProcessing(
143+
float totalBufferSizeInBytes,
144+
float totalBufferSizePerTableInBytes,
145+
ChannelData<T> current,
146+
ChannelData<T> prev) {
147+
return totalBufferSizeInBytes + current.getBufferSize() > MAX_BLOB_SIZE_IN_BYTES
148+
|| totalBufferSizePerTableInBytes + current.getBufferSize()
149+
> parameterProvider.getMaxChunkSizeInBytes()
150+
|| !Objects.equals(
151+
current.getChannelContext().getEncryptionKeyId(),
152+
prev.getChannelContext().getEncryptionKeyId())
153+
|| !current.getColumnEps().keySet().equals(prev.getColumnEps().keySet());
154+
}
155+
}

src/main/java/net/snowflake/ingest/streaming/internal/ChannelCache.java

+9-9
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class ChannelCache<T> {
2323
// Cache to hold all the valid channels, the key for the outer map is FullyQualifiedTableName and
2424
// the key for the inner map is ChannelName
2525
private final ConcurrentHashMap<
26-
String, ConcurrentHashMap<String, SnowflakeStreamingIngestChannelInternal<T>>>
26+
String, ConcurrentHashMap<String, SnowflakeStreamingIngestChannelFlushable<T>>>
2727
cache = new ConcurrentHashMap<>();
2828

2929
/** Flush information for each table including last flush time and if flush is needed */
@@ -45,8 +45,8 @@ static class FlushInfo {
4545
*
4646
* @param channel
4747
*/
48-
void addChannel(SnowflakeStreamingIngestChannelInternal<T> channel) {
49-
ConcurrentHashMap<String, SnowflakeStreamingIngestChannelInternal<T>> channels =
48+
void addChannel(SnowflakeStreamingIngestChannelFlushable<T> channel) {
49+
ConcurrentHashMap<String, SnowflakeStreamingIngestChannelFlushable<T>> channels =
5050
this.cache.computeIfAbsent(
5151
channel.getFullyQualifiedTableName(), v -> new ConcurrentHashMap<>());
5252

@@ -55,7 +55,7 @@ void addChannel(SnowflakeStreamingIngestChannelInternal<T> channel) {
5555
this.tableFlushInfo.putIfAbsent(
5656
channel.getFullyQualifiedTableName(), new FlushInfo(System.currentTimeMillis(), false));
5757

58-
SnowflakeStreamingIngestChannelInternal<T> oldChannel =
58+
SnowflakeStreamingIngestChannelFlushable<T> oldChannel =
5959
channels.put(channel.getName(), channel);
6060
// Invalidate old channel if it exits to block new inserts and return error to users earlier
6161
if (oldChannel != null) {
@@ -136,7 +136,7 @@ void setNeedFlush(String fullyQualifiedTableName, boolean needFlush) {
136136
}
137137

138138
/** Returns an immutable set view of the mappings contained in the channel cache. */
139-
Set<Map.Entry<String, ConcurrentHashMap<String, SnowflakeStreamingIngestChannelInternal<T>>>>
139+
Set<Map.Entry<String, ConcurrentHashMap<String, SnowflakeStreamingIngestChannelFlushable<T>>>>
140140
entrySet() {
141141
return Collections.unmodifiableSet(cache.entrySet());
142142
}
@@ -155,11 +155,11 @@ void closeAllChannels() {
155155

156156
/** Remove a channel in the channel cache if the channel sequencer matches */
157157
// TODO: background cleaner to cleanup old stale channels that are not closed?
158-
void removeChannelIfSequencersMatch(SnowflakeStreamingIngestChannelInternal<T> channel) {
158+
void removeChannelIfSequencersMatch(SnowflakeStreamingIngestChannelFlushable<T> channel) {
159159
cache.computeIfPresent(
160160
channel.getFullyQualifiedTableName(),
161161
(k, v) -> {
162-
SnowflakeStreamingIngestChannelInternal<T> channelInCache = v.get(channel.getName());
162+
SnowflakeStreamingIngestChannelFlushable<T> channelInCache = v.get(channel.getName());
163163
// We need to compare the channel sequencer in case the old channel was already been
164164
// removed
165165
return channelInCache != null
@@ -180,10 +180,10 @@ void invalidateChannelIfSequencersMatch(
180180
Long channelSequencer,
181181
String invalidationCause) {
182182
String fullyQualifiedTableName = String.format("%s.%s.%s", dbName, schemaName, tableName);
183-
ConcurrentHashMap<String, SnowflakeStreamingIngestChannelInternal<T>> channelsMapPerTable =
183+
ConcurrentHashMap<String, SnowflakeStreamingIngestChannelFlushable<T>> channelsMapPerTable =
184184
cache.get(fullyQualifiedTableName);
185185
if (channelsMapPerTable != null) {
186-
SnowflakeStreamingIngestChannelInternal<T> channel = channelsMapPerTable.get(channelName);
186+
SnowflakeStreamingIngestChannelFlushable<T> channel = channelsMapPerTable.get(channelName);
187187
if (channel != null && channel.getChannelSequencer().equals(channelSequencer)) {
188188
channel.invalidate("invalidate with matched sequencer", invalidationCause);
189189
}

src/main/java/net/snowflake/ingest/streaming/internal/ChannelsStatusRequest.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ static class ChannelStatusRequestDTO {
2929
// Client Sequencer
3030
private final Long clientSequencer;
3131

32-
ChannelStatusRequestDTO(SnowflakeStreamingIngestChannelInternal channel) {
32+
ChannelStatusRequestDTO(SnowflakeStreamingIngestChannelFlushable channel) {
3333
this.channelName = channel.getName();
3434
this.databaseName = channel.getDBName();
3535
this.schemaName = channel.getSchemaName();

0 commit comments

Comments
 (0)