diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java index d6469e209c67..bb9f0c6ea689 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java @@ -22,6 +22,7 @@ import com.google.auto.value.AutoValue; import com.solacesystems.jcsmp.BytesXMLMessage; +import com.solacesystems.jcsmp.DeliveryMode; import com.solacesystems.jcsmp.Destination; import com.solacesystems.jcsmp.JCSMPFactory; import com.solacesystems.jcsmp.Queue; @@ -31,18 +32,22 @@ import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.coders.CannotProvideCoderException; import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.io.solace.broker.BasicAuthJcsmpSessionServiceFactory; +import org.apache.beam.sdk.io.solace.broker.GCPSecretSessionServiceFactory; import org.apache.beam.sdk.io.solace.broker.SempClientFactory; import org.apache.beam.sdk.io.solace.broker.SessionService; import org.apache.beam.sdk.io.solace.broker.SessionServiceFactory; import org.apache.beam.sdk.io.solace.data.Solace; import org.apache.beam.sdk.io.solace.data.Solace.SolaceRecordMapper; import org.apache.beam.sdk.io.solace.read.UnboundedSolaceSource; +import org.apache.beam.sdk.io.solace.write.SolaceOutput; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.checkerframework.checker.nullness.qual.Nullable; @@ -181,10 +186,6 @@ * * } * - *
When reading from Solace, the user must use {@link @@ -198,6 +199,186 @@ *
For the authentication to the SEMP API ({@link Read#withSempClientFactory(SempClientFactory)}) * the connector provides {@link org.apache.beam.sdk.io.solace.broker.BasicAuthSempClientFactory} to * authenticate using the Basic Authentication. + * + *
To write to Solace, use {@link #write()} with a {@link PCollection The connector uses the Solace JCSMP API.
+ * The clients will write to a SMF
+ * topic to the port 55555 of the host. If you want to use a different port, specify it in the
+ * host property with the format "X.X.X.X:PORT".
+ *
+ * Once you have a {@link PCollection} of {@link Solace.Record}, you can write to Solace using:
+ *
+ * The above code snippet will write to the VPN named "default", using 4 clients per worker (VM
+ * in Dataflow), and a maximum of 20 workers/VMs for writing (default values). You can change the
+ * default VPN name by setting the required JCSMP property in the session factory (in this case,
+ * with {@link BasicAuthJcsmpSessionServiceFactory#vpnName()}), the number of clients per worker
+ * with {@link Write#withNumberOfClientsPerWorker(int)} and the number of parallel write clients
+ * using {@link Write#withMaxNumOfUsedWorkers(int)}.
+ *
+ * For instance, you can create a function like the following:
+ *
+ * The connector can write either direct or persistent messages. The default mode is DIRECT.
+ *
+ * The connector returns a {@link PCollection} of {@link Solace.PublishResult}, that you can use
+ * to get a confirmation of messages that have been published, or rejected, but only if it is
+ * publishing persistent messages.
+ *
+ * If you are publishing persistent messages, then you can have some feedback about whether the
+ * messages have been published, and some publishing latency metrics. If the message has been
+ * published, {@link Solace.PublishResult#getPublished()} will be true. If it is false, it means
+ * that the message could not be published, and {@link Solace.PublishResult#getError()} will contain
+ * more details about why the message could not be published. To get latency metrics as well as the
+ * results, set the property {@link Write#publishLatencyMetrics()}.
+ *
+ * This connector can work in two main modes: high latency or high throughput. The default mode
+ * favors high throughput over high latency. You can control this behavior with the methods {@link
+ * Write#withSubmissionMode(SubmissionMode)} and {@link Write#withWriterType(WriterType)}.
+ *
+ * The default mode works like the following options:
+ *
+ * {@link SubmissionMode#HIGHER_THROUGHPUT} and {@link WriterType#BATCHED} are the default
+ * values, and offer the higher possible throughput, and the lowest usage of resources in the runner
+ * side (due to the lower backpressure).
+ *
+ * This connector writes bundles of 50 messages, using a bulk publish JCSMP method. This will
+ * increase the latency, since a message needs to "wait" until 50 messages are accumulated, before
+ * they are submitted to Solace.
+ *
+ * For the lowest latency possible, use {@link SubmissionMode#LOWER_LATENCY} and {@link
+ * WriterType#STREAMING}.
+ *
+ * The streaming connector publishes each message individually, without holding up or batching
+ * before the message is sent to Solace. This will ensure the lowest possible latency, but it will
+ * offer a much lower throughput. The streaming connector does not use state & timers.
+ *
+ * Both connectors uses state & timers to control the level of parallelism. If you are using
+ * Cloud Dataflow, it is recommended that you enable Streaming Engine to use this
+ * connector.
+ *
+ * When writing to Solace, the user must use {@link
+ * Write#withSessionServiceFactory(SessionServiceFactory)} to create a JCSMP session.
+ *
+ * See {@link Write#withSessionServiceFactory(SessionServiceFactory)} for session authentication.
+ * The connector provides implementation of the {@link SessionServiceFactory} using basic
+ * authentication ({@link BasicAuthJcsmpSessionServiceFactory}), and another implementation using
+ * basic authentication but with a password stored as a secret in Google Cloud Secret Manager
+ * ({@link GCPSecretSessionServiceFactory})
+ *
+ * When the worker using the connector is created, the connector will attempt to connect to
+ * Solace.
+ *
+ * If the client cannot connect to Solace for whatever reason, the connector will retry the
+ * connections using the following strategy. There will be a maximum of 4 retries. The first retry
+ * will be attempted 1 second after the first connection attempt. Every subsequent retry will
+ * multiply that time by a factor of two, with a maximum of 10 seconds.
+ *
+ * If after those retries the client is still unable to connect to Solace, the connector will
+ * attempt to reconnect using the same strategy repeated for every single incoming message. If for
+ * some reason, there is a persistent issue that prevents the connection (e.g. client quota
+ * exhausted), you will need to stop your job manually, or the connector will keep retrying.
+ *
+ * This strategy is applied to all the remote calls sent to Solace, either to connect, pull
+ * messages, push messages, etc.
*/
@Internal
public class SolaceIO {
@@ -213,11 +394,13 @@ public class SolaceIO {
};
private static final boolean DEFAULT_DEDUPLICATE_RECORDS = false;
- // Part of the new write connector, documentation to be updated in upcoming pull requests
- public enum SubmissionMode {
- HIGHER_THROUGHPUT,
- LOWER_LATENCY
- }
+ public static final int DEFAULT_WRITER_MAX_NUMBER_OF_WORKERS = 20;
+ public static final int DEFAULT_WRITER_CLIENTS_PER_WORKER = 4;
+ public static final Boolean DEFAULT_WRITER_PUBLISH_LATENCY_METRICS = false;
+ public static final SubmissionMode DEFAULT_WRITER_SUBMISSION_MODE =
+ SubmissionMode.HIGHER_THROUGHPUT;
+ public static final DeliveryMode DEFAULT_WRITER_DELIVERY_MODE = DeliveryMode.DIRECT;
+ public static final WriterType DEFAULT_WRITER_TYPE = WriterType.BATCHED;
/** Get a {@link Topic} object from the topic name. */
static Topic topicFromName(String topicName) {
@@ -287,6 +470,24 @@ public static If you are using a custom data class, the format function should return a {@link
+ * Solace.Record} corresponding to your custom data class instance.
+ *
+ * If you are using this formatting function with dynamic destinations, you must ensure that
+ * you set the right value in the destination value of the {@link Solace.Record} messages.
+ */
+ public static The topic does not need to exist before launching the pipeline.
+ *
+ * This will write all records to the same topic, ignoring their destination field.
+ *
+ * Optional. If not specified, the connector will use dynamic destinations based on the
+ * destination field of {@link Solace.Record}.
+ */
+ public Write The queue must exist prior to launching the pipeline.
+ *
+ * This will write all records to the same queue, ignoring their destination field.
+ *
+ * Optional. If not specified, the connector will use dynamic destinations based on the
+ * destination field of {@link Solace.Record}.
+ */
+ public Write This is optional, the default value is 20.
+ *
+ * This is the maximum value that the job would use, but depending on the amount of data, the
+ * actual number of writers may be lower than this value. With the Dataflow runner, the
+ * connector will as maximum this number of VMs in the job (but the job itself may use more
+ * VMs).
+ *
+ * Set this number taking into account the limits in the number of clients in your Solace
+ * cluster, and the need for performance when writing to Solace (more workers will achieve
+ * higher throughput).
+ */
+ public Write This is optional, the default number is 4.
+ *
+ * The number of clients is per worker. If there are more than one worker, the number of
+ * clients will be multiplied by the number of workers. With the Dataflow runner, this will be
+ * the number of clients created per VM. The clients will be re-used across different threads in
+ * the same worker.
+ *
+ * Set this number in combination with {@link #withMaxNumOfUsedWorkers}, to ensure that the
+ * limit for number of clients in your Solace cluster is not exceeded.
+ *
+ * Normally, using a higher number of clients with fewer workers will achieve better
+ * throughput at a lower cost, since the workers are better utilized. A good rule of thumb to
+ * use is setting as many clients per worker as vCPUs the worker has.
+ */
+ public Write For more details, see https://docs.solace.com/API/API-Developer-Guide/Message-Delivery-Modes.htm
+ */
+ public Write Latency metrics are only available if {@link #withDeliveryMode(DeliveryMode)} is set to
+ * PERSISTENT. In that mode, latency is measured for each single message, as the time difference
+ * between the message creation and the reception of the publishing confirmation.
+ *
+ * For the batched writer, the creation time is set for every message in a batch shortly
+ * before the batch is submitted. So the latency is very close to the actual publishing latency,
+ * and it does not take into account the time spent waiting for the batch to be submitted.
+ *
+ * This is optional, the default value is false (don't publish latency metrics).
+ */
+ public Write For full details, please check https://docs.solace.com/API/API-Developer-Guide/Java-API-Best-Practices.htm.
+ *
+ * The Solace JCSMP client libraries can dispatch messages using two different modes:
+ *
+ * One of the modes dispatches messages directly from the same thread that is doing the rest
+ * of I/O work. This mode favors lower latency but lower throughput. Set this to LOWER_LATENCY
+ * to use that mode (MESSAGE_CALLBACK_ON_REACTOR set to True).
+ *
+ * The other mode uses a parallel thread to accumulate and dispatch messages. This mode
+ * favors higher throughput but also has higher latency. Set this to HIGHER_THROUGHPUT to use
+ * that mode. This is the default mode (MESSAGE_CALLBACK_ON_REACTOR set to False).
+ *
+ * This is optional, the default value is HIGHER_THROUGHPUT.
+ */
+ public Write The Solace writer can either use the JCSMP modes in streaming or batched.
+ *
+ * In streaming mode, the publishing latency will be lower, but the throughput will also be
+ * lower.
+ *
+ * With the batched mode, messages are accumulated until a batch size of 50 is reached, or 5
+ * seconds have elapsed since the first message in the batch was received. The 50 messages are
+ * sent to Solace in a single batch. This writer offers higher throughput but higher publishing
+ * latency, as messages can be held up for up to 5 seconds until they are published.
+ *
+ * Notice that this is the message publishing latency, not the end-to-end latency. For very
+ * large scale pipelines, you will probably prefer to use the HIGHER_THROUGHPUT mode, as with
+ * lower throughput messages will accumulate in the pipeline, and the end-to-end latency may
+ * actually be higher.
+ *
+ * This is optional, the default is the BATCHED writer.
+ */
+ public Write This provider should define the destination host where the broker is listening, and all
+ * the properties related to authentication (base auth, client certificate, etc.).
+ */
+ public Write This class provides a builder to create instances, but you will probably not need it. The
+ * write connector will create and return instances of {@link Solace.PublishResult}.
+ *
+ * If the message has been published, {@link Solace.PublishResult#getPublished()} will be true.
+ * If it is false, it means that the message could not be published, and {@link
+ * Solace.PublishResult#getError()} will contain more details about why the message could not be
+ * published.
+ */
+ @AutoValue
+ @DefaultSchema(AutoValueSchema.class)
+ public abstract static class PublishResult {
+ /** The message id of the message that was published. */
+ @SchemaFieldNumber("0")
+ public abstract String getMessageId();
+
+ /** Whether the message was published or not. */
+ @SchemaFieldNumber("1")
+ public abstract Boolean getPublished();
+
+ /**
+ * The publishing latency in milliseconds. This is the difference between the time the message
+ * was created, and the time the message was published. It is only available if the {@link
+ * CorrelationKey} class is used as correlation key of the messages.
+ */
+ @SchemaFieldNumber("2")
+ public abstract @Nullable Long getLatencyMilliseconds();
+
+ /** The error details if the message could not be published. */
+ @SchemaFieldNumber("3")
+ public abstract @Nullable String getError();
+
+ public static Builder builder() {
+ return new AutoValue_Solace_PublishResult.Builder();
+ }
+
+ @AutoValue.Builder
+ public abstract static class Builder {
+ public abstract Builder setMessageId(String messageId);
+
+ public abstract Builder setPublished(Boolean published);
+
+ public abstract Builder setLatencyMilliseconds(Long latencyMs);
+
+ public abstract Builder setError(String error);
+
+ public abstract PublishResult build();
+ }
+ }
+
+ /**
+ * The correlation key is an object that is passed back to the client during the event broker ack
+ * or nack.
+ *
+ * In the streaming writer is optionally used to calculate publish latencies, by calculating
+ * the time difference between the creation of the correlation key, and the time of the ack.
+ */
+ @AutoValue
+ @DefaultSchema(AutoValueSchema.class)
+ public abstract static class CorrelationKey {
+ @SchemaFieldNumber("0")
+ public abstract String getMessageId();
+
+ @SchemaFieldNumber("1")
+ public abstract long getPublishMonotonicMillis();
+
+ public static Builder builder() {
+ return new AutoValue_Solace_CorrelationKey.Builder();
+ }
+
+ @AutoValue.Builder
+ public abstract static class Builder {
+ public abstract Builder setMessageId(String messageId);
+
+ public abstract Builder setPublishMonotonicMillis(long millis);
+
+ public abstract CorrelationKey build();
+ }
+ }
+
/**
* A utility class for mapping {@link BytesXMLMessage} instances to {@link Solace.Record} objects.
* This simplifies the process of converting raw Solace messages into a format suitable for use
diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/write/SolaceOutput.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/write/SolaceOutput.java
new file mode 100644
index 000000000000..6c37f879ae7f
--- /dev/null
+++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/write/SolaceOutput.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.solace.write;
+
+import java.util.Map;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.io.solace.SolaceIO;
+import org.apache.beam.sdk.io.solace.data.Solace;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PInput;
+import org.apache.beam.sdk.values.POutput;
+import org.apache.beam.sdk.values.PValue;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap;
+import org.checkerframework.checker.nullness.qual.Nullable;
+
+/**
+ * The {@link SolaceIO.Write} transform's output return this type, containing both the successful
+ * publishes ({@link #getSuccessfulPublish()}) and the failed publishes ({@link
+ * #getFailedPublish()}).
+ *
+ * The streaming writer with DIRECT messages does not return anything, and the output {@link
+ * PCollection}s will be equal to null.
+ */
+public final class SolaceOutput implements POutput {
+ private final Pipeline pipeline;
+ private final TupleTagWriting to a static topic or queue
+ *
+ * {@code
+ * PCollection
+ *
+ * Writing to dynamic destinations
+ *
+ * To write to dynamic destinations, don't set the {@link Write#to(Solace.Queue)} or {@link
+ * Write#to(Solace.Topic)} property and make sure that all the {@link Solace.Record}s have their
+ * destination field set to either a topic or a queue. You can do this prior to calling the write
+ * connector, or by using a format function and {@link #write(SerializableFunction)}.
+ *
+ * {@code
+ * // Generate Record with different destinations
+ * SerializableFunction
+ *
+ * And then use the connector as follows:
+ *
+ * {@code
+ * // Ignore "to" method to use dynamic destinations
+ * SolaceOutput solaceResponses = msgs.apply("Write to Solace",
+ * SolaceIO.
+ *
+ * Direct and persistent messages, and latency metrics
+ *
+ * Throughput and latency
+ *
+ * {@code
+ * PCollection
+ *
+ * {@code
+ * PCollection
+ *
+ * Authentication
+ *
+ * Connector retries
+ *
+ *