Kafka topic partition and Spark executor mapping

Kafka topic partition and Spark executor mapping - java

I am using spark streaming with kafka topic. topic is created with 5 partitions. My all messages are published to the kafka topic using tablename as key.
Given this i assume all messages for that table should goto the same partition.
But i notice in the spark log messages for same table sometimes goes to executor's node-1 and sometime goes to executor's node-2.
I am running code in yarn-cluster mode using following command:
spark-submit --name DataProcessor --master yarn-cluster --files /opt/ETL_JAR/executor-log4j-spark.xml,/opt/ETL_JAR/driver-log4j-spark.xml,/opt/ETL_JAR/application.properties --conf "spark.driver.extraJavaOptions=-Dlog4j.configuration=driver-log4j-spark.xml" --conf "spark.executor.extraJavaOptions=-Dlog4j.configuration=executor-log4j-spark.xml" --class com.test.DataProcessor /opt/ETL_JAR/etl-all-1.0.jar
and this submission creates 1 driver lets say on node-1 and 2 executors on node-1 and node-2.
I don't want node-1 and node-2 executors to read the same partition. but this is happening
Also tried following configuration to specify consumer group but no difference.
kafkaParams.put("group.id", "app1");
This is how we are creating the stream using createDirectStream method
*Not through zookeeper.
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", brokers);
kafkaParams.put("auto.offset.reset", "largest");
kafkaParams.put("group.id", "app1");
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topicsSet
);
Complete Code:
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.api.java.JavaStreamingContextFactory;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import kafka.serializer.StringDecoder;
import scala.Tuple2;
public class DataProcessor2 implements Serializable {
private static final long serialVersionUID = 3071125481526170241L;
private static Logger log = LoggerFactory.getLogger("DataProcessor");
public static void main(String[] args) {
final String sparkCheckPointDir = ApplicationProperties.getProperty(Consts.SPARK_CHECKPOINTING_DIR);
DataProcessorContextFactory3 factory = new DataProcessorContextFactory3();
JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(sparkCheckPointDir, factory);
// Start the process
jssc.start();
jssc.awaitTermination();
}
}
class DataProcessorContextFactory3 implements JavaStreamingContextFactory, Serializable {
private static final long serialVersionUID = 6070911284191531450L;
private static Logger logger = LoggerFactory.getLogger(DataProcessorContextFactory.class);
DataProcessorContextFactory3() {
}
#Override
public JavaStreamingContext create() {
logger.debug("creating new context..!");
final String brokers = ApplicationProperties.getProperty(Consts.KAFKA_BROKERS_NAME);
final String topic = ApplicationProperties.getProperty(Consts.KAFKA_TOPIC_NAME);
final String app = "app1";
final String offset = ApplicationProperties.getProperty(Consts.KAFKA_CONSUMER_OFFSET, "largest");
logger.debug("Data processing configuration. brokers={}, topic={}, app={}, offset={}", brokers, topic, app,
offset);
if (StringUtils.isBlank(brokers) || StringUtils.isBlank(topic) || StringUtils.isBlank(app)) {
System.err.println("Usage: DataProcessor <brokers> <topic>\n" + Consts.KAFKA_BROKERS_NAME
+ " is a list of one or more Kafka brokers separated by comma\n" + Consts.KAFKA_TOPIC_NAME
+ " is a kafka topic to consume from \n\n\n");
System.exit(1);
}
final String majorVersion = "1.0";
final String minorVersion = "3";
final String version = majorVersion + "." + minorVersion;
final String applicationName = "DataProcessor-" + topic + "-" + version;
// for dev environment
SparkConf sparkConf = new SparkConf().setMaster("local[*]").setAppName(applicationName);
// for cluster environment
//SparkConf sparkConf = new SparkConf().setAppName(applicationName);
final long sparkBatchDuration = Long
.valueOf(ApplicationProperties.getProperty(Consts.SPARK_BATCH_DURATION, "10"));
final String sparkCheckPointDir = ApplicationProperties.getProperty(Consts.SPARK_CHECKPOINTING_DIR);
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(sparkBatchDuration));
logger.debug("setting checkpoint directory={}", sparkCheckPointDir);
jssc.checkpoint(sparkCheckPointDir);
HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topic.split(",")));
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", brokers);
kafkaParams.put("auto.offset.reset", offset);
kafkaParams.put("group.id", "app1");
// #formatter:off
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topicsSet
);
// #formatter:on
processRDD(messages, app);
return jssc;
}
private void processRDD(JavaPairInputDStream<String, String> messages, final String app) {
JavaDStream<MsgStruct> rdd = messages.map(new MessageProcessFunction());
rdd.foreachRDD(new Function<JavaRDD<MsgStruct>, Void>() {
private static final long serialVersionUID = 250647626267731218L;
#Override
public Void call(JavaRDD<MsgStruct> currentRdd) throws Exception {
if (!currentRdd.isEmpty()) {
logger.debug("Receive RDD. Create JobDispatcherFunction at HOST={}", FunctionUtil.getHostName());
currentRdd.foreachPartition(new VoidFunction<Iterator<MsgStruct>>() {
#Override
public void call(Iterator<MsgStruct> arg0) throws Exception {
while(arg0.hasNext()){
System.out.println(arg0.next().toString());
}
}
});
} else {
logger.debug("Current RDD is empty.");
}
return null;
}
});
}
public static class MessageProcessFunction implements Function<Tuple2<String, String>, MsgStruct> {
#Override
public MsgStruct call(Tuple2<String, String> data) throws Exception {
String message = data._2();
System.out.println("message:"+message);
return MsgStruct.parse(message);
}
}
public static class MsgStruct implements Serializable{
private String message;
public static MsgStruct parse(String msg){
MsgStruct m = new MsgStruct();
m.message = msg;
return m;
}
public String toString(){
return "content inside="+message;
}
}
}

According to Spark Streaming + Kafka Integration Guide (Kafka broker version 0.10.0 or higher), you can specify an explicit mapping of partitions to hosts.
Assume you have two hosts(h1 and h2), and the Kafka topic topic-name has three partitions. The following critical code will show you how to map a specified partition to a host in Java.
Map<TopicPartition, String> partitionMapToHost = new HashMap<>();
// partition 0 -> h1, partition 1 and 2 -> h2
partitionMapToHost.put(new TopicPartition("topic-name", 0), "h1");
partitionMapToHost.put(new TopicPartition("topic-name", 1), "h2");
partitionMapToHost.put(new TopicPartition("topic-name", 2), "h2");
List<String> topicCollection = Arrays.asList("topic-name");
Map<String, Object> kafkaParams = new HasMap<>();
kafkaParams.put("bootstrap.servers", "10.0.0.2:9092,10.0.0.3:9092");
kafkaParams.put("group.id", "group-id-name");
kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
kafkaParams.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
JavaInputDStream<ConsumerRecord<String, String>> records = KafkaUtils.createDirectStream(jssc,
LocationStrategies.PreferFixed(partitionMapToHost), // PreferFixed is the key
ConsumerStrategies.Subscribe(topicCollection, kafkaParams));
You can also use LocationStrategies.PreferConsistent(), which distribute partitions evenly across available executors, and assure that a specified partition is only consumed by a specified executor.

Using the DirectStream approach it's a correct assumption that messages sent to a Kafka partition will land in the same Spark partition.
What we cannot assume is that each Spark partition will be processed by the same Spark worker each time. On each batch interval, Spark task are created for each OffsetRange for each partition and sent to the cluster for processing, landing on some available worker.
What you are looking for partition locality. The only partition locality that the direct kafka consumer supports is the kafka host containing the offset range being processed in the case that you Spark and Kafka deployements are colocated; but that's a deployment topology that I don't see very often.
In case that your requirements dictate the need to have host locality, you should look into Apache Samza or Kafka Streams.

Related

Kafka Streams Twitter Wordcount - Count Value not Long after Serialization

I am running a Kafka Cluster Docker Compose on an AWS EC2 instance.
I want to receive all the tweets of a specific keyword and push them to Kafka. This works fine.
But I also want to count the most used words of those tweets.
This is the WordCount code:
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.StreamsBuilder;
import java.util.Arrays;
import java.util.Properties;
import org.apache.kafka.streams.Topology;
import org.apache.kafka.streams.kstream.Materialized;
import org.apache.kafka.streams.kstream.Produced;
import java.util.concurrent.CountDownLatch;
import static org.apache.kafka.streams.StreamsConfig.APPLICATION_ID_CONFIG;
import static org.apache.kafka.streams.StreamsConfig.BOOTSTRAP_SERVERS_CONFIG;
import static org.apache.kafka.streams.StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG;
import static org.apache.kafka.streams.StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG;
public class WordCount {
public static void main(String[] args) {
final StreamsBuilder builder = new StreamsBuilder();
final KStream<String, String> textLines = builder
.stream("test-topic");
textLines
.flatMapValues(value -> Arrays.asList(value.toLowerCase().split("\\W+")))
.groupBy((key, value) -> value)
.count(Materialized.as("WordCount"))
.toStream()
.to("test-output", Produced.with(Serdes.String(), Serdes.Long()));
final Topology topology = builder.build();
Properties props = new Properties();
props.put(APPLICATION_ID_CONFIG, "streams-word-count");
props.put(BOOTSTRAP_SERVERS_CONFIG, "ec2-ip:9092");
props.put(DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
props.put(DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
final KafkaStreams streams = new KafkaStreams(topology, props);
final CountDownLatch latch = new CountDownLatch(1);
Runtime.getRuntime().addShutdownHook(
new Thread("streams-shutdown-hook") {
#Override
public void run() {
streams.close();
latch.countDown();
}
});
try {
streams.start();
latch.await();
} catch (Throwable e) {
System.exit(1);
}
System.exit(0);
}
}
When I check the output topic in the Control Center, it looks like this:
Key
Value
Looks like it's working as far as splitting the tweets into single words. But the count value isn't in Long format, although it is specified in the code.
When I use the kafka-console-consumer to consume from this topic, it says:
"Size of data received by LongDeserializer is not 8"

Control Center UI and console consumer can only render UTF8 data, by default.
You'll need to explicitly pass LongDeserializer to the console consumer, as the value deserializer only

try a KTable instead:
KStream<String, String> textLines = builder.stream("test-topic", Consumed.with(stringSerde, stringSerde));
KTable<String, Long> wordCounts = textLines
.flatMapValues(value -> Arrays.asList(value.toLowerCase().split("\\W+")))
.groupBy((key, value) -> value)
.count()
.toStream()
.to("test-output", Produced.with(Serdes.String(), Serdes.Long()));

Count number of message/event in Kafka stream at a periodic level

I have created one Kafka stream by consuming the message from one Kafka topic. I want to count what is the number of messages that I have received at a 1-minute level.
So let's say, I have got the message in the following way:
t1 -> message1
t1 -> message2
t1 -> message3
After 1 minute I receive the message say like this
t2 -> message4
t2 -> message5
Let's say I have one integer variable count in my Java application. What I want is from the start of the application till 1 minute this count value should be 3. At the end of the second minute, this count variable should become 2. This is because at the first minute I Had received 3 messages and in the second minute I had received 2 messages.
My code so far
import lombok.SneakyThrows;
import org.apache.commons.lang3.StringUtils;
import org.apache.kafka.common.serialization.Serde;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.StreamsBuilder;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.Topology;
import org.apache.kafka.streams.kstream.Consumed;
import org.apache.kafka.streams.kstream.ForeachAction;
import org.apache.kafka.streams.kstream.KStream;
import java.util.Properties;
public class CountMessage {
private static KafkaStreams kafkaStreams;
public static void main(String[] args) {
Properties props = new Properties();
props.put(StreamsConfig.APPLICATION_ID_CONFIG, "my_first_count_2");
props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "10.0.0.43:9092");
props.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
props.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.Long().getClass());
props.put(StreamsConfig.DEFAULT_TIMESTAMP_EXTRACTOR_CLASS_CONFIG, MyTimestampExtractor.class);
final StreamsBuilder streamsBuilder = new StreamsBuilder();
// consuming stream
String kafkaTopic = "my_kafka_topic_2";
System.out.println("Starting the application");
KStream<String, String> myStream = streamsBuilder
.stream(kafkaTopic);
myStream.foreach(new ForeachAction<String, String>() {
#SneakyThrows
#Override
public void apply(String key, String value) {
System.out.println("key received = " + key + "---<<<" + value);
}
});
final Topology topology = streamsBuilder.build();
kafkaStreams = new KafkaStreams(topology, props);
kafkaStreams.start();
}
}

Not sure if you're tied to using Kafka Streams, but for what it's worth you can do this with ksqlDB:
SELECT TIMESTAMPTOSTRING(WINDOWSTART,'yyyy-MM-dd HH:mm:ss') AS TS,
COUNT(*) AS MSG_COUNT
FROM SRC_STREAM
WINDOW TUMBLING (SIZE 1 MINUTE)
GROUP BY 'X'
EMIT CHANGES;

Kafka Streams - Fields in the Custom object changing to null while doing Aggregation

I've written a simple Kafka Stream processor code to
Read messages as stream from a topic with <K, V> as <String, String>
Convert the value in the message from String to a Custom Object <String, Object> using mapValues() method
Use Window function to aggregate the statistics of the Objects for a particular time interval
Sample Message
{"coiRequestGuid":"xxxx","accountId":1122132,"companyName":"xxxx","existingPolicyCoverageLimit":1000000,"isChangeRequested":true,"newlyRequestedPolicyCoverageLimit":200000,"isNewRecipient":false,"newRecipientGuid":null,"existingRecipientId":11111,"recipientName":"xxxx","recipientEmail":"xxxxx"}
Here is my code
import com.da.app.data.model.PolicyChangeRequest;
import com.da.app.data.model.PolicyChangeRequestStats;
import com.da.app.data.serde.JsonDeserializer;
import com.da.app.data.serde.JsonSerializer;
import com.da.app.data.serde.WrapperSerde;
import com.da.app.system.util.ConfigUtil;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.common.utils.Bytes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.KeyValue;
import org.apache.kafka.streams.StreamsBuilder;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.kstream.*;
import org.apache.kafka.streams.state.WindowStore;
import org.apache.log4j.Logger;
import org.json.JSONObject;
import java.time.Duration;
import java.util.Properties;
public class PolicyChangeReqStreamProcessor {
private static final Logger logger = Logger.getLogger(PolicyChangeReqStreamProcessor.class);
private static final String TOPIC_NAME = "stream-window-play1";
private static ObjectMapper mapper = new ObjectMapper();
private static Properties properties = ConfigUtil.loadProperty();
public static void main(String[] args) {
logger.info("Policy Limit Change Stats Generator");
Properties streamProperties = getStreamProperties();
StreamsBuilder streamsBuilder = new StreamsBuilder();
KStream<String, String> source = streamsBuilder.stream(TOPIC_NAME,
Consumed.with(Serdes.String(), Serdes.String()));
source
.filter((key, value) -> isValidEvent(value))
//Converting the request json to PolicyChangeRequest object
.mapValues(PolicyChangeReqStreamProcessor::convertPolicyChangeReqJsonToObj)
//Mapping all events to a single key in order to group all the events
.map((key, value) -> new KeyValue<>("key", value))
// Grouping by key
.groupByKey(Grouped.with(Serdes.String(), new PolicyChangeRequestSerde()))
//Creating a Tumbling window of 5 secs (for Testing)
.windowedBy(TimeWindows.of(Duration.ofSeconds(5)).advanceBy(Duration.ofSeconds(5)))
// Aggregating the PolicyChangeRequest events to a
// PolicyChangeRequestStats object
.<PolicyChangeRequestStats>aggregate(PolicyChangeRequestStats::new,
(k, v, policyStats) -> policyStats.add(v),
Materialized.<String, PolicyChangeRequestStats, WindowStore<Bytes, byte[]>>as
("policy-change-aggregates")
.withValueSerde(new PolicyChangeRequestStatsSerde()))
//Converting KTable to KStream
.toStream()
.foreach((key, value) -> logger.info(key.window().startTime() + "----" + key.window().endTime() + " :: " + value));
KafkaStreams kafkaStreams = new KafkaStreams(streamsBuilder.build(), streamProperties);
logger.info("Started the stream");
kafkaStreams.start();
Runtime.getRuntime().addShutdownHook(new Thread(kafkaStreams::close));
}
private static PolicyChangeRequest convertPolicyChangeReqJsonToObj(String policyChangeReq) {
JSONObject policyChangeReqJson = new JSONObject(policyChangeReq);
PolicyChangeRequest policyChangeRequest = new PolicyChangeRequest(policyChangeReqJson);
// return mapper.readValue(value, PolicyChangeRequest.class);
return policyChangeRequest;
}
private static boolean isValidEvent(String value) {
//TODO: Message Validation
return true;
}
private static Properties getStreamProperties() {
Properties props = new Properties();
props.put(StreamsConfig.APPLICATION_ID_CONFIG, "policy-change-stats-gen");
props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, properties.getProperty("kafka.bootstrap.servers"));
props.put(StreamsConfig.CLIENT_ID_CONFIG, "stream-window-play1");
props.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
props.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
return props;
}
public static final class PolicyChangeRequestStatsSerde extends WrapperSerde<PolicyChangeRequestStats> {
PolicyChangeRequestStatsSerde() {
super(new JsonSerializer<>(), new JsonDeserializer<>(PolicyChangeRequestStats.class));
}
}
public static final class PolicyChangeRequestSerde extends WrapperSerde<PolicyChangeRequest> {
PolicyChangeRequestSerde() {
super(new JsonSerializer<>(), new JsonDeserializer<>(PolicyChangeRequest.class));
}
}
}
isValidEvent - returns true
.mapValues(PolicyChangeReqStreamProcessor::convertPolicyChangeReqJsonToObj) - This will convert the incoming json string to a PolicyChangeRequest object
Till the operation map((key, value) -> new KeyValue<>("key", value)), the Custom Object - PolicyChangeRequest is fine as per the incoming message (I've tested by printing the stream there).
But after going into the Groupby and aggregate operation, the Custom Object got changed as
PolicyChangeRequest{coiRequestGuid='null', accountId='null', companyName='null', existingPolicyCoverageLimit=null, isChangeRequested=null, newlyRequestedPolicyCoverageLimit=null, isNewRecipient=null, newRecipientGuid='null', existingRecipientId='null', recipientName='null', recipientEmail='null'}
I found the above value by putting a log statement inside the policyStats.add(v) method i've called inside the aggregate method.
The add method is in the PolicyChangeRequestStats class
public PolicyChangeRequestStats add(PolicyChangeRequest policyChangeRequest) {
System.out.println("Incoming req: " + policyChangeRequest);
//Incrementing the Policy limit change request count
this.policyLimitChangeRequests++;
//Adding the Increased policy limit coverage to the existing increasedPolicyLimitCoverage
this.increasedPolicyLimitCoverage +=
(policyChangeRequest.getNewlyRequestedPolicyCoverageLimit() -
policyChangeRequest.getExistingPolicyCoverageLimit());
return this;
}
I'm getting NullPointerException in the line where I'm adding the policyChangeRequest.getNewlyRequestedPolicyCoverageLimit() - policyChangeRequest.getExistingPolicyCoverageLimit() as the values were null in the PolicyChangeRequest object
I've provided the valid Serde classes for the key and Value while doing groupBy .groupByKey(Grouped.with(Serdes.String(), new PolicyChangeRequestSerde())).
For Serialization and Desrialization I used Gson.
But I can't able to get the PolicyChangeRequest object as is before it was sent to the grouping operation.
I'm new to kafka Streams and I'm not sure whether I missed anything or whether the process I'm doing is correct or not.
Can anyone guide me here?

kafka java consumer not reading data

I am trying to write a simple java kafka consumer to read data using similar code as in https://github.com/bkimminich/apache-kafka-book-examples/blob/master/src/test/kafka/consumer/SimpleHLConsumer.java.
Looks like my app is able to connect, but its not fetching any data. Please suggest.
import kafka.consumer.Consumer;
import kafka.consumer.ConsumerConfig;
import kafka.consumer.ConsumerIterator;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
//import scala.util.parsing.json.JSONObject
import scala.util.parsing.json.JSONObject;
public class SimpleHLConsumer {
private final ConsumerConnector consumer;
private final String topic;
public SimpleHLConsumer(String zookeeper, String groupId, String topic) {
Properties props = new Properties();
props.put("zookeeper.connect", zookeeper);
props.put("group.id", groupId);
// props.put("zookeeper.session.timeout.ms", "5000");
// props.put("zookeeper.sync.time.ms", "250");
// props.put("auto.commit.interval.ms", "1000");
consumer = Consumer.createJavaConsumerConnector(new ConsumerConfig(props));
this.topic = topic;
}
public void testConsumer() {
Map<String, Integer> topicCount = new HashMap<>();
topicCount.put(topic, 1);
Map<String, List<KafkaStream<byte[], byte[]>>> consumerStreams = consumer.createMessageStreams(topicCount);
System.out.println(consumerStreams);
List<KafkaStream<byte[], byte[]>> streams = consumerStreams.get(topic);
System.out.println(streams);
System.out.println(consumer);
for (final KafkaStream stream : streams) {
ConsumerIterator<byte[], byte[]> it = stream.iterator();
System.out.println("for loop");
System.out.println(it);
System.out.println("Message from Single Topic: " + new String(it.next().message()));
//System.out.println("Message from Single Topic: " + new String(it.message()));
while (it.hasNext()) {
System.out.println("in While");
System.out.println("Message from Single Topic: " + new String(it.next().message()));
}
}
// if (consumer != null) {
// consumer.shutdown();
// }
}
public static void main(String[] args) {
String topic = "test";
SimpleHLConsumer simpleHLConsumer = new SimpleHLConsumer("localhost:2181", "testgroup", topic);
simpleHLConsumer.testConsumer();
}
}
Here is the output i see in eclipse. It does seem to connect to my zookeeper , but it just hangs there, it does not display any message at all.
log4j:WARN No appenders could be found for logger (kafka.utils.VerifiableProperties).
log4j:WARN Please initialize the log4j system properly.
SLF4J: The requested version 1.6 by your slf4j binding is not compatible with [1.5.5, 1.5.6]
SLF4J: See http://www.slf4j.org/codes.html#version_mismatch for further details.
{test=[testgroup kafka stream]}
[testgroup kafka stream]
kafka.javaapi.consumer.ZookeeperConsumerConnector#6200f9cb
for loop

Consumer iterator hasNext is blocking call. It will block indefinitely if no new message is available for consumption.
To verify this, change your code to
// Comment 2 lines below
// System.out.println(it);
// System.out.println("Message from Single Topic: " + new String(it.next().message()));
// Line below is blocking. Your code will hang till next message in topic.
// Add new message in topic using producer, message will appear in console
while (it.hasNext()) {
Better way is to execute code in separate thread. Use consumer.timeout.ms to specify time in ms, after which consumer will throw timeout exception
// keepRunningThread is flag to control when to exit consumer loop
while(keepRunningThread)
{
try
{
if(it.hasNext())
{
System.out.println(new String(it.next().message()));
}
}
catch(ConsumerTimeoutException ex)
{
// Timeout exception waiting for kafka message
// Wait for 5 (or t) seconds before checking for message again
Thread.sleep(5000);
}
}‍‍‍‍‍‍‍‍‍‍

Spark Streaming Kafka Consumer

I'm attempting to set up a Spark Streaming simple app that will read messages from a Kafka topic.
After much work I am at this stage but get the exceptions shown below.
Code:
public static void main(String[] args) throws Exception {
String brokers = "my.kafka.broker" + ":" + "6667";
String topics = "MyKafkaTopic";
// Create context with a 2 seconds batch interval
SparkConf sparkConf = new SparkConf().setAppName("StreamingE")
.setMaster("local[1]")
;
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2));
Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
Map<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", brokers);
System.out.println("Brokers: " + brokers);
// Create direct kafka stream with brokers and topics
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topicsSet
);
System.out.println("Message received: " + messages);
// Start the computation
jssc.start();
jssc.awaitTermination();
}
Which throws:
[WARNING]
java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.codehaus.mojo.exec.ExecJavaMojo$1.run(ExecJavaMojo.java:293)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.IllegalArgumentException: requirement failed: No output operations registered, so nothing to execute
at scala.Predef$.require(Predef.scala:233)
at org.apache.spark.streaming.DStreamGraph.validate(DStreamGraph.scala:161)
at org.apache.spark.streaming.StreamingContext.validate(StreamingContext.scala:542)
at org.apache.spark.streaming.StreamingContext.liftedTree1$1(StreamingContext.scala:601)
at org.apache.spark.streaming.StreamingContext.start(StreamingContext.scala:600)
at org.apache.spark.streaming.api.java.JavaStreamingContext.start(JavaStreamingContext.scala:624)
at com.ncr.dataplatform.api.StreamingE.main(StreamingE.java:66)
Out of desperation I tried connecting to the Zookeeper:
String brokers = "my.kafka.zookeeper" + ":" + "2181";
String topics = "MyKafkaTopic";
But that throws:
[WARNING]
java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.codehaus.mojo.exec.ExecJavaMojo$1.run(ExecJavaMojo.java:293)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: java.io.EOFException: Received -1 when reading from channel, socket has likely been closed.
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$checkErrors$1.apply(KafkaCluster.scala:366)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$checkErrors$1.apply(KafkaCluster.scala:366)
at scala.util.Either.fold(Either.scala:97)
at org.apache.spark.streaming.kafka.KafkaCluster$.checkErrors(KafkaCluster.scala:365)
at org.apache.spark.streaming.kafka.KafkaUtils$.getFromOffsets(KafkaUtils.scala:222)
at org.apache.spark.streaming.kafka.KafkaUtils$.createDirectStream(KafkaUtils.scala:484)
at org.apache.spark.streaming.kafka.KafkaUtils$.createDirectStream(KafkaUtils.scala:607)
at org.apache.spark.streaming.kafka.KafkaUtils.createDirectStream(KafkaUtils.scala)
at com.ncr.dataplatform.api.StreamingE.main(StreamingE.java:53)
The relevant dependencies are:
<properties>
<spark.version>1.6.2</spark.version>
<kafka.version>0.8.2.1</kafka.version>
</properties>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.10</artifactId>
<version>${kafka.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.10</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.10</artifactId>
<version>${spark.version}</version>
</dependency>
I'd like to ask:
Should I be connecting to the Kafka broker or the zookeeper servers?
What am I doing wrong in my code to be unable to connect/listen to incoming messages?

Caused by: java.lang.IllegalArgumentException: requirement failed: No
output operations registered, so nothing to execute
The way Spark works is that most of it's transformations are lazy. When you want a graph to execute, you need to register an Output Transformation. Output transformations come in the form of foreachRDD, print, collect or count (and more).
Instead of using println, call DStream.print():
// Create direct kafka stream with brokers and topics
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topicsSet
);
messages.print();
// Start the computation
jssc.start();
jssc.awaitTermination();
Regarding Kafka, metadata.broker.list needs to provide the addresses of your Kafka broker nodes. There is a separate key named zookeeper.connect to provide ZooKeepers address.

import static org.apache.spark.streaming.kafka.KafkaUtils.createStream;
import java.util.HashMap;
import java.util.Map;
import org.apache.spark.SparkConf;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.Seconds;
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.ImmutableMap;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.Properties;
import kafka.serializer.StringDecoder;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.api.java.JavaDStream;
import scala.Tuple2;
public class KafkaKerberosReader {
// Spark information
private static SparkConf conf;
private static String appName = "KafkaKerberosReader";
private static JavaStreamingContext context;
private static final Logger logger = LoggerFactory.getLogger(KafkaKerberosReader.class.getSimpleName());
// Kafka information
private static String zkQuorum = "";
private static String kfkQuorum = "";
private static String group = "";
private static Integer threads = 1;
private static Map<String, String> kafkaParams = new HashMap<String, String>();
public static void loadProps() {
Properties prop = new Properties();
try {
logger.info("------------------------------loadProps");
InputStream input = new FileInputStream("config.properties");
prop.load(input);
System.out.println("loadProps loaded:" + prop);
appName = prop.getProperty("app.name");
autoOffsetReset = prop.getProperty("auto.offset.reset");
secProtocol = prop.getProperty("security.protocol");
kfkQuorum = bServers = prop.getProperty("bootstrap.servers");
zkQuorum = zServers = prop.getProperty("zookeeper.connect");
group = kGroupId = prop.getProperty("group.id");
kKeyTabFile = prop.getProperty("kerberos.keytabfile");
kJaas = prop.getProperty("kerberos.jaas");
kTopic = prop.getProperty("kafka.topic");
kPrincipal = prop.getProperty("kerberos.principal");
logger.info("loadProps:Props:zk:" + zServers + ",issecure:" + secProtocol + ",autoOffsetReset:"
+ autoOffsetReset + ",bServers:" + bServers + ",kJaas:" + kJaas + ",keytab:" + kKeyTabFile
+ ", kTopic:" + kTopic + ", kPrincipal" + kPrincipal);
if (kPrincipal != null && kKeyTabFile != null) {
logger.info("---------------------Logging into Kerberos");
org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
conf.set("hadoop.security.authentication", "Kerberos");
UserGroupInformation.setConfiguration(conf);
UserGroupInformation.loginUserFromKeytabAndReturnUGI(kPrincipal, kKeyTabFile);
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
logger.info("------------------------------main:START");
loadProps();
// Configure the application
configureSpark();
// Create the context
context = createContext(kTopic);
// Stop the application
context.start();
context.awaitTermination();
logger.info("main:END");
}
/**
* ----------------------------------------------- | This is the kernel of
* the spark application | -----------------------------------------------
*
*/
private static JavaStreamingContext createContext(String topic) {
logger.info("-------------------------------------------------------");
logger.info("| Starting: {} |", appName);
logger.info("| kafkaParams: |", kafkaParams);
logger.info("-------------------------------------------------------");
// Create the spark streaming context
context = new JavaStreamingContext(conf, Seconds.apply(5));
// Read from a Kerberized Kafka
JavaPairReceiverInputDStream<String, String> kafkaStream = createStream(context, zkQuorum, "Default",
ImmutableMap.of(topic, threads), StorageLevel.MEMORY_AND_DISK_SER());
kafkaStream.print();
JavaDStream<String> lines = kafkaStream.map(new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 1L;
#Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
lines.print();
// kafkaStream.map(message -> message._2.toLowerCase()).print();
logger.info("-------------------------------------------------------");
logger.info("| Finished: {} |", appName);
logger.info("-------------------------------------------------------");
return context;
}
/**
* Create a SparkConf and configure it.
*
*/
private static void configureSpark() {
logger.info("------------------------------Initializing '%s'.", appName);
conf = new SparkConf().setAppName(appName);
if (group != null && group.trim().length() != 0) {
kafkaParams.put("group.id", group);
}
kafkaParams.put("auto.offset.reset", autoOffsetReset);
kafkaParams.put("security.protocol", secProtocol);
kafkaParams.put("bootstrap.servers", kfkQuorum);
kafkaParams.put("zookeeper.connect", zkQuorum);
logger.info(">- Configuration done with the follow properties:");
logger.info(conf.toDebugString());
}
static String autoOffsetReset, secProtocol, bServers, zServers, kGroupId, kKeyTabFile, kJaas, kTopic, kPrincipal;
}
Properties:
app.name=KafkaKerberosReader
auto.offset.reset=smallest
security.protocol=PLAINTEXTSASL
bootstrap.servers=sandbox.hortonworks.com:6667
zookeeper.connect=sandbox.hortonworks.com:2181
group.id=Default
kafka.topic=ifinboundprecint
//#kerberos.keytabfile=/etc/hello.keytab
//#kerberos.jaas=/etc/kafka/conf/kafka_client_jaas.conf
//#kerberos.principal=hello#EXAMPLE.COM
Calling:
spark-submit --master yarn --deploy-mode client --num-executors 3
--executor-memory 500M --executor-cores 3 --class com.my.spark.KafkaKerberosReader
~/SparkStreamKafkaTest-1.0-SNAPSHOT.jar

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Kafka topic partition and Spark executor mapping - java

Related

Kafka Streams Twitter Wordcount - Count Value not Long after Serialization

Count number of message/event in Kafka stream at a periodic level

Kafka Streams - Fields in the Custom object changing to null while doing Aggregation

kafka java consumer not reading data

Spark Streaming Kafka Consumer

Categories

Resources