Cannot resolve symbol JavaSparkSessionSingleton - java

I am new to Spark streaming. What I am trying to achieve is read json string data from kafka, store it in a DStream and convert it to Dataset to be able to load it into Elasticsearch. I am using part of the code from this post.
This is the actual code:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Duration;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.spark.api.java.function.Function;
import kafka.serializer.StringDecoder;
import scala.Tuple2;
public class SparkConsumer {
public static void main(String[] args) throws InterruptedException {
SparkConf conf = new SparkConf().setAppName("readKafkajson").setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000));
// TODO: processing pipeline
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", "localhost:9092");
Set<String> topics = Collections.singleton("kafkajson");
JavaPairInputDStream<String, String> directKafkaStream =
KafkaUtils.createDirectStream(ssc, String.class, String.class, StringDecoder.class,
StringDecoder.class, kafkaParams, topics);
JavaDStream<String> json = directKafkaStream.map(new Function<Tuple2<String,String>, String>() {
public String call(Tuple2<String,String> message) throws Exception {
System.out.println(message._2());
return message._2();
};
});
System.out.println(" json is 0------ 0"+ json);
json.foreachRDD(rdd -> {
rdd.foreach(
record -> System.out.println(record));
});
//Create JavaRDD<Row>
json.foreachRDD(new VoidFunction<JavaRDD<String>>() {
#Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
#Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
//Create Schema
StructType schema = DataTypes.createStructType(new StructField[] {DataTypes.createStructField("Message", DataTypes.StringType, true)});
//Get Spark 2.0 session
SparkSession spark = **JavaSparkSessionSingleton**.getInstance(rdd.context().getConf());
Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
msgDataFrame.show();
}
});
ssc.start();
ssc.awaitTermination();
}
}
I am getting an error saying cannot resolve symbol JavaSparkSessionSingleton.
I am using Spark 2.0.1 and my maven dependencies looks like this:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.11</artifactId>
<version>1.6.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.0.1</version>
</dependency>
<dependency>
I am not sure what I am missing. Any help is appreciated.

The offical Spark doc leads you to create a Singleton class to hold your session, add that to the bottom of your class:
class JavaSparkSessionSingleton {
private static transient SparkSession instance = null;
public static SparkSession getInstance(SparkConf sparkConf) {
if (instance == null) {
instance = SparkSession
.builder()
.config(sparkConf)
.getOrCreate();
}
return instance;
}
}
Sample from Spark doc, complete example here: https://github.com/apache/spark/blob/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java

Related

JavaInputDStream is not working

I am getting following Error:
Exception in thread "main" java.lang.AbstractMethodError
at org.apache.spark.internal.Logging$class.initializeLogIfNecessary(Logging.scala:99)
at org.apache.spark.streaming.kafka010.KafkaUtils$.initializeLogIfNecessary(KafkaUtils.scala:40)
at org.apache.spark.internal.Logging$class.log(Logging.scala:46)
at org.apache.spark.streaming.kafka010.KafkaUtils$.log(KafkaUtils.scala:40)
at org.apache.spark.internal.Logging$class.logWarning(Logging.scala:66)
at org.apache.spark.streaming.kafka010.KafkaUtils$.logWarning(KafkaUtils.scala:40)
at org.apache.spark.streaming.kafka010.KafkaUtils$.fixKafkaParams(KafkaUtils.scala:157)
at org.apache.spark.streaming.kafka010.DirectKafkaInputDStream.<init>(DirectKafkaInputDStream.scala:65)
at org.apache.spark.streaming.kafka010.KafkaUtils$.createDirectStream(KafkaUtils.scala:126)
at org.apache.spark.streaming.kafka010.KafkaUtils$.createDirectStream(KafkaUtils.scala:149)
at org.apache.spark.streaming.kafka010.KafkaUtils.createDirectStream(KafkaUtils.scala)
at com.spark.kafka.JavaDirectKafkaWordCount.main(JavaDirectKafkaWordCount.java:50)
18/05/29 18:05:43 INFO SparkContext: Invoking stop() from shutdown hook
18/05/29 18:05:43 INFO SparkUI: Stopped Spark web UI at
Code snippet:
I am writing a simple kafka - spark streaming code in eclipse to consume the messages from kafka broker using spark streaming. Below is the code
I am trying to execute respective code it working fine utill JavaInputDStream, after that i am getting error.
evey import is fine. can some one help in this
package com.spark.kafka;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import scala.Tuple2;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.api.java.*;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import org.apache.spark.streaming.Durations;
public final class JavaDirectKafkaWordCount {
private static final Pattern SPACE = Pattern.compile(" ");
public static void main(String[] args) throws Exception {
String brokers = "localhost:9092";
String topics = "sparktestone";
SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount").setMaster("local[*]");
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2));
Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", brokers);
JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(jssc,
LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicsSet, kafkaParams));
JavaDStream<String> lines = messages.map(ConsumerRecord::value);
JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator());
JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
.reduceByKey((i1, i2) -> i1 + i2);
wordCounts.print();
jssc.start();
jssc.awaitTermination();
}
}
In Pom i have added respective dependencies.
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.3.0</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.10</artifactId>
<version>2.0.0</version>
</dependency>
I just have the same issue, in Spark 2.3 that method is abstract.
Conclusion, use Spark 2.2.
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.2.0</version>
<scope>provided</scope>
</dependency>

Issue in extracting 54K entries into dataset and writing it into csv file in apache spark

Need to write large Dataset to CSV file. Below is my sample code
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.core.io.support.PropertiesLoaderUtils;
import org.apache.spark.sql.api.java.UDF3;
import org.apache.spark.sql.api.java.UDF2;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.spark.sql.Dataset;
public class TestUdf3{
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "F:\\JAVA\\winutils");
JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"));
SQLContext sqlContext = new SQLContext(sc);
List<Row> manufactuerSynonymData = new ArrayList<Row>();
try{
SparkSession spark = SparkSession.builder().appName("JavaTokenizerExample").getOrCreate();
HashMap<String, String> options = new HashMap<String, String>();options.put("header", "true");options.put("path", "D:\\xls\\Source25K.csv"); //Load source excel file
Dataset<Row> SourcePropertSet = sqlContext.load("com.databricks.spark.csv", options) ;
Resource resource = new ClassPathResource("/ActaulManufacturerSynonym.properties");
Properties allProperties = PropertiesLoaderUtils.loadProperties(resource);
StructType schemaManufactuerSynonymDictionary = new StructType(new StructField[] {new StructField("ManufacturerSynonymSource", DataTypes.StringType, false, Metadata.empty()), new StructField("ManufacturerSynonymTarget", DataTypes.StringType, false, Metadata.empty()) });
Set<String> setuniqueManufacturerEntries=allProperties.stringPropertyNames();
Row individualRowEntry;
for (String individualManufacturerEntry : setuniqueManufacturerEntries) {
individualRowEntry=RowFactory.create(individualManufacturerEntry,allProperties.getProperty(individualManufacturerEntry));
manufactuerSynonymData.add(individualRowEntry);
}
Dataset<Row> SynonaymList = spark.createDataFrame(manufactuerSynonymData, schemaManufactuerSynonymDictionary).withColumn("ManufacturerSynonymSource", lower(col("ManufacturerSynonymSource")));
SynonaymList.show(90,false);
UDF2<String, String, Boolean> contains = new UDF2<String, String, Boolean>() {
private static final long serialVersionUID = -5239951370238629896L;
#Override
public Boolean call(String t1, String t2) throws Exception {
return t1.matches(t2);
}
};
spark.udf().register("contains", contains, DataTypes.BooleanType);
UDF3<String, String, String, String> replaceWithTerm = new UDF3<String, String, String, String>() {
private static final long serialVersionUID = -2882956931420910207L;
#Override
public String call(String t1, String t2, String t3) throws Exception {
return t1 .replaceAll(t2,t3);
}
};
spark.udf().register("replaceWithTerm", replaceWithTerm, DataTypes.StringType);
Dataset<Row> joined = SourcePropertSet.join(SynonaymList, callUDF("contains", SourcePropertSet.col("manufacturersource"), SynonaymList.col("ManufacturerSynonymSource"))).withColumn("ManufacturerSource", callUDF("replaceWithTerm",SourcePropertSet.col("manufacturersource"),SynonaymList.col("ManufacturerSynonymSource"), SynonaymList.col("ManufacturerSynonymTarget")));
joined.show(54000);
joined.repartition(1).select("*").write().format("com.databricks.spark.csv").option("delimiter", ",")
.option("header", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("nullValue", "")
.save("D:\\xls\\synonym.csv");
}
catch(Exception e){
e.printStackTrace();
}
}
}
In the above code rather than displaying the output in console using Statement :
joined.show(54000,false);
I need to write it in to csv file directly
It gives me an runtime exception they are:
1. save("D:\xls\synonym.csv");
org.apache.spark.SparkException: Job aborted.
Caused by:
org.apache.spark.SparkException: Job aborted due to stage failure:
Task 0 in stage 3.0 failed 1 times, most recent failure: Lost task 0.0
in stage 3.0 (TID 3, localhost, executor driver):
org.apache.spark.SparkException: Failed to execute user defined
function($anonfun$apply$2: (string, string) => boolean)
2. return t1.matches(t2);
java.lang.NullPointerException
Caused by:
org.apache.spark.SparkException: Failed to execute user defined
function($anonfun$apply$2: (string, string) => boolean)
Can anybody suggest how to write large dataset to excel file

How to convert JavaPairInputDStream into DataSet/DataFrame in Spark

I am trying to receive streaming data from kafka. In this process I am able to receive and store the streaming data into JavaPairInputDStream. Now I need to analyze this data with out storing it into any database.So I want to convert this JavaPairInputDStream to DataSet or DataFrame
What I tried so far is:
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalog.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.AbstractJavaDStreamLike;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import kafka.serializer.StringDecoder;
import scala.Tuple2;
//Streaming Working Code
public class KafkaToSparkStreaming
{
public static void main(String arr[]) throws InterruptedException
{
SparkConf conf = new SparkConf();
conf.set("spark.app.name", "SparkReceiver"); //The name of application. This will appear in the UI and in log data.
//conf.set("spark.ui.port", "7077"); //Port for application's dashboard, which shows memory and workload data.
conf.set("dynamicAllocation.enabled","false"); //Which scales the number of executors registered with this application up and down based on the workload
//conf.set("spark.cassandra.connection.host", "localhost"); //Cassandra Host Adddress/IP
conf.set("spark.serializer","org.apache.spark.serializer.KryoSerializer"); //For serializing objects that will be sent over the network or need to be cached in serialized form.
//conf.setMaster("local");
conf.set("spark.streaming.stopGracefullyOnShutdown", "true");
JavaSparkContext sc = new JavaSparkContext(conf);
// Create the context with 2 seconds batch size
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000));
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("zookeeper.connect", "localhost:2181"); //Make all kafka data for this cluster appear under a particular path.
kafkaParams.put("group.id", "testgroup"); //String that uniquely identifies the group of consumer processes to which this consumer belongs
kafkaParams.put("metadata.broker.list", "localhost:9092"); //Producer can find a one or more Brokers to determine the Leader for each topic.
kafkaParams.put("serializer.class", "kafka.serializer.StringEncoder"); //Serializer to use when preparing the message for transmission to the Broker.
kafkaParams.put("request.required.acks", "1"); //Producer to require an acknowledgement from the Broker that the message was received.
Set<String> topics = Collections.singleton("ny-2008.csv");
//Create an input DStream for Receiving data from socket
JavaPairInputDStream<String, String> directKafkaStream = KafkaUtils.createDirectStream(ssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams, topics);
//System.out.println(directKafkaStream);
directKafkaStream.print();
}
}
Here is the complete working code using Spark 2.0.
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import kafka.serializer.StringDecoder;
import scala.Tuple2;
public class KafkaToSparkStreaming {
public static void main(String arr[]) throws InterruptedException
{
SparkConf conf = new SparkConf();
conf.set("spark.app.name", "SparkReceiver"); //The name of application. This will appear in the UI and in log data.
//conf.set("spark.ui.port", "7077"); //Port for application's dashboard, which shows memory and workload data.
conf.set("dynamicAllocation.enabled","false"); //Which scales the number of executors registered with this application up and down based on the workload
//conf.set("spark.cassandra.connection.host", "localhost"); //Cassandra Host Adddress/IP
conf.set("spark.serializer","org.apache.spark.serializer.KryoSerializer"); //For serializing objects that will be sent over the network or need to be cached in serialized form.
conf.setMaster("local");
conf.set("spark.streaming.stopGracefullyOnShutdown", "true");
JavaSparkContext sc = new JavaSparkContext(conf);
// Create the context with 2 seconds batch size
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000));
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("zookeeper.connect", "localhost:2181"); //Make all kafka data for this cluster appear under a particular path.
kafkaParams.put("group.id", "testgroup"); //String that uniquely identifies the group of consumer processes to which this consumer belongs
kafkaParams.put("metadata.broker.list", "localhost:9092"); //Producer can find a one or more Brokers to determine the Leader for each topic.
kafkaParams.put("serializer.class", "kafka.serializer.StringEncoder"); //Serializer to use when preparing the message for transmission to the Broker.
kafkaParams.put("request.required.acks", "1"); //Producer to require an acknowledgement from the Broker that the message was received.
Set<String> topics = Collections.singleton("ny-2008.csv");
//Create an input DStream for Receiving data from socket
JavaPairInputDStream<String, String> directKafkaStream = KafkaUtils.createDirectStream(ssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams, topics);
//Create JavaDStream<String>
JavaDStream<String> msgDataStream = directKafkaStream.map(new Function<Tuple2<String, String>, String>() {
#Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
//Create JavaRDD<Row>
msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
#Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
#Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
//Create Schema
StructType schema = DataTypes.createStructType(new StructField[] {DataTypes.createStructField("Message", DataTypes.StringType, true)});
//Get Spark 2.0 session
SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
msgDataFrame.show();
}
});
ssc.start();
ssc.awaitTermination();
}
}
class JavaSparkSessionSingleton {
private static transient SparkSession instance = null;
public static SparkSession getInstance(SparkConf sparkConf) {
if (instance == null) {
instance = SparkSession
.builder()
.config(sparkConf)
.getOrCreate();
}
return instance;
}
}
Technically Dstream is sequence of RDDs, you won't convert Dstream to Datframe instead you will convert each RDD to Dataframe/Dataset as below(Scala code please convert it in Java for your case):
stream.foreachRDD { rdd =>
val dataFrame = rdd.map {case (key, value) => Row(key, value)}.toDF()
}

Kafka Storm integration with java. kafka.api.OffsetRequest.DefaultClientId()Ljava/lang/String; Error

I am new to kafka and storm. I was trying to implement a java example which integrates Kafka and storm. I found an example online. I am trying to run the java program in eclipse IDE. I am not using maven.
I have storm-kafka-0.10.0.jar, kafka-0.6.jar, scala-library-2.10.3.jar and storm-core-0.10.0.jar as external jars.
Here is my java code.
KafkaStormSample.java
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.topology.TopologyBuilder;
import java.util.UUID;
import backtype.storm.spout.SchemeAsMultiScheme;
import storm.kafka.ZkHosts;
import storm.kafka.BrokerHosts;
import storm.kafka.SpoutConfig;
import storm.kafka.KafkaSpout;
import storm.kafka.StringScheme;
public class KafkaStormSample {
public static void main(String[] args) throws Exception{
Config config = new Config();
config.setDebug(true);
config.put(Config.TOPOLOGY_MAX_SPOUT_PENDING, 1);
String zkConnString = "localhost:2181";
String topic = "my-first-topic";
BrokerHosts hosts = new ZkHosts(zkConnString);
SpoutConfig kafkaSpoutConfig = new SpoutConfig (hosts, topic, "/" + topic,
UUID.randomUUID().toString());
kafkaSpoutConfig.bufferSizeBytes = 1024 * 1024 * 4;
kafkaSpoutConfig.fetchSizeBytes = 1024 * 1024 * 4;
//kafkaSpoutConfig.forceFromStart = true;
kafkaSpoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("kafka-spout", new KafkaSpout(kafkaSpoutConfig));
//builder.setBolt("word-spitter", new SplitBolt()).shuffleGrouping("kafka-spout");
builder.setBolt("word-counter", new CountBolt()).shuffleGrouping("word-spitter");
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("KafkaStormSample", config, builder.createTopology());
Thread.sleep(10000);
cluster.shutdown();
}
}
CountBolt.java
import java.util.Map;
import java.util.HashMap;
import backtype.storm.tuple.Tuple;
import backtype.storm.task.OutputCollector;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.IRichBolt;
import backtype.storm.task.TopologyContext;
public class CountBolt implements IRichBolt{
Map<String, Integer> counters;
private OutputCollector collector;
#Override
public void prepare(Map stormConf, TopologyContext context,
OutputCollector collector) {
this.counters = new HashMap<String, Integer>();
this.collector = collector;
}
#Override
public void execute(Tuple input) {
String str = input.getString(0);
if(!counters.containsKey(str)){
counters.put(str, 1);
}else {
Integer c = counters.get(str) +1;
counters.put(str, c);
}
collector.ack(input);
}
#Override
public void cleanup() {
for(Map.Entry<String, Integer> entry:counters.entrySet()){
System.out.println(entry.getKey()+" : " + entry.getValue());
}
}
#Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
#Override
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
When I try to run the kafkaStormSample.java I keep getting the below error.
Exception in thread "main" java.lang.NoSuchMethodError: kafka.api.OffsetRequest.DefaultClientId()Ljava/lang/String;
at storm.kafka.KafkaConfig.<init>(KafkaConfig.java:43)
at storm.kafka.SpoutConfig.<init>(SpoutConfig.java:40)
at KafkaStormSample.main(KafkaStormSample.java:23)
I made sure I have all the required jars. But still I think I am missing jar.
Any help would be appreciated.
Thanks !
I don't know much about those systems, but it looks to me like a library version mismatch.
One of the libraries (Storm in thids case) was compiled against a different version of kafka where that method is defined. Check your dependencies.
Is one of the reasons dependency management systems are helpful.
Update:
From their documentation they provide this set up on Maven:
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.10</artifactId>
<version>0.8.1.1</version>
<exclusions>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
</dependency>
Seems your kafka version is too old.

Can i write Kafka consumer in java api to consume messages produced by python producer

I have producer code written in python which fetches tweets from twitter. I have created the topic named twitter_test.
When i use kafka-console-consumer i can see that there are lot tweets in that topic.
But when i tried to consume this messages from java consumer it is not fetching any data.
Below is my consumer code.
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Properties;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
public class avro_twitter {
public static void main(String[] args) throws IOException {
Properties props = new Properties();
props.put("bootstrap.servers", "10.16.111.12:9092");
props.put("group.id", "groupid");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "io.confluent.kafka.serializers.KafkaAvroDeserializer");
props.put("auto.offset.reset", "earliest");
props.put("schema.registry.url", "10.16.111.12:8081");
String topic = "twitter_test";
KafkaConsumer<String, GenericRecord> consumer = new KafkaConsumer<String, GenericRecord>(props);
consumer.subscribe(Collections.singletonList(topic));
System.out.println("Reading topic:" + topic);
while (true) {
ConsumerRecords<String, GenericRecord> records = consumer.poll(1000);
for (ConsumerRecord<String, GenericRecord> record: records) {
String authid=record.value().get(1).toString();
String screen_name=record.value().get(1).toString();
String description=record.value().get(2).toString();
System.out.println(authid);
}
}
}
Any help would be really appreciated

Categories

Resources