I am trying to create a simple application where the app will consume Kafka message do some cql transform and publish to Kafka and below is the code:
JAVA: 1.8
Flink: 1.13
Scala: 2.11
flink-siddhi: 2.11-0.2.2-SNAPSHOT
I am using library: https://github.com/haoch/flink-siddhi
input json to Kafka:
{
"awsS3":{
"ResourceType":"aws.S3",
"Details":{
"Name":"crossplane-test",
"CreationDate":"2020-08-17T11:28:05+00:00"
},
"AccessBlock":{
"PublicAccessBlockConfiguration":{
"BlockPublicAcls":true,
"IgnorePublicAcls":true,
"BlockPublicPolicy":true,
"RestrictPublicBuckets":true
}
},
"Location":{
"LocationConstraint":"us-west-2"
}
}
}
main class:
public class S3SidhiApp {
public static void main(String[] args) {
internalStreamSiddhiApp.start();
//kafkaStreamApp.start();
}
}
App class:
package flinksidhi.app;
import com.google.gson.JsonObject;
import flinksidhi.event.s3.source.S3EventSource;
import io.siddhi.core.SiddhiManager;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import org.apache.flink.streaming.siddhi.SiddhiCEP;
import org.json.JSONObject;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import static flinksidhi.app.connector.Consumers.createInputMessageConsumer;
import static flinksidhi.app.connector.Producer.*;
public class internalStreamSiddhiApp {
private static final String inputTopic = "EVENT_STREAM_INPUT";
private static final String outputTopic = "EVENT_STREAM_OUTPUT";
private static final String consumerGroup = "EVENT_STREAM1";
private static final String kafkaAddress = "localhost:9092";
private static final String zkAddress = "localhost:2181";
private static final String S3_CQL1 = "from inputStream select * insert into temp";
private static final String S3_CQL = "from inputStream select json:toObject(awsS3) as obj insert into temp;" +
"from temp select json:getString(obj,'$.awsS3.ResourceType') as affected_resource_type," +
"json:getString(obj,'$.awsS3.Details.Name') as affected_resource_name," +
"json:getString(obj,'$.awsS3.Encryption.ServerSideEncryptionConfiguration') as encryption," +
"json:getString(obj,'$.awsS3.Encryption.ServerSideEncryptionConfiguration.Rules[0].ApplyServerSideEncryptionByDefault.SSEAlgorithm') as algorithm insert into temp2; " +
"from temp2 select affected_resource_name,affected_resource_type, " +
"ifThenElse(encryption == ' ','Fail','Pass') as state," +
"ifThenElse(encryption != ' ' and algorithm == 'aws:kms','None','Critical') as severity insert into outputStream";
public static void start(){
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//DataStream<String> inputS = env.addSource(new S3EventSource());
//Flink kafka stream consumer
FlinkKafkaConsumer<String> flinkKafkaConsumer =
createInputMessageConsumer(inputTopic, kafkaAddress,zkAddress, consumerGroup);
//Add Data stream source -- flink consumer
DataStream<String> inputS = env.addSource(flinkKafkaConsumer);
SiddhiCEP cep = SiddhiCEP.getSiddhiEnvironment(env);
cep.registerExtension("json:toObject", io.siddhi.extension.execution.json.function.ToJSONObjectFunctionExtension.class);
cep.registerExtension( "json:getString", io.siddhi.extension.execution.json.function.GetStringJSONFunctionExtension.class);
cep.registerStream("inputStream", inputS, "awsS3");
inputS.print();
System.out.println(cep.getDataStreamSchemas());
//json needs extension jars to present during runtime.
DataStream<Map<String,Object>> output = cep
.from("inputStream")
.cql(S3_CQL1)
.returnAsMap("temp");
//Flink kafka stream Producer
FlinkKafkaProducer<Map<String, Object>> flinkKafkaProducer =
createMapProducer(env,outputTopic, kafkaAddress);
//Add Data stream sink -- flink producer
output.addSink(flinkKafkaProducer);
output.print();
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
Consumer class:
package flinksidhi.app.connector;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.json.JSONObject;
import java.util.Properties;
public class Consumers {
public static FlinkKafkaConsumer<String> createInputMessageConsumer(String topic, String kafkaAddress, String zookeeprAddr, String kafkaGroup ) {
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", kafkaAddress);
properties.setProperty("zookeeper.connect", zookeeprAddr);
properties.setProperty("group.id",kafkaGroup);
FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<String>(
topic,new SimpleStringSchema(),properties);
return consumer;
}
}
Producer class:
package flinksidhi.app.connector;
import flinksidhi.app.util.ConvertJavaMapToJson;
import org.apache.flink.api.common.serialization.SerializationSchema;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import org.apache.flink.streaming.util.serialization.KeyedSerializationSchema;
import org.json.JSONObject;
import java.util.Map;
public class Producer {
public static FlinkKafkaProducer<Tuple2> createStringProducer(StreamExecutionEnvironment env, String topic, String kafkaAddress) {
return new FlinkKafkaProducer<Tuple2>(kafkaAddress, topic, new AverageSerializer());
}
public static FlinkKafkaProducer<Map<String,Object>> createMapProducer(StreamExecutionEnvironment env, String topic, String kafkaAddress) {
return new FlinkKafkaProducer<Map<String,Object>>(kafkaAddress, topic, new SerializationSchema<Map<String, Object>>() {
#Override
public void open(InitializationContext context) throws Exception {
}
#Override
public byte[] serialize(Map<String, Object> stringObjectMap) {
String json = ConvertJavaMapToJson.convert(stringObjectMap);
return json.getBytes();
}
});
}
}
I have tried many things but the code where the CQL is invoked is never called and doesn't even give any error not sure where is it going wrong.
The same thing if I do creating an internal stream source and use the same input json to return as string it works.
Initial guess: if you are using event time, are you sure you have defined watermarks correctly? As stated in the docs:
(...) an incoming element is initially put in a buffer where elements are sorted in ascending order based on their timestamp, and when a watermark arrives, all the elements in this buffer with timestamps smaller than that of the watermark are processed (...)
If this doesn't help, I would suggest to decompose/simplify the job to a bare minimum, for example just a source operator and some naive sink printing/logging elements. And if that works, start adding back operators one by one. You could also start by simplifying your CEP pattern as much as possible.
First of all thanks a lot #Piotr Nowojski , just because of your small pointer which no matter how many times I pondered over about event time , it did not came in my mind. So yes while debugging the two cases:
With internal datasource , where it was processing successfully, while debugging the flow , I identified that it was processing a watermark after it was processing the data, but it did not catch me that it was somehow managing the event time of the data implicitly.
With kafka as a datasource , while I was debugging I could very clearly see that it was not processing any watermark in the flow, but it did not occur to me that , it is happening because of the event time and watermark not handled properly.
Just adding a single line of code in the application code which I understood from below Flink code snippet:
#deprecated In Flink 1.12 the default stream time characteristic has been changed to {#link
* TimeCharacteristic#EventTime}, thus you don't need to call this method for enabling
* event-time support anymore. Explicitly using processing-time windows and timers works in
* event-time mode. If you need to disable watermarks, please use {#link
* ExecutionConfig#setAutoWatermarkInterval(long)}. If you are using {#link
* TimeCharacteristic#IngestionTime}, please manually set an appropriate {#link
* WatermarkStrategy}. If you are using generic "time window" operations (for example {#link
* org.apache.flink.streaming.api.datastream.KeyedStream#timeWindow(org.apache.flink.streaming.api.windowing.time.Time)}
* that change behaviour based on the time characteristic, please use equivalent operations
* that explicitly specify processing time or event time.
*/
I got to know that by default flink considers event time and for that watermark needs to be handled properly which I didn't so I added below link for setting the time characteristics of the flink execution environment:
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
and kaboom ... it started working , while this is deprecated and needs some other configuration, but thanks a lot , it was a great pointer and helped me a lot and I solved the issue..
Thanks again #Piotr Nowojski
Related
I am a beginner who just started developing pulsar-client with spring boot.
First of all, I learned the basics through pulsar doc and git, but I was stuck testing batch transmission of messages from the pulsar-client producer.
In particular, I want to send JsonArray data in batches, but I keep getting a JsonArray.getAsInt error.
Please take a look at my code and tell me what's wrong
package com.refactorizando.example.pulsar.producer;
import static java.util.stream.Collectors.toList;
import com.refactorizando.example.pulsar.config.PulsarConfiguration;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.stream.IntStream;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import net.sf.json.JSONArray;
import org.apache.pulsar.client.api.CompressionType;
import org.apache.pulsar.client.api.Message;
import org.apache.pulsar.client.api.MessageId;
import org.apache.pulsar.client.api.Producer;
import org.apache.pulsar.client.api.PulsarClient;
import org.apache.pulsar.client.api.PulsarClientException;
import org.apache.pulsar.client.impl.schema.JSONSchema;
import org.apache.pulsar.shade.com.google.gson.JsonArray;
import org.apache.pulsar.shade.com.google.gson.JsonElement;
import org.apache.pulsar.shade.com.google.gson.JsonObject;
import org.apache.pulsar.shade.com.google.gson.JsonParser;
import org.springframework.context.annotation.Bean;
import org.springframework.stereotype.Component;
#Component
#RequiredArgsConstructor
#Slf4j
public class PulsarProducer {
private static final String TOPIC_NAME = "Json_Test";
private final PulsarClient client;
#Bean(name = "producer")
public void producer() throws PulsarClientException {
// batching
Producer<JsonArray> producer = client.newProducer(JSONSchema.of(JsonArray.class))
.topic(TOPIC_NAME)
.batchingMaxPublishDelay(60, TimeUnit.SECONDS)
.batchingMaxMessages(2)
.enableBatching(true)
.compressionType(CompressionType.LZ4)
.create();
String data = "{'users': [{'userId': 1,'firstName': 'AAAAA'},{'userId': 2,'firstName': 'BBBB'},{'userId': 3,'firstName': 'CCCCC'},{'userId': 4,'firstName': 'DDDDD'},{'userId': 5,'firstName': 'EEEEE'}]}";
JsonElement element = JsonParser.parseString(data);
JsonObject obj = element.getAsJsonObject();
JsonArray arr = obj.getAsJsonArray("users");
try {
producer.send(arr);
} catch (Exception e) {
log.error("Error sending mesasage");
e.printStackTrace();
}
producer.close();
}
}
I'm still a beginner developer, so I couldn't find it on stackOverflow because I couldn't search well. If you have anything related to it, please leave a link and I'll delete the question.
Thanks for reading my question and have a nice day!
I tried several things, such as converting to JsonObject and sending, converting to String and sending, etc., but the same error came out.
cho ,
Welcome to Pulsar and Spring Pulsar! I believe there are a few things to cover to fully answer your question.
Spring Pulsar Usage
In your example you are crafting a Producer directly from the PulsarClient. There is absolutely nothing wrong/bad about using that API directly. However, if you want to use Spring Pulsar, the recommended approach to send messages in a Spring Boot app using Spring Pulsar is via the auto-configured PulsarTemplate (or ReactivePulsarTemplate if using Reactive). It simplifies usage and allows configuring the template/producer using configuration properties. For example, instead of building up and then using Producer.send() you would instead inject the pulsar template and use it as follows:
pulsarTemplate.newMessage(foo)
.withTopic("Json_Test")
.withSchema(Schema.JSON(Foo.class))
.withProducerCustomizer((producerBuilder) -> {
producerBuilder
.batchingMaxPublishDelay(60, TimeUnit.SECONDS)
.batchingMaxMessages(2)
.enableBatching(true)
.compressionType(CompressionType.LZ4);
})
.send();
Furthermore you can replace the builder calls w/ configuration properties like:
spring:
pulsar:
producer:
batching-enabled: true
batching-max-publish-delay: 60s
batching-max-messages: 2
compression-type: lz4
and then your code becomes:
pulsarTemplate.newMessage(foo)
.withTopic("Json_Test")
.withSchema(Schema.JSON(Foo.class))
.send();
NOTE: I replace json array w/ Foo for simplicity.
Schemas
In Pulsar, the Schema knows how to de/serialize the data. The built-in Pulsar Schema.JSON by default uses the Jackson json lib to de/serialize the data. This requires that the data must be able to be handled by Jackson ObjectMapper.readValue/writeValue methods. It handles POJOs really well, but does not handle the JSON impl you are using.
I noticed the latest json-lib is 2.4 and (AFAICT) has 9 CVEs against it and was last released in 2010. If I had to use a Json level API for my data I would pick a more contemporary and well supported / used lib such as Jackson or Gson.
I switched your sample to use Jackson ArrayNode and it worked well. I did have to replace the single quotes in your data string to backslash double-quote as Jackson by default does not like single-quoted data. Here is the re-worked sample app using Jackson ArrayNode:
#SpringBootApplication
public class HyunginChoSpringPulsarUserApp {
public static void main(String[] args) {
SpringApplication.run(HyunginChoSpringPulsarUserApp.class, args);
}
#Bean
ApplicationRunner sendDataOnStartup(PulsarTemplate<ArrayNode> pulsarTemplate) {
return (args) -> {
String data2 = "{\"users\": [{\"userId\": 1,\"firstName\": \"AAAAA\"},{\"userId\": 2,\"firstName\": \"BBBB\"},{\"userId\": 3,\"firstName\": \"CCCCC\"},{\"userId\": 4,\"firstName\": \"DDDDD\"},{\"userId\": 5,\"firstName\": \"EEEEE\"}]}";
ArrayNode jsonArray = (ArrayNode) ObjectMapperFactory.create().readTree(data2).get("users");
System.out.printf("*** SENDING: %s%n", jsonArray);
pulsarTemplate.newMessage(jsonArray)
.withTopic("Json_Test")
.withSchema(Schema.JSON(ArrayNode.class))
.send();
};
}
#PulsarListener(topics = "Json_Test", schemaType = SchemaType.JSON, batch = true)
public void listenForData(List<ArrayNode> user) {
System.out.printf("***** LISTEN: %s%n".formatted(user));
}
}
The output looks like:
*** SENDING: [{"userId":1,"firstName":"AAAAA"},{"userId":2,"firstName":"BBBB"},{"userId":3,"firstName":"CCCCC"},{"userId":4,"firstName":"DDDDD"},{"userId":5,"firstName":"EEEEE"}]
***** LISTEN: [{"userId":1,"firstName":"AAAAA"},{"userId":2,"firstName":"BBBB"},{"userId":3,"firstName":"CCCCC"},{"userId":4,"firstName":"DDDDD"},{"userId":5,"firstName":"EEEEE"}]
Data Model
Your data is an array of users. Do you have a requirement to use a Json level API or you instead deal with a List<User> POJOs? This would simplify things and make it much better experience to use. The Java record is a great choice such as:
public record(String userId, String firstName) {}
then you can pass in a List<User> to your PulsarTemplate and everything will work well. For example:
#SpringBootApplication
public class HyunginChoSpringPulsarUserApp {
public static void main(String[] args) {
SpringApplication.run(HyunginChoSpringPulsarUserApp.class, args);
}
#Bean
ApplicationRunner sendDataOnStartup(PulsarTemplate<User> pulsarTemplate) {
return (args) -> {
String data2 = "{\"users\": [{\"userId\": 1,\"firstName\": \"AAAAA\"},{\"userId\": 2,\"firstName\": \"BBBB\"},{\"userId\": 3,\"firstName\": \"CCCCC\"},{\"userId\": 4,\"firstName\": \"DDDDD\"},{\"userId\": 5,\"firstName\": \"EEEEE\"}]}";
ObjectMapper objectMapper = ObjectMapperFactory.create();
JsonNode usersNode = objectMapper.readTree(data2).get("users");
List<User> users = objectMapper.convertValue(usersNode, new TypeReference<>() {});
System.out.printf("*** SENDING: %s%n", users);
for (User user : users) {
pulsarTemplate.newMessage(user)
.withTopic("Json_Test2")
.withSchema(Schema.JSON(User.class))
.send();
}
};
}
#PulsarListener(topics = "Json_Test2", schemaType = SchemaType.JSON, batch = true)
public void listenForData(List<User> users) {
users.forEach((user) -> System.out.printf("***** LISTEN: %s%n".formatted(user)));
}
public record User(String userId, String firstName) {}
}
*** SENDING: [User[userId=1, firstName=AAAAA], User[userId=2, firstName=BBBB], User[userId=3, firstName=CCCCC], User[userId=4, firstName=DDDDD], User[userId=5, firstName=EEEEE]]
...
***** LISTEN: User[userId=1, firstName=AAAAA]
***** LISTEN: User[userId=2, firstName=BBBB]
***** LISTEN: User[userId=3, firstName=CCCCC]
***** LISTEN: User[userId=4, firstName=DDDDD]
***** LISTEN: User[userId=5, firstName=EEEEE]
I hope this helps. Take care.
I am using spring data Redis 2.5.7 to consume the stream messages from Redis, this is my consumer code looks like:
package com.dolphin.soa.post.common.mq;
import com.alibaba.fastjson.JSON;
import com.dolphin.soa.post.contract.request.ArticleRequest;
import com.dolphin.soa.post.model.entity.SubRelation;
import com.dolphin.soa.post.service.IArticleService;
import com.dolphin.soa.post.service.ISubRelationService;
import lombok.extern.slf4j.Slf4j;
import misc.enumn.user.SubStatus;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.collections4.MapUtils;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.redis.connection.stream.MapRecord;
import org.springframework.data.redis.core.DefaultTypedTuple;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.data.redis.core.ZSetOperations;
import org.springframework.data.redis.stream.StreamListener;
import org.springframework.stereotype.Component;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
/**
* #author dolphin
*/
#Component
#Slf4j
public class StreamMessageListener implements StreamListener<String, MapRecord<String, String, String>> {
#Value("${dolphin.redis.stream.group}")
private String groupName;
#Value("${dolphin.redis.user.sub.article.key}")
private String subArticleKey;
private final StringRedisTemplate stringRedisTemplate;
private final RedisTemplate<String, Object> articleRedisTemplate;
private final RedisTemplate<String, Long> redisLongTemplate;
private final ISubRelationService subRelationService;
private final IArticleService articleService;
public StreamMessageListener(StringRedisTemplate stringRedisTemplate,
#Qualifier("redisObjectTemplate") RedisTemplate<String, Object> articleRedisTemplate,
ISubRelationService subRelationService,
#Qualifier("redisLongTemplate") RedisTemplate<String, Long> redisLongTemplate,
IArticleService articleService) {
this.stringRedisTemplate = stringRedisTemplate;
this.articleRedisTemplate = articleRedisTemplate;
this.subRelationService = subRelationService;
this.redisLongTemplate = redisLongTemplate;
this.articleService = articleService;
}
#Override
public void onMessage(MapRecord<String, String, String> message) {
try {
Map<String, String> body = message.getValue();
log.debug("receive message from redis:" + JSON.toJSONString(body));
handleArticle(body);
this.stringRedisTemplate.opsForStream().acknowledge(groupName, message);
} catch (Exception e) {
log.error("handle redis stream message error", e);
}
}
private void handleArticle(Map<String, String> body) {
Long channelId = MapUtils.getLongValue(body, "sub_source_id", -1L);
Long articleId = MapUtils.getLongValue(body, "id", -1L);
if (channelId <= 0L || articleId <= 0L) {
log.error("id incorrect", body);
return;
}
ArticleRequest articleRequest = new ArticleRequest();
articleRequest.setChannelId(channelId);
articleRequest.setSubStatus(SubStatus.SUB);
// TODO: may be page should avoid memory overflow, this is dangerous when the subscribe user increase
List<SubRelation> relations = subRelationService.list(articleRequest);
if (CollectionUtils.isEmpty(relations)) {
return;
}
relations.forEach(item -> {
var articleIdsSet = new HashSet<ZSetOperations.TypedTuple<Long>>();
ZSetOperations.TypedTuple<Long> singleId = new DefaultTypedTuple<>(articleId, Double.valueOf(articleId));
articleIdsSet.add(singleId);
/**
* because now we only have less than 2GB memory
* the redis stream only pass the article id and channel id not the full article
* at this procedure has a extern stop 1 query from database
*/
articleService.getArticleFromCache(Arrays.asList(articleId));
String userSubCacheKey = subArticleKey + item.getUserId();
Boolean isKeyExists = redisLongTemplate.hasKey(userSubCacheKey);
if (isKeyExists) {
/**
* only the user is active recently
* the redis will cache the user subscribe list then we push the newest article
* to the subscribe user one by one(may be we should make the operation less)
* when the channel subscribe user increment
* this may become a performance neck bottle
*/
redisLongTemplate.opsForZSet().add(userSubCacheKey, articleIdsSet);
}
});
}
}
I am facing a problem that the Redis stream have message but the spring data Redis consumer did not consume it. when I am using this command to check the stream message:
> XINFO STREAM pydolphin:stream:article
length
45
radix-tree-keys
1
radix-tree-nodes
2
last-generated-id
1652101310592-0
groups
1
first-entry
1652083221122-0
id
2288687
sub_source_id
4817
last-entry
1652101310592-0
id
2288731
sub_source_id
4792
it shows that there have some message but the consumer did not consume it. why did this happen? what should I do to fix this problem?Sometimes it will consume one or more message but not all the message will be consumed. It always have many message in the queue. I have already tried to use this command to check the pending message:
XPENDING pydolphin:stream:article pydolphin:stream:group:article - + 20
it only return 1 pending message. but the stream queue have 40+ message.
I am trying to create a listener on the HiveMetastore where i need to retrieve the query submitted to the metastore. Is there any way to retrieve the queryString?
In the MetatstoreListener we get events such as onCreate, onDelete etc.
It is possible to have postHook on the hive but need to have the listener on the metastore so that all the DDL commands are getting catched executed from anywhere
Is there any way to capture the events in the Metastore and apply the same events to another parallel Metastore setup?
context:- I am trying to upgrade the hive from 1.x version to 3.x.x
where the idea is to have the stateless setup of Metastore-service in Kubernetes.
but not sure how much the query syntax is compatible between both versions. So wanted to set hot-hot setup parallelly and monitor the results of queries. So if there is any way on the MetastoreListener to transfer the DDL events from one Metastore to another and execute simultaneously?
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.metastore.events.AlterTableEvent;
import org.apache.hadoop.hive.metastore.MetaStoreEventListener;
import org.apache.hadoop.hive.metastore.events.CreateTableEvent;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.time.LocalDateTime;
public class HiveMetastoreListener extends MetaStoreEventListener {
private static final Logger LOGGER = LoggerFactory.getLogger(HiveMetastoreListener.class);
private static final ObjectMapper objMapper = new ObjectMapper();
private final DataProducer dataProducer = DataProducer.getInstance();
public HiveMetastoreListener(Configuration config) {
super(config);
}
/**
* Handler for a CreateTable Event
*/
#Override
public void onCreateTable(CreateTableEvent tableEvent) throws MetaException{
super.onCreateTable(tableEvent);
try {
String data = null;
dataProducer.produceToKafka("metastore_topic", LocalDateTime.now().toString(), data);
}catch (Exception e) {
System.out.println("Error:- " + e);
}
}
OK, everyone. Another Dataflow question from a Dataflow newbie. (Just started playing with it this week..)
I'm creating a datapipe to take in a list of product names and generate autocomplete data. The data processing part is all working fine, it seems, but I'm missing something obvious because when I add my last ".apply" to use either DatastoreIO or TextIO to write the data out, I'm getting a syntax error in my IDE that says the following:
"The method apply(DatastoreV1.Write) is undefined for the type ParDo.SingleOutput>,Entity>"
If gives me an option add a cast to the method receiver, but that obviously isn't the answer. Do I need to do some other step before I try to write the data out? My last step before trying to write the data is a call to an Entity helper for Dataflow to change my Pipeline structure from > to , which seems to me like what I'd need to write to Datastore.
I got so frustrated with this thing the last few days, I even decided to write the data to some AVRO files instead so I could just load it in Datastore by hand. Imagine how ticked I was when I got all that done and got the exact same error in the exact same place on my call to TextIO. That is why I think I must be missing something very obvious here.
Here is my code. I included it all for reference, but you probably just need to look at the main[] at the bottom. Any input would be greatly appreciated! Thanks!
MrSimmonsSr
package com.client.autocomplete;
import com.client.autocomplete.AutocompleteOptions;
import com.google.datastore.v1.Entity;
import com.google.datastore.v1.Key;
import com.google.datastore.v1.Value;
import static com.google.datastore.v1.client.DatastoreHelper.makeKey;
import static com.google.datastore.v1.client.DatastoreHelper.makeValue;
import org.apache.beam.sdk.coders.DefaultCoder;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionList;
import com.google.api.services.bigquery.model.TableRow;
import com.google.common.base.MoreObjects;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.datastore.DatastoreIO;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.DoFn.ProcessContext;
import org.apache.beam.sdk.transforms.DoFn.ProcessElement;
import org.apache.beam.sdk.extensions.jackson.ParseJsons;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.StreamingOptions;
import org.apache.beam.sdk.options.Validation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.List;
import java.util.ArrayList;
/*
* A simple Dataflow pipeline to create autocomplete data from a list of
* product names. It then loads that prefix data into Google Cloud Datastore for consumption by
* a Google Cloud Function. That function will take in a prefix and return a list of 10 product names
*
* Pseudo Code Steps
* 1. Load a list of product names from Cloud Storage
* 2. Generate prefixes for use with autocomplete, based on the product names
* 3. Merge the prefix data together with 10 products per prefix
* 4. Write that prefix data to the Cloud Datastore as a KV with a <String>, List<String> structure
*
*/
public class ClientAutocompletePipeline {
private static final Logger LOG = LoggerFactory.getLogger(ClientAutocompletePipeline.class);
/**
* A DoFn that keys each product name by all of its prefixes.
* This creates one row in the PCollection for each prefix<->product_name pair
*/
private static class AllPrefixes
extends DoFn<String, KV<String, String>> {
private final int minPrefix;
private final int maxPrefix;
public AllPrefixes(int minPrefix) {
this(minPrefix, 10);
}
public AllPrefixes(int minPrefix, int maxPrefix) {
this.minPrefix = minPrefix;
this.maxPrefix = maxPrefix;
}
#ProcessElement
public void processElement(ProcessContext c) {
String productName= c.element().toString();
for (int i = minPrefix; i <= Math.min(productName.length(), maxPrefix); i++) {
c.output(KV.of(productName.substring(0, i), c.element()));
}
}
}
/**
* Takes as input the top product names per prefix, and emits an entity
* suitable for writing to Cloud Datastore.
*
*/
static class FormatForDatastore extends DoFn<KV<String, List<String>>, Entity> {
private String kind;
private String ancestorKey;
public FormatForDatastore(String kind, String ancestorKey) {
this.kind = kind;
this.ancestorKey = ancestorKey;
}
#ProcessElement
public void processElement(ProcessContext c) {
// Initialize an EntityBuilder and get it a valid key
Entity.Builder entityBuilder = Entity.newBuilder();
Key key = makeKey(kind, ancestorKey).build();
entityBuilder.setKey(key);
// New HashMap to hold all the properties of the Entity
Map<String, Value> properties = new HashMap<>();
String prefix = c.element().getKey();
String productsString = "Products[";
// iterate through the product names and add each one to the productsString
for (String productName : c.element().getValue()) {
// products.add(productName);
productsString += productName + ", ";
}
productsString += "]";
properties.put("prefix", makeValue(prefix).build());
properties.put("products", makeValue(productsString).build());
entityBuilder.putAllProperties(properties);
c.output(entityBuilder.build());
}
}
/**
* Options supported by this class.
*
* <p>Inherits standard Beam example configuration options.
*/
public interface Options
extends AutocompleteOptions {
#Description("Input text file")
#Validation.Required
String getInputFile();
void setInputFile(String value);
#Description("Cloud Datastore entity kind")
#Default.String("prefix-product-map")
String getKind();
void setKind(String value);
#Description("Whether output to Cloud Datastore")
#Default.Boolean(true)
Boolean getOutputToDatastore();
void setOutputToDatastore(Boolean value);
#Description("Cloud Datastore ancestor key")
#Default.String("root")
String getDatastoreAncestorKey();
void setDatastoreAncestorKey(String value);
#Description("Cloud Datastore output project ID, defaults to project ID")
String getOutputProject();
void setOutputProject(String value);
}
public static void main(String[] args) throws IOException{
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
// create the pipeline
Pipeline p = Pipeline.create(options);
PCollection<String> toWrite = p
// A step to read in the product names from a text file on GCS
.apply(TextIO.read().from("gs://sample-product-data/clean_product_names.txt"))
// Next expand the product names into KV pairs with prefix as key (<KV<String, String>>)
.apply("Explode Prefixes", ParDo.of(new AllPrefixes(2)))
// Apply a GroupByKey transform to the PCollection "flatCollection" to create "productsGroupedByPrefix".
.apply(GroupByKey.<String, String>create())
// Now format the PCollection for writing into the Google Datastore
.apply("FormatForDatastore", ParDo.of(new FormatForDatastore(options.getKind(),
options.getDatastoreAncestorKey()))
// Write the processed data to the Google Cloud Datastore
// NOTE: This is the line that I'm getting the error on!!
.apply(DatastoreIO.v1().write().withProjectId(MoreObjects.firstNonNull(
options.getOutputProject(), options.getOutputProject()))));
// Run the pipeline.
PipelineResult result = p.run();
}
}
I think you need another closing parenthesis. I've removed some of the extraneous bits and reindent according to the parentheses:
PCollection<String> toWrite = p
.apply(TextIO.read().from("..."))
.apply("Explode Prefixes", ...)
.apply(GroupByKey.<String, String>create())
.apply("FormatForDatastore", ParDo.of(new FormatForDatastore(
options.getKind(), options.getDatastoreAncestorKey()))
.apply(...);
Specifically, you need another parenthesis to close the apply("FormatForDatastore", ...). Right now, it is trying to call ParDo.of(...).apply(...) which doesn't work.
I am printing the list of books from a library using a printBooks operation class. I want to see if the proper output is written to console.
This is what I have tried so far. Can someone please explain what Im doing wrong here. Thanks in advance.
PrintBooksOperation.java
package tw51.biblioteca.io.menu;
import tw51.biblioteca.Lendable;
import tw51.biblioteca.Library;
import tw51.biblioteca.io.Input;
import tw51.biblioteca.io.Output;
import tw51.biblioteca.io.menu.home.MenuOptions;
import java.util.List;
import static tw51.biblioteca.ItemType.Book;
/**
* Prints the Items Of Type Book.
*/
public class PrintBooksOperation implements MenuOptions {
private Library library;
private Output writer;
#Override
public void execute(Library library, Input reader, Output writer) {
this.library = library;
this.writer = writer;
printBooks();
}
private void printBooks() {
writer.formattedHeadings();
writer.write("\n");
List<Lendable> items = library.listItems();
items.stream().filter(item -> item.isOfType(Book)).forEach(item -> {
writer.write("\n" + item.toFormattedString());
});
}
}
PrintBooksOperationTest.java
package tw51.biblioteca.io.menu;
import org.junit.Test;
import tw51.biblioteca.Book;
import tw51.biblioteca.Library;
import tw51.biblioteca.io.Input;
import tw51.biblioteca.io.Output;
import java.util.Arrays;
import java.util.LinkedList;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;
/**
*
*/
public class PrintBooksOperationTest {
#Test
public void areTheBooksPrintedCorrectly() {
Input reader = mock(Input.class);
Output writer = mock(Output.class);
Book book = new Book("nin", "#123", "ghy", 2003);
Library library = new Library(new LinkedList<>(Arrays.asList(book)));
PrintBooksOperation print = new PrintBooksOperation();
print.execute(library, reader, writer);
verify(writer).write("");
}
}
Input and Output are interfaces that implement console read and write.
My Error Message:
Argument(s) are different! Wanted:
output.write(
""
);
-> at tw51.biblioteca.io.menu.PrintBooksOperationTest.areTheBooksPrintedCorrectly(PrintBooksOperationTest.java:28)
Actual invocation has different arguments:
output.write(
"
"
);
Why are the actual arguments empty? The Printoperation works when I run it. Is there something that I am doing wrong? Or is there another way to test the console??
When you call verify on the writer instance you're signalling that it should be called for the first time with the argument "".
From your implementation however you write to it several times
private void printBooks() {
writer.formattedHeadings();
writer.write("\n"); // <-- This is the first time
List<Lendable> items = library.listItems();
items.stream().filter(item -> item.isOfType(Book)).forEach(item -> {
writer.write("\n" + item.toFormattedString());
});
}
Note that the first time you call write the argument is actually "\n" which is a newline, this does not match and empty string and the test fails. Either change the test to check for a "\n" or change the method to print what you expect.
The error message says that the actual function call is returning extra whitespace (notice that the quotes are on different lines) while your "expected" value is an empty string ("").
You either need to add this whitespace to your expected portion or change your function.