How to serialize Java POJOs generated from Avro - java

I am trying to write a generic serializer for my Avro-generated Java objects. By begging, borrowing and stealing I came up with the following method:
public byte[] serialize(T data) {
SpecificDatumWriter<T> writer = new SpecificDatumWriter<>(tClass);
ByteArrayOutputStream out = new ByteArrayOutputStream();
BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, null);
try {
writer.write(data, encoder);
encoder.flush();
ByteBuffer serialized = ByteBuffer.allocate(out.toByteArray().length);
serialized.put(out.toByteArray());
return serialized.array();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
I have two questions:
Is the schema included in this byte array? If I try to deserialize this byte array with a tClass of a different version, will it be ok? (as long as the schemas are backwards-compatible)
If this is how I'm meant to serialize Avro POJOs, what if anything are the following used for in my Avro generated POJO:
public org.apache.avro.Schema getSchema() {
return SCHEMA$;
}
private static final org.apache.avro.io.DatumWriter WRITER$ = new org.apache.avro.specific.SpecificDatumWriter(SCHEMA$);
#Override
public void writeExternal(java.io.ObjectOutput out) throws java.io.IOException {
WRITER$.write(this, SpecificData.getEncoder(out));
}
private static final org.apache.avro.io.DatumReader READER$ = new org.apache.avro.specific.SpecificDatumReader(SCHEMA$);
#Override
public void readExternal(java.io.ObjectInput in) throws java.io.IOException {
READER$.read(this, SpecificData.getDecoder(in));
}
Am I missing something?

Related

Kafka Connect JsonConverter with validation

I'm trying to create kafka connect value converter which wraps invalid json records with a valid json object.
I'm reading the values from kinesis (using KinesisSourceConnector) so the input is in base64 encoding.
My implementation tries to process the input through ByteArrayConverter which decodes the data amd delegate the output to JsonConverter as follows (decode is initialized in the configure method to true):
private final Converter delegate = new JsonConverter();
private final Converter decoder = new ByteArrayConverter();
private boolean decode = false;
#Override
public byte[] fromConnectData(String topic, Schema schema, Object value) {
try {
String decoded = new String(decoder.fromConnectData(topic, schema, value));
LOG.info("decoded string\n" + decoded);
if(decode) {
byte[] bytes = decoder.fromConnectData(topic, schema, value);
return delegate.fromConnectData(topic, schema, bytes);
}
return delegate.fromConnectData(topic, schema, value);
} catch (Exception e) {
LOG.error("something went wrong", e);
return delegate.fromConnectData(topic, schema, wrapInvalidJson(new String(decoder.fromConnectData(topic, schema, value))));
}
}
When i am printing the decoded string it looks ok (decoded json string)
But when i consume the output topic it looks like base64 again and I'm not sure what i am missing
Not sure it is optimal but went for this approache
private final Converter delegate = new JsonConverter();
private final Converter decoder = new ByteArrayConverter();
private final Converter stringConverter = new StringConverter();
private final ObjectMapper mapper = new ObjectMapper();
private boolean decode = false;
#Override
public void configure(Map<String, ?> configs, boolean isKey) {
delegate.configure(Collections.singletonMap("schemas.enable", false), false);
if (configs.containsKey("ni.decode.data") && Boolean.valueOf((String) configs.get("ni.decode.data"))) {
decode = true;
}
}
#Override
public byte[] fromConnectData(String topic, Schema schema, Object value) {
if (decode) {
String decoded = new String(decoder.fromConnectData(topic, schema, value));
try {
return mapper.readTree(decoded).toString().getBytes();
} catch (Exception e) {
return wrapInvalidJson(decoded).getBytes();
}
} else {
try {
return delegate.fromConnectData(topic, schema, value);
} catch (Exception e) {
byte[] msg = stringConverter.fromConnectData(topic, schema, value);
return wrapInvalidJson(new String(msg)).getBytes();
}
}
}

Change my writeJsonFile function from GSON to Jackson

I have some code that takes in a list of descriptors and writes them to different JSON files using the GSON library. I am now trying to change that library to Jackson. I am not a Jackson expert so I'm looking for some help. Here is my code when I am using GSON:
Descriptor Class:
public class Descriptor {
#SerializedName("BatchName")
private String batchName;
#SerializedName("Metadata")
private Metadata metadata;
#SerializedName("SampleInfo")
private SampleInfoJsonModel sampleInfo;
#SerializedName("Files")
private List<String> files;
#SerializedName("ClientData")
private ClientData clientData;
#SerializedName("CaseName")
private String caseName;
public Descriptor() {
this.metadata = new Metadata();
this.sampleInfo = new SampleInfoJsonModel();
this.files = new ArrayList<String>();
this.clientData = new ClientData();
}
public String getBatchName() {
return batchName;
}
public void setBatchName(String batchName) {
this.batchName = batchName;
}
public Metadata getMetadata() {
return metadata;
}
public void setMetadata(Metadata metadata) {
this.metadata = metadata;
}
public SampleInfoJsonModel getSampleInfo() {
return sampleInfo;
}
public void setSampleInfo(SampleInfoJsonModel sampleInfo) {
this.sampleInfo = sampleInfo;
}
public List<String> getFiles() {
return files;
}
public void setFiles(List<String> files) {
this.files = files;
}
public ClientData getClientData() {
return clientData;
}
public void setClientData(ClientData clientData) {
this.clientData = clientData;
}
public String getCaseName() {
return caseName;
}
public void setCaseName(String caseName) {
this.caseName = caseName;
}
public ClientData getClientDataNoCountryCodes() {
// TODO Auto-generated method stub
return null ;
}
}
My write JSON File function:
public static void writeJsonFile(List<Descriptor> descriptors) {
try {
for(Descriptor descriptor : descriptors) {
BufferedWriter buffWrite = new BufferedWriter(new FileWriter("descriptor_"+descriptor.getCaseName()+".json"));
Gson gson = new GsonBuilder().setPrettyPrinting().create();
buffWrite.write(gson.toJson(descriptor));
buffWrite.close();
}
}
catch (IOException ioe) {
System.err.println("Error while writing to json file in writeJsonFile: ");
ioe.printStackTrace();
}
}
Here is what I have written in Jackson:
BufferedWriter buffWrite = new BufferedWriter(new FileWriter("descriptor_"+descriptor.getCaseName()+".json"));
ObjectMapper mapper = new ObjectMapper();
mapper.enable(SerializationFeature.INDENT_OUTPUT);
buffWrite.write(mapper.writeValueAsString(descriptor));
Is this the equivalent of the code below in GSON?
BufferedWriter buffWrite = new BufferedWriter(new FileWriter("descriptor_"+descriptor.getCaseName()+".json"));
Gson gson = new GsonBuilder().setPrettyPrinting().create();
buffWrite.write(gson.toJson(descriptor));
buffWrite.close();
I think you are looking for generating a pretty JSON output for your Object and trying to write it into a file.
You have to make sure that you are using #SerializedName equivalent annotation from jackson which is #JsonProperty on your object properties.
Also you can use following to prettify JSON using jackson ObjectMapper
mapper.writerWithDefaultPrettyPrinter().writeValueAsString( descriptorObj )
NOTE that setting SerializationFeature.INDENT_OUTPUT will also help doing the same as you are already thinking.
Also Files APIs are really useful for file related operations.
I hope this will help!

KafkaAvroDeserializer does not return SpecificRecord but returns GenericRecord

My KafkaProducer is able to use KafkaAvroSerializer to serialize objects to my topic. However, KafkaConsumer.poll() returns deserialized GenericRecord instead of my serialized class.
MyKafkaProducer
KafkaProducer<CharSequence, MyBean> producer;
try (InputStream props = Resources.getResource("producer.props").openStream()) {
Properties properties = new Properties();
properties.load(props);
properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
io.confluent.kafka.serializers.KafkaAvroSerializer.class);
properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
io.confluent.kafka.serializers.KafkaAvroSerializer.class);
properties.put("schema.registry.url", "http://localhost:8081");
MyBean bean = new MyBean();
producer = new KafkaProducer<>(properties);
producer.send(new ProducerRecord<>(topic, bean.getId(), bean));
My KafkaConsumer
try (InputStream props = Resources.getResource("consumer.props").openStream()) {
properties.load(props);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, io.confluent.kafka.serializers.KafkaAvroDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, io.confluent.kafka.serializers.KafkaAvroDeserializer.class);
properties.put("schema.registry.url", "http://localhost:8081");
consumer = new KafkaConsumer<>(properties);
}
consumer.subscribe(Arrays.asList(topic));
try {
while (true) {
ConsumerRecords<CharSequence, MyBean> records = consumer.poll(100);
if (records.isEmpty()) {
continue;
}
for (ConsumerRecord<CharSequence, MyBean> record : records) {
MyBean bean = record.value(); // <-------- This is throwing a cast Exception because it cannot cast GenericRecord to MyBean
System.out.println("consumer received: " + bean);
}
}
MyBean bean = record.value(); That line throws a cast Exception because it cannot cast GenericRecord to MyBean.
I'm using kafka-client-0.9.0.1, kafka-avro-serializer-3.0.0.
KafkaAvroDeserializer supports SpecificData
It's not enabled by default. To enable it:
properties.put(KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG, true);
KafkaAvroDeserializer does not support ReflectData
Confluent's KafkaAvroDeserializer does not know how to deserialize using Avro ReflectData. I had to extend it to support Avro ReflectData:
/**
* Extends deserializer to support ReflectData.
*
* #param <V>
* value type
*/
public abstract class ReflectKafkaAvroDeserializer<V> extends KafkaAvroDeserializer {
private Schema readerSchema;
private DecoderFactory decoderFactory = DecoderFactory.get();
protected ReflectKafkaAvroDeserializer(Class<V> type) {
readerSchema = ReflectData.get().getSchema(type);
}
#Override
protected Object deserialize(
boolean includeSchemaAndVersion,
String topic,
Boolean isKey,
byte[] payload,
Schema readerSchemaIgnored) throws SerializationException {
if (payload == null) {
return null;
}
int schemaId = -1;
try {
ByteBuffer buffer = ByteBuffer.wrap(payload);
if (buffer.get() != MAGIC_BYTE) {
throw new SerializationException("Unknown magic byte!");
}
schemaId = buffer.getInt();
Schema writerSchema = schemaRegistry.getByID(schemaId);
int start = buffer.position() + buffer.arrayOffset();
int length = buffer.limit() - 1 - idSize;
DatumReader<Object> reader = new ReflectDatumReader(writerSchema, readerSchema);
BinaryDecoder decoder = decoderFactory.binaryDecoder(buffer.array(), start, length, null);
return reader.read(null, decoder);
} catch (IOException e) {
throw new SerializationException("Error deserializing Avro message for id " + schemaId, e);
} catch (RestClientException e) {
throw new SerializationException("Error retrieving Avro schema for id " + schemaId, e);
}
}
}
Define a custom deserializer class which deserializes to MyBean:
public class MyBeanDeserializer extends ReflectKafkaAvroDeserializer<MyBean> {
public MyBeanDeserializer() {
super(MyBean.class);
}
}
Configure KafkaConsumer to use the custom deserializer class:
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, MyBeanDeserializer.class);
Edit : reflect data support got merged (see below)
To add to Chin Huang's answer, for minimal code and better performance, you should probably implement it this way :
/**
* Extends deserializer to support ReflectData.
*
* #param <V>
* value type
*/
public abstract class SpecificKafkaAvroDeserializer<V extends SpecificRecordBase> extends AbstractKafkaAvroDeserializer implements Deserializer<V> {
private final Schema schema;
private Class<T> type;
private DecoderFactory decoderFactory = DecoderFactory.get();
protected SpecificKafkaAvroDeserializer(Class<T> type, Map<String, ?> props) {
this.type = type;
this.schema = ReflectData.get().getSchema(type);
this.configure(this.deserializerConfig(props));
}
public void configure(Map<String, ?> configs) {
this.configure(new KafkaAvroDeserializerConfig(configs));
}
#Override
protected T deserialize(
boolean includeSchemaAndVersion,
String topic,
Boolean isKey,
byte[] payload,
Schema readerSchemaIgnore) throws SerializationException {
if (payload == null) {
return null;
}
int schemaId = -1;
try {
ByteBuffer buffer = ByteBuffer.wrap(payload);
if (buffer.get() != MAGIC_BYTE) {
throw new SerializationException("Unknown magic byte!");
}
schemaId = buffer.getInt();
Schema schema = schemaRegistry.getByID(schemaId);
Schema readerSchema = ReflectData.get().getSchema(type);
int start = buffer.position() + buffer.arrayOffset();
int length = buffer.limit() - 1 - idSize;
SpecificDatumReader<T> reader = new SpecificDatumReader(schema, readerSchema);
BinaryDecoder decoder = decoderFactory.binaryDecoder(buffer.array(), start, length, null);
return reader.read(null, decoder);
} catch (IOException e) {
throw new SerializationException("Error deserializing Avro message for id " + schemaId, e);
} catch (RestClientException e) {
throw new SerializationException("Error retrieving Avro schema for id " + schemaId, e);
}
}
}

How do I write objects via a byte buffer?

I'm trying to:
Write an object (or a series of objects of different types/classes) into a file
Read them back
Check the instances and cast them into objects of their same type/class again
I could find these two classes, and this is how I use them. But the data[] array doesn't make much sense to me. Why do you have to put an empty array of data into the deserialize method?
public static byte[] serialize(Object obj) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
ObjectOutputStream os = new ObjectOutputStream(out);
os.writeObject(obj);
return out.toByteArray();
}
public static Object deserialize(byte[] data)
throws IOException, ClassNotFoundException {
ByteArrayInputStream in = new ByteArrayInputStream(data);
ObjectInputStream is = new ObjectInputStream(in);
return is.readObject();
}
public static void main(String[] args) {
try {
Thing p = new Thing(2,4);
byte[]data = new byte[10240];
serialize(p);
Object des = deserialize(data);
} catch (IOException | ClassNotFoundException ex) {
Logger.getLogger(PruebiƱa.class.getName())
.log(Level.SEVERE, null, ex);
}
}
How can I fix this? Now I'm having the following error, when the program reaches the deserialize line:
java.io.StreamCorruptedException: invalid stream header: 00000000
at java.io.ObjectInputStream.readStreamHeader(ObjectInputStream.java:806)
What can I do to fix this, and being able to write and read the objects back? And yes, the class Thing is Serializable.
If you want to write to a File you don't need the byte arrays at all use
FileInputStream and FileOutputStream eg.
public static void serialize(Object obj, File f) throws IOException {
try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(f))) {
out.writeObject(obj);
}
}
public static Object deserialize(File f)
throws IOException, ClassNotFoundException {
try (ObjectInputStream is = new ObjectInputStream(new FileInputStream(f))) {
return is.readObject();
}
}
static class Thing implements Serializable {
int a,b,c;
}
public static void main(String[] args) throws IOException, ClassNotFoundException {
File f = new File("object.dat");
Thing orig = new Thing();
serialize(orig, f);
Thing back = (Thing) deserialize(f);
}
You create the array in serialize, you don't need to create your own array.
Just do this:
byte[] data = serialize(p);
Instead of this:
byte[]data = new byte[10240];
serialize(p);

Append to ObjectOutputStream iteratively

I want to add lots of data to a file. I defined the HYB class since my object contains ofdifferent types of data (String and byte[]). I used ObjectOutputStream and ObjectInputStream to write and read from the file. But my code does not print the expected result. To write my code I used code in the following pages:
How can I append to an existing java.io.ObjectStream?
ClassCastException when Appending Object OutputStream
I try to debug my code and found the problem but I could not. This is my code:
import java.io.*;
import java.io.BufferedOutputStream;
import java.util.*;
public class HYB implements Serializable
{
private static final long serialVersionUID = 1L;
private List<byte[]> data = new ArrayList<>();
public void addRow(String s,byte[] a)
{
data.add(s.getBytes()); // add encoding if necessary
data.add(a);
}
#Override public String toString()
{
StringBuilder sb = new StringBuilder();
synchronized (data)
{
for(int i=0;i<data.size();i+=2)
{
sb.append(new String(data.get(i)));
sb.append(Arrays.toString(data.get(i+1))+"\n");
}
}
return sb.toString();
}
private static void write(File storageFile, HYB hf)
throws IOException {
ObjectOutputStream oos = getOOS(storageFile);
oos.writeObject(hf);
oos.flush();
oos.close();
}
public static ObjectOutputStream getOOS(File file) throws IOException
{
if (file.exists()) {
return new AppendableObjectOutputStream(new FileOutputStream(file, true));
} else {
return new ObjectOutputStream(new FileOutputStream(file));
}
}
private static ObjectInputStream getOIS(FileInputStream fis)
throws IOException {
long pos = fis.getChannel().position();
return pos == 0 ? new ObjectInputStream(fis) :
new AppendableObjectInputStream(fis);
}
private static class AppendableObjectOutputStream extends
ObjectOutputStream {
public AppendableObjectOutputStream(OutputStream out)
throws IOException {
super(out);
}
#Override
protected void writeStreamHeader() throws IOException {
}
}
private static class AppendableObjectInputStream extends ObjectInputStream {
public AppendableObjectInputStream(InputStream in) throws IOException {
super(in);
}
#Override
protected void readStreamHeader() throws IOException {
// do not read a header
}
}
public static void main(String[] args) throws FileNotFoundException, IOException, ClassNotFoundException
{
File x=new File ("test");
HYB hf1 = new HYB();
hf1.addRow("fatemeh",new byte[] {11,12,13});
hf1.addRow("andisheh",new byte[] {14,15,16});
write(x,hf1);
HYB hf = new HYB();
hf.addRow("peter",new byte[] {1,2,3});
hf.addRow("jaqueline",new byte[] {4,5,6});
write(x,hf);
FileInputStream fis = new FileInputStream(x);
HYB hf2 = (HYB) getOIS(fis).readObject();
System.out.println(hf2);
}
}
expected results:
fatemeh[11, 12, 13]
andisheh[14, 15, 16]
peter[1, 2, 3]
jaqueline[4, 5, 6]
actual results:
fatemeh[11, 12, 13]
andisheh[14, 15, 16]
Writing the two HYB objects to the ObjectOutputStream doesn't merge them into a single HYB object; the ObjectOutputStream still contains two HYB object, of which your code reads one. If you did a second call to readObject(), the second one would be retrieved and could be printed to the screen. So you could just wrap the readObject() and println() calls in a loop that reads/writes until there's nothing else to read from the stream.
You are writing two HYB objects to the stream, but only reading one out.
You need to readObject() twice.

Categories

Resources