[Spark Dataset]: CreateDataset API fails with Java object containing Abstract Class - java

I am trying to convert JavaRDD to Dataset using createDataFrame(RDD<T> data, Encoder<T> evidence) function, but I am getting the following error. I have used a subset of my use case.
My Usecase:
I am trying to convert a JavaRDD with a complex nested object(with abstract classes) to Dataset So that I can write the data in ORC/Parquet format(JavaRDD doesn't support ORC/Parquet)
The input data is in Avro format, there is an infinite recursion problem in createDataFrame for Avro types, referring to this https://issues.apache.org/jira/browse/SPARK-25789, that's why I am loading the data in JavaRDD first.
Requirements:
Encoders.kryo(), Encoders.javaSerialization() works here but i want to use Encoders.bean()
Encoders.bean(T) leverages the structure of an object, to provide class-specific storage layout, as I am using parquet(columnar storage) format for storage, each class variable can be stored in a different column using Encoders.bean(T), whereas
Encoders.Kryo(T) and EncodersjavaSerialization(T), these encoder maps T into a single byte array (binary) field. and thus store the object into one single column.
If a custom serializer is required then please elaborate on the solution.
Classes Used:
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import java.io.Serializable;
import java.util.Map;
#AllArgsConstructor
#NoArgsConstructor
#lombok.Data
public class Data implements Serializable {
private String id;
private Map<Type, Aclass> data;
}
import com.fasterxml.jackson.annotation.JsonSubTypes;
import com.fasterxml.jackson.annotation.JsonTypeInfo;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.io.Serializable;
#AllArgsConstructor
#NoArgsConstructor
#Data
#JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "uad")
#JsonSubTypes(value = {#JsonSubTypes.Type(name = "UADAffinity", value = Bclass.class)})
public abstract class Aclass implements Serializable {
String t;
}
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.Map;
#AllArgsConstructor
#NoArgsConstructor
#Data
public class Bclass extends Aclass {
private Map<String, String> data;
public Bclass(String t, Map<String, String> data) {
super(t);
this.data = data;
}
}
public enum Type {
A, B;
}
Logic:
import com.flipkart.ads.neo.schema.Type;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.xyz.schema.Aclass;
import com.xyz.schema.Bclass;
import com.xyz.schema.Data;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import java.util.List;
import java.util.Map;
public class DataSetConverter {
private SparkSession session;
public DataSetConverter() {
session = initSpark();
}
SparkSession initSpark() {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("123");
return SparkSession.builder()
.sparkContext(new SparkContext(conf))
.getOrCreate();
}
public void dataset_test() {
List<Data> dataList = prepareData();
JavaSparkContext jsc = new JavaSparkContext(session.sparkContext());
JavaRDD<Data> rowsrdd = jsc.parallelize(dataList);
Dataset<Data> rows = session.createDataset(rowsrdd.rdd(), Encoders.bean(Data.class));
System.out.println(rows.takeAsList(3));
}
public static void main(String[] args) {
new DataSetConverter().dataset_test();
}
private List<Data> prepareData() {
List<Data> dataList = Lists.newArrayList();
Data sample1 = getData();
Data sample2 = getData();
dataList.add(sample1);
dataList.add(sample2);
return dataList;
}
private Data getData() {
Map<Type, Aclass> data = getUadData("ppv");
return new Data("123", data);
}
private Map<Type, Aclass> getUadData(String id) {
Map<Type, Aclass> result = Maps.newHashMap();
Map<String, String> data = Maps.newHashMap();
data.put(id + "11", "11");
result.put(Type.A, new Bclass("123", data));
return result;
}
}
Error:
org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 150, Column 11: Cannot instantiate abstract "com.xyz.schema.Aclass"
at org.codehaus.janino.UnitCompiler.compileError(UnitCompiler.java:12124)
at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:5260)
at org.codehaus.janino.UnitCompiler.access$9800(UnitCompiler.java:215)
at org.codehaus.janino.UnitCompiler$16.visitNewClassInstance(UnitCompiler.java:4433)
at org.codehaus.janino.UnitCompiler$16.visitNewClassInstance(UnitCompiler.java:4394)
at org.codehaus.janino.Java$NewClassInstance.accept(Java.java:5179)
at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:4394)
at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:5575)
at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4703)
at org.codehaus.janino.UnitCompiler.access$8800(UnitCompiler.java:215)
at org.codehaus.janino.UnitCompiler$16.visitConditionalExpression(UnitCompiler.java:4418)
at org.codehaus.janino.UnitCompiler$16.visitConditionalExpression(UnitCompiler.java:4394)
at org.codehaus.janino.Java$ConditionalExpression.accept(Java.java:4504)
at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:4394)
at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:5575)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:2580)
at org.codehaus.janino.UnitCompiler.access$2700(UnitCompiler.java:215)
at org.codehaus.janino.UnitCompiler$6.visitLocalVariableDeclarationStatement(UnitCompiler.java:1503)
at org.codehaus.janino.UnitCompiler$6.visitLocalVariableDeclarationStatement(UnitCompiler.java:1487)
at org.codehaus.janino.Java$LocalVariableDeclarationStatement.accept(Java.java:3511)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:1487)
at org.codehaus.janino.UnitCompiler.compileStatements(UnitCompiler.java:1567)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:3388)
at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1357)
at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1330)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:822)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:981)
at org.codehaus.janino.UnitCompiler.access$700(UnitCompiler.java:215)
at org.codehaus.janino.UnitCompiler$2.visitMemberClassDeclaration(UnitCompiler.java:414)
at org.codehaus.janino.UnitCompiler$2.visitMemberClassDeclaration(UnitCompiler.java:406)
at org.codehaus.janino.Java$MemberClassDeclaration.accept(Java.java:1295)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:406)
at org.codehaus.janino.UnitCompiler.compileDeclaredMemberTypes(UnitCompiler.java:1306)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:848)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:432)
Please Help!!

Related

Does findById() actually load data from a JPA repository?

I am a Hibernate beginner. I did a couple of simple tutorials and am trying to write a simple shop backend. Everything works as it should, but I am seeing strange things in my unit tests. When I save an entity, then retrieve it using findById(), it seems that I am simply getting the same object I called save() on, without even retrieving actual values from the database:
package com.bo.learnjava.shop1.repository;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.Id;
import javax.persistence.Table;
#Entity
#Table(name = "PRODUCTS")
public class Product {
#Id
#GeneratedValue
#Column(name="ID")
long id;
#Column(name="NAME")
String name = "";
#Column(name="PRICE_CENTS")
int priceCents = 0;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getPriceCents() {
return priceCents;
}
public void setPriceCents(int priceCents) {
this.priceCents = priceCents;
}
public long getId() {
return id;
}
}
package com.bo.learnjava.shop1.repository;
import org.springframework.data.repository.PagingAndSortingRepository;
import org.springframework.stereotype.Repository;
#Repository
public interface ProductRepository extends PagingAndSortingRepository<Product,Long> {
}
package com.bo.learnjava.shop1.repository;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Optional;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.autoconfigure.orm.jpa.DataJpaTest;
#DataJpaTest
public class ProductRepositoryTest {
#Autowired
ProductRepository repo;
#Test
void testProductRepository() {
Product p=new Product();
p.setName("Milk");
p.setPriceCents(134);
repo.save(p);
// Modify the value to check that repo.findById() actually retrieves *saved* data
p.setPriceCents(9999);
Optional<Product> productFromRepo=repo.findById(p.getId());**
// I expect productFromRepo to contain the values I called save() with
// (price == 134). But productFromRepo.get() returns exactly the same Java object
// as p (with price == 9999), so no actual data was retrieved from the database - why?
assertTrue(productFromRepo.isPresent());
System.out.println("productFromRepo.priceCents="+productFromRepo.get().getPriceCents()); // Outputs 9999!
assertEquals(134,productFromRepo.get().getPriceCents()); // THIS FAILS!!!
}
}
Why does Hibernate behave like that, and how do I test that stuff I write to the database via Hibernate actually gets retrieved back from the database?
Additionaly to comment about first level cache.
If you are extending JpaRepository you can use
repo.saveAndFlush(p);
or
repo.save(p); repo.flush();
to immediately save data in DB.
After it - repo.findById(p.getId()); will return updated data.

How to give different name for each list value in XML mapping?

I have the following annotation using javax.xml.bind.annotation.XmlElement:
#XmlElement
public List<String> getKeywords() {
return keywords;
}
Which produces the following XML:
<keywords>keyword1</keywords>
<keywords>keyword2</keywords>
But I would like to get the below Output:
<A>keyword1</A>
<B>keyword2</B>
I mean some customized tag names for each list value.
Here is an example of such mapping. Main idea is to set #XmlAnyElement on a list getter and provide some mapping function between keywords and tag names. In this example it is achieved using a map, but implementation could be different depending on your logic.
Please note, that it could be difficult to deserialize resulting XML.
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.annotation.XmlAnyElement;
import javax.xml.bind.annotation.XmlRootElement;
import javax.xml.namespace.QName;
#XmlRootElement(name = "response")
public class Response {
private List<String> keywords;
static Map<String, String> names = new HashMap<>();
static {
names.put("keyword1", "A");
names.put("keyword2", "B");
}
private String getElementName(String keyword) {
return names.get(keyword);
}
#XmlAnyElement
public List<JAXBElement<String>> getKeywords() {
List<JAXBElement<String>> elements = new ArrayList<>();
keywords.forEach(keyword -> {
elements.add(new JAXBElement(
new QName(getElementName(keyword)),
String.class,
keyword)
);
});
return elements;
}
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
}
}

How can I retrieve a mongodb collection using spring-data?

I want to retrieve List<Document> (as an example) of all documents in a MongoDB collection for given mongo shell query.
You can retrieve a collection without mapping Document to a domain model.
Not sure whats the purpose you are chasing, but here you have an example:
package com.answers.stackoverflow.spring.mondbretrievedata.data;
import com.mongodb.client.MongoClient;
import com.mongodb.client.MongoCollection;
import org.bson.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;
import java.util.ArrayList;
import java.util.List;
#Repository
public class MongoRepository {
private static final String DatabaseName = "EXAMPLE";
private static final String CollectionName = "example";
#Autowired
private MongoClient client;
public List<String> allDocuments() {
final List<String> list = new ArrayList<>();
final MongoCollection<Document> data = client.getDatabase(DatabaseName).getCollection(CollectionName);
data.find().map(Document::toJson).forEach(list::add);
return list;
}
}
When you use the MongoRepository, you have to give a PersistentEntity. So use your model class which is to be extended by MongoRepository
public interface YOUR_MODEL_Repository extends MongoRepository<MODEL_CLASS, String> {
}
See example official on Product -> getAttributes() for more details visit Spring Data - Mongo DB

Why to use #Query(“”) to operate Elasticsearch and report errors (org.elasticsearch.common.ParsingException: no [query] registered for [query])

My code
package com.tl666.elasticsearch.pojo;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.data.elasticsearch.annotations.Document;
import java.util.Date;
#Data
#Document(indexName = "blog", type = "blogtext")
#NoArgsConstructor
#AllArgsConstructor
public class Blog {
private Integer id;
private String title;//标题
private String text;//文本内容
private String[] images;
private Person person;
private Date create_time;
}
Operation Elasticsearch
package com.tl666.elasticsearch.repository;
import com.tl666.elasticsearch.pojo.Blog;
import org.springframework.data.elasticsearch.annotations.Query;
import org.springframework.data.elasticsearch.repository.ElasticsearchRepository;
public interface BlogRepository extends ElasticsearchRepository<Blog, Integer> {
#Query("{ \"query\": { \"match\": { \"title\": \"?0\" }}}")
Blog findByName(String title);
}
The following exception occurred
org.elasticsearch.common.ParsingException: no [query] registered for [query]
at org.elasticsearch.index.query.AbstractQueryBuilder.parseInnerQueryBuilder(AbstractQueryBuilder.java:337) ~[elasticsearch-6.8.4.jar:6.8.4]
at org.elasticsearch.index.query.WrapperQueryBuilder.doRewrite(WrapperQueryBuilder.java:165) ~[elasticsearch-6.8.4.jar:6.8.4]
at org.elasticsearch.index.query.AbstractQueryBuilder.rewrite(AbstractQueryBuilder.java:284) ~[elasticsearch-6.8.4.jar:6.8.4]
at org.elasticsearch.search.builder.SearchSourceBuilder.rewrite(SearchSourceBuilder.java:949) ~[elasticsearch-6.8.4.jar:6.8.4]
at org.elasticsearch.search.builder.SearchSourceBuilder.rewrite(SearchSourceBuilder.java:80) ~[elasticsearch-6.8.4.jar:6.8.4]
I am operating according to the official documentation, but the above exception occurred.
Who will help me? Thank you very much.
You need to remove the query section. Try this instead:
#Query("{ \"match\": { \"title\": \"?0\" }}")
Val showed you the correct query, but you don't need a custom query for that, you can search in the title property by defining:
public interface BlogRepository extends ElasticsearchRepository<Blog, Integer>
{
Blog findByTitle(String title);
}

Duplicate values in the output of ObjectMapper.writeValueAsString

I am using Jackson ObjectMapper to (de)serialize a class with polymorphic nested class. The deserialization of JSON to the class is working fine but when I try to serialize the class to JSON using writeValueAsString function I observe duplicate values in the output
public class Movie {
private String movieName;
#JsonTypeInfo(use=Id.NAME,include=As.EXTERNAL_PROPERTY,property="movieName")
#JsonSubTypes({#JsonSubTypes.Type(value = StarWarsParams.class, name = "starwars")})
private MovieParams movieParams;
/* Getters and setters follow */
}
/* Empty class */
public class MovieParams {
}
public class StarWarsParams extends MovieParams{
private String characterName;
#JsonTypeInfo(use=Id.NAME,include=As.EXTERNAL_PROPERTY,property="characterName")
#JsonSubTypes({#JsonSubTypes.Type(value = SithParameters.class, name = "Darth Vader")})
private CharacterParams characterParams;
/* Getters and setters follow */
}
/* Empty class */
public class CharacterParams {
}
public class SithParameters extends CharacterParams {
private boolean canShootLightning;
}
The code snippet where the conversion is done as follows:
Movie movie = new Movie();
movie.setMovieName("starwars");
StarWarsParams starWarsParams = new StarWarsParams();
starWarsParams.setCharacterName("Darth Vader");
SithParameters sithParameters = new SithParameters();
sithParameters.setCanShootLightning(false);
starWarsParams.setCharacterParams(sithParameters);
movie.setMovieParams(starWarsParams);
ObjectMapper mapper = new ObjectMapper();
String jsonStringSample = mapper.writeValueAsString(movie);
System.out.println(jsonStringSample);
The output, in which movieName and characterName have duplicates are as follows:
{"movieName":"starwars","movieParams":{"characterName":"Darth Vader","characterParams":{"canShootLightning":false},"characterName":"Darth Vader"},"movieName":"starwars"}
This problem appears with older versions of Jackson e.g. 1.9.2 but not the latest ones from com.fasterxml. Jackson identifies 2 fields one from the #JsonTypeInfo annotation and one from the getter. Two solutions :
Use a more recent version of Jackson from com.fasterxml
Move the #JsonTypeInfo annotation over the getter instead of over the field e.g.
#JsonTypeInfo(use = Id.NAME, include = As.EXTERNAL_PROPERTY, property = "characterName")
public String getCharacterName() {
return characterName;
}
Customized JSON Object using Serialization is Very Simple.
I have wrote a class in my project to get Serialized JSONObject. i am giving u a Idea to how to Implement this in Project.
Application (POJO Class)
import java.io.Serializable;
import java.util.List;
import org.webservice.business.serializer.ApplicationSerializer;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
#JsonSerialize(using=ApplicationSerializer.class)
public class Application implements Serializable {
private static final long serialVersionUID = 1L;
private double amount;
private String businessType;
private String currency;
private int duration;
}
Now ApplicationSerializer class that contains the Customization using Serialization Logic................
package org.webservice.business.serializer;
import java.io.IOException;
import org.webservice.business.dto.Application;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.SerializerProvider;
public class ApplicationSerializer extends JsonSerializer<Application> {
#Override
public void serialize(Application prm_objObjectToSerialize, JsonGenerator prm_objJsonGenerator, SerializerProvider prm_objSerializerProvider) throws IOException, JsonProcessingException {
if (null == prm_objObjectToSerialize) {
} else {
try {
prm_objJsonGenerator.writeStartObject();
prm_objJsonGenerator.writeNumberField("amount", prm_objObjectToSerialize.getAmount());
prm_objJsonGenerator.writeNumberField("duration", prm_objObjectToSerialize.getDuration());
prm_objJsonGenerator.writeStringField("businesstype", prm_objObjectToSerialize.getBusinessType());
prm_objJsonGenerator.writeStringField("currency", prm_objObjectToSerialize.getCurrency());
} catch (Exception v_exException) {
v_exException.printStackTrace()
} finally {
prm_objJsonGenerator.writeEndObject();
}
}
}

Categories

Resources