SpakrSQL Generate new column with UUID - java

I have to add new column with value of UUID. I have done this using Spark 1.4 Java using following code.
StructType objStructType = inputDataFrame.schema();
StructField []arrStructField=objStructType.fields();
List<StructField> fields = new ArrayList<StructField>();
List<StructField> newfields = new ArrayList<StructField>();
List <StructField> listFields = Arrays.asList(arrStructField);
StructField a = DataTypes.createStructField(leftCol,DataTypes.StringType, true);
fields.add(a);
newfields.addAll(listFields);
newfields.addAll(fields);
final int size = objStructType.size();
JavaRDD<Row> rowRDD = inputDataFrame.javaRDD().map(new Function<Row, Row>() {
private static final long serialVersionUID = 3280804931696581264L;
public Row call(Row tblRow) throws Exception {
Object[] newRow = new Object[size+1];
int rowSize= tblRow.length();
for (int itr = 0; itr < rowSize; itr++)
{
if(tblRow.apply(itr)!=null)
{
newRow[itr] = tblRow.apply(itr);
}
}
newRow[size] = UUID.randomUUID().toString();
return RowFactory.create(newRow);
}
});
inputDataFrame = objsqlContext.createDataFrame(rowRDD, DataTypes.createStructType(newfields));
I'm wondering if there is some neat way to doing in Spark 2. Please advice.

You can register udf for getting UUID and use callUDF function to add new column to your inputDataFrame. Please see the sample code using Spark 2.0.
public class SparkUUIDSample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("SparkUUIDSample").master("local[*]").getOrCreate();
//sample input data
List<Tuple2<String, String>> inputList = new ArrayList<Tuple2<String, String>>();
inputList.add(new Tuple2<String, String>("A", "v1"));
inputList.add(new Tuple2<String, String>("B", "v2"));
//dataset
Dataset<Row> df = spark.createDataset(inputList, Encoders.tuple(Encoders.STRING(), Encoders.STRING())).toDF("key", "value");
df.show();
//register udf
UDF1<String, String> uuid = str -> UUID.randomUUID().toString();
spark.udf().register("uuid", uuid, DataTypes.StringType);
//call udf
df.select(col("*"), callUDF("uuid", col("value"))).show();
//stop
spark.stop();
}
}

Related

Group and Aggregate List of Map<String, Object>

I have a List<Map<String, Object>> input like below:
[{
CURRENCY = USD,
STATUS = NEW,
PUBLISH_REGION = DEL,
SOURCE = ALADDIN,
RECON_STATUS = null,
JOB_ID_COUNT = 783
}, {
CURRENCY = USD,
STATUS = IN_PROGRESS,
PUBLISH_REGION = DEL,
SOURCE = ALADDIN,
RECON_STATUS = null,
JOB_ID_COUNT = 462
}, {
CURRENCY = USD,
STATUS = NEW,
PUBLISH_REGION = DEL,
SOURCE = GROUP,
RECON_STATUS = null,
JOB_ID_COUNT = 4
}]
I am trying to create another List<Map<String, Object>> by grouping on CURRENCY, PUBLISH_REGION, SOURCE and RECON_STATUS columns. And add all unique STATUS values as pivot to the output map and use JOB_ID_COUNT to summarize/aggregate the count.
List<String> groups = new ArrayList<>(asList("SOURCE", "RECON_STATUS", "PUBLISH_REGION", "CURRENCY"));
List<Map<String, Object>> = input.stream()
.collect(groupingBy(row -> row.get(groups.get(0)), mapping(map -> map.get(groups.get(0)), toList())));
I am expecting below response:
Output:
[{
CURRENCY = USD,
PUBLISH_REGION = DEL,
SOURCE = ALADDIN,
RECON_STATUS = null,
NEW = 783,
IN_PROGRESS = 462
}, {
CURRENCY = USD,
PUBLISH_REGION = DEL,
SOURCE = GROUP,
RECON_STATUS = null,
NEW = 4,
IN_PROGRESS = 0
}]
I am getting compile time error when trying to group by multiple map fields. Single field groupingBy is working fine. Any help is greatly appriciated.
Without Using Custom Class
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class MultipleFieldSorting2 {
private static Map<String, Object> map, map1, map2;
private static List<Map<String, Object>> lst = new ArrayList<>();
static {
map = new HashMap<>();
map.put("CURRENCY", "USD");
map.put("STATUS", "NEW");
map.put("PUBLISH_REGION", "DEL");
map.put("SOURCE", "ALADDIN");
map.put("RECON_STATUS", null);
map.put("JOB_ID_COUNT", "783");
map1 = new HashMap<>();
map1.put("CURRENCY", "USD");
map1.put("STATUS", "IN_PROGRESS");
map1.put("PUBLISH_REGION", "DEL");
map1.put("SOURCE", "ALADDIN");
map1.put("RECON_STATUS", null);
map1.put("JOB_ID_COUNT", "462");
map2 = new HashMap<>();
map2.put("CURRENCY", "USD");
map2.put("STATUS", "NEW");
map2.put("PUBLISH_REGION", "DEL");
map2.put("SOURCE", "GROUP");
map2.put("RECON_STATUS", null);
map2.put("JOB_ID_COUNT", "4");
lst.add(map);
lst.add(map1);
lst.add(map2);
}
public static Map<String, Object> mapper(Map<String, Object> e){
String key = e.get("CURRENCY") + "-" + e.get("PUBLISH_REGION") + "-" + e.get("SOURCE") + "-" + e.get("RECON_STATUS");
Map<String, Object> groupedValue = res.get(key);
if(groupedValue!=null){
groupedValue.put((String) e.get("STATUS"), groupedValue.get("STATUS")!=null ? groupedValue.get("STATUS")+","+e.get("JOB_ID_COUNT") : e.get("JOB_ID_COUNT"));
if(groupedValue.get("NEW")==null){
groupedValue.put("NEW", 0);
}
if(groupedValue.get("IN_PROGRESS")==null){
groupedValue.put("IN_PROGRESS", 0);
}
}else{
groupedValue = new HashMap<>();
res.put(key, groupedValue);
groupedValue.put("CURRENCY", e.get("CURRENCY"));
groupedValue.put("PUBLISH_REGION", e.get("PUBLISH_REGION"));
groupedValue.put("SOURCE", e.get("SOURCE"));
groupedValue.put("RECON_STATUS", e.get("RECON_STATUS"));
groupedValue.put((String) e.get("STATUS"), e.get("JOB_ID_COUNT"));
}
return groupedValue;
}
static Map<String, Map<String, Object>> res = new HashMap<>();
public static void main(String[] args) {
List<Map<String, Object>> finalResult = new ArrayList<>();
lst.stream()
.map(MultipleFieldSorting2::mapper)
.forEach(result -> {
if(!finalResult.contains(result))
finalResult.add(result);
});
System.out.println(finalResult);
}
}
Tried this solution and it is working
Stream the source List
Map each value of map in the list to Class MapWrapper(a pojo where each key is a field)
GroupBy using the groupByKey defined in MapWrapper(uses CURRENCY, PUBLISH_REGION, SOURCE and RECON_STATUS columns)
3.a The result is a Map<String, List<MapWrapper>>
4.Stream through the entry set
map - and get the value alone from (Map<String, List<MapWrapper>>)
Map - convert from List<MapWrapper> to Map<String, Object> using MapWrapper::map
Collect to a list
In Short the solution is
List<Map<String, Object>> value = lst.stream()
.map(map -> new MapWrapper(map))
.collect(groupingBy(MapWrapper::groupByKey))
.entrySet()
.stream()
.map(e -> e.getValue())
.map(MapWrapper::map).collect(toList());
Working Code
public class MultipleFieldSorting {
private static Map<String, Object> map, map1, map2;
private static List<Map<String, Object>> lst = new ArrayList<>();
static {
map = new HashMap<>();
map.put("CURRENCY", "USD");
map.put("STATUS", "NEW");
map.put("PUBLISH_REGION", "DEL");
map.put("SOURCE", "ALADDIN");
map.put("RECON_STATUS", null);
map.put("JOB_ID_COUNT", "783");
map1 = new HashMap<>();
map1.put("CURRENCY", "USD");
map1.put("STATUS", "IN_PROGRESS");
map1.put("PUBLISH_REGION", "DEL");
map1.put("SOURCE", "ALADDIN");
map1.put("RECON_STATUS", null);
map1.put("JOB_ID_COUNT", "462");
map2 = new HashMap<>();
map2.put("CURRENCY", "USD");
map2.put("STATUS", "NEW");
map2.put("PUBLISH_REGION", "DEL");
map2.put("SOURCE", "GROUP");
map2.put("RECON_STATUS", null);
map2.put("JOB_ID_COUNT", "4");
lst.add(map);
lst.add(map1);
lst.add(map2);
}
public static void main(String[] args) {
List<Map<String, Object>> value = lst.stream()
.map(map -> new MapWrapper(map))
.collect(groupingBy(MapWrapper::groupByKey))
.entrySet()
.stream()
.map(e -> e.getValue())
.map(MapWrapper::map).collect(toList());
System.out.println(value);
}
}
class MapWrapper {
private String currency;
private String status;
private String publish;
private String source;
private String recon_status;
private String job_id;
public MapWrapper(Map<String, Object> map) {
this.currency = (String) map.get("CURRENCY");
this.status = (String) map.get("STATUS");
this.publish = (String) map.get("PUBLISH_REGION");
this.source = (String) map.get("SOURCE");
this.recon_status = (String) map.get("RECON_STATUS");
this.job_id = (String) map.get("JOB_ID_COUNT");
}
String groupByKey() {
return new StringBuilder().append(this.getCurrency()).append("-").append(this.publish).append("-")
.append(this.source).append("-").append(this.recon_status).toString();
}
public static Map<String, Object> map(List<MapWrapper> lst){
Map<String, Object> res = new HashMap<>();
res.put("CURRENCY",lst.get(0).getCurrency());
res.put("PUBLISH_REGION",lst.get(0).getPublish());
res.put("SOURCE",lst.get(0).getSource());
res.put("RECON_STATUS",lst.get(0).getRecon_status());
for(MapWrapper m : lst){
res.put(m.getStatus(), m.getJob_id());
}
if(res.get("NEW")==null){
res.put("NEW", 0);
}
if(res.get("IN_PROGRESS")==null){
res.put("IN_PROGRESS", 0);
}
return res;
}
String getCurrency() {
return currency;
}
void setCurrency(String currency) {
this.currency = currency;
}
String getStatus() {
return status;
}
void setStatus(String status) {
this.status = status;
}
String getPublish() {
return publish;
}
void setPublish(String publish) {
this.publish = publish;
}
String getSource() {
return source;
}
void setSource(String source) {
this.source = source;
}
String getJob_id() {
return job_id;
}
void setJob_id(String job_id) {
this.job_id = job_id;
}
String getRecon_status() {
return recon_status;
}
void setRecon_status(String recon_status) {
this.recon_status = recon_status;
}
}

Convert XML into Dataset<Row>

I'm receiving XML from kafka and consuming using Spark kafka API using below code
public class XMLSparkStreamEntry {
public static void registerPrintValue(SparkSession spark) {
spark.udf().register("registerPrintValue", new UDF1<String, List<Row>>() {
private static final long serialVersionUID = 1L;
List<Row> rows = new ArrayList<Row>();
#Override
public List<Row> call(String t1) throws Exception {
JAXBContext jaxbContext = JAXBContext.newInstance(FileWrapper.class);
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
StringReader reader = new StringReader(t1);
FileWrapper person = (FileWrapper) unmarshaller.unmarshal(reader);
List<Employee> emp = new ArrayList<Employee>(person.getEmployees());
for (Employee e : emp) {
rows.add(RowFactory.create(e.getFirstname(), e.getLastname(), e.getTitle(), e.getId(),
e.getDivision(), e.getSupervisor(), e.getTitle()));
}
return rows;
}
}, DataTypes.StringType);
}
public static void main(String[] args) throws StreamingQueryException {
SparkConf conf = new SparkConf();
SparkSession spark = SparkSession.builder().config(conf).appName("Spark Program").master("local[*]")
.getOrCreate();
Dataset<Row> ds1 = spark.readStream().format("kafka").option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "Kafkademo").load();
Dataset<Row> stringTypeDS = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
XMLSparkStreamEntry.registerPrintValue(spark);
Dataset<Row> ss = stringTypeDS.select(callUDF("registerPrintValue", stringTypeDS.col("value")));
I'm confused how to proceed further. I've created one UDF function named as registerPrintValue, where I'm passing xml string. In XML, multiple Employee tags or instances can be present.
In UDF function, third parameter is generic return type of Dataset. I've given DataTypes.StringType, but its wrong I guess and no other option is available right now.
How can I convert my XML having multiple Employee tags to Dataset<Row>? I think the way I'm doing is wrong.
Updated Code
public class XMLSparkStreamEntry {
static StructType structType = new StructType();
static {
structType = structType.add("FirstName", DataTypes.StringType, false);
structType = structType.add("LastName", DataTypes.StringType, false);
structType = structType.add("Title", DataTypes.StringType, false);
structType = structType.add("ID", DataTypes.StringType, false);
structType = structType.add("Division", DataTypes.StringType, false);
structType = structType.add("Supervisor", DataTypes.StringType, false);
}
static ExpressionEncoder<Row> encoder = RowEncoder.apply(structType);
public static void main(String[] args) throws StreamingQueryException {
SparkConf conf = new SparkConf();
SparkSession spark = SparkSession.builder().config(conf).appName("Spark Program").master("local[*]")
.getOrCreate();
Dataset<Row> ds1 = spark.readStream().format("kafka").option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "Kafkademo").load();
Dataset<Row> ss = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
Dataset<Row> finalOP = ss.flatMap(new FlatMapFunction<Row, Row>() {
private static final long serialVersionUID = 1L;
#Override
public Iterator<Row> call(Row t) throws Exception {
JAXBContext jaxbContext = JAXBContext.newInstance(FileWrapper.class);
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
StringReader reader = new StringReader(t.getAs("value"));
FileWrapper person = (FileWrapper) unmarshaller.unmarshal(reader);
List<Employee> emp = new ArrayList<Employee>(person.getEmployees());
List<Row> rows = new ArrayList<Row>();
for (Employee e : emp) {
rows.add(RowFactory.create(e.getFirstname(), e.getLastname(), e.getTitle(), e.getId(),
e.getDivision(), e.getSupervisor()));
}
return rows.iterator();
}
}, encoder);
Dataset<Row> wordCounts = finalOP.groupBy("FirstName").count();
StreamingQuery query = wordCounts.writeStream().outputMode("complete").format("console").start();
System.out.println("SHOW SCHEMA");
query.awaitTermination();
}
}
Output
Output getting
+---------+-----+
|FirstName|count|
+---------+-----+
+---------+-----+

Apache Spark Dataset convert

My aim is to combine 2 tables.
How can I do that in Java?
I am getting error while using this code.
public class App {
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "C:\\hadoop-common-2.2.0-bin-master");
SparkSession sparkSession = SparkSession.builder().appName("SQL").master("local").getOrCreate();
final Properties cp = new Properties();
cp.put("user", "root");
cp.put("password", "1234");
Dataset<Row> studentData = sparkSession.read().jdbc("jdbc:mysql://localhost:3306/dd", "student", cp);
Dataset<Row> schoolData = sparkSession.read().jdbc("jdbc:mysql://localhost:3306/dd", "school", cp);
Dataset<Ogrenci> studentDS = studentData.as(Encoders.bean(Ogrenci.class));
Dataset<Okul> schoolDS = schoolData.as(Encoders.bean(Okul.class));
Dataset<Row> resultDS = studentDS.joinWith(schoolDS, studentData.col("schoolId") == schoolDS.col("id")).drop("schoolId"); ??
resultDS.show();
}
}

Not able to load the data in JavaRDD<Row>

I am very new to spark
i could see the data using loadrisk.show() method but when I am creating the object JavaRDD balRDD = loadrisk.javaRDD(); I am getting null pointer.
public class LoadBalRDD implements Serializable {
public JavaPairRDD getBalRDD(SQLContext sqlContext) {
Dataset<Row> loadrisk = sqlContext.read().format("com.databricks.spark.csv").option("header", "true")
.option("mode", "DROPMALFORMED").load("/home/data/test.csv");
loadrisk.show(); // able to see the result
JavaRDD<Row> balRDD = loadrisk.javaRDD(); // here not loading
JavaPairRDD<String, Balrdd> balRDDMap = balRDD.mapToPair(x -> {
String aml_acc_id = "";
if (!x.isNullAt(x.fieldIndex("aml_acc_id")))
aml_acc_id = x.getAs("aml_acc_id").toString();
Tuple2<String, Balrdd> tp = new Tuple2(x.getAs(x.fieldIndex("aml_acc_id")).toString(),
new Balrdd(aml_acc_id));
return tp;
}).repartitionAndSortWithinPartitions(new CustomAcctIdPartitioner());
return balRDDMap;
}
}

Spark Streaming - Is there a way to Union two JavaInputDstreams, perform a transformation on unified stream and commit offset

The spark consumer have to read topics with same name from different Bootstrap servers. So in need to create two JavaDstreams, performing union, process the stream and commit the offsets.
JavaInputDStream<ConsumerRecord<String, GenericRecord>> dStream = KafkaUtils.createDirectStream(...);
Problem is JavaInputDStream doesn't support dStream.Union(stream2);
If i use,
JavaDStream<ConsumerRecord<String, GenericRecord>> dStream= KafkaUtils.createDirectStream(...);
But JavaDstream doesn't support,
((CanCommitOffsets) dStream.inputDStream()).commitAsync(os);
Please bare with the long answer.
There is no direct way to do this which i am aware of so, I would like to first convert the Dstreams to Datasets/Dataframes and then perform a UNION on both of the dataframes/datasets.
The below code is not tested but this should works. Please feel free to validate and do the necessary changes to make it work.
JavaPairInputDStream<String, String> pairDstream1 = KafkaUtils.createDirectStream(ssc,kafkaParams, topics);
JavaPairInputDStream<String, String> pairDstream2 = KafkaUtils.createDirectStream(ssc,kafkaParams, topics);
//Create JavaDStream<String>
JavaDStream<String> dstream1 = pairDstream1.map(new Function<Tuple2<String, String>, String>() {
#Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
//Create JavaDStream<String>
JavaDStream<String> dstream1 = pairDstream2.map(new Function<Tuple2<String, String>, String>() {
#Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
//Create JavaRDD<Row>
pairDstream1.foreachRDD(new VoidFunction<JavaRDD<String>>() {
#Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
#Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
//Create JavaRDD<Row>
pairDstream2.foreachRDD(new VoidFunction<JavaRDD<String>>() {
#Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
#Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
//Create Schema
StructType schema = DataTypes.createStructType(new StructField[] {DataTypes.createStructField("Message", DataTypes.StringType, true)});
//Get Spark 2.0 session
SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
Dataset<Row> df1 = spark.createDataFrame(rowRDD, schema);
Dataset<Row> df2 = spark.createDataFrame(rowRDD, schema);
//union the both dataframes
df1.union(df2);

Categories

Resources