Convert XML into Dataset<Row> - java

I'm receiving XML from kafka and consuming using Spark kafka API using below code
public class XMLSparkStreamEntry {
public static void registerPrintValue(SparkSession spark) {
spark.udf().register("registerPrintValue", new UDF1<String, List<Row>>() {
private static final long serialVersionUID = 1L;
List<Row> rows = new ArrayList<Row>();
#Override
public List<Row> call(String t1) throws Exception {
JAXBContext jaxbContext = JAXBContext.newInstance(FileWrapper.class);
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
StringReader reader = new StringReader(t1);
FileWrapper person = (FileWrapper) unmarshaller.unmarshal(reader);
List<Employee> emp = new ArrayList<Employee>(person.getEmployees());
for (Employee e : emp) {
rows.add(RowFactory.create(e.getFirstname(), e.getLastname(), e.getTitle(), e.getId(),
e.getDivision(), e.getSupervisor(), e.getTitle()));
}
return rows;
}
}, DataTypes.StringType);
}
public static void main(String[] args) throws StreamingQueryException {
SparkConf conf = new SparkConf();
SparkSession spark = SparkSession.builder().config(conf).appName("Spark Program").master("local[*]")
.getOrCreate();
Dataset<Row> ds1 = spark.readStream().format("kafka").option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "Kafkademo").load();
Dataset<Row> stringTypeDS = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
XMLSparkStreamEntry.registerPrintValue(spark);
Dataset<Row> ss = stringTypeDS.select(callUDF("registerPrintValue", stringTypeDS.col("value")));
I'm confused how to proceed further. I've created one UDF function named as registerPrintValue, where I'm passing xml string. In XML, multiple Employee tags or instances can be present.
In UDF function, third parameter is generic return type of Dataset. I've given DataTypes.StringType, but its wrong I guess and no other option is available right now.
How can I convert my XML having multiple Employee tags to Dataset<Row>? I think the way I'm doing is wrong.
Updated Code
public class XMLSparkStreamEntry {
static StructType structType = new StructType();
static {
structType = structType.add("FirstName", DataTypes.StringType, false);
structType = structType.add("LastName", DataTypes.StringType, false);
structType = structType.add("Title", DataTypes.StringType, false);
structType = structType.add("ID", DataTypes.StringType, false);
structType = structType.add("Division", DataTypes.StringType, false);
structType = structType.add("Supervisor", DataTypes.StringType, false);
}
static ExpressionEncoder<Row> encoder = RowEncoder.apply(structType);
public static void main(String[] args) throws StreamingQueryException {
SparkConf conf = new SparkConf();
SparkSession spark = SparkSession.builder().config(conf).appName("Spark Program").master("local[*]")
.getOrCreate();
Dataset<Row> ds1 = spark.readStream().format("kafka").option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "Kafkademo").load();
Dataset<Row> ss = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
Dataset<Row> finalOP = ss.flatMap(new FlatMapFunction<Row, Row>() {
private static final long serialVersionUID = 1L;
#Override
public Iterator<Row> call(Row t) throws Exception {
JAXBContext jaxbContext = JAXBContext.newInstance(FileWrapper.class);
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
StringReader reader = new StringReader(t.getAs("value"));
FileWrapper person = (FileWrapper) unmarshaller.unmarshal(reader);
List<Employee> emp = new ArrayList<Employee>(person.getEmployees());
List<Row> rows = new ArrayList<Row>();
for (Employee e : emp) {
rows.add(RowFactory.create(e.getFirstname(), e.getLastname(), e.getTitle(), e.getId(),
e.getDivision(), e.getSupervisor()));
}
return rows.iterator();
}
}, encoder);
Dataset<Row> wordCounts = finalOP.groupBy("FirstName").count();
StreamingQuery query = wordCounts.writeStream().outputMode("complete").format("console").start();
System.out.println("SHOW SCHEMA");
query.awaitTermination();
}
}
Output
Output getting
+---------+-----+
|FirstName|count|
+---------+-----+
+---------+-----+

Related

Apache Spark Dataset convert

My aim is to combine 2 tables.
How can I do that in Java?
I am getting error while using this code.
public class App {
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "C:\\hadoop-common-2.2.0-bin-master");
SparkSession sparkSession = SparkSession.builder().appName("SQL").master("local").getOrCreate();
final Properties cp = new Properties();
cp.put("user", "root");
cp.put("password", "1234");
Dataset<Row> studentData = sparkSession.read().jdbc("jdbc:mysql://localhost:3306/dd", "student", cp);
Dataset<Row> schoolData = sparkSession.read().jdbc("jdbc:mysql://localhost:3306/dd", "school", cp);
Dataset<Ogrenci> studentDS = studentData.as(Encoders.bean(Ogrenci.class));
Dataset<Okul> schoolDS = schoolData.as(Encoders.bean(Okul.class));
Dataset<Row> resultDS = studentDS.joinWith(schoolDS, studentData.col("schoolId") == schoolDS.col("id")).drop("schoolId"); ??
resultDS.show();
}
}

Not able to load the data in JavaRDD<Row>

I am very new to spark
i could see the data using loadrisk.show() method but when I am creating the object JavaRDD balRDD = loadrisk.javaRDD(); I am getting null pointer.
public class LoadBalRDD implements Serializable {
public JavaPairRDD getBalRDD(SQLContext sqlContext) {
Dataset<Row> loadrisk = sqlContext.read().format("com.databricks.spark.csv").option("header", "true")
.option("mode", "DROPMALFORMED").load("/home/data/test.csv");
loadrisk.show(); // able to see the result
JavaRDD<Row> balRDD = loadrisk.javaRDD(); // here not loading
JavaPairRDD<String, Balrdd> balRDDMap = balRDD.mapToPair(x -> {
String aml_acc_id = "";
if (!x.isNullAt(x.fieldIndex("aml_acc_id")))
aml_acc_id = x.getAs("aml_acc_id").toString();
Tuple2<String, Balrdd> tp = new Tuple2(x.getAs(x.fieldIndex("aml_acc_id")).toString(),
new Balrdd(aml_acc_id));
return tp;
}).repartitionAndSortWithinPartitions(new CustomAcctIdPartitioner());
return balRDDMap;
}
}

Spark Streaming - Is there a way to Union two JavaInputDstreams, perform a transformation on unified stream and commit offset

The spark consumer have to read topics with same name from different Bootstrap servers. So in need to create two JavaDstreams, performing union, process the stream and commit the offsets.
JavaInputDStream<ConsumerRecord<String, GenericRecord>> dStream = KafkaUtils.createDirectStream(...);
Problem is JavaInputDStream doesn't support dStream.Union(stream2);
If i use,
JavaDStream<ConsumerRecord<String, GenericRecord>> dStream= KafkaUtils.createDirectStream(...);
But JavaDstream doesn't support,
((CanCommitOffsets) dStream.inputDStream()).commitAsync(os);
Please bare with the long answer.
There is no direct way to do this which i am aware of so, I would like to first convert the Dstreams to Datasets/Dataframes and then perform a UNION on both of the dataframes/datasets.
The below code is not tested but this should works. Please feel free to validate and do the necessary changes to make it work.
JavaPairInputDStream<String, String> pairDstream1 = KafkaUtils.createDirectStream(ssc,kafkaParams, topics);
JavaPairInputDStream<String, String> pairDstream2 = KafkaUtils.createDirectStream(ssc,kafkaParams, topics);
//Create JavaDStream<String>
JavaDStream<String> dstream1 = pairDstream1.map(new Function<Tuple2<String, String>, String>() {
#Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
//Create JavaDStream<String>
JavaDStream<String> dstream1 = pairDstream2.map(new Function<Tuple2<String, String>, String>() {
#Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
//Create JavaRDD<Row>
pairDstream1.foreachRDD(new VoidFunction<JavaRDD<String>>() {
#Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
#Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
//Create JavaRDD<Row>
pairDstream2.foreachRDD(new VoidFunction<JavaRDD<String>>() {
#Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
#Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
//Create Schema
StructType schema = DataTypes.createStructType(new StructField[] {DataTypes.createStructField("Message", DataTypes.StringType, true)});
//Get Spark 2.0 session
SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
Dataset<Row> df1 = spark.createDataFrame(rowRDD, schema);
Dataset<Row> df2 = spark.createDataFrame(rowRDD, schema);
//union the both dataframes
df1.union(df2);

How to use LinearRegression in Spark based on text files

I'm fairly new to programming with spark. I want to setup a linear regression model using spark based on log files with tabs as "column" separators. All tutorials, examples I found about it start off with something like this JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
However, I have a bunch of log files I want to use. So what I tried so far is the following:
public static void main(String... args)
{
if(!new File("LogisticRegressionModel").exists())
{
buildTrainingModel();
}
else
{
testModel();
}
}
private static void testModel()
{
SparkSession sc = SparkSession.builder().master("local[2]").appName("LogisticRegressionTest").getOrCreate();
Dataset<Row> dataSet = sc.read().option("delimiter", "-").option("header", "false").csv("EI/eyeliteidemo/TAP01.log");
PipelineModel model = PipelineModel.load("LogisticRegressionModel");
Dataset<Row> predictions = model.transform(dataSet);
}
private static void buildTrainingModel()
{
SparkSession sc = SparkSession.builder().master("local[2]").appName("LogisticRegressionTest").getOrCreate();
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
DataTypes.createStructField("features", DataTypes.StringType, false),
});
Dataset<Row> input = sc.read().option("delimiter", "-").option("header", "false").csv("foo/bar/Foo_*.log");
input = input.drop("_c1", "_c3", "_c4");
input = input.select(functions.concat(input.col("_c0"), input.col("_c2"), input.col("_c5")));
input = input.withColumnRenamed("concat(_c0, _c2, _c5)", "features");
input.show(30, false);
Dataset<Row> dataSet = sc.createDataFrame(input.collectAsList(), schema);
Tokenizer tokenizer = new Tokenizer()
.setInputCol("features")
.setOutputCol("rawTokens");
StopWordsRemover swRemover = new StopWordsRemover().setInputCol(tokenizer.getOutputCol()).setOutputCol("cleanedTerms").setStopWords(readStopwords());
HashingTF hashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(swRemover.getOutputCol())
.setOutputCol("hashedTerms");
IDF idf = new IDF().setInputCol(hashingTF.getOutputCol()).setOutputCol("featuresIDF");
LogisticRegression lr = new LogisticRegression().setMaxIter(10).setRegParam(0.001);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {tokenizer, swRemover, hashingTF, idf, lr});
// Fit the pipeline to training documents.
PipelineModel model = pipeline.fit(dataSet);
try
{
model.save("LogisticRegressionModel");
}
catch (IOException e)
{
e.printStackTrace();
}
}
private static String[] readStopwords()
{
List<String> words = new ArrayList();
try (Stream<String> stream = Files.lines(Paths.get(LogisticRegressionTest.class.getResource("stopwords_en.txt").toURI()))) {
words = stream
.map(String::toLowerCase)
.collect(Collectors.toList());
} catch (IOException e) {
e.printStackTrace();
}
catch (URISyntaxException e)
{
e.printStackTrace();
}
String[] retWords = new String[words.size()];
return words.toArray(retWords);
}
Unfortunately, I run into exceptions:
Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: Column features must be of type org.apache.spark.ml.linalg.VectorUDT#3bfc3ba7 but was actually StringType.
at scala.Predef$.require(Predef.scala:224)
at org.apache.spark.ml.util.SchemaUtils$.checkColumnType(SchemaUtils.scala:42)
at org.apache.spark.ml.PredictorParams$class.validateAndTransformSchema(Predictor.scala:51)
at org.apache.spark.ml.classification.Classifier.org$apache$spark$ml$classification$ClassifierParams$$super$validateAndTransformSchema(Classifier.scala:58)
at org.apache.spark.ml.classification.ClassifierParams$class.validateAndTransformSchema(Classifier.scala:42)
at org.apache.spark.ml.classification.ProbabilisticClassifier.org$apache$spark$ml$classification$ProbabilisticClassifierParams$$super$validateAndTransformSchema(ProbabilisticClassifier.scala:53)
at org.apache.spark.ml.classification.ProbabilisticClassifierParams$class.validateAndTransformSchema(ProbabilisticClassifier.scala:37)
at org.apache.spark.ml.classification.LogisticRegression.org$apache$spark$ml$classification$LogisticRegressionParams$$super$validateAndTransformSchema(LogisticRegression.scala:193)
at org.apache.spark.ml.classification.LogisticRegressionParams$class.validateAndTransformSchema(LogisticRegression.scala:184)
at org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema(LogisticRegression.scala:193)
at org.apache.spark.ml.Predictor.transformSchema(Predictor.scala:122)
at org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:184)
at org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:184)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
at scala.collection.mutable.ArrayOps$ofRef.foldLeft(ArrayOps.scala:186)
at org.apache.spark.ml.Pipeline.transformSchema(Pipeline.scala:184)
at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:74)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:136)
at LogisticRegressionTest.buildTrainingModel(LogisticRegressionTest.java:92)
at LogisticRegressionTest.main(LogisticRegressionTest.java:40)
Now my problem/question is how to get this datatype issue right? Moreover, does my code make any sense to Spark experts in the first place?
Thanks!

SpakrSQL Generate new column with UUID

I have to add new column with value of UUID. I have done this using Spark 1.4 Java using following code.
StructType objStructType = inputDataFrame.schema();
StructField []arrStructField=objStructType.fields();
List<StructField> fields = new ArrayList<StructField>();
List<StructField> newfields = new ArrayList<StructField>();
List <StructField> listFields = Arrays.asList(arrStructField);
StructField a = DataTypes.createStructField(leftCol,DataTypes.StringType, true);
fields.add(a);
newfields.addAll(listFields);
newfields.addAll(fields);
final int size = objStructType.size();
JavaRDD<Row> rowRDD = inputDataFrame.javaRDD().map(new Function<Row, Row>() {
private static final long serialVersionUID = 3280804931696581264L;
public Row call(Row tblRow) throws Exception {
Object[] newRow = new Object[size+1];
int rowSize= tblRow.length();
for (int itr = 0; itr < rowSize; itr++)
{
if(tblRow.apply(itr)!=null)
{
newRow[itr] = tblRow.apply(itr);
}
}
newRow[size] = UUID.randomUUID().toString();
return RowFactory.create(newRow);
}
});
inputDataFrame = objsqlContext.createDataFrame(rowRDD, DataTypes.createStructType(newfields));
I'm wondering if there is some neat way to doing in Spark 2. Please advice.
You can register udf for getting UUID and use callUDF function to add new column to your inputDataFrame. Please see the sample code using Spark 2.0.
public class SparkUUIDSample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("SparkUUIDSample").master("local[*]").getOrCreate();
//sample input data
List<Tuple2<String, String>> inputList = new ArrayList<Tuple2<String, String>>();
inputList.add(new Tuple2<String, String>("A", "v1"));
inputList.add(new Tuple2<String, String>("B", "v2"));
//dataset
Dataset<Row> df = spark.createDataset(inputList, Encoders.tuple(Encoders.STRING(), Encoders.STRING())).toDF("key", "value");
df.show();
//register udf
UDF1<String, String> uuid = str -> UUID.randomUUID().toString();
spark.udf().register("uuid", uuid, DataTypes.StringType);
//call udf
df.select(col("*"), callUDF("uuid", col("value"))).show();
//stop
spark.stop();
}
}

Categories

Resources