Apache Spark Dataset convert - java

My aim is to combine 2 tables.
How can I do that in Java?
I am getting error while using this code.
public class App {
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "C:\\hadoop-common-2.2.0-bin-master");
SparkSession sparkSession = SparkSession.builder().appName("SQL").master("local").getOrCreate();
final Properties cp = new Properties();
cp.put("user", "root");
cp.put("password", "1234");
Dataset<Row> studentData = sparkSession.read().jdbc("jdbc:mysql://localhost:3306/dd", "student", cp);
Dataset<Row> schoolData = sparkSession.read().jdbc("jdbc:mysql://localhost:3306/dd", "school", cp);
Dataset<Ogrenci> studentDS = studentData.as(Encoders.bean(Ogrenci.class));
Dataset<Okul> schoolDS = schoolData.as(Encoders.bean(Okul.class));
Dataset<Row> resultDS = studentDS.joinWith(schoolDS, studentData.col("schoolId") == schoolDS.col("id")).drop("schoolId"); ??
resultDS.show();
}
}

Related

Convert XML into Dataset<Row>

I'm receiving XML from kafka and consuming using Spark kafka API using below code
public class XMLSparkStreamEntry {
public static void registerPrintValue(SparkSession spark) {
spark.udf().register("registerPrintValue", new UDF1<String, List<Row>>() {
private static final long serialVersionUID = 1L;
List<Row> rows = new ArrayList<Row>();
#Override
public List<Row> call(String t1) throws Exception {
JAXBContext jaxbContext = JAXBContext.newInstance(FileWrapper.class);
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
StringReader reader = new StringReader(t1);
FileWrapper person = (FileWrapper) unmarshaller.unmarshal(reader);
List<Employee> emp = new ArrayList<Employee>(person.getEmployees());
for (Employee e : emp) {
rows.add(RowFactory.create(e.getFirstname(), e.getLastname(), e.getTitle(), e.getId(),
e.getDivision(), e.getSupervisor(), e.getTitle()));
}
return rows;
}
}, DataTypes.StringType);
}
public static void main(String[] args) throws StreamingQueryException {
SparkConf conf = new SparkConf();
SparkSession spark = SparkSession.builder().config(conf).appName("Spark Program").master("local[*]")
.getOrCreate();
Dataset<Row> ds1 = spark.readStream().format("kafka").option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "Kafkademo").load();
Dataset<Row> stringTypeDS = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
XMLSparkStreamEntry.registerPrintValue(spark);
Dataset<Row> ss = stringTypeDS.select(callUDF("registerPrintValue", stringTypeDS.col("value")));
I'm confused how to proceed further. I've created one UDF function named as registerPrintValue, where I'm passing xml string. In XML, multiple Employee tags or instances can be present.
In UDF function, third parameter is generic return type of Dataset. I've given DataTypes.StringType, but its wrong I guess and no other option is available right now.
How can I convert my XML having multiple Employee tags to Dataset<Row>? I think the way I'm doing is wrong.
Updated Code
public class XMLSparkStreamEntry {
static StructType structType = new StructType();
static {
structType = structType.add("FirstName", DataTypes.StringType, false);
structType = structType.add("LastName", DataTypes.StringType, false);
structType = structType.add("Title", DataTypes.StringType, false);
structType = structType.add("ID", DataTypes.StringType, false);
structType = structType.add("Division", DataTypes.StringType, false);
structType = structType.add("Supervisor", DataTypes.StringType, false);
}
static ExpressionEncoder<Row> encoder = RowEncoder.apply(structType);
public static void main(String[] args) throws StreamingQueryException {
SparkConf conf = new SparkConf();
SparkSession spark = SparkSession.builder().config(conf).appName("Spark Program").master("local[*]")
.getOrCreate();
Dataset<Row> ds1 = spark.readStream().format("kafka").option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "Kafkademo").load();
Dataset<Row> ss = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
Dataset<Row> finalOP = ss.flatMap(new FlatMapFunction<Row, Row>() {
private static final long serialVersionUID = 1L;
#Override
public Iterator<Row> call(Row t) throws Exception {
JAXBContext jaxbContext = JAXBContext.newInstance(FileWrapper.class);
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
StringReader reader = new StringReader(t.getAs("value"));
FileWrapper person = (FileWrapper) unmarshaller.unmarshal(reader);
List<Employee> emp = new ArrayList<Employee>(person.getEmployees());
List<Row> rows = new ArrayList<Row>();
for (Employee e : emp) {
rows.add(RowFactory.create(e.getFirstname(), e.getLastname(), e.getTitle(), e.getId(),
e.getDivision(), e.getSupervisor()));
}
return rows.iterator();
}
}, encoder);
Dataset<Row> wordCounts = finalOP.groupBy("FirstName").count();
StreamingQuery query = wordCounts.writeStream().outputMode("complete").format("console").start();
System.out.println("SHOW SCHEMA");
query.awaitTermination();
}
}
Output
Output getting
+---------+-----+
|FirstName|count|
+---------+-----+
+---------+-----+

Not able to load the data in JavaRDD<Row>

I am very new to spark
i could see the data using loadrisk.show() method but when I am creating the object JavaRDD balRDD = loadrisk.javaRDD(); I am getting null pointer.
public class LoadBalRDD implements Serializable {
public JavaPairRDD getBalRDD(SQLContext sqlContext) {
Dataset<Row> loadrisk = sqlContext.read().format("com.databricks.spark.csv").option("header", "true")
.option("mode", "DROPMALFORMED").load("/home/data/test.csv");
loadrisk.show(); // able to see the result
JavaRDD<Row> balRDD = loadrisk.javaRDD(); // here not loading
JavaPairRDD<String, Balrdd> balRDDMap = balRDD.mapToPair(x -> {
String aml_acc_id = "";
if (!x.isNullAt(x.fieldIndex("aml_acc_id")))
aml_acc_id = x.getAs("aml_acc_id").toString();
Tuple2<String, Balrdd> tp = new Tuple2(x.getAs(x.fieldIndex("aml_acc_id")).toString(),
new Balrdd(aml_acc_id));
return tp;
}).repartitionAndSortWithinPartitions(new CustomAcctIdPartitioner());
return balRDDMap;
}
}

How to use LinearRegression in Spark based on text files

I'm fairly new to programming with spark. I want to setup a linear regression model using spark based on log files with tabs as "column" separators. All tutorials, examples I found about it start off with something like this JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
However, I have a bunch of log files I want to use. So what I tried so far is the following:
public static void main(String... args)
{
if(!new File("LogisticRegressionModel").exists())
{
buildTrainingModel();
}
else
{
testModel();
}
}
private static void testModel()
{
SparkSession sc = SparkSession.builder().master("local[2]").appName("LogisticRegressionTest").getOrCreate();
Dataset<Row> dataSet = sc.read().option("delimiter", "-").option("header", "false").csv("EI/eyeliteidemo/TAP01.log");
PipelineModel model = PipelineModel.load("LogisticRegressionModel");
Dataset<Row> predictions = model.transform(dataSet);
}
private static void buildTrainingModel()
{
SparkSession sc = SparkSession.builder().master("local[2]").appName("LogisticRegressionTest").getOrCreate();
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
DataTypes.createStructField("features", DataTypes.StringType, false),
});
Dataset<Row> input = sc.read().option("delimiter", "-").option("header", "false").csv("foo/bar/Foo_*.log");
input = input.drop("_c1", "_c3", "_c4");
input = input.select(functions.concat(input.col("_c0"), input.col("_c2"), input.col("_c5")));
input = input.withColumnRenamed("concat(_c0, _c2, _c5)", "features");
input.show(30, false);
Dataset<Row> dataSet = sc.createDataFrame(input.collectAsList(), schema);
Tokenizer tokenizer = new Tokenizer()
.setInputCol("features")
.setOutputCol("rawTokens");
StopWordsRemover swRemover = new StopWordsRemover().setInputCol(tokenizer.getOutputCol()).setOutputCol("cleanedTerms").setStopWords(readStopwords());
HashingTF hashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(swRemover.getOutputCol())
.setOutputCol("hashedTerms");
IDF idf = new IDF().setInputCol(hashingTF.getOutputCol()).setOutputCol("featuresIDF");
LogisticRegression lr = new LogisticRegression().setMaxIter(10).setRegParam(0.001);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {tokenizer, swRemover, hashingTF, idf, lr});
// Fit the pipeline to training documents.
PipelineModel model = pipeline.fit(dataSet);
try
{
model.save("LogisticRegressionModel");
}
catch (IOException e)
{
e.printStackTrace();
}
}
private static String[] readStopwords()
{
List<String> words = new ArrayList();
try (Stream<String> stream = Files.lines(Paths.get(LogisticRegressionTest.class.getResource("stopwords_en.txt").toURI()))) {
words = stream
.map(String::toLowerCase)
.collect(Collectors.toList());
} catch (IOException e) {
e.printStackTrace();
}
catch (URISyntaxException e)
{
e.printStackTrace();
}
String[] retWords = new String[words.size()];
return words.toArray(retWords);
}
Unfortunately, I run into exceptions:
Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: Column features must be of type org.apache.spark.ml.linalg.VectorUDT#3bfc3ba7 but was actually StringType.
at scala.Predef$.require(Predef.scala:224)
at org.apache.spark.ml.util.SchemaUtils$.checkColumnType(SchemaUtils.scala:42)
at org.apache.spark.ml.PredictorParams$class.validateAndTransformSchema(Predictor.scala:51)
at org.apache.spark.ml.classification.Classifier.org$apache$spark$ml$classification$ClassifierParams$$super$validateAndTransformSchema(Classifier.scala:58)
at org.apache.spark.ml.classification.ClassifierParams$class.validateAndTransformSchema(Classifier.scala:42)
at org.apache.spark.ml.classification.ProbabilisticClassifier.org$apache$spark$ml$classification$ProbabilisticClassifierParams$$super$validateAndTransformSchema(ProbabilisticClassifier.scala:53)
at org.apache.spark.ml.classification.ProbabilisticClassifierParams$class.validateAndTransformSchema(ProbabilisticClassifier.scala:37)
at org.apache.spark.ml.classification.LogisticRegression.org$apache$spark$ml$classification$LogisticRegressionParams$$super$validateAndTransformSchema(LogisticRegression.scala:193)
at org.apache.spark.ml.classification.LogisticRegressionParams$class.validateAndTransformSchema(LogisticRegression.scala:184)
at org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema(LogisticRegression.scala:193)
at org.apache.spark.ml.Predictor.transformSchema(Predictor.scala:122)
at org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:184)
at org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:184)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
at scala.collection.mutable.ArrayOps$ofRef.foldLeft(ArrayOps.scala:186)
at org.apache.spark.ml.Pipeline.transformSchema(Pipeline.scala:184)
at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:74)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:136)
at LogisticRegressionTest.buildTrainingModel(LogisticRegressionTest.java:92)
at LogisticRegressionTest.main(LogisticRegressionTest.java:40)
Now my problem/question is how to get this datatype issue right? Moreover, does my code make any sense to Spark experts in the first place?
Thanks!

SpakrSQL Generate new column with UUID

I have to add new column with value of UUID. I have done this using Spark 1.4 Java using following code.
StructType objStructType = inputDataFrame.schema();
StructField []arrStructField=objStructType.fields();
List<StructField> fields = new ArrayList<StructField>();
List<StructField> newfields = new ArrayList<StructField>();
List <StructField> listFields = Arrays.asList(arrStructField);
StructField a = DataTypes.createStructField(leftCol,DataTypes.StringType, true);
fields.add(a);
newfields.addAll(listFields);
newfields.addAll(fields);
final int size = objStructType.size();
JavaRDD<Row> rowRDD = inputDataFrame.javaRDD().map(new Function<Row, Row>() {
private static final long serialVersionUID = 3280804931696581264L;
public Row call(Row tblRow) throws Exception {
Object[] newRow = new Object[size+1];
int rowSize= tblRow.length();
for (int itr = 0; itr < rowSize; itr++)
{
if(tblRow.apply(itr)!=null)
{
newRow[itr] = tblRow.apply(itr);
}
}
newRow[size] = UUID.randomUUID().toString();
return RowFactory.create(newRow);
}
});
inputDataFrame = objsqlContext.createDataFrame(rowRDD, DataTypes.createStructType(newfields));
I'm wondering if there is some neat way to doing in Spark 2. Please advice.
You can register udf for getting UUID and use callUDF function to add new column to your inputDataFrame. Please see the sample code using Spark 2.0.
public class SparkUUIDSample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("SparkUUIDSample").master("local[*]").getOrCreate();
//sample input data
List<Tuple2<String, String>> inputList = new ArrayList<Tuple2<String, String>>();
inputList.add(new Tuple2<String, String>("A", "v1"));
inputList.add(new Tuple2<String, String>("B", "v2"));
//dataset
Dataset<Row> df = spark.createDataset(inputList, Encoders.tuple(Encoders.STRING(), Encoders.STRING())).toDF("key", "value");
df.show();
//register udf
UDF1<String, String> uuid = str -> UUID.randomUUID().toString();
spark.udf().register("uuid", uuid, DataTypes.StringType);
//call udf
df.select(col("*"), callUDF("uuid", col("value"))).show();
//stop
spark.stop();
}
}

Memory leak in Oracle's Jena adaptation?

(note; this is also asked on the OTN discussion fora - but I'm not too sure there's much activity there)
I've got a test using rdf_semantic_graph_support_for_apache_jena_2.11.1_with_12101_server_patch_build0529 and observe a OOM behavior when running the attached test-class's main() method.
I'm at a loss to see why, though.
In MAT it seems that theres a whole bunch of Oracle instances that hang around, hugging a lot of Strings - but I can verify via SQL*Developer that the connections are successfully closed.
Stripping the test down, I see the apparent leakage happen upon just about any interaction with the ModelOracleSem or GraphOracleSem.
public class OracleSemTxIntegrationWithSpringITCase {
private OracleDataSource oracleDataSource;
private OraclePool oraclePool;
#Before
public void before() throws SQLException {
java.util.Properties prop = new java.util.Properties();
prop.setProperty("MinLimit", "2"); // the cache size is 2 at least
prop.setProperty("MaxLimit", "10");
prop.setProperty("InitialLimit", "2"); // create 2 connections at startup
prop.setProperty("InactivityTimeout", "200"); // seconds
prop.setProperty("AbandonedConnectionTimeout", "100"); // seconds
prop.setProperty("MaxStatementsLimit", "10");
prop.setProperty("PropertyCheckInterval", "60"); // seconds
oracleDataSource = new OracleDataSource();
oracleDataSource.setURL("jdbc:oracle:thin:#**********");
oracleDataSource.setUser("rdfuser");
oracleDataSource.setPassword("****");
oracleDataSource.setConnectionProperties(prop);
oraclePool = new OraclePool(oracleDataSource);
}
#Test
public void testTransactionHandlingViaJdbcTransactions() throws Exception {
final Oracle oracle1 = oraclePool.getOracle();
final Oracle oracle2 = oraclePool.getOracle();
final Oracle oracle3 = oraclePool.getOracle();
final GraphOracleSem graph1 = new GraphOracleSem(oracle1, OracleMetadataDaoITCase.INTEGRATION_TEST_MODEL);
final Model model1 = new ModelOracleSem(graph1);
final GraphOracleSem graph2 = new GraphOracleSem(oracle2, OracleMetadataDaoITCase.INTEGRATION_TEST_MODEL);
final Model model2 = new ModelOracleSem(graph2);
GraphOracleSem graph3 = new GraphOracleSem(oracle3, OracleMetadataDaoITCase.INTEGRATION_TEST_MODEL);
Model model3 = new ModelOracleSem(graph3);
removePersons(model3);
model3.commit();
model3.close();
graph3 = new GraphOracleSem(oracle3, OracleMetadataDaoITCase.INTEGRATION_TEST_MODEL);
model3 = new ModelOracleSem(graph3);
model1.add(model1.createResource("http://www.tv2.no/people/person-1"), DC.description, "A dude");
model2.add(model1.createResource("http://www.tv2.no/people/person-2"), DC.description, "Another dude");
int countPersons = countPersons(model3);
assertEquals(0, countPersons);
model1.commit();
countPersons = countPersons(model3);
assertEquals(1, countPersons);
model2.commit();
countPersons = countPersons(model3);
assertEquals(2, countPersons);
oracle1.commitTransaction();
oracle2.commitTransaction();
oracle3.commitTransaction();
model1.close();
model2.close();
model3.close();
oracle1.dispose();
oracle2.dispose();
oracle3.dispose();
System.err.println("all disposed");
}
public static void main(String ...args) throws Exception {
OracleSemTxIntegrationWithSpringITCase me = new OracleSemTxIntegrationWithSpringITCase();
me.before();
Stopwatch sw = Stopwatch.createStarted();
for(int n = 0; n < 1000; n++) {
me.testTransactionHandlingViaJdbcTransactions();
}
System.err.println("DONE: " + sw.stop());
me.after();
}
#After
public void after() throws SQLException {
oracleDataSource.close();
}
private int countPersons(final Model model) {
return listPersons(model).size();
}
private void removePersons(final Model model) {
final List<Resource> persons = listPersons(model);
persons.stream().forEach(per -> model.removeAll(per, null, null));
}
private List<Resource> listPersons(final Model model) {
final List<Resource> persons = Lists.newArrayList();
ExtendedIterator<Resource> iter = model.listSubjects()
.filterKeep(new Filter<Resource>() {
#Override
public boolean accept(Resource o) {
return o.getURI().startsWith("http://www.tv2.no/people/person-");
}
})
;
iter.forEachRemaining(item -> persons.add(item));
iter.close();
return persons;
}
}
Oracle has provided a fix for this, which I would assume will be publicly available at some time.

Categories

Resources