Convert a JavaRDD String to JavaRDD Vector - java

I'm trying to load a csv file as a JavaRDD String and then want to get the data in JavaRDD Vector
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.feature.HashingTF;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
import org.apache.spark.mllib.stat.Statistics;
import breeze.collection.mutable.SparseArray;
import scala.collection.immutable.Seq;
public class Trial {
public void start() throws InstantiationException, IllegalAccessException,
ClassNotFoundException {
run();
}
private void run(){
SparkConf conf = new SparkConf().setAppName("csvparser");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> data = jsc.textFile("C:/Users/kalraa2/Documents/trial.csv");
JavaRDD<Vector> datamain = data.flatMap(null);
MultivariateStatisticalSummary mat = Statistics.colStats(datamain.rdd());
System.out.println(mat.mean());
}
private List<Vector> Seq(Vector dv) {
// TODO Auto-generated method stub
return null;
}
public static void main(String[] args) throws Exception {
Trial trial = new Trial();
trial.start();
}
}
The program is running without any error but i'm not able to get anything when trying to run it on spark-machine. Can anyone tell me whether the conversion of string RDD to Vector RDD is correct.
My csv file consist of only one column which are floating numbers

The null in this flatMap invocation might be a problem:
JavaRDD<Vector> datamain = data.flatMap(null);

I solved my answer by changing the code to this
JavaRDD<Vector> datamain = data.map(new Function<String,Vector>(){
public Vector call(String s){
String[] sarray = s.trim().split("\\r?\\n");
double[] values = new double[sarray.length];
for (int i = 0; i < sarray.length; i++) {
values[i] = Double.parseDouble(sarray[i]);
System.out.println(values[i]);
}
return Vectors.dense(values);
}
}
);

Assuming your trial.csv file looks like this
1.0
2.0
3.0
Taking your original code from your question a one line change is required with Java 8
SparkConf conf = new SparkConf().setAppName("csvparser").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> data = jsc.textFile("C:/Users/kalraa2/Documents/trial.csv");
JavaRDD<Vector> datamain = data.map(s -> Vectors.dense(Double.parseDouble(s)));
MultivariateStatisticalSummary mat = Statistics.colStats(datamain.rdd());
System.out.println(mat.mean());
Prints 2.0

Related

Apache flink pattern conditions with list

I wrote a pattern. I have a list for conditions(gettin rules from json).Data(json) is coming form kafka server . I want to filter the data with this list. But it is not working. How can I do that?
I am not sure about keyedstream and alarms in for. Can flink work like this?
main program:
package cep_kafka_eample.cep_kafka;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonParser;
import org.apache.flink.cep.CEP;
import org.apache.flink.cep.PatternSelectFunction;
import org.apache.flink.cep.PatternStream;
import org.apache.flink.cep.pattern.Pattern;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.SlidingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.streaming.util.serialization.JSONDeserializationSchema;
import util.AlarmPatterns;
import util.Rules;
import util.TypeProperties;
import java.io.FileReader;
import java.util.*;
public class MainClass {
public static void main( String[] args ) throws Exception
{
ObjectMapper mapper = new ObjectMapper();
JsonParser parser = new JsonParser();
Object obj = parser.parse(new FileReader(
"c://new 5.json"));
JsonArray array = (JsonArray)obj;
Gson googleJson = new Gson();
List<Rules> ruleList = new ArrayList<>();
for(int i = 0; i< array.size() ; i++) {
Rules jsonObjList = googleJson.fromJson(array.get(i), Rules.class);
ruleList.add(jsonObjList);
}
//apache kafka properties
Properties properties = new Properties();
properties.setProperty("zookeeper.connect", "localhost:2181");
properties.setProperty("bootstrap.servers", "localhost:9092");
//starting flink
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(1000).setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//get kafka values
FlinkKafkaConsumer010<ObjectNode> myConsumer = new FlinkKafkaConsumer010<>("demo", new JSONDeserializationSchema(),
properties);
List<Pattern<ObjectNode,?>> patternList = new ArrayList<>();
DataStream<ObjectNode> dataStream = env.addSource(myConsumer);
dataStream.windowAll(SlidingProcessingTimeWindows.of(Time.seconds(10), Time.seconds(5)));
DataStream<ObjectNode> keyedStream = dataStream;
//get pattern list, keyeddatastream
for(Rules rules : ruleList){
List<TypeProperties> typePropertiesList = rules.getTypePropList();
for (int i = 0; i < typePropertiesList.size(); i++) {
TypeProperties typeProperty = typePropertiesList.get(i);
if (typeProperty.getGroupType() != null && typeProperty.getGroupType().equals("group")) {
keyedStream = keyedStream.keyBy(
jsonNode -> jsonNode.get(typeProperty.getPropName().toString())
);
}
}
Pattern<ObjectNode,?> pattern = new AlarmPatterns().getAlarmPattern(rules);
patternList.add(pattern);
}
//CEP pattern and alarms
List<DataStream<Alert>> alertList = new ArrayList<>();
for(Pattern<ObjectNode,?> pattern : patternList){
PatternStream<ObjectNode> patternStream = CEP.pattern(keyedStream, pattern);
DataStream<Alert> alarms = patternStream.select(new PatternSelectFunction<ObjectNode, Alert>() {
private static final long serialVersionUID = 1L;
public Alert select(Map<String, List<ObjectNode>> map) throws Exception {
return new Alert("new message");
}
});
alertList.add(alarms);
}
env.execute("Flink CEP monitoring job");
}
}
getAlarmPattern:
package util;
import org.apache.flink.cep.pattern.Pattern;
import org.apache.flink.cep.pattern.conditions.IterativeCondition;
import org.apache.flink.streaming.api.datastream.DataStream;
import com.fasterxml.jackson.databind.node.ObjectNode;
public class AlarmPatterns {
public Pattern<ObjectNode, ?> getAlarmPattern(Rules rules) {
//MySimpleConditions conditions = new MySimpleConditions();
Pattern<ObjectNode, ?> alarmPattern = Pattern.<ObjectNode>begin("first")
.where(new IterativeCondition<ObjectNode>() {
#Override
public boolean filter(ObjectNode jsonNodes, Context<ObjectNode> context) throws Exception {
for (Criterias criterias : rules.getCriteriaList()) {
if (criterias.getCriteriaType().equals("equals")) {
return jsonNodes.get(criterias.getPropName()).equals(criterias.getCriteriaValue());
} else if (criterias.getCriteriaType().equals("greaterThen")) {
if (!jsonNodes.get(criterias.getPropName()).equals(criterias.getCriteriaValue())) {
return false;
}
int count = 0;
for (ObjectNode node : context.getEventsForPattern("first")) {
count += node.get("value").asInt();
}
return Integer.compare(count, 5) > 0;
} else if (criterias.getCriteriaType().equals("lessThen")) {
if (!jsonNodes.get(criterias.getPropName()).equals(criterias.getCriteriaValue())) {
return false;
}
int count = 0;
for (ObjectNode node : context.getEventsForPattern("first")) {
count += node.get("value").asInt();
}
return Integer.compare(count, 5) < 0;
}
}
return false;
}
}).times(rules.getRuleCount());
return alarmPattern;
}
}
Thanks for using FlinkCEP!
Could you provide some more details about what exactly is the error message (if any)? This will help a lot at pinning down the problem.
From a first look at the code, I can make the following observations:
At first, the line:
dataStream.windowAll(SlidingProcessingTimeWindows.of(Time.seconds(10), Time.seconds(5)));
will never be executed, as you never use this stream in the rest of your program.
Second, you should specify a sink to be taken after the select(), e.g. print() method on each of your PatternStreams. If you do not do so, then your output gets discarded. You can have a look here for examples, although the list is far from exhaustive.
Finally, I would recommend adding a within() clause to your pattern, so that you do not run out of memory.
Error was from my json object. I will fix it. When i am run job on intellij cep doesn't work. When submit from flink console it works.

Load CSV files with Apache Spark - Java API

I am new to Spark and working my way through the Java and Scala API and I was interested to put together two examples with a view of comparing both languages in terms of conciseness and readability.
Here is my Scala version:
import java.io.StringReader
import au.com.bytecode.opencsv.CSVReader
import org.apache.spark.{SparkConf, SparkContext}
object LoadCSVScalaExample {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("MyLoadCSVScalaExampleApplication").setMaster("local[*]")
val sc = new SparkContext(conf)
val input = sc.textFile("D:\\MOCK_DATA_spark.csv")
val result = input.map { line => val reader = new CSVReader(new StringReader(line));
reader.readNext()
}
print("This is the total count " + result.count())
}
}
Whereas this is the Java counterpart:
import au.com.bytecode.opencsv.CSVReader;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import java.io.StringReader;
public class LoadCSVJavaExample implements Function<String, String[]> {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("MyLoadCSVJavaExampleApp").setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> csvFile = sc.textFile("D:\\MOCK_DATA_spark.csv");
JavaRDD<String[]> csvData = csvFile.map(new LoadCSVJavaExample());
System.out.println("This prints the total count " + csvData.count());
}
public String[] call(String line) throws Exception {
CSVReader reader = new CSVReader(new StringReader(line));
return reader.readNext();
}
}
I am not sure, however, whether the Java example is actually correct? Can I pick your brain on it? I know I could use the Databrickd Spark csv library, instead, but I was wondering whether the current example is correct and how it can be further improved upon.
Thank you for your help,
I.

Spark - Extract line after String match and save it in ArrayList

I am new to spark and trying to extract a line which contains "Subject:" and save it in an arraylist. I am not facing any error but the array list is empty. Can you please guide me where am i going wrong? or the best way to do this?
import java.util.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
public final class extractSubject {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setMaster("local[1]").setAppName("JavaBookExample");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaRDD<String> sample = sc.textFile("/Users/Desktop/sample.txt");
final ArrayList<String> list = new ArrayList<>();
sample.foreach(new VoidFunction<String>(){
public void call(String line) {
if (line.contains("Subject:")) {
System.out.println(line);
list.add(line);
}
}}
);
System.out.println(list);
sc.stop();
}
}
Please keep in mind that Spark applications run distributed and in parallel. Therefore you cannot modify variables outside of functions that are executed by Spark.
Instead you need to return a result from these functions. In your case you need flatMap (instead of foreach that has no result), which concatenates collections that are returned as result of your function.
If a line matches a list that contains the matching line is returned, otherwise you return an empty list.
To print the data in the main function, you first have to gather the possibly distributed data in your master node, by calling collect().
Here an example:
import java.util.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
public final class extractSubject {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setMaster("local[1]").setAppName("JavaBookExample");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
//JavaRDD<String> sample = sc.textFile("/Users/Desktop/sample.txt");
JavaRDD<String> sample = sc.parallelize(Arrays.asList("Subject: first",
"nothing here",
"Subject: second",
"dummy"));
JavaRDD<String> subjectLinesRdd = sample.flatMap(new FlatMapFunction<String, String>() {
public Iterable<String> call(String line) {
if (line.contains("Subject:")) {
return Collections.singletonList(line); // line matches → return list with the line as its only element
} else {
return Collections.emptyList(); // ignore line → return empty list
}
}
});
List<String> subjectLines = subjectLinesRdd.collect(); // collect values from Spark workers
System.out.println(subjectLines); // → "[Subject: first, Subject: second]"
sc.stop();
}
}

Getting error in apache spark simple program

i am doing one simple example of word count in apache spark in java with reference of Internet and i m getting error of
Caused by: java.net.UnknownHostException: my.txt
you can see my below code for the reference!
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
public class MyCount {
public static void main(String[] args) {
// TODO Auto-generated method stub
String file = "hdfs://my.txt";
JavaSparkContext sc = new JavaSparkContext("local", "Simple App");
JavaRDD<String> lines = sc.textFile(file);
long nums = lines.count();
System.out.println(nums);
}
}
Can you try
String file = "hdfs://localhost/my.txt"
PS: make sure you have this file my.txt in hdfs.
In case if you don't have that file hdfs, follow below command to put the file in hdfs from local dir.
Hadoop fs -copyFromLocal /home/training/my.txt hadoop/
Old question but an answer was never accepted, the mistake at the time I read it is mixing the "local" concept of Spark with "localhost."
Using this constructor: JavaSparkContext(java.lang.String master, java.lang.String appName), you would want to use:
JavaSparkContext sc = new JavaSparkContext("localhost", "Simple App");
but the question was using "local". Further, the HDFS filename didn't specify a hostname: "hdfs://SomeNameNode:9000/foo/bar/"or
"hdfs://host:port/absolute-path"
As of 1.6.2, the Javadoc for JavaSparkContext is not showing any constructor that let's you specify the cluster type directly:
http://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/api/java/JavaSparkContext.html
The best constructor for JavaSparkContext wants a SparkConf object. To do something more readable by humans, build a SparkConf object and then pass it to JavaSparkContext, here's an example that sets the appname, specifies Kryo serializer and sets the master:
SparkConf sparkConf = new SparkConf().setAppName("Threshold")
//.setMaster("local[4]");
.setMaster(getMasterString(masterName))
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.registerKryoClasses(kryoClassArray);
// create the JavaSparkContext now:
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
NOTE: the alternate .setMaster("local[4]"); would use local mode, which the OP may have been trying.
I have a more extended answer here that addresses using hostnames vs. IP addresses and a lot more for setting up your SparkConf
You can try this simple word count program
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class First {
public static void main(String[] args) {
SparkConf sf = new SparkConf().setMaster("local[3]").setAppName("parth");
JavaSparkContext sc = new JavaSparkContext(sf);
JavaRDD<String> textFile = sc.textFile("input file path");
JavaRDD<String> words = textFile.flatMap((new FlatMapFunction<String, String>() {
public Iterable<String> call(String s) { return Arrays.asList(s.split(" ")); }}));
JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); }
});
JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer a, Integer b) { return a + b; }
});
counts.saveAsTextFile("outputfile-path");
}
}

Bug dereferencing after pig udf

The is actually related to the question How can I add row numbers for rows in PIG or HIVE?
The 3rd answer provided by srini works fine, but I have trouble to access the data after the udf.
The udf provided by srini is following
import java.io.IOException;
import java.util.Iterator;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.data.DataType;
public class RowCounter extends EvalFunc<DataBag> {
TupleFactory mTupleFactory = TupleFactory.getInstance();
BagFactory mBagFactory = BagFactory.getInstance();
public DataBag exec(Tuple input) throws IOException {
try {
DataBag output = mBagFactory.newDefaultBag();
DataBag bg = (DataBag)input.get(0);
Iterator it = bg.iterator();
Integer count = new Integer(1);
while(it.hasNext())
{ Tuple t = (Tuple)it.next();
t.append(count);
output.add(t);
count = count + 1;
}
return output;
} catch (ExecException ee) {
// error handling goes here
throw ee;
}
}
public Schema outputSchema(Schema input) {
try{
Schema bagSchema = new Schema();
bagSchema.add(new Schema.FieldSchema("RowCounter", DataType.BAG));
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
bagSchema, DataType.BAG));
}catch (Exception e){
return null;
}
}
}
I wrote a simple test pig script as following
A = load 'input.txt' using PigStorage(' ') as (name:chararray, age:int);
/*
--A: {name: chararray,age: int}
(amy,56)
(bob,1)
(bob,9)
(amy,34)
(bob,20)
(amy,78)
*/
B = group A by name;
C = foreach B {
orderedGroup = order A by age;
generate myudfs.RowCounter(orderedGroup) as t;
}
/*
--C: {t: {(RowCounter: {})}}
({(amy,34,1),(amy,56,2),(amy,78,3)})
({(bob,1,1),(bob,9,2),(bob,20,3)})
*/
D = foreach C generate FLATTEN(t);
/*
D: {t::RowCounter: {}}
(amy,34,1)
(amy,56,2)
(amy,78,3)
(bob,1,1)
(bob,9,2)
(bob,20,3)
*/
The problem is how to use D in later operation. I tried multiple ways, but always got the following error
ava.lang.ClassCastException: java.lang.String cannot be cast to org.apache.pig.data.DataBag
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POProject.processInputBag(POProject.java:575)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POProject.getNext(POProject.java:248)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.getNext(PhysicalOperator.java:316)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach.processPlan(POForEach.java:332)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach.getNext(POForEach.java:284)
at org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigGenericMapReduce$Reduce.runPipeline(PigGenericMapReduce.java:459)
at org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigGenericMapReduce$Reduce.processOnePackageOutput(PigGenericMapReduce.java:427)
at org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigGenericMapReduce$Reduce.reduce(PigGenericMapReduce.java:407)
at org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigGenericMapReduce$Reduce.reduce(PigGenericMapReduce.java:261)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:572)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:414)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:256)
My guess is that because we don't have the schema for the tuple inside the bag. if this is the reason, how should I modify the udf?
ok, I found the solution by adding the outputSchema as following
public Schema outputSchema(Schema input) {
try{
Schema.FieldSchema counter = new Schema.FieldSchema("counter", DataType.INTEGER);
Schema tupleSchema = new Schema(input.getField(0).schema.getField(0).schema.getFields());
tupleSchema.add(counter);
Schema.FieldSchema tupleFs;
tupleFs = new Schema.FieldSchema("with_counter", tupleSchema, DataType.TUPLE);
Schema bagSchema = new Schema(tupleFs);
return new Schema(new Schema.FieldSchema("row_counter",
bagSchema, DataType.BAG));
}catch (Exception e){
return null;
}
}
}
Thanks.

Categories

Resources