How to use LinearRegression in Spark based on text files - java

I'm fairly new to programming with spark. I want to setup a linear regression model using spark based on log files with tabs as "column" separators. All tutorials, examples I found about it start off with something like this JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
However, I have a bunch of log files I want to use. So what I tried so far is the following:
public static void main(String... args)
{
if(!new File("LogisticRegressionModel").exists())
{
buildTrainingModel();
}
else
{
testModel();
}
}
private static void testModel()
{
SparkSession sc = SparkSession.builder().master("local[2]").appName("LogisticRegressionTest").getOrCreate();
Dataset<Row> dataSet = sc.read().option("delimiter", "-").option("header", "false").csv("EI/eyeliteidemo/TAP01.log");
PipelineModel model = PipelineModel.load("LogisticRegressionModel");
Dataset<Row> predictions = model.transform(dataSet);
}
private static void buildTrainingModel()
{
SparkSession sc = SparkSession.builder().master("local[2]").appName("LogisticRegressionTest").getOrCreate();
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
DataTypes.createStructField("features", DataTypes.StringType, false),
});
Dataset<Row> input = sc.read().option("delimiter", "-").option("header", "false").csv("foo/bar/Foo_*.log");
input = input.drop("_c1", "_c3", "_c4");
input = input.select(functions.concat(input.col("_c0"), input.col("_c2"), input.col("_c5")));
input = input.withColumnRenamed("concat(_c0, _c2, _c5)", "features");
input.show(30, false);
Dataset<Row> dataSet = sc.createDataFrame(input.collectAsList(), schema);
Tokenizer tokenizer = new Tokenizer()
.setInputCol("features")
.setOutputCol("rawTokens");
StopWordsRemover swRemover = new StopWordsRemover().setInputCol(tokenizer.getOutputCol()).setOutputCol("cleanedTerms").setStopWords(readStopwords());
HashingTF hashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(swRemover.getOutputCol())
.setOutputCol("hashedTerms");
IDF idf = new IDF().setInputCol(hashingTF.getOutputCol()).setOutputCol("featuresIDF");
LogisticRegression lr = new LogisticRegression().setMaxIter(10).setRegParam(0.001);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {tokenizer, swRemover, hashingTF, idf, lr});
// Fit the pipeline to training documents.
PipelineModel model = pipeline.fit(dataSet);
try
{
model.save("LogisticRegressionModel");
}
catch (IOException e)
{
e.printStackTrace();
}
}
private static String[] readStopwords()
{
List<String> words = new ArrayList();
try (Stream<String> stream = Files.lines(Paths.get(LogisticRegressionTest.class.getResource("stopwords_en.txt").toURI()))) {
words = stream
.map(String::toLowerCase)
.collect(Collectors.toList());
} catch (IOException e) {
e.printStackTrace();
}
catch (URISyntaxException e)
{
e.printStackTrace();
}
String[] retWords = new String[words.size()];
return words.toArray(retWords);
}
Unfortunately, I run into exceptions:
Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: Column features must be of type org.apache.spark.ml.linalg.VectorUDT#3bfc3ba7 but was actually StringType.
at scala.Predef$.require(Predef.scala:224)
at org.apache.spark.ml.util.SchemaUtils$.checkColumnType(SchemaUtils.scala:42)
at org.apache.spark.ml.PredictorParams$class.validateAndTransformSchema(Predictor.scala:51)
at org.apache.spark.ml.classification.Classifier.org$apache$spark$ml$classification$ClassifierParams$$super$validateAndTransformSchema(Classifier.scala:58)
at org.apache.spark.ml.classification.ClassifierParams$class.validateAndTransformSchema(Classifier.scala:42)
at org.apache.spark.ml.classification.ProbabilisticClassifier.org$apache$spark$ml$classification$ProbabilisticClassifierParams$$super$validateAndTransformSchema(ProbabilisticClassifier.scala:53)
at org.apache.spark.ml.classification.ProbabilisticClassifierParams$class.validateAndTransformSchema(ProbabilisticClassifier.scala:37)
at org.apache.spark.ml.classification.LogisticRegression.org$apache$spark$ml$classification$LogisticRegressionParams$$super$validateAndTransformSchema(LogisticRegression.scala:193)
at org.apache.spark.ml.classification.LogisticRegressionParams$class.validateAndTransformSchema(LogisticRegression.scala:184)
at org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema(LogisticRegression.scala:193)
at org.apache.spark.ml.Predictor.transformSchema(Predictor.scala:122)
at org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:184)
at org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:184)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
at scala.collection.mutable.ArrayOps$ofRef.foldLeft(ArrayOps.scala:186)
at org.apache.spark.ml.Pipeline.transformSchema(Pipeline.scala:184)
at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:74)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:136)
at LogisticRegressionTest.buildTrainingModel(LogisticRegressionTest.java:92)
at LogisticRegressionTest.main(LogisticRegressionTest.java:40)
Now my problem/question is how to get this datatype issue right? Moreover, does my code make any sense to Spark experts in the first place?
Thanks!

Related

How to use the Tumbling window function for the non keyed streaming data in Flink?

I want to use the tumbling window function for my program (non keyed data) as it is processing streaming data but only 300 messages/sec. I want to take it to at least 5K/sec. For this purpose, I want to use the tumbling window for 2 sec just to see speed up its performance. But I am not sure how to use this in my case.
Note: I am using the Geomesa HBase platform for saving the messages.
Also, I did not paste my whole application code here as I only need the window function for which this code is sufficient here for your understanding
Here is my flink code
public class Tranport {
public static void main(String[] args) throws Exception {
// fetch runtime arguments
String bootstrapServers = "xx.xx.xxx.xxx:xxxx";
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// Set up the Consumer and create a datastream from this source
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", bootstrapServers);
properties.setProperty("group.id", "group_id");
final FlinkKafkaConsumer<String> flinkConsumer = new FlinkKafkaConsumer<>("lc", new SimpleStringSchema(), properties);
flinkConsumer.setStartFromTimestamp(Long.parseLong("0"));
DataStream<String> readingStream = env.addSource(flinkConsumer);
readingStream.rebalance().map(new RichMapFunction<String, String>() {
private static final long serialVersionUID = -2547861355L; // random number
DataStore lc_live = null;
SimpleFeatureType sft_live;
SimpleFeatureBuilder SFbuilderLive; // feature builder for live
List<SimpleFeature> lc_live_features; //
#Override
public void open(Configuration parameters) throws Exception {
System.out.println("In open method.");
// --- GEOMESA, GEOTOOLS APPROACH ---//
// define connection parameters to xxx GeoMesa-HBase DataStore
Map<String, Serializable> params_live = new HashMap<>();
params_live.put("xxxx", "xxx"); // HBase table name
params_live.put("xxxx","xxxx");
try {
lc_live = DataStoreFinder.getDataStore(params_live);
if (lc_live == null) {
System.out.println("Could not connect to live");
} else {
System.out.println("Successfully connected to live");
}
} catch (IOException e) {
e.printStackTrace();
}
// create simple feature type for x table in HBASE
StringBuilder attributes1 = new StringBuilder();
attributes1.append("xxx:String,");
attributes1.append("xxx:Long,");
attributes1.append("source:String,");
attributes1.append("xxx:String,");
attributes1.append("xxx:Double,");
attributes1.append("status:String,");
attributes1.append("forecast:Double,");
attributes1.append("carsCount:Integer,");
attributes1.append("*xxx:Point:srid=4326");
sft_history = SimpleFeatureTypes.createType("xxxx", attributes1.toString());
try {
lc_history.createSchema(sft_history);
} catch (IOException e) {
e.printStackTrace();
}
// Initialize the variables
numberOfMessagesProcessed = 0;
numberOfMessagesFailed = 0;
numberOfMessagesSkipped = 0;
// for lc_Live
lc_live_features = new ArrayList<>();
SFbuilderLive = new SimpleFeatureBuilder(sft_live);
Here I want to create a Tumbling window function (Window All) which can take all the
stream messages with in 2 seconds of window and push them into the array list which i have created below
// live GeoMesa-HBase DataStore
// copy the list into a local variable and empty the list for the next iteration
List<SimpleFeature> LocalFeatures = live_features;
live_features = new ArrayList<>();
LocalFeatures = Collections.unmodifiableList(LocalFeatures);
try (FeatureWriter<SimpleFeatureType, SimpleFeature> writer = live.getFeatureWriterAppend(sft_live.getTypeName(), Transaction.AUTO_COMMIT)) {
System.out.println("Writing " + LocalFeatures.size() + " features to live");
for (SimpleFeature feature : LocalFeatures) {
SimpleFeature toWrite = writer.next();
toWrite.setAttributes(feature.getAttributes());
((FeatureIdImpl) toWrite.getIdentifier()).setID(feature.getID());
toWrite.getUserData().put(Hints.USE_PROVIDED_FID, Boolean.TRUE);
toWrite.getUserData().putAll(feature.getUserData());
writer.write();
}
} catch (IOException e) {
e.printStackTrace();
}
It's late but might help someone. In Scala, you can do something like
env.addSource(consumer).
windowAll(TumblingProcessingTimeWindows.of(Time.seconds(2)))
But, remember if you are not using KeyBy(), then your data won't be processed in parallel no matter what value you set in env.setParallelism()

Append to existing CSV with headers

I have a method that appends to a .csv file but the problem is that it adds a header row everytime as well. How can I append to the .csv correctly?
I am aware that adding to a List would do the job but this method is called in separate runs.
public static void writeToCSVFileAndSend(String facilityId, int candidateStockTakeContainersCount) throws IOException {
FileWriter report = new FileWriter("/tmp/MonthlyExpectedComplianceSuggestions.csv", true);
LocalDate today = java.time.LocalDate.now();
String[] headers = { "Warehouse", "Expected Count for "+ today.getMonth().getDisplayName(TextStyle.SHORT, Locale.ENGLISH)};
Map<String, Integer> facilityExpectedMonthlyCountMap= new HashMap<String, Integer>() {
{
put(facilityId, candidateStockTakeContainersCount);
}
};
try (CSVPrinter printer = new CSVPrinter(report, CSVFormat.DEFAULT
.withHeader(headers))) {
facilityExpectedMonthlyCountMap.forEach((a, b) -> {
try {
printer.printRecord(a, b);
} catch (IOException e) {
e.printStackTrace();
}
});
}
}
Current Output
Warehouse,Expected Count for Dec
A,2147
Warehouse,Expected Count for Dec
B,0
Expected Output
Warehouse,Expected Count for Dec
A,2147
B,0
To avoid multiple headers, you should create object of CSVPrinter once and reuse it
Depending on how you are getting the data, you may split the function in two and pass CSVPrinter object around.
public static void writeToCSVFileAndSend() throws IOException
{
File outputCSV = new File( "/tmp/MonthlyExpectedComplianceSuggestions.csv");
LocalDate today = java.time.LocalDate.now();
String[] headers = { "Warehouse", "Expected Count for "+ today.getMonth().getDisplayName(TextStyle.SHORT, Locale.ENGLISH)};
boolean headerRequired = true;
if( outputCSV.exists()){
headerRequired = false;
}
CSVPrinter printer = null;
if( headerRequired){
printer = new CSVPrinter(report, CSVFormat.DEFAULT.withHeader(headers));
}
else{
printer = new CSVPrinter(report);
}
// Iterate through combination of facilityId and candidateStockTakeContainersCount and
// call print record
Map<String, Integer> facilityExpectedMonthlyCountMap= new HashMap<String, Integer>();
// fill in your data in map here
facilityExpectedMonthlyCountMap.forEach((a, b) -> {
try {
printer.printRecord(a, b);
} catch (IOException e) {
e.printStackTrace();
}
});
}

any way to use JavaSparkContext in JavaRdd.map(rdd -> {})?

I am thinking doing the following code. However, I got an error saying JavaSparkContext (sc) is not serializable. I am wondering if there is anyway to bypass this?
javaRdd.map(rdd -> {
List<String> data = new ArrayList<>();
ObjectMapper mapper = new ObjectMapper();
for (EntityA a : rdd) {
String json = null;
try {
json = mapper.writeValueAsString(a);
} catch (JsonProcessingException e) {
e.printStackTrace();
}
data.add(json);
}
JavaRDD<String> rddData = sc.parallelize(data);
DataFrame df = sqlContext.read().schema(schema).json(rddData);
});

ElasticSearch - Java API indexing 100K + PDFs using producer & consumer

Am indexing pdf using java api. I have installed ingest-attachement processor plugin and from my java code, am converting PDF into base64 and indexing encoded format of PDF.
Actually, PDFs are available in my machine d:\ drive. The file path are available in ElasticSearch index named as documents_local. So, am fetching all the records from documents_local index and getting the file path. Then, am reading the pdf file and encode into base64. Then indexing them.
For this process, am using scrollRequest API to fetch file path from index, because am having more that 100k documents. so, for indexing 20000 PDFs its taking 8 hours of time with the below java code.
So, i tried to seperated this process of indexing.
I have created 3 classses,
Controller.java
Producer.java
Consumer.java
From Controller.java class am reading all the filePath from my index and am storing all the filePath into ArrayList and passing to Producer class.
From Producer.java class am reading PDF using the filePath and converting into base64 and pushing into the queue.
From Consumer.java class i will read all the messages from the queue which are published by producer.java class.
My idea is, i want to index the encoded files in Consumer.java class. ( which is not implemented and am not sure how to do that).
Please find my java code below.
Controller.java
public class Controller {
private static final int QUEUE_SIZE = 2;
private static BlockingQueue<String> queue;
private static Collection<Thread> producerThreadCollection, allThreadCollection;
private final static String INDEX = "documents_local";
private final static String ATTACHMENT = "document_suggestion";
private final static String TYPE = "doc";
private static final Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[0].getClassName());
public static void main(String[] args) throws IOException {
RestHighLevelClient restHighLevelClient = null;
Document doc=new Document();
List<String> filePathList = new ArrayList<String>();
producerThreadCollection = new ArrayList<Thread>();
allThreadCollection = new ArrayList<Thread>();
queue = new LinkedBlockingDeque<String>(QUEUE_SIZE);
SearchRequest searchRequest = new SearchRequest(INDEX);
searchRequest.types(TYPE);
final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(60L)); //part of Scroll API
searchRequest.scroll(scroll); //part of Scroll API
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
QueryBuilder qb = QueryBuilders.matchAllQuery();
searchSourceBuilder.query(qb);
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = SearchEngineClient.getInstance3().search(searchRequest);
String scrollId = searchResponse.getScrollId(); //part of Scroll API
SearchHit[] searchHits = searchResponse.getHits().getHits();
long totalHits=searchResponse.getHits().totalHits;
logger.info("Total Hits --->"+totalHits);
//part of Scroll API -- Starts
while (searchHits != null && searchHits.length > 0) {
SearchScrollRequest scrollRequest = new SearchScrollRequest(scrollId);
scrollRequest.scroll(scroll);
searchResponse = SearchEngineClient.getInstance3().searchScroll(scrollRequest);
scrollId = searchResponse.getScrollId();
searchHits = searchResponse.getHits().getHits();
for (SearchHit hit : searchHits) {
Map<String, Object> sourceAsMap = hit.getSourceAsMap();
if(sourceAsMap != null) {
doc.setId((int) sourceAsMap.get("id"));
doc.setApp_language(String.valueOf(sourceAsMap.get("app_language")));
}
filePathList.add(doc.getPath().concat(doc.getFilename()));
}
}
createAndStartProducers(filePathList);
createAndStartConsumers(filePathList);
for(Thread t: allThreadCollection){
try {
t.join();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
System.out.println("Controller finished");
}
private static void createAndStartProducers(List<String> filePathList){
for(int i = 1; i <= filePathList.size(); i++){
Producer producer = new Producer(Paths.get(filePathList.get(i)), queue);
Thread producerThread = new Thread(producer,"producer-"+i);
producerThreadCollection.add(producerThread);
producerThread.start();
}
allThreadCollection.addAll(producerThreadCollection);
}
private static void createAndStartConsumers(List<String> filePathList){
for(int i = 0; i < filePathList.size(); i++){
Thread consumerThread = new Thread(new Consumer(queue), "consumer-"+i);
allThreadCollection.add(consumerThread);
consumerThread.start();
}
}
public static boolean isProducerAlive(){
for(Thread t: producerThreadCollection){
if(t.isAlive())
return true;
}
return false;
}
}
Producer.java
public class Producer implements Runnable {
private Path fileToRead;
private BlockingQueue<String> queue;
File file=null;
public Producer(Path filePath, BlockingQueue<String> q){
fileToRead = filePath;
queue = q;
}
public void run() {
String encodedfile = null;
BufferedReader reader = null;
try {
reader = Files.newBufferedReader(fileToRead);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
File file=new File(reader.toString());
if(file.exists() && !file.isDirectory()) {
try {
FileInputStream fileInputStreamReader = new FileInputStream(file);
byte[] bytes = new byte[(int) file.length()];
fileInputStreamReader.read(bytes);
encodedfile = new String(Base64.getEncoder().encodeToString(bytes));
fileInputStreamReader.close();
System.out.println(Thread.currentThread().getName()+" finished");
} catch (IOException e) {
e.printStackTrace();
}
}
else
{
System.out.println("File not exists");
}
}
}
Consumer.java (uncompleted class, not sure how i can do index from consumer class , Just showing skeleton of my consumer class.)
public class Consumer implements Runnable {
private BlockingQueue<String> queue;
File file=null;
public Consumer(BlockingQueue<String> q){
queue = q;
}
public void run(){
while(true){
String line = queue.poll();
if(line == null && !Controller.isProducerAlive())
return;
if(line != null){
System.out.println(Thread.currentThread().getName()+" processing line: "+line);
//Do something with the line here like see if it contains a string
}
}
}
}
With the below piece of code i have indexed the encoded file, But its taking more time to index because am having 100k documents. So that am trying for Producer & Consumer concept
jsonMap = new HashMap<>();
jsonMap.put("id", doc.getId());
jsonMap.put("app_language", doc.getApp_language());
jsonMap.put("fileContent", result);
String id=Long.toString(doc.getId());
IndexRequest request = new IndexRequest(ATTACHMENT, "doc", id )
.source(jsonMap)
.setPipeline(ATTACHMENT);

SnakeYAML formatting - remove YAML curly brackets

I have a code that is dumping a linkedhashmap into a YAML object
public class TestDump {
public static void main(String[] args) {
LinkedHashMap<String, Object> values = new LinkedHashMap<String, Object>();
values.put("one", 1);
values.put("two", 2);
values.put("three", 3);
DumperOptions options = new DumperOptions();
options.setIndent(2);
options.setPrettyFlow(true);
Yaml output = new Yaml(options);
File targetYAMLFile = new File("C:\\temp\\sample.yaml");
FileWriter writer =null;
try {
writer = new FileWriter(targetYAMLFile);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
output.dump(values, writer);
}
}
But the output looks like this
{
one: 1,
two: 2,
three: 3
}
is there a way to set this to something like this
one: 1
two: 2
three: 3
Although the first one is a valid yaml..I wanted the output format to be like the second one.
It looks like this is just some configuration via DumperOptions:
public class TestDump {
public static void main(String[] args) {
LinkedHashMap<String, Object> values = new LinkedHashMap<String, Object>();
values.put("one", 1);
values.put("two", 2);
values.put("three", 3);
DumperOptions options = new DumperOptions();
options.setIndent(2);
options.setPrettyFlow(true);
// Fix below - additional configuration
options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK);
Yaml output = new Yaml(options);
File targetYAMLFile = new File("C:\\temp\\sample.yaml");
FileWriter writer =null;
try {
writer = new FileWriter(targetYAMLFile);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
output.dump(values, writer);
}
}
This will solve my problem

Categories

Resources