wait for multiple scala-async - java

i want to close csv writer when all the async block executed
import java.io.{FileReader, FileWriter}
import com.opencsv.{CSVReader, CSVWriter}
import org.jsoup.helper.StringUtil
import scala.async.Async.{async, await}
import scala.concurrent.ExecutionContext.Implicits.global
var rows = 0
reader.forEach(line => {
async {
val csv = new CSV(line(0), line(1), line(2), line(3), line(4));
entries(0) = csv.id
entries(1) = csv.name
val di = async(httpRequest(csv.id))
var di2 = "not available"
val di2Future = async(httpRequest(csv.name))
di2 = await(di2Future)
entries(2) = await(di)
entries(3) = di2
writer.writeNext(entries)
println(s"${csv.id} completed!!!! ")
rows += 1
}
})
writer.close();
in above code writer always closed first, so i want to execute all async block then close my csv writer.

Below is a skeleton solution.
val allResponses = reader.map(l => {
val entries = ??? // declare entires data structure here
for {
line <- async(l)
// line <- Future.succesful(l)
csv <- async {
val csv = CSV(line(0), line(1), line(2), line(3), line(4))
entries(0) = csv.id
entries(1) = csv.name
csv
}
di <- async {
httpRequest(csv.id)
}
di2 <- async {
httpRequest(csv.name)
}
e <- async {
entries(2) = di
entries(3) = di2
entries
}
} yield e
})
val t = Future.sequence(allResponses)
t.map(a => {
val writer = new FileWriter("file.txt")
a.foreach(i => {
writer.writeNext(i)
})
writer.close()
})
Hope this helps.

An async block produces a Future[A], where A in your case is Unit (which is the type of the assignment rows += 1).
In general, you can perform operations when a Future is complete like the following:
def myFuture: Future[Something] = ??? // your async process
myFuture.onComplete {
case Success(result) =>
???
case Failure(exception) =>
???
}
If you want to perform something regardless the status you can skip pattern matching:
myFuture.onComplete(_ => writer.close()) // e.g.

Related

Exception in thread "streaming-job-executor-11" java.lang.ClassFormatError

I am working with kafka (scala) and spark streaming (scala) to insert data from several CSVs to Cassandra tables, and I made a producer and a consumer, here are their respective codes
Producer:
import java.sql.Timestamp
import java.util.Properties
import java.io._
import java.io.File
import java.nio.file.{Files, Paths, Path, SimpleFileVisitor,
FileVisitResult}
import scala.io.Source
import akka.actor.{Actor, ActorSystem, Props}
import com.typesafe.config.ConfigFactory
import kafka.producer.{KeyedMessage, Producer, ProducerConfig}
class produceMessages(brokers: String, topic: String) extends Actor {
// All helpers needed to send messages
def filecontent(namefile: String){
for (line <- Source.fromFile(namefile).getLines) {
println(line)
}
}
def getListOfFiles(dir: String):List[File] = {
val d = new File(dir)
if (d.exists && d.isDirectory) {
d.listFiles.filter(_.isFile).toList
} else {
List[File]()
}
}
def between(value: String, a:String, b: String):String = {
// Return a substring between the two strings.
val posA = value.indexOf(a)
val posB = value.lastIndexOf(b)
val adjustedPosA = posA + a.length()
val res = value.substring(adjustedPosA, posB)
return res
}
def getTableName(filePath: String):String = {
//return table name from filePath
val fileName = filePath.toString.split("\\\\").last
val tableName = between(fileName,"100001_","_2017")
return tableName
}
// end of helpers
object kafka {
val producer = {
val props = new Properties()
props.put("metadata.broker.list", brokers)
//props.put(" max.request.size","5242880")
props.put("serializer.class", "kafka.serializer.StringEncoder")
val config = new ProducerConfig(props)
new Producer[String, String](config)
}
}
def receive = {
case "send" => {
val listeFichiers = getListOfFiles("C:\\Users\\acer\\Desktop\\csvs")
for (i <- 0 until listeFichiers.length)yield{
val chemin = listeFichiers(i).toString
val nomTable = getTableName(chemin)
println(nomTable)
val lines = Source.fromFile(chemin).getLines.toArray
val headerLine = lines(0)
println(headerLine)
val data = lines.slice(1,lines.length)
val messages = for (j <- 0 until data.length) yield{
val str = s"${data(j).toString}"
println(str)
new KeyedMessage[String, String](topic, str)
}
//sending the messages
val numberOfLinesInTable = new KeyedMessage[String, String](topic, data.length.toString)
val table = new KeyedMessage[String, String](topic, nomTable)
val header = new KeyedMessage[String, String](topic, headerLine)
kafka.producer.send(numberOfLinesInTable)
kafka.producer.send(table)
kafka.producer.send(header)
kafka.producer.send(messages: _*)
}
}
/*case "delete" =>{
val listeFichiers = getListOfFiles("C:\\Users\\acer\\Desktop\\csvs")
for (file <- listeFichiers){
if (file.isDirectory)
Option(file.listFiles).map(_.toList).getOrElse(Nil).foreach(Files.delete(_))
file.delete
}
}*/
case _ => println("Not a valid message!")
}
}
// Produces some random words between 1 and 100.
object KafkaStreamProducer extends App {
/*
* Get runtime properties from application.conf
*/
val systemConfig = ConfigFactory.load()
val kafkaHost = systemConfig.getString("KafkaStreamProducer.kafkaHost")
println(s"kafkaHost $kafkaHost")
val kafkaTopic = systemConfig.getString("KafkaStreamProducer.kafkaTopic")
println(s"kafkaTopic $kafkaTopic")
val numRecords = systemConfig.getLong("KafkaStreamProducer.numRecords")
println(s"numRecords $numRecords")
val waitMillis = systemConfig.getLong("KafkaStreamProducer.waitMillis")
println(s"waitMillis $waitMillis")
/*
* Set up the Akka Actor
*/
val system = ActorSystem("KafkaStreamProducer")
val messageActor = system.actorOf(Props(new produceMessages(kafkaHost, kafkaTopic)), name="genMessages")
/*
* Message Loop
*/
var numRecsWritten = 0
while(numRecsWritten < numRecords) {
messageActor ! "send"
numRecsWritten += numRecsWritten
println(s"${numRecsWritten} records written.")
//messageActor ! "delete"
Thread sleep waitMillis
}
}
And here is the consumer:
package com.datastax.demo
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SaveMode, Row, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import org.apache.spark.streaming.{Milliseconds, StreamingContext, Time}
import org.apache.spark.streaming.kafka.KafkaUtils
import com.datastax.spark.connector._
import kafka.serializer.StringDecoder
import org.apache.spark.rdd.RDD
import java.sql.Timestamp
import java.io.File
import scala.io.Source
import scala.reflect.runtime.universe
import scala.tools.reflect.ToolBox
case class cellmodu(collecttime: Double,sbnid: Double,enodebid: Double,cellid: Double,c373515500: Double,c373515501: Double,c373515502: Double,c373515503: Double,c373515504: Double,c373515505: Double,c373515506: Double,c373515507: Double,c373515508: Double,c373515509: Double,c373515510: Double,c373515511: Double,c373515512: Double,c373515513: Double,c373515514: Double,c373515515: Double,c373515516: Double,c373515517: Double,c373515518: Double,c373515519: Double,c373515520: Double,c373515521: Double,c373515522: Double,c373515523: Double,c373515524: Double,c373515525: Double,c373515526: Double,c373515527: Double,c373515528: Double,c373515529: Double,c373515530: Double,c373515531: Double,c373515532: Double,c373515533: Double,c373515534: Double,c373515535: Double,c373515536: Double,c373515537: Double,c373515538: Double,c373515539: Double,c373515540: Double,c373515541: Double,c373515542: Double,c373515543: Double,c373515544: Double,c373515545: Double,c373515546: Double,c373515547: Double,c373515548: Double,c373515549: Double,c373515550: Double,c373515551: Double,c373515552: Double,c373515553: Double,c373515554: Double,c373515555: Double,c373515556: Double,c373515557: Double,c373515558: Double,c373515559: Double,c373515560: Double,c373515561: Double,c373515562: Double,c373515563: Double,c373515564: Double,c373515565: Double,c373515566: Double,c373515567: Double,c373515568: Double,c373515569: Double,c373515570: Double,c373515571: Double,c373515572: Double,c373515573: Double,c373515574: Double,c373515575: Double,c373515576: Double,c373515577: Double,c373515578: Double,c373515589: Double,c373515590: Double,c373515591: Double,c373515592: Double,c373515593: Double,c373515594: Double,c373515595: Double,c373515596: Double,c373515597: Double,c373515598: Double,c373515601: Double,c373515602: Double,c373515608: Double,c373515609: Double,c373515610: Double,c373515611: Double,c373515616: Double,c373515618: Double,c373515619: Double,c373515620: Double,c373515621: Double,c373515622: Double,c373515623: Double,c373515624: Double,c373515625: Double,c373515626: Double,c373515627: Double,c373515628: Double,c373515629: Double,c373515630: Double,c373515631: Double,c373515632: Double,c373515633: Double,c373515634: Double,c373515635: Double,c373515636: Double,c373515637: Double,c373515638: Double,c373515639: Double,c373515640: Double,c373515641: Double,c373515642: Double,c373515643: Double,c373515644: Double,c373515645: Double,c373515646: Double,c373515647: Double,c373515648: Double,c373515649: Double,c373515650: Double,c373515651: Double,c373515652: Double,c373515653: Double,c373515654: Double,c373515655: Double,c373515656: Double,c373515657: Double,c373515658: Double,c373515659: Double,c373515660: Double,c373515661: Double,c373515662: Double,c373515663: Double,c373515664: Double,c373515665: Double,c373515666: Double,c373515667: Double,c373515668: Double,c373515669: Double,c373515670: Double,c373515671: Double,c373515672: Double,c373515673: Double,c373515674: Double,c373515675: Double,c373515676: Double,c373515677: Double,c373515678: Double,c373515679: Double,c373515680: Double,c373515681: Double,c373515682: Double,c373515683: Double,c373515684: Double,c373515685: Double,c373515686: Double,c373515687: Double,c373515688: Double,c373515689: Double,c373515690: Double,c373515691: Double,c373515692: Double,c373515693: Double,c373515694: Double,c373515695: Double,c373515696: Double,c373515697: Double,c373515698: Double,c373515699: Double,c373515700: Double,c373515701: Double,c373515702: Double,c373515703: Double,c373515704: Double,c373515705: Double,c373515706: Double,c373515707: Double,c373515708: Double,c373515709: Double,c373515710: Double,c373515711: Double,c373515712: Double,c373515713: Double,c373515714: Double,c373515715: Double,c373515716: Double,c373515717: Double,c373515718: Double,c373515719: Double,c373515720: Double,c373515721: Double,c373515722: Double,c373515723: Double,c373515724: Double,c373515725: Double,c373515726: Double,c373515727: Double,c373515728: Double,c373515729: Double,c373515730: Double,c373515731: Double,c373515732: Double,c373515733: Double,c373515734: Double,c373515735: Double,c373515736: Double,c373515737: Double,c373515738: Double,c373515739: Double,c373515740: Double,c373515741: Double,c373515742: Double,c373515743: Double,c373515744: Double,c373515745: Double,c373515746: Double,c373515747: Double,c373515748: Double,c373515749: Double,c373515750: Double,c373515751: Double,c373515752: Double,c373515753: Double,c373515754: Double,c373515755: Double,c373515756: Double) {}
object SparkKafkaConsumerCellmodu extends App {
//START OF HELPERS
def isNumeric(str:String): Boolean = str.matches("[-+]?\\d+(\\.\\d+)?")
def printList(args: List[_]): Unit = {args.foreach(println)}
//END OF HELPERS
val appName = "SparkKafkaConsumer"
val conf = new SparkConf()
.set("spark.cores.max", "2")
//.set("spark.executor.memory", "512M")
.set("spark.cassandra.connection.host","localhost")
.setAppName(appName)
val spark: SparkSession = SparkSession.builder.master("local").getOrCreate
val sc = SparkContext.getOrCreate(conf)
val sqlContext = SQLContext.getOrCreate(sc)
import sqlContext.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
val ssc = new StreamingContext(sc, Milliseconds(1000))
ssc.checkpoint(appName)
val kafkaTopics = Set("test")
//val kafkaParams = Map[String, String]("metadata.broker.list" -> "localhost:9092")
val kafkaParams = Map(
"bootstrap.servers" -> "localhost:9092",
"fetch.message.max.bytes" -> "5242880")
val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, kafkaTopics)
kafkaStream
.foreachRDD {
(message: RDD[(String, String)]) => {
val rddToArray = message.collect().toList
val msg = rddToArray.map(_._2)
var i = 0
while (i < msg.length){
if(isNumeric(msg(i))){
println("HHHHHHHHHHHHHHHHHHHHHHHHHHHHH")
val numberLines = msg(i).toInt //get number of lines to insert in table
val nameTable = msg(i+1) //get table name
val headerTable = msg(i+2).toLowerCase //get the columns of the table
println(headerTable)
if(msg(i+1)=="CELLMODU"){
val typedCols : Array[String] = headerTable.split(",") // transform headerTable into array to define dataframe dynamically
val listtoinsert:Array[String] = new Array[String](numberLines) // an empty list that will contain the lines to insert in the adequate table
val k = i + 3 //to skip name of table and header
//fill the toinsert array with the lines
for (j <- 0 until numberLines){
listtoinsert(j) = msg(k + j)
println (listtoinsert(j))
}
//convert the array to RDD
val rddtoinsert: RDD[(String)] = sc.parallelize(listtoinsert)
//rddtoinsert.foreach(println)
//convert rdd to dataframe
val df = rddtoinsert.map {
case (v) => v.split(",")
}.map(payload1 => { // instance of dynamic class
cellmodu(payload1(0).toDouble,payload1(1).toDouble,payload1(2).toDouble,payload1(3).toDouble,payload1(4).toDouble,payload1(5).toDouble,payload1(6).toDouble,payload1(7).toDouble,payload1(8).toDouble,payload1(9).toDouble,payload1(10).toDouble,payload1(11).toDouble,payload1(12).toDouble,payload1(13).toDouble,payload1(14).toDouble,payload1(15).toDouble,payload1(16).toDouble,payload1(17).toDouble,payload1(18).toDouble,payload1(19).toDouble,payload1(20).toDouble,payload1(21).toDouble,payload1(22).toDouble,payload1(23).toDouble,payload1(24).toDouble,payload1(25).toDouble,payload1(26).toDouble,payload1(27).toDouble,payload1(28).toDouble,payload1(29).toDouble,payload1(30).toDouble,payload1(31).toDouble,payload1(32).toDouble,payload1(33).toDouble,payload1(34).toDouble,payload1(35).toDouble,payload1(36).toDouble,payload1(37).toDouble,payload1(38).toDouble,payload1(39).toDouble,payload1(40).toDouble,payload1(41).toDouble,payload1(42).toDouble,payload1(43).toDouble,payload1(44).toDouble,payload1(45).toDouble,payload1(46).toDouble,payload1(47).toDouble,payload1(48).toDouble,payload1(49).toDouble,payload1(50).toDouble,payload1(51).toDouble,payload1(52).toDouble,payload1(53).toDouble,payload1(54).toDouble,payload1(55).toDouble,payload1(56).toDouble,payload1(57).toDouble,payload1(58).toDouble,payload1(59).toDouble,payload1(60).toDouble,payload1(61).toDouble,payload1(62).toDouble,payload1(63).toDouble,payload1(64).toDouble,payload1(65).toDouble,payload1(66).toDouble,payload1(67).toDouble,payload1(68).toDouble,payload1(69).toDouble,payload1(70).toDouble,payload1(71).toDouble,payload1(72).toDouble,payload1(73).toDouble,payload1(74).toDouble,payload1(75).toDouble,payload1(76).toDouble,payload1(77).toDouble,payload1(78).toDouble,payload1(79).toDouble,payload1(80).toDouble,payload1(81).toDouble,payload1(82).toDouble,payload1(83).toDouble,payload1(84).toDouble,payload1(85).toDouble,payload1(86).toDouble,payload1(87).toDouble,payload1(88).toDouble,payload1(89).toDouble,payload1(90).toDouble,payload1(91).toDouble,payload1(92).toDouble,payload1(93).toDouble,payload1(94).toDouble,payload1(95).toDouble,payload1(96).toDouble,payload1(97).toDouble,payload1(98).toDouble,payload1(99).toDouble,payload1(100).toDouble,payload1(101).toDouble,payload1(102).toDouble,payload1(103).toDouble,payload1(104).toDouble,payload1(105).toDouble,payload1(106).toDouble,payload1(107).toDouble,payload1(108).toDouble,payload1(109).toDouble,payload1(110).toDouble,payload1(111).toDouble,payload1(112).toDouble,payload1(113).toDouble,payload1(114).toDouble,payload1(115).toDouble,payload1(116).toDouble,payload1(117).toDouble,payload1(118).toDouble,payload1(119).toDouble,payload1(120).toDouble,payload1(121).toDouble,payload1(122).toDouble,payload1(123).toDouble,payload1(124).toDouble,payload1(125).toDouble,payload1(126).toDouble,payload1(127).toDouble,payload1(128).toDouble,payload1(129).toDouble,payload1(130).toDouble,payload1(131).toDouble,payload1(132).toDouble,payload1(133).toDouble,payload1(134).toDouble,payload1(135).toDouble,payload1(136).toDouble,payload1(137).toDouble,payload1(138).toDouble,payload1(139).toDouble,payload1(140).toDouble,payload1(141).toDouble,payload1(142).toDouble,payload1(143).toDouble,payload1(144).toDouble,payload1(145).toDouble,payload1(146).toDouble,payload1(147).toDouble,payload1(148).toDouble,payload1(149).toDouble,payload1(150).toDouble,payload1(151).toDouble,payload1(152).toDouble,payload1(153).toDouble,payload1(154).toDouble,payload1(155).toDouble,payload1(156).toDouble,payload1(157).toDouble,payload1(158).toDouble,payload1(159).toDouble,payload1(160).toDouble,payload1(161).toDouble,payload1(162).toDouble,payload1(163).toDouble,payload1(164).toDouble,payload1(165).toDouble,payload1(166).toDouble,payload1(167).toDouble,payload1(168).toDouble,payload1(169).toDouble,payload1(170).toDouble,payload1(171).toDouble,payload1(172).toDouble,payload1(173).toDouble,payload1(174).toDouble,payload1(175).toDouble,payload1(176).toDouble,payload1(177).toDouble,payload1(178).toDouble,payload1(179).toDouble,payload1(180).toDouble,payload1(181).toDouble,payload1(182).toDouble,payload1(183).toDouble,payload1(184).toDouble,payload1(185).toDouble,payload1(186).toDouble,payload1(187).toDouble,payload1(188).toDouble,payload1(189).toDouble,payload1(190).toDouble,payload1(191).toDouble,payload1(192).toDouble,payload1(193).toDouble,payload1(194).toDouble,payload1(195).toDouble,payload1(196).toDouble,payload1(197).toDouble,payload1(198).toDouble,payload1(199).toDouble,payload1(200).toDouble,payload1(201).toDouble,payload1(202).toDouble,payload1(203).toDouble,payload1(204).toDouble,payload1(205).toDouble,payload1(206).toDouble,payload1(207).toDouble,payload1(208).toDouble,payload1(209).toDouble,payload1(210).toDouble,payload1(211).toDouble,payload1(212).toDouble,payload1(213).toDouble,payload1(214).toDouble,payload1(215).toDouble,payload1(216).toDouble,payload1(217).toDouble,payload1(218).toDouble,payload1(219).toDouble,payload1(220).toDouble,payload1(221).toDouble,payload1(222).toDouble,payload1(223).toDouble,payload1(224).toDouble,payload1(225).toDouble,payload1(226).toDouble,payload1(227).toDouble,payload1(228).toDouble,payload1(229).toDouble,payload1(230).toDouble,payload1(231).toDouble,payload1(232).toDouble,payload1(233).toDouble,payload1(234).toDouble,payload1(235).toDouble,payload1(236).toDouble,payload1(237).toDouble,payload1(238).toDouble)
}).toDF(typedCols: _*)
//insert dataframe in cassandra table
df
.write
.format("org.apache.spark.sql.cassandra")
.mode(SaveMode.Append)
.options(Map("keyspace" -> "ztedb4g", "table" -> nameTable.toLowerCase)) // tolowercase because the name table comes in uppercase
.save()
df.show(1)
println(s"${df.count()} rows processed.")
}
}
}
}
}
ssc.start()
ssc.awaitTermination()
}
The producer works well and publishes the messages as I want it to, but when I execute the consumer to insert in a table called "Cellmodu" I get the following error:
Exception in thread "streaming-job-executor-11" java.lang.ClassFormatError: com/datastax/demo/cellmodu
at com.datastax.demo.SparkKafkaConsumerCellmodu$$anonfun$1.apply(SparkKafkaConsumerCellmodu.scala:90)
at com.datastax.demo.SparkKafkaConsumerCellmodu$$anonfun$1.apply(SparkKafkaConsumerCellmodu.scala:57)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I keep getting this error over and over again for different streaming jobs and then nothing is inserted in my table, note that I tried to executre the exact same code for other tables with different case class of course that matches my table schema and it worked just fine, I don't understand why I get this error for few tables only like this one
as per your exception it is clearly java.lang.ClassFormatError error. First compare with other classes which are working fine in inserting to table. If you are using xml config information for Cellmodu, please check that if something wrong in it.

How to Extract vector in session?

I have saved vector in session and I want to use random value from the vector but dont know how to extract value in session.
Errors:
'httpRequest-6' failed to execute: Vector(437420, 443940, 443932,
437437, 443981, 443956, 443973, 443915, 437445) named 'termIds' does
not support .random function
And
In 2nd scenario It passes vector in get request like this way, http://someurl/api/thr/Vector(435854)/terms/Vector(437420, 443940,
443932, 437437, 443981, 443956, 443973, 443915, 437445)
instead of using
http://someurl/api/thr/435854/terms/443973
::Here is my script::
class getTerm extends Simulation {
val repeatCount = Integer.getInteger("repeatCount", 1).toInt
val userCount = Integer.getInteger("userCount", 1).toInt
val turl = System.getProperty("turl", "some url")
val httpProtocol = http
.baseURL("http://" + turl)
val headers_10 = Map("Content-Type" -> """application/json""")
var thrIds = ""
var termIds = ""
// Scenario - 1
val getTerms = scenario("Scn 1")
.exec(http("list_of_term")
.get("/api/abc")
.headers(headers_10)
.check(jsonPath("$[*].id")
.findAll.saveAs("thrIds"))
)
.exec(http("get_all_terms")
.get("""/api/thr/${thrIds.random()}/terms""")
.headers(headers_10)
.check(jsonPath("$[*].id")
.findAll.saveAs("termIds"))
)
.exec(session => {
thrIds = session("thrIds").as[Long].toString
termIds = session("termIds").as[Long].toString
println("***************************************")
println("Session ====>>>> " + session)
println("Ths ID ====>>>> " + thrIds)
println("Term ID ====>>>> " + termIds)
println("***************************************")
session}
)
// Scenario - 2
// Want to extract vectors here and pass its value into get call
val getKnownTerms = scenario("Get Known Term")
.exec(_.set("thrIds", thrIds))
.exec(_.set("termIds", termIds))
.repeat (repeatCount){
exec(http("get_k_term")
.get("""/api/thr/${thrIds}/terms/${termIds.random()}""")
.headers(headers_10))
}
val scn = List(getTerms.inject(atOnceUsers(1)), getKnownTerms.inject(nothingFor(20 seconds), atOnceUsers(userCount)))
setUp(scn).protocols(httpProtocol)
}
Here is the solution which may help others.
class getTerm extends Simulation {
val repeatCount = Integer.getInteger("repeatCount", 1).toInt
val userCount = Integer.getInteger("userCount", 1).toInt
val turl = System.getProperty("turl", "some url")
val httpProtocol = http
.baseURL("http://" + turl)
val headers_10 = Map("Content-Type" -> """application/json""")
// Change - 1
var thrIds: Seq[String] = _
var termIds: Seq[String] = _
// Scenario - 1
val getTerms = scenario("Scn 1")
.exec(http("list_of_term")
.get("/api/abc")
.headers(headers_10)
.check(jsonPath("$[*].id")
.findAll
.transform { v => thrIds = v; v }
.saveAs("thrIds"))
)
.exec(http("get_all_trms")
.get("""/api/thr/${thrIds.random()}/terms""")
.headers(headers_10)
.check(jsonPath("$[*].id")
.findAll
.transform { v => termIds = v; v }
.saveAs("termIds"))
)
// Scenario - 2
val getKnownTerms = scenario("Get Known Term")
.exec(_.set("thrIds", thrIds))
.exec(_.set("termIds", termIds))
.repeat (repeatCount){
exec(http("get_k_term")
.get("""/api/thr/${thrIds.random()}/terms/${termIds.random()}""")
.headers(headers_10))
}
val scn = List(getTerms.inject(atOnceUsers(1)), getKnownTerms.inject(nothingFor(20 seconds), atOnceUsers(userCount)))
setUp(scn).protocols(httpProtocol)
}

Whats the best way to read multiline input format to one record in spark?

Below is the input file(csv) looks like:
Carrier_create_date,Message,REF_SHEET_CREATEDATE,7/1/2008
Carrier_create_time,Message,REF_SHEET_CREATETIME,8:53:57
Carrier_campaign,Analog,REF_SHEET_CAMPAIGN,25
Carrier_run_no,Analog,REF_SHEET_RUNNO,7
Below is the list of columns each rows has:
(Carrier_create_date, Carrier_create_time, Carrier_campaign, Carrier_run_no)
Desired output as dataframe:
7/1/2008,8:53:57,25,7
Basically the input file has column name and value on each rows.
What I have tried so far is:
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkContext, SparkConf}
object coater4CR {
// Define the application Name
val AppName: String = "coater4CR"
// Set the logging level.ERROR)
Logger.getLogger("org.apache").setLevel(Level.ERROR)
def main(args: Array[String]): Unit = {
// define the input parmeters
val input_file = "/Users/gangadharkadam/myapps/NlrPraxair/src/main/resources/NLR_Praxair›2008›3QTR2008›Coater_4›C025007.csv"
// Create the Spark configuration and the spark context
println("Initializing the Spark Context...")
val conf = new SparkConf().setAppName(AppName).setMaster("local")
// Define the Spark Context
val sc = new SparkContext(conf)
// Read the csv file
val inputRDD = sc.wholeTextFiles(input_file)
.flatMap(x => x._2.split(" "))
.map(x => {
val rowData = x.split("\n")
var Carrier_create_date: String = ""
var Carrier_create_time: String = ""
var Carrier_campaign: String = ""
var Carrier_run_no: String = ""
for (data <- rowData) {
if (data.trim().startsWith("Carrier_create_date")) {
Carrier_create_date = data.split(",")(3)
} else if (data.trim().startsWith("Carrier_create_time")) {
Carrier_create_time = data.split(",")(3)
} else if (data.trim().startsWith("Carrier_campaign")) {
Carrier_campaign = data.split(",")(3)
} else if (data.trim().startsWith("Carrier_run_no")) {
Carrier_run_no = data.split(",")(3)
}
}
(Carrier_create_date, Carrier_create_time, Carrier_campaign, Carrier_run_no)
}).foreach(println)
}
}
issues with the above code
when I run the above code I am getting an empty list as below
(,,,)
when I change
Carrier_campaign = data.split(",")(3)
to
Carrier_campaign = data.split(",")(2)
I am getting the below output which is somewhat closer
(REF_SHEET_CREATEDATE,REF_SHEET_CREATETIME,REF_SHEET_CAMPAIGN,REF_SHEET_RUNNO)
(,,,)
some how the above code is not able to pick the last column position from the data row but is working for column positions 0,1,2.
So my questions are-
whats wrong with the above code
whats the efficient approach to read this multiline input and load it in tabular format to database
Appreciate any help/pointers on this. Thanks.

Scala/Akka NullPointerException on master actor

I am learning akka framework for parallel processing in scala, and I was trying to migrating a java project to scala so I can learn both akka and scala at the same time. I am get a NullPointerException on master actor when trying to receive mutable object from the worker actor after some computation in the worker. All code is below...
import akka.actor._
import java.math.BigInteger
import akka.routing.ActorRefRoutee
import akka.routing.Router
import akka.routing.RoundRobinRoutingLogic
object Main extends App {
val system = ActorSystem("CalcSystem")
val masterActor = system.actorOf(Props[Master], "master")
masterActor.tell(new Calculate, ActorRef.noSender)
}
class Master extends Actor {
private val messages: Int = 10;
var resultList: Seq[String] = _
//val workerRouter = this.context.actorOf(Props[Worker].withRouter(new RoundRobinRouter(2)), "worker")
var router = {
val routees = Vector.fill(5) {
val r = context.actorOf(Props[Worker])
context watch r
ActorRefRoutee(r)
}
Router(RoundRobinRoutingLogic(), routees)
}
def receive() = {
case msg: Calculate =>
processMessages()
case msg: Result =>
resultList :+ msg.getFactorial().toString
println(msg.getFactorial())
if (resultList.length == messages) {
end
}
}
private def processMessages() {
var i: Int = 0
for (i <- 1 to messages) {
// workerRouter.tell(new Work, self)
router.route(new Work, self)
}
}
private def end() {
println("List = " + resultList)
this.context.system.shutdown()
}
}
import akka.actor._
import java.math.BigInteger
class Worker extends Actor {
private val calculator = new Calculator
def receive() = {
case msg: Work =>
println("Called calculator.calculateFactorial: " + context.self.toString())
val result = new Result(calculator.calculateFactorial)
sender.tell(result, this.context.parent)
case _ =>
println("I don't know what to do with this...")
}
}
import java.math.BigInteger
class Result(bigInt: BigInteger) {
def getFactorial(): BigInteger = bigInt
}
import java.math.BigInteger
class Calculator {
def calculateFactorial(): BigInteger = {
var result: BigInteger = BigInteger.valueOf(1)
var i = 0
for(i <- 1 to 4) {
result = result.multiply(BigInteger.valueOf(i))
}
println("result: " + result)
result
}
}
You initialize the resultList with null and then try to append something.
Does your calculation ever stop? In line
resultList :+ msg.getFactorial().toString
you're creating a copy of sequence with an element appended. But there is no assignment to var resultList
This line will work as you want.
resultList = resultList :+ msg.getFactorial().toString
I recommend you to avoid mutable variables in actor and use context.become
https://github.com/alexandru/scala-best-practices/blob/master/sections/5-actors.md#52-should-mutate-state-in-actors-only-with-contextbecome

Is there a cleaner way to do this Group Query in MongoDB from Groovy?

I'm working on learning MongoDB. Language of choice for the current run at it is Groovy.
Working on Group Queries by trying to answer the question of which pet is the most needy one.
Below is my first attempt and it's awful. Any help cleaning this up (or simply confirming that there isn't a cleaner way to do it) would be much appreciated.
Thanks in advance!
package mongo.pets
import com.gmongo.GMongo
import com.mongodb.BasicDBObject
import com.mongodb.DBObject
class StatsController {
def dbPets = new GMongo().getDB('needsHotel').getCollection('pets')
//FIXME OMG THIS IS AWFUL!!!
def index = {
def petsNeed = 'a walk'
def reduce = 'function(doc, aggregator) { aggregator.needsCount += doc.needs.length }'
def key = new BasicDBObject()
key.put("name", true)
def initial = new BasicDBObject()
initial.put ("needsCount", 0)
def maxNeeds = 0
def needyPets = []
dbPets.group(key, new BasicDBObject(), initial, reduce).each {
if (maxNeeds < it['needsCount']) {
maxNeeds = it['needsCount']
needyPets = []
needyPets += it['name']
} else if (maxNeeds == it['needsCount']) {
needyPets += it['name']
}
}
def needyPet = needyPets
[petsNeedingCount: dbPets.find([needs: petsNeed]).count(), petsNeed: petsNeed, mostNeedyPet: needyPet]
}
}
It should be possible to be change the whole method to this (but I don't have MongoDB to test it)
def index = {
def petsNeed = 'a walk'
def reduce = 'function(doc, aggregator) { aggregator.needsCount += doc.needs.length }'
def key = [ name: true ] as BasicDBObject
def initial = [ needsCount: 0 ] as BasicDBObject
def allPets = dbPets.group( key, new BasicDBObject(), initial, reduce )
def maxNeeds = allPets*.needsCount.collect { it as Integer }.max()
def needyPet = allPets.findAll { maxNeeds == it.needsCount as Integer }.name
[petsNeedingCount: dbPets.find([needs: petsNeed]).count(), petsNeed: petsNeed, mostNeedyPet: needyPet]
}

Categories

Resources