Can't save Dataframe to mongodb - java
I have got a code on Scala which reads data from Twitter with help streaming, and I would like to do the same on Java. I'm trying seriallize data with help Jackson Mapper. But I have an error here MongoSpark.save(dataFrame,writeConfig);This one is underlined (Cannot resolve method save(org,apache.spark.sql.Dataframe, com.mongodb.saprk.config.WriteConfig))Can do the same in other way?MongoSpark.save(rawTweetsDF.coalesce(1).write.format("org.apache.spark.sql.json").option("forensicdb","LiveRawTweets").mode("append"), writeConfig) I also confused about this line can I do the same in Java?
P.S. I'm using Spark 1.6.2 version
object tweetstreamingmodel {
//***********************************************************************************
#transient
#volatile private
var spark_SparkSession: SparkSession = _ //Equivalent of SQLContext
val naivemodelpth = "/home/smulwa/data/naiveBayesModel"
case class SummaryStats(Recall: Double, Precision: Double, F1measure: Double, Accuracy: Double)
var tweetcategory: String = _
//***********************************************************************************
def main(args: Array[String]) {
try {
var totalTweets: Long = 0
if (spark_SparkSession == null) {
spark_SparkSession = SentUtilities.getSparkSession() //Get Spark Session Object
}
val spark_streamcontext = SentUtilities.getSparkStreamingContext(spark_SparkSession.sparkContext)
spark_streamcontext.checkpoint("hdfs://KENBO-SPK08.forensics.net:54310/checkpoint/")
// Load Naive Bayes Model from local drive.
val sqlcontext = spark_SparkSession.sqlContext //Create SQLContext from SparkSession Object
import sqlcontext.implicits._
val twitteroAuth: Some[OAuthAuthorization] = OAuthUtilities.getTwitterOAuth()
val tweetfilters = MongoScalaUtil.getTweetFilters(spark_SparkSession)
val Twitterstream: DStream[Status] = TwitterUtils.createStream(spark_streamcontext, twitteroAuth, tweetfilters,
StorageLevel.MEMORY_AND_DISK_SER).filter(_.getLang() == "en")
Twitterstream.foreachRDD {
rdd =>
if (rdd != null && !rdd.isEmpty() && !rdd.partitions.isEmpty) {
saveRawTweetsToMongoDB(rdd)
rdd.foreachPartition {
partitionOfRecords =>
if (!partitionOfRecords.isEmpty) {
partitionOfRecords.foreach(record =>
MongoScalaUtil.SaveRawtweetstoMongodb(record.toString, record.getUser.getId, record.getId, SentUtilities.getStrea mDate(), SentUtilities.getStreamTime())) //mongo_utilities.save(record.toString,spark_SparkSession.sparkContext))
}
}
}
}
val jacksonObjectMapper: ObjectMapper = new ObjectMapper()
// #param rdd -- RDD of Status objects to save.
def saveRawTweetsToMongoDB(rdd: RDD[Status]): Unit = {
try {
val sqlContext = spark_SparkSession.sqlContext
val tweet = rdd.map(status => jacksonObjectMapper.writeValueAsString(status))
val rawTweetsDF = sqlContext.read.json(tweet)
val readConfig: ReadConfig = ReadConfig(Map("uri" ->
"mongodb://10.0.10.100:27017/forensicdb.LiveRawTweets?readPreference=primaryPreferred"))
val writeConfig: WriteConfig = WriteConfig(Map("uri" ->
"mongodb://10.0.10.100:27017/forensicdb.LiveRawTweets"))
MongoSpark.save(rawTweetsDF.coalesce(1).write.format("org.apache.spark.sql.json").option("forensicdb",
"LiveRawTweets").mode("append"), writeConfig)
} catch {
case e: Exception => println("Error Saving tweets to Mongodb:", e)
}
}
}
and Java analogue
public class Main {
// Set system credentials for access to twitter
private static void setTwitterOAuth() {
System.setProperty("twitter4j.oauth.consumerKey", TwitterCredentials.consumerKey);
System.setProperty("twitter4j.oauth.consumerSecret", TwitterCredentials.consumerSecret);
System.setProperty("twitter4j.oauth.accessToken", TwitterCredentials.accessToken);
System.setProperty("twitter4j.oauth.accessTokenSecret", TwitterCredentials.accessTokenSecret);
}
public static void main(String[] args) {
setTwitterOAuth();
SparkConf conf = new SparkConf().setMaster("local[2]")
.setAppName("SparkTwitter");
// Spark contexts
JavaSparkContext sparkContext = new JavaSparkContext(conf);
JavaStreamingContext jssc = new JavaStreamingContext(sparkContext, new Duration(1000));
JavaReceiverInputDStream < Status > twitterStream = TwitterUtils.createStream(jssc);
// Stream that contains just tweets in english
JavaDStream < Status > enTweetsDStream = twitterStream.filter((status) -> "en".equalsIgnoreCase(status.getLang()));
enTweetsDStream.persist(StorageLevel.MEMORY_AND_DISK());
enTweetsDStream.foreachRDD(rdd -> {
if (rdd != null && !rdd.isEmpty() && !rdd.partitions().isEmpty()) {
saveRawTweetsToMondoDb(rdd, sparkContext);
}
});
enTweetsDStream.print();
jssc.start();
jssc.awaitTermination();
}
static void saveRawTweetsToMondoDb(JavaRDD < Status > rdd, JavaSparkContext sparkContext) {
try {
ObjectMapper objectMapper = new ObjectMapper();
Function < Status, String > toJsonString = status -> objectMapper.writeValueAsString(status);
SQLContext sqlContext = new SQLContext(sparkContext);
JavaRDD < String > tweet = (JavaRDD < String > ) rdd.map(toJsonString);
DataFrame dataFrame = sqlContext.read().json(tweet);
// Setting for read
Map < String, String > readOverrides = new HashMap < > ();
readOverrides.put("uri", "mongodb://127.0.0.1/forensicdb.LiveRawTweets");
readOverrides.put("readPreference", "primaryPreferred");
ReadConfig readConfig = ReadConfig.create(sparkContext).withJavaOptions(readOverrides);
// Settings for writing
Map < String, String > writeOverrides = new HashMap < > ();
writeOverrides.put("uri", "mongodb://127.0.0.1/forensicdb.LiveRawTweets");
WriteConfig writeConfig = WriteConfig.create(sparkContext).withJavaOptions(writeOverrides);
MongoSpark.write(dataFrame).option("collection", "LiveRawTweets").mode("append").save();
MongoSpark.save(dataFrame, writeConfig);
} catch (Exception e) {
System.out.println("Error saving to database");
}
}
Related
Spark - createDataFrame returns NPE
I'm trying to run these lines : dsFinalSegRfm.show(20, false); Long compilationTime = System.currentTimeMillis() / 1000; JavaRDD<CustomerKnowledgeEntity> customerKnowledgeList = dsFinalSegRfm.javaRDD().map( (Function<Row, CustomerKnowledgeEntity>) rowRfm -> { CustomerKnowledgeEntity customerKnowledge = new CustomerKnowledgeEntity(); customerKnowledge.setCustomerId(new Long(getString(rowRfm.getAs("CLI_ID")))); customerKnowledge.setKnowledgeType("rfm-segmentation"); customerKnowledge.setKnowledgeTypeId("default"); InformationsEntity infos = new InformationsEntity(); infos.setCreationDate(new Date()); infos.setModificationDate(new Date()); infos.setUserModification("addKnowledge"); customerKnowledge.setInformations(infos); List<KnowledgeEntity> knowledgeEntityList = new ArrayList<>(); List<WrappedArray<String>> segList = rowRfm.getList(rowRfm.fieldIndex("SEGS")); for (WrappedArray<String> seg : segList) { KnowledgeEntity knowledge = new KnowledgeEntity(); Map<String, Object> attr = new HashMap<>(); attr.put("segment", seg.apply(1)); attr.put("segmentSemester", seg.apply(2)); knowledge.setKnowledgeId(seg.apply(0)); knowledge.setAttributes(attr); knowledge.setPriority(0); knowledge.setCount(1); knowledge.setDeleted(false); knowledgeEntityList.add(knowledge); } customerKnowledge.setKnowledgeCollections(knowledgeEntityList); return customerKnowledge; }); Long dataConstructionTime = System.currentTimeMillis() / 1000; Dataset<Row> dataset = sparkSession .createDataFrame(customerKnowledgeList, CustomerKnowledgeEntity.class) .repartition(16) .cache(); The dsFinalSegRfm.show(20, false); returns what I expect : But I'm getting a Null Pointer Exception from createDataFrame method. I'm learning Spark but I find it very opaque for debugging... Any help is appreciated !
Exception in thread "streaming-job-executor-11" java.lang.ClassFormatError
I am working with kafka (scala) and spark streaming (scala) to insert data from several CSVs to Cassandra tables, and I made a producer and a consumer, here are their respective codes Producer: import java.sql.Timestamp import java.util.Properties import java.io._ import java.io.File import java.nio.file.{Files, Paths, Path, SimpleFileVisitor, FileVisitResult} import scala.io.Source import akka.actor.{Actor, ActorSystem, Props} import com.typesafe.config.ConfigFactory import kafka.producer.{KeyedMessage, Producer, ProducerConfig} class produceMessages(brokers: String, topic: String) extends Actor { // All helpers needed to send messages def filecontent(namefile: String){ for (line <- Source.fromFile(namefile).getLines) { println(line) } } def getListOfFiles(dir: String):List[File] = { val d = new File(dir) if (d.exists && d.isDirectory) { d.listFiles.filter(_.isFile).toList } else { List[File]() } } def between(value: String, a:String, b: String):String = { // Return a substring between the two strings. val posA = value.indexOf(a) val posB = value.lastIndexOf(b) val adjustedPosA = posA + a.length() val res = value.substring(adjustedPosA, posB) return res } def getTableName(filePath: String):String = { //return table name from filePath val fileName = filePath.toString.split("\\\\").last val tableName = between(fileName,"100001_","_2017") return tableName } // end of helpers object kafka { val producer = { val props = new Properties() props.put("metadata.broker.list", brokers) //props.put(" max.request.size","5242880") props.put("serializer.class", "kafka.serializer.StringEncoder") val config = new ProducerConfig(props) new Producer[String, String](config) } } def receive = { case "send" => { val listeFichiers = getListOfFiles("C:\\Users\\acer\\Desktop\\csvs") for (i <- 0 until listeFichiers.length)yield{ val chemin = listeFichiers(i).toString val nomTable = getTableName(chemin) println(nomTable) val lines = Source.fromFile(chemin).getLines.toArray val headerLine = lines(0) println(headerLine) val data = lines.slice(1,lines.length) val messages = for (j <- 0 until data.length) yield{ val str = s"${data(j).toString}" println(str) new KeyedMessage[String, String](topic, str) } //sending the messages val numberOfLinesInTable = new KeyedMessage[String, String](topic, data.length.toString) val table = new KeyedMessage[String, String](topic, nomTable) val header = new KeyedMessage[String, String](topic, headerLine) kafka.producer.send(numberOfLinesInTable) kafka.producer.send(table) kafka.producer.send(header) kafka.producer.send(messages: _*) } } /*case "delete" =>{ val listeFichiers = getListOfFiles("C:\\Users\\acer\\Desktop\\csvs") for (file <- listeFichiers){ if (file.isDirectory) Option(file.listFiles).map(_.toList).getOrElse(Nil).foreach(Files.delete(_)) file.delete } }*/ case _ => println("Not a valid message!") } } // Produces some random words between 1 and 100. object KafkaStreamProducer extends App { /* * Get runtime properties from application.conf */ val systemConfig = ConfigFactory.load() val kafkaHost = systemConfig.getString("KafkaStreamProducer.kafkaHost") println(s"kafkaHost $kafkaHost") val kafkaTopic = systemConfig.getString("KafkaStreamProducer.kafkaTopic") println(s"kafkaTopic $kafkaTopic") val numRecords = systemConfig.getLong("KafkaStreamProducer.numRecords") println(s"numRecords $numRecords") val waitMillis = systemConfig.getLong("KafkaStreamProducer.waitMillis") println(s"waitMillis $waitMillis") /* * Set up the Akka Actor */ val system = ActorSystem("KafkaStreamProducer") val messageActor = system.actorOf(Props(new produceMessages(kafkaHost, kafkaTopic)), name="genMessages") /* * Message Loop */ var numRecsWritten = 0 while(numRecsWritten < numRecords) { messageActor ! "send" numRecsWritten += numRecsWritten println(s"${numRecsWritten} records written.") //messageActor ! "delete" Thread sleep waitMillis } } And here is the consumer: package com.datastax.demo import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{SQLContext, SaveMode, Row, SparkSession} import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import org.apache.spark.streaming.{Milliseconds, StreamingContext, Time} import org.apache.spark.streaming.kafka.KafkaUtils import com.datastax.spark.connector._ import kafka.serializer.StringDecoder import org.apache.spark.rdd.RDD import java.sql.Timestamp import java.io.File import scala.io.Source import scala.reflect.runtime.universe import scala.tools.reflect.ToolBox case class cellmodu(collecttime: Double,sbnid: Double,enodebid: Double,cellid: Double,c373515500: Double,c373515501: Double,c373515502: Double,c373515503: Double,c373515504: Double,c373515505: Double,c373515506: Double,c373515507: Double,c373515508: Double,c373515509: Double,c373515510: Double,c373515511: Double,c373515512: Double,c373515513: Double,c373515514: Double,c373515515: Double,c373515516: Double,c373515517: Double,c373515518: Double,c373515519: Double,c373515520: Double,c373515521: Double,c373515522: Double,c373515523: Double,c373515524: Double,c373515525: Double,c373515526: Double,c373515527: Double,c373515528: Double,c373515529: Double,c373515530: Double,c373515531: Double,c373515532: Double,c373515533: Double,c373515534: Double,c373515535: Double,c373515536: Double,c373515537: Double,c373515538: Double,c373515539: Double,c373515540: Double,c373515541: Double,c373515542: Double,c373515543: Double,c373515544: Double,c373515545: Double,c373515546: Double,c373515547: Double,c373515548: Double,c373515549: Double,c373515550: Double,c373515551: Double,c373515552: Double,c373515553: Double,c373515554: Double,c373515555: Double,c373515556: Double,c373515557: Double,c373515558: Double,c373515559: Double,c373515560: Double,c373515561: Double,c373515562: Double,c373515563: Double,c373515564: Double,c373515565: Double,c373515566: Double,c373515567: Double,c373515568: Double,c373515569: Double,c373515570: Double,c373515571: Double,c373515572: Double,c373515573: Double,c373515574: Double,c373515575: Double,c373515576: Double,c373515577: Double,c373515578: Double,c373515589: Double,c373515590: Double,c373515591: Double,c373515592: Double,c373515593: Double,c373515594: Double,c373515595: Double,c373515596: Double,c373515597: Double,c373515598: Double,c373515601: Double,c373515602: Double,c373515608: Double,c373515609: Double,c373515610: Double,c373515611: Double,c373515616: Double,c373515618: Double,c373515619: Double,c373515620: Double,c373515621: Double,c373515622: Double,c373515623: Double,c373515624: Double,c373515625: Double,c373515626: Double,c373515627: Double,c373515628: Double,c373515629: Double,c373515630: Double,c373515631: Double,c373515632: Double,c373515633: Double,c373515634: Double,c373515635: Double,c373515636: Double,c373515637: Double,c373515638: Double,c373515639: Double,c373515640: Double,c373515641: Double,c373515642: Double,c373515643: Double,c373515644: Double,c373515645: Double,c373515646: Double,c373515647: Double,c373515648: Double,c373515649: Double,c373515650: Double,c373515651: Double,c373515652: Double,c373515653: Double,c373515654: Double,c373515655: Double,c373515656: Double,c373515657: Double,c373515658: Double,c373515659: Double,c373515660: Double,c373515661: Double,c373515662: Double,c373515663: Double,c373515664: Double,c373515665: Double,c373515666: Double,c373515667: Double,c373515668: Double,c373515669: Double,c373515670: Double,c373515671: Double,c373515672: Double,c373515673: Double,c373515674: Double,c373515675: Double,c373515676: Double,c373515677: Double,c373515678: Double,c373515679: Double,c373515680: Double,c373515681: Double,c373515682: Double,c373515683: Double,c373515684: Double,c373515685: Double,c373515686: Double,c373515687: Double,c373515688: Double,c373515689: Double,c373515690: Double,c373515691: Double,c373515692: Double,c373515693: Double,c373515694: Double,c373515695: Double,c373515696: Double,c373515697: Double,c373515698: Double,c373515699: Double,c373515700: Double,c373515701: Double,c373515702: Double,c373515703: Double,c373515704: Double,c373515705: Double,c373515706: Double,c373515707: Double,c373515708: Double,c373515709: Double,c373515710: Double,c373515711: Double,c373515712: Double,c373515713: Double,c373515714: Double,c373515715: Double,c373515716: Double,c373515717: Double,c373515718: Double,c373515719: Double,c373515720: Double,c373515721: Double,c373515722: Double,c373515723: Double,c373515724: Double,c373515725: Double,c373515726: Double,c373515727: Double,c373515728: Double,c373515729: Double,c373515730: Double,c373515731: Double,c373515732: Double,c373515733: Double,c373515734: Double,c373515735: Double,c373515736: Double,c373515737: Double,c373515738: Double,c373515739: Double,c373515740: Double,c373515741: Double,c373515742: Double,c373515743: Double,c373515744: Double,c373515745: Double,c373515746: Double,c373515747: Double,c373515748: Double,c373515749: Double,c373515750: Double,c373515751: Double,c373515752: Double,c373515753: Double,c373515754: Double,c373515755: Double,c373515756: Double) {} object SparkKafkaConsumerCellmodu extends App { //START OF HELPERS def isNumeric(str:String): Boolean = str.matches("[-+]?\\d+(\\.\\d+)?") def printList(args: List[_]): Unit = {args.foreach(println)} //END OF HELPERS val appName = "SparkKafkaConsumer" val conf = new SparkConf() .set("spark.cores.max", "2") //.set("spark.executor.memory", "512M") .set("spark.cassandra.connection.host","localhost") .setAppName(appName) val spark: SparkSession = SparkSession.builder.master("local").getOrCreate val sc = SparkContext.getOrCreate(conf) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ val ssc = new StreamingContext(sc, Milliseconds(1000)) ssc.checkpoint(appName) val kafkaTopics = Set("test") //val kafkaParams = Map[String, String]("metadata.broker.list" -> "localhost:9092") val kafkaParams = Map( "bootstrap.servers" -> "localhost:9092", "fetch.message.max.bytes" -> "5242880") val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, kafkaTopics) kafkaStream .foreachRDD { (message: RDD[(String, String)]) => { val rddToArray = message.collect().toList val msg = rddToArray.map(_._2) var i = 0 while (i < msg.length){ if(isNumeric(msg(i))){ println("HHHHHHHHHHHHHHHHHHHHHHHHHHHHH") val numberLines = msg(i).toInt //get number of lines to insert in table val nameTable = msg(i+1) //get table name val headerTable = msg(i+2).toLowerCase //get the columns of the table println(headerTable) if(msg(i+1)=="CELLMODU"){ val typedCols : Array[String] = headerTable.split(",") // transform headerTable into array to define dataframe dynamically val listtoinsert:Array[String] = new Array[String](numberLines) // an empty list that will contain the lines to insert in the adequate table val k = i + 3 //to skip name of table and header //fill the toinsert array with the lines for (j <- 0 until numberLines){ listtoinsert(j) = msg(k + j) println (listtoinsert(j)) } //convert the array to RDD val rddtoinsert: RDD[(String)] = sc.parallelize(listtoinsert) //rddtoinsert.foreach(println) //convert rdd to dataframe val df = rddtoinsert.map { case (v) => v.split(",") }.map(payload1 => { // instance of dynamic class cellmodu(payload1(0).toDouble,payload1(1).toDouble,payload1(2).toDouble,payload1(3).toDouble,payload1(4).toDouble,payload1(5).toDouble,payload1(6).toDouble,payload1(7).toDouble,payload1(8).toDouble,payload1(9).toDouble,payload1(10).toDouble,payload1(11).toDouble,payload1(12).toDouble,payload1(13).toDouble,payload1(14).toDouble,payload1(15).toDouble,payload1(16).toDouble,payload1(17).toDouble,payload1(18).toDouble,payload1(19).toDouble,payload1(20).toDouble,payload1(21).toDouble,payload1(22).toDouble,payload1(23).toDouble,payload1(24).toDouble,payload1(25).toDouble,payload1(26).toDouble,payload1(27).toDouble,payload1(28).toDouble,payload1(29).toDouble,payload1(30).toDouble,payload1(31).toDouble,payload1(32).toDouble,payload1(33).toDouble,payload1(34).toDouble,payload1(35).toDouble,payload1(36).toDouble,payload1(37).toDouble,payload1(38).toDouble,payload1(39).toDouble,payload1(40).toDouble,payload1(41).toDouble,payload1(42).toDouble,payload1(43).toDouble,payload1(44).toDouble,payload1(45).toDouble,payload1(46).toDouble,payload1(47).toDouble,payload1(48).toDouble,payload1(49).toDouble,payload1(50).toDouble,payload1(51).toDouble,payload1(52).toDouble,payload1(53).toDouble,payload1(54).toDouble,payload1(55).toDouble,payload1(56).toDouble,payload1(57).toDouble,payload1(58).toDouble,payload1(59).toDouble,payload1(60).toDouble,payload1(61).toDouble,payload1(62).toDouble,payload1(63).toDouble,payload1(64).toDouble,payload1(65).toDouble,payload1(66).toDouble,payload1(67).toDouble,payload1(68).toDouble,payload1(69).toDouble,payload1(70).toDouble,payload1(71).toDouble,payload1(72).toDouble,payload1(73).toDouble,payload1(74).toDouble,payload1(75).toDouble,payload1(76).toDouble,payload1(77).toDouble,payload1(78).toDouble,payload1(79).toDouble,payload1(80).toDouble,payload1(81).toDouble,payload1(82).toDouble,payload1(83).toDouble,payload1(84).toDouble,payload1(85).toDouble,payload1(86).toDouble,payload1(87).toDouble,payload1(88).toDouble,payload1(89).toDouble,payload1(90).toDouble,payload1(91).toDouble,payload1(92).toDouble,payload1(93).toDouble,payload1(94).toDouble,payload1(95).toDouble,payload1(96).toDouble,payload1(97).toDouble,payload1(98).toDouble,payload1(99).toDouble,payload1(100).toDouble,payload1(101).toDouble,payload1(102).toDouble,payload1(103).toDouble,payload1(104).toDouble,payload1(105).toDouble,payload1(106).toDouble,payload1(107).toDouble,payload1(108).toDouble,payload1(109).toDouble,payload1(110).toDouble,payload1(111).toDouble,payload1(112).toDouble,payload1(113).toDouble,payload1(114).toDouble,payload1(115).toDouble,payload1(116).toDouble,payload1(117).toDouble,payload1(118).toDouble,payload1(119).toDouble,payload1(120).toDouble,payload1(121).toDouble,payload1(122).toDouble,payload1(123).toDouble,payload1(124).toDouble,payload1(125).toDouble,payload1(126).toDouble,payload1(127).toDouble,payload1(128).toDouble,payload1(129).toDouble,payload1(130).toDouble,payload1(131).toDouble,payload1(132).toDouble,payload1(133).toDouble,payload1(134).toDouble,payload1(135).toDouble,payload1(136).toDouble,payload1(137).toDouble,payload1(138).toDouble,payload1(139).toDouble,payload1(140).toDouble,payload1(141).toDouble,payload1(142).toDouble,payload1(143).toDouble,payload1(144).toDouble,payload1(145).toDouble,payload1(146).toDouble,payload1(147).toDouble,payload1(148).toDouble,payload1(149).toDouble,payload1(150).toDouble,payload1(151).toDouble,payload1(152).toDouble,payload1(153).toDouble,payload1(154).toDouble,payload1(155).toDouble,payload1(156).toDouble,payload1(157).toDouble,payload1(158).toDouble,payload1(159).toDouble,payload1(160).toDouble,payload1(161).toDouble,payload1(162).toDouble,payload1(163).toDouble,payload1(164).toDouble,payload1(165).toDouble,payload1(166).toDouble,payload1(167).toDouble,payload1(168).toDouble,payload1(169).toDouble,payload1(170).toDouble,payload1(171).toDouble,payload1(172).toDouble,payload1(173).toDouble,payload1(174).toDouble,payload1(175).toDouble,payload1(176).toDouble,payload1(177).toDouble,payload1(178).toDouble,payload1(179).toDouble,payload1(180).toDouble,payload1(181).toDouble,payload1(182).toDouble,payload1(183).toDouble,payload1(184).toDouble,payload1(185).toDouble,payload1(186).toDouble,payload1(187).toDouble,payload1(188).toDouble,payload1(189).toDouble,payload1(190).toDouble,payload1(191).toDouble,payload1(192).toDouble,payload1(193).toDouble,payload1(194).toDouble,payload1(195).toDouble,payload1(196).toDouble,payload1(197).toDouble,payload1(198).toDouble,payload1(199).toDouble,payload1(200).toDouble,payload1(201).toDouble,payload1(202).toDouble,payload1(203).toDouble,payload1(204).toDouble,payload1(205).toDouble,payload1(206).toDouble,payload1(207).toDouble,payload1(208).toDouble,payload1(209).toDouble,payload1(210).toDouble,payload1(211).toDouble,payload1(212).toDouble,payload1(213).toDouble,payload1(214).toDouble,payload1(215).toDouble,payload1(216).toDouble,payload1(217).toDouble,payload1(218).toDouble,payload1(219).toDouble,payload1(220).toDouble,payload1(221).toDouble,payload1(222).toDouble,payload1(223).toDouble,payload1(224).toDouble,payload1(225).toDouble,payload1(226).toDouble,payload1(227).toDouble,payload1(228).toDouble,payload1(229).toDouble,payload1(230).toDouble,payload1(231).toDouble,payload1(232).toDouble,payload1(233).toDouble,payload1(234).toDouble,payload1(235).toDouble,payload1(236).toDouble,payload1(237).toDouble,payload1(238).toDouble) }).toDF(typedCols: _*) //insert dataframe in cassandra table df .write .format("org.apache.spark.sql.cassandra") .mode(SaveMode.Append) .options(Map("keyspace" -> "ztedb4g", "table" -> nameTable.toLowerCase)) // tolowercase because the name table comes in uppercase .save() df.show(1) println(s"${df.count()} rows processed.") } } } } } ssc.start() ssc.awaitTermination() } The producer works well and publishes the messages as I want it to, but when I execute the consumer to insert in a table called "Cellmodu" I get the following error: Exception in thread "streaming-job-executor-11" java.lang.ClassFormatError: com/datastax/demo/cellmodu at com.datastax.demo.SparkKafkaConsumerCellmodu$$anonfun$1.apply(SparkKafkaConsumerCellmodu.scala:90) at com.datastax.demo.SparkKafkaConsumerCellmodu$$anonfun$1.apply(SparkKafkaConsumerCellmodu.scala:57) at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628) at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628) at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51) at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51) at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51) at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416) at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50) at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50) at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50) at scala.util.Try$.apply(Try.scala:192) at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39) at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257) at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257) at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257) at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58) at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) I keep getting this error over and over again for different streaming jobs and then nothing is inserted in my table, note that I tried to executre the exact same code for other tables with different case class of course that matches my table schema and it worked just fine, I don't understand why I get this error for few tables only like this one
as per your exception it is clearly java.lang.ClassFormatError error. First compare with other classes which are working fine in inserting to table. If you are using xml config information for Cellmodu, please check that if something wrong in it.
How to Extract vector in session?
I have saved vector in session and I want to use random value from the vector but dont know how to extract value in session. Errors: 'httpRequest-6' failed to execute: Vector(437420, 443940, 443932, 437437, 443981, 443956, 443973, 443915, 437445) named 'termIds' does not support .random function And In 2nd scenario It passes vector in get request like this way, http://someurl/api/thr/Vector(435854)/terms/Vector(437420, 443940, 443932, 437437, 443981, 443956, 443973, 443915, 437445) instead of using http://someurl/api/thr/435854/terms/443973 ::Here is my script:: class getTerm extends Simulation { val repeatCount = Integer.getInteger("repeatCount", 1).toInt val userCount = Integer.getInteger("userCount", 1).toInt val turl = System.getProperty("turl", "some url") val httpProtocol = http .baseURL("http://" + turl) val headers_10 = Map("Content-Type" -> """application/json""") var thrIds = "" var termIds = "" // Scenario - 1 val getTerms = scenario("Scn 1") .exec(http("list_of_term") .get("/api/abc") .headers(headers_10) .check(jsonPath("$[*].id") .findAll.saveAs("thrIds")) ) .exec(http("get_all_terms") .get("""/api/thr/${thrIds.random()}/terms""") .headers(headers_10) .check(jsonPath("$[*].id") .findAll.saveAs("termIds")) ) .exec(session => { thrIds = session("thrIds").as[Long].toString termIds = session("termIds").as[Long].toString println("***************************************") println("Session ====>>>> " + session) println("Ths ID ====>>>> " + thrIds) println("Term ID ====>>>> " + termIds) println("***************************************") session} ) // Scenario - 2 // Want to extract vectors here and pass its value into get call val getKnownTerms = scenario("Get Known Term") .exec(_.set("thrIds", thrIds)) .exec(_.set("termIds", termIds)) .repeat (repeatCount){ exec(http("get_k_term") .get("""/api/thr/${thrIds}/terms/${termIds.random()}""") .headers(headers_10)) } val scn = List(getTerms.inject(atOnceUsers(1)), getKnownTerms.inject(nothingFor(20 seconds), atOnceUsers(userCount))) setUp(scn).protocols(httpProtocol) }
Here is the solution which may help others. class getTerm extends Simulation { val repeatCount = Integer.getInteger("repeatCount", 1).toInt val userCount = Integer.getInteger("userCount", 1).toInt val turl = System.getProperty("turl", "some url") val httpProtocol = http .baseURL("http://" + turl) val headers_10 = Map("Content-Type" -> """application/json""") // Change - 1 var thrIds: Seq[String] = _ var termIds: Seq[String] = _ // Scenario - 1 val getTerms = scenario("Scn 1") .exec(http("list_of_term") .get("/api/abc") .headers(headers_10) .check(jsonPath("$[*].id") .findAll .transform { v => thrIds = v; v } .saveAs("thrIds")) ) .exec(http("get_all_trms") .get("""/api/thr/${thrIds.random()}/terms""") .headers(headers_10) .check(jsonPath("$[*].id") .findAll .transform { v => termIds = v; v } .saveAs("termIds")) ) // Scenario - 2 val getKnownTerms = scenario("Get Known Term") .exec(_.set("thrIds", thrIds)) .exec(_.set("termIds", termIds)) .repeat (repeatCount){ exec(http("get_k_term") .get("""/api/thr/${thrIds.random()}/terms/${termIds.random()}""") .headers(headers_10)) } val scn = List(getTerms.inject(atOnceUsers(1)), getKnownTerms.inject(nothingFor(20 seconds), atOnceUsers(userCount))) setUp(scn).protocols(httpProtocol) }
Spark streaming - textFileStream/fileStream - Get file name [duplicate]
Spark streaming textFileStream and fileStream can monitor a directory and process the new files in a Dstream RDD. How to get the file names that are being processed by the DStream RDD at that particular interval?
fileStream produces UnionRDD of NewHadoopRDDs. The good part about NewHadoopRDDs created by sc.newAPIHadoopFile is that their names are set to their paths. Here's the example of what you can do with that knowledge: def namedTextFileStream(ssc: StreamingContext, directory: String): DStream[String] = ssc.fileStream[LongWritable, Text, TextInputFormat](directory) .transform( rdd => new UnionRDD(rdd.context, rdd.dependencies.map( dep => dep.rdd.asInstanceOf[RDD[(LongWritable, Text)]].map(_._2.toString).setName(dep.rdd.name) ) ) ) def transformByFile[U: ClassTag](unionrdd: RDD[String], transformFunc: String => RDD[String] => RDD[U]): RDD[U] = { new UnionRDD(unionrdd.context, unionrdd.dependencies.map{ dep => if (dep.rdd.isEmpty) None else { val filename = dep.rdd.name Some( transformFunc(filename)(dep.rdd.asInstanceOf[RDD[String]]) .setName(filename) ) } }.flatten ) } def main(args: Array[String]) = { val conf = new SparkConf() .setAppName("Process by file") .setMaster("local[2]") val ssc = new StreamingContext(conf, Seconds(30)) val dstream = namesTextFileStream(ssc, "/some/directory") def byFileTransformer(filename: String)(rdd: RDD[String]): RDD[(String, String)] = rdd.map(line => (filename, line)) val transformed = dstream. transform(rdd => transformByFile(rdd, byFileTransformer)) // Do some stuff with transformed ssc.start() ssc.awaitTermination() }
For those that want some Java code instead of Scala: JavaPairInputDStream<LongWritable, Text> textFileStream = jsc.fileStream( inputPath, LongWritable.class, Text.class, TextInputFormat.class, FileInputDStream::defaultFilter, false ); JavaDStream<Tuple2<String, String>> namedTextFileStream = textFileStream.transform((pairRdd, time) -> { UnionRDD<Tuple2<LongWritable, Text>> rdd = (UnionRDD<Tuple2<LongWritable, Text>>) pairRdd.rdd(); List<RDD<Tuple2<LongWritable, Text>>> deps = JavaConverters.seqAsJavaListConverter(rdd.rdds()).asJava(); List<RDD<Tuple2<String, String>>> collectedRdds = deps.stream().map( depRdd -> { if (depRdd.isEmpty()) { return null; } JavaRDD<Tuple2<LongWritable, Text>> depJavaRdd = depRdd.toJavaRDD(); String filename = depRdd.name(); JavaPairRDD<String, String> newDep = JavaPairRDD.fromJavaRDD(depJavaRdd).mapToPair(t -> new Tuple2<String, String>(filename, t._2().toString())).setName(filename); return newDep.rdd(); }).filter(t -> t != null).collect(Collectors.toList()); Seq<RDD<Tuple2<String, String>>> rddSeq = JavaConverters.asScalaBufferConverter(collectedRdds).asScala().toIndexedSeq(); ClassTag<Tuple2<String, String>> classTag = scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class); return new UnionRDD<Tuple2<String, String>>(rdd.sparkContext(), rddSeq, classTag).toJavaRDD(); });
Alternatively, by modifying FileInputDStream so that rather than loading the contents of the files into the RDD, it simply creates an RDD from the filenames. This gives a performance boost if you don't actually want to read the data itself into the RDD, or want to pass filenames to an external command as one of your steps. Simply change filesToRDD(..) so that it makes an RDD of the filenames, rather than loading the data into the RDD. See: https://github.com/HASTE-project/bin-packing-paper/blob/master/spark/spark-scala-cellprofiler/src/main/scala/FileInputDStream2.scala#L278
Is there a cleaner way to do this Group Query in MongoDB from Groovy?
I'm working on learning MongoDB. Language of choice for the current run at it is Groovy. Working on Group Queries by trying to answer the question of which pet is the most needy one. Below is my first attempt and it's awful. Any help cleaning this up (or simply confirming that there isn't a cleaner way to do it) would be much appreciated. Thanks in advance! package mongo.pets import com.gmongo.GMongo import com.mongodb.BasicDBObject import com.mongodb.DBObject class StatsController { def dbPets = new GMongo().getDB('needsHotel').getCollection('pets') //FIXME OMG THIS IS AWFUL!!! def index = { def petsNeed = 'a walk' def reduce = 'function(doc, aggregator) { aggregator.needsCount += doc.needs.length }' def key = new BasicDBObject() key.put("name", true) def initial = new BasicDBObject() initial.put ("needsCount", 0) def maxNeeds = 0 def needyPets = [] dbPets.group(key, new BasicDBObject(), initial, reduce).each { if (maxNeeds < it['needsCount']) { maxNeeds = it['needsCount'] needyPets = [] needyPets += it['name'] } else if (maxNeeds == it['needsCount']) { needyPets += it['name'] } } def needyPet = needyPets [petsNeedingCount: dbPets.find([needs: petsNeed]).count(), petsNeed: petsNeed, mostNeedyPet: needyPet] } }
It should be possible to be change the whole method to this (but I don't have MongoDB to test it) def index = { def petsNeed = 'a walk' def reduce = 'function(doc, aggregator) { aggregator.needsCount += doc.needs.length }' def key = [ name: true ] as BasicDBObject def initial = [ needsCount: 0 ] as BasicDBObject def allPets = dbPets.group( key, new BasicDBObject(), initial, reduce ) def maxNeeds = allPets*.needsCount.collect { it as Integer }.max() def needyPet = allPets.findAll { maxNeeds == it.needsCount as Integer }.name [petsNeedingCount: dbPets.find([needs: petsNeed]).count(), petsNeed: petsNeed, mostNeedyPet: needyPet] }