edited
I am trying to iterate trough a dataframe to create another one. In this example I am not using data from the first one, it is just to show what I am trying to do. However, the idea is to use the first one to generate a new one much bigger based on data from the first one.
Whatever I try in the void function, I always get the error in the foreach.
Sample dataframe to iterate:
Dataset<Row> obtencionRents = spark.createDataFrame(Arrays.asList(
new testRentabilidades("0000A0","PORTAL","4-ANUAL","asdasd","asdasd"),
new testRentabilidades("00A00","PORTAL","","asdasd","sdasd"),
new testRentabilidades("00A","PORTAL","4-ANUAL","asdasd","asdasd")
), testRentabilidades.class);
Foreach function to iterate sample dataframe:
obtencionRents.toJavaRDD().foreach(new VoidFunction<Row>() {
public void call(Row r) throws Exception {
//add registers to new collection/arraylist/etc.
}
});
The Error I've got:
Driver stacktrace:
2021-11-03 17:34:41 INFO DAGScheduler:54 - Job 0 failed: foreach at CargarRentabilidades.java:154, took 0,812094 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:139)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:137)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:73)
at org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:419)
at batchload.proceso.builder.CargarRentabilidades$1.call(CargarRentabilidades.java:157)
at batchload.proceso.builder.CargarRentabilidades$1.call(CargarRentabilidades.java:154)
at org.apache.spark.api.java.JavaRDDLike$$anonfun$foreach$1.apply(JavaRDDLike.scala:351)
at org.apache.spark.api.java.JavaRDDLike$$anonfun$foreach$1.apply(JavaRDDLike.scala:351)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:919)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:919)
at org.apache.spark.api.java.JavaRDDLike$class.foreach(JavaRDDLike.scala:351)
at org.apache.spark.api.java.AbstractJavaRDDLike.foreach(JavaRDDLike.scala:45)
at batchload.proceso.builder.CargarRentabilidades.transformacionRentabilidades(CargarRentabilidades.java:154)
at batchload.proceso.builder.CargarRentabilidades.coleccionRentabilidades(CargarRentabilidades.java:78)
at batchload.proceso.builder.CargarRentabilidades.coleccionCargaRentabilidades(CargarRentabilidades.java:52)
at batchload.proceso.MainBatch.init(MainBatch.java:59)
at batchload.BatchloadRentabilidades.main(BatchloadRentabilidades.java:24)
Caused by: java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:139)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:137)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:73)
at org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:419)
at batchload.proceso.builder.CargarRentabilidades$1.call(CargarRentabilidades.java:157)
at batchload.proceso.builder.CargarRentabilidades$1.call(CargarRentabilidades.java:154)
at org.apache.spark.api.java.JavaRDDLike$$anonfun$foreach$1.apply(JavaRDDLike.scala:351)
at org.apache.spark.api.java.JavaRDDLike$$anonfun$foreach$1.apply(JavaRDDLike.scala:351)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Versions:
mongo-spark-connector_2.11-2.3.0
Java 1.8
IntelliJ 2021 1.2 Community
Spark library versions 2.11
other dependency versions I am using:
hadoop 2.7, spark 2.3.0, java driver 2.7, spark catalyst,core,hive,sql ....all 2.11:2.3.0, scala scala-library:2.11.12
Stuck with this, any help is more than welcome
Thanks!
This might be due to serialization issue.
Can you try converting your anonymous Function into static method of class?
Related
How to manage this cast?
scala.math.BigDecimal cannot be cast to java.math.BigDecimal
While Reading the data from Redshift and Writing to S4HANA via JDBC.
JDBC Connection.
connection_sap_options_pfd = {
"url": db_url,
"dbTable": 'dbTable_name',
"user": db_username,
"password":db_password,
"customJdbcDriverS3Path": driver_path,
"className": jdbc_driver_name
}
Writing the dataframe
glueContext.write_dynamic_frame.from_options(frame = pfd_dynamicFrame, connection_type="custom.jdbc",
connection_options=connection_sap_options_pfd,
transformation_ctx = "datasink_pfd")
pfd_dynamicFrame has one column which is of type Decimal(4, 0)
while writing i have the below exception.
py4j.protocol.Py4JJavaError: An error occurred while calling o109.pyWriteDynamicFrame.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 9.0 failed 4 times, most recent failure: Lost task 1.3 in stage 9.0 (TID 51) (44.109.37.157 executor 4):
java.lang.ClassCastException: scala.math.BigDecimal cannot be cast to java.math.BigDecimal
at org.apache.spark.sql.Row.getDecimal(Row.scala:285)
at org.apache.spark.sql.Row.getDecimal$(Row.scala:285)
at org.apache.spark.sql.catalyst.expressions.GenericRow.getDecimal(rows.scala:166)
at com.amazonaws.services.glue.marketplace.partner.PartnerJDBCRecordWriter.$anonfun$makeSetter$12(PartnerJDBCDataSink.scala:276)
at com.amazonaws.services.glue.marketplace.partner.PartnerJDBCRecordWriter.$anonfun$makeSetter$12$adapted(PartnerJDBCDataSink.scala:275)
at com.amazonaws.services.glue.marketplace.partner.PartnerJDBCRecordWriter.writePartition(PartnerJDBCDataSink.scala:163)
at com.amazonaws.services.glue.marketplace.connector.GlueCustomDataSink.$anonfun$defaultWriteDynamicFrame$1(CustomDataSink.scala:82)
at com.amazonaws.services.glue.marketplace.connector.GlueCustomDataSink.$anonfun$defaultWriteDynamicFrame$1$adapted(CustomDataSink.scala:71)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:138)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1516)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
After upgrading the spark snowflake connector jar I am using running my application on AWS EMR 6.6.0 to spark-snowflake_2.12:2.11.0-spark_3.2
I get the following error
java.io.NotSerializableException: org.apache.spark.storage.StorageStatus
Serialization stack:
- object not serializable (class: org.apache.spark.storage.StorageStatus, value: org.apache.spark.storage.StorageStatus#6ea056eb)
- element of array (index: 0)
- array (class [Lorg.apache.spark.storage.StorageStatus;, size 17)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:41)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.rpc.netty.NettyRpcEnv.serialize(NettyRpcEnv.scala:286)
at org.apache.spark.rpc.netty.RemoteNettyRpcCallContext.send(NettyRpcCallContext.scala:64)
at org.apache.spark.rpc.netty.NettyRpcCallContext.reply(NettyRpcCallContext.scala:32)
at org.apache.spark.storage.BlockManagerMasterEndpoint$$anonfun$receiveAndReply$1.applyOrElse(BlockManagerMasterEndpoint.scala:156)
at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:103)
at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
at org.apache.spark.rpc.netty.Inbox.process(Inbox.scala:100)
at org.apache.spark.rpc.netty.MessageLoop.org$apache$spark$rpc$netty$MessageLoop$$receiveLoop(MessageLoop.scala:75)
at org.apache.spark.rpc.netty.MessageLoop$$anon$1.run(MessageLoop.scala:41)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Although my application still completes, I am not sure why this error is occurring, but it dramatically slows down my application.
We are trying to integrate Spark and Spring boot, unfortunately we are facing each time lot of issues. After resolving the most of them, we are now stuck on the exception below
Job aborted due to stage failure: Task 0 in stage 11.0 failed 4 times, most recent failure: Lost task 0.3 in stage 11.0 (TID 14, xxxxx.ax.internal.cloudapp.net, executor 1): java.lang.ClassCastException: cannot assign instance of scala.collection.immutable.List$SerializationProxy to field org.apache.spark.rdd.RDD.org$apache$spark$rdd$RDD$$dependencies_ of type scala.collection.Seq in instance of org.apache.spark.rdd.MapPartitionsRDD
at java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2233)
at java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1405)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2284)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2202)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2060)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1567)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2278)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2202)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2060)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1567)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:427)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:80)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1928)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1941)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1954)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:336)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2386)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withNewExecutionId(Dataset.scala:2788)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2385)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2392)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2128)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2127)
at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2818)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2127)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2342)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:248)
at org.apache.spark.sql.Dataset.show(Dataset.scala:638)
at org.apache.spark.sql.Dataset.show(Dataset.scala:597)
at org.apache.spark.sql.Dataset.show(Dataset.scala:606)
at com.xxx.xxx.spark.Execute.run(Execute.java:46)
at com.xxx.xxx.spark.Loader.process(Loader.java:505)
at com.xxx.xxx.spark.Loader.run(Loader.java:122)
at org.springframework.boot.SpringApplication.callRunner(SpringApplication.java:732)
at org.springframework.boot.SpringApplication.callRunners(SpringApplication.java:716)
at org.springframework.boot.SpringApplication.afterRefresh(SpringApplication.java:703)
at org.springframework.boot.SpringApplication.run(SpringApplication.java:304)
at org.springframework.boot.SpringApplication.run(SpringApplication.java:1118)
at org.springframework.boot.SpringApplication.run(SpringApplication.java:1107)
at com.xxx.xxx.spark.AnalyseFec.main(AnalyseFec.java:11)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
This exception is raised when trying to manipulate a transformed data set (created with a map). The count and collect methods works fine,
The sample below throw the exception on dataf22.show();
StructType schemata = DataTypes.createStructType(
new StructField[]{
DataTypes.createStructField("Column1", DataTypes.StringType, false),
DataTypes.createStructField("Column2", DataTypes.DoubleType, false),
DataTypes.createStructField("Column3", DataTypes.DoubleType, false),
});
ExpressionEncoder<Row> encoder = RowEncoder.apply(schemata);
Dataset<Row> dataf2 = session.read()
.option("header", "true")
.option("delimiter",separateur)
// .schema(schemata)
.csv(csvPath);
dataf2.write().mode(SaveMode.Overwrite).parquet("xxx.parquet");
Dataset<Row> parquetFileDF = session.read().parquet("xxx.parquet");
Dataset<Row> dataf22 = parquetFileDF.map(row -> {
return RowFactory.create(row.getAs("Column1"),
Double.parseDouble(row.getAs("Column2").toString().replace(",", ".")),
Double.parseDouble(row.getAs("Column3").toString().replace(",", ".")));
}, encoder);
dataf22.printSchema();
dataf22.show();
dataf22.groupBy("Column1");
Dataset<Row> ds1 = dataf22.groupBy("Column1").sum("Column2");
ds1.show();
Dataset<Row> ds2 = dataf22.groupBy("Column1").sum("Column3");
ds2.show();
Initially we were packaging using the spring-boot-maven-plugin, the spark-submit was calling the org.springframework.boot.loader.JarLauncher that launch our starter class.
When we moved to maven-shade-plugin with some modification to support spring boot, the exception above disappear and we were able to execute our program, but only in client mode. In cluster mode the application is never running in Yarn, after multiple attempt the application Fail without any error that can help to fix the issue.
I feel that once the program is executed on executors, the problem will appear related to classpath or classloader issues
Did you succeed to made this integration working ? If yes, what maven plugin did you used ? what extra parameters of spark-submit command did you uses ( extraclasspath … )
Thank you
I am currently using spark 1.5.0 from cloudera distribution and in my java code trying to broadcast a concurrent hashmap. In the map() function, when I try to read the broadcast variable, I get a NullPointer Exception in the resource manager logs. Can anyone please help me out? I am unable to find any resolution for this. Following is my code snippet:
// for broadcasting before calling mapper
final Broadcast<ConcurrentHashMap<ConstantKeys, Object>> constantmapFinal =
context.broadcast(constantMap);
.......
// In map function
JavaRDD<String> outputRDD =
tempRdd.map(new org.apache.spark.api.java.function.Function() {
private static final long serialVersionUID =
6104325309455195113L;
public Object call(final Object arg0)
throws **Exception {
ConcurrentHashMap<ConstantKeys, Object> constantMap =
constantmapFinal.value(); // line 428
}
});
The exception from resource manager logs:
016-11-17 10:40:10 ERROR ApplicationMaster:96 - User class threw exception: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 2.0 failed 4 times, most recent failure: Lost task 1.3 in stage 2.0 (TID 20, ******(server name)): java.io.IOException: java.lang.NullPointerException
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1177)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:165)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:88)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at com.***.text.fuzzymatch.execute.FuzzyMatchWrapper$2.call(FuzzyMatchWrapper.java:428)
at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1027)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply$mcV$sp(PairRDDFunctions.scala:1109)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply(PairRDDFunctions.scala:1108)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$6.apply(PairRDDFunctions.scala:1108)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1205)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1116)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1095)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.NullPointerException
at java.util.HashMap.put(HashMap.java:493)
at com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:135)
at com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:17)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:729)
at com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:134)
at com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:17)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:729)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:192)
at org.apache.spark.broadcast.TorrentBroadcast$.unBlockifyObject(TorrentBroadcast.scala:217)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:178)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1174)
... 21 more
This is working for smaller sizes of the map. The map can contain numerous key value pairs based on the input request. Can anyone please help me out?
I'm trying to load a big Graph (60GB) using GraphX in spark1.4.1 in local mode with 16 threads. The driver memory is set to 500GB inside spark-defaults.conf. I work on a machine that has 590341 MB free (shown by free -m command)which is actually 576GB.
Some details about the graph I'm trying to load: It is the Friendster Graph downloaded from snap.stanford.edu. The actual size is 30GB because Friendster is a directed Graph but I doubled the edges to create an undirected graph version. (That's why the size is doubled 60=2*30). I use the GraphLoader.edgeListFile using scala to load the Graph as shown below:
def main(args: Array[String]): Unit = {
val logFile = "/home/panos/spark-1.2.0/README.md"
// Create spark configuration and spark context
val conf = new SparkConf().setAppName("My App").setMaster("local[16]")
val sc = new SparkContext(conf)
val currentDir = System.getProperty("user.dir") // get the current directory
val edgeFile = "file://" + currentDir + "/friendsterUndirected.txt"
// Load the edges as a graph
val graph = GraphLoader.edgeListFile(sc, edgeFile,false,1,StorageLevel.MEMORY_AND_DISK,StorageLevel.MEMORY_AND_DISK)
}
The problem is that the graph cannot be loaded receiving java.lang.negativeArraySize exception as shown below. I tried with smaller graphs (~4GB) with success. Any Idea?
16/01/10 13:04:09 WARN TaskSetManager: Stage 0 contains a task of very large size (440 KB). The maximum recommended task size is 100 KB.
16/01/10 13:32:56 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.NegativeArraySizeException
at java.lang.reflect.Array.newArray(Native Method)
at java.lang.reflect.Array.newInstance(Array.java:75)
at scala.reflect.ClassTag$class.newArray(ClassTag.scala:62)
at scala.reflect.ClassTag$$anon$1.newArray(ClassTag.scala:144)
at org.apache.spark.util.collection.PrimitiveVector.copyArrayWithLength(PrimitiveVector.scala:87)
at org.apache.spark.util.collection.PrimitiveVector.resize(PrimitiveVector.scala:74)
at org.apache.spark.util.collection.PrimitiveVector.$plus$eq(PrimitiveVector.scala:41)
at org.apache.spark.graphx.impl.EdgePartitionBuilder$mcI$sp.add$mcI$sp(EdgePartitionBuilder.scala:34)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:87)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:76)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:76)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:74)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:69)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/01/10 13:32:56 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.NegativeArraySizeException
at java.lang.reflect.Array.newArray(Native Method)
at java.lang.reflect.Array.newInstance(Array.java:75)
at scala.reflect.ClassTag$class.newArray(ClassTag.scala:62)
at scala.reflect.ClassTag$$anon$1.newArray(ClassTag.scala:144)
at org.apache.spark.util.collection.PrimitiveVector.copyArrayWithLength(PrimitiveVector.scala:87)
at org.apache.spark.util.collection.PrimitiveVector.resize(PrimitiveVector.scala:74)
at org.apache.spark.util.collection.PrimitiveVector.$plus$eq(PrimitiveVector.scala:41)
at org.apache.spark.graphx.impl.EdgePartitionBuilder$mcI$sp.add$mcI$sp(EdgePartitionBuilder.scala:34)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:87)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:76)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:76)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:74)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:69)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/01/10 13:32:56 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.NegativeArraySizeException
at java.lang.reflect.Array.newArray(Native Method)
at java.lang.reflect.Array.newInstance(Array.java:75)
at scala.reflect.ClassTag$class.newArray(ClassTag.scala:62)
at scala.reflect.ClassTag$$anon$1.newArray(ClassTag.scala:144)
at org.apache.spark.util.collection.PrimitiveVector.copyArrayWithLength(PrimitiveVector.scala:87)
at org.apache.spark.util.collection.PrimitiveVector.resize(PrimitiveVector.scala:74)
at org.apache.spark.util.collection.PrimitiveVector.$plus$eq(PrimitiveVector.scala:41)
at org.apache.spark.graphx.impl.EdgePartitionBuilder$mcI$sp.add$mcI$sp(EdgePartitionBuilder.scala:34)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:87)
at org.apache.spark.graphx.GraphLoader$$anonfun$1$$anonfun$apply$1.apply(GraphLoader.scala:76)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:76)
at org.apache.spark.graphx.GraphLoader$$anonfun$1.apply(GraphLoader.scala:74)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$18.apply(RDD.scala:703)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:69)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)