Got error when connecting to AWS RDS MySQL service in python - java
I have used PySpark to read some Excel and load it to AWS RDS MySQL service in AWS EC2 Linux server.
My script:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
if __name__ == '__main__':
scSpark = SparkSession \
.builder \
.appName("reading csv") \
.config("spark.driver.extraClassPath", "./mysql-connector-java-8.0.16.jar") \
.getOrCreate()
data_file = './text.xlsx'
sdfData = scSpark.read.csv(data_file, header=True, sep=",").cache()
sdfData.registerTempTable("books")
output = scSpark.sql('SELECT * from books')
output.show()
output.write.format('jdbc').options(
url='XXX.rds.amazonaws.com',
driver='com.mysql.cj.jdbc.Driver',
dbtable='books',
user='xxx',
password='xxx').mode('append').save()
I got some error when connecting to AWS RDS MySQL service use this script:
PuTTYTraceback (most recent call last):
File "ETL.py", line 24, in <module>
password='XXX').mode('append').save()
File "/home/ec2-user/.local/lib/python3.7/site-packages/pyspark/sql/readwriter.py", line 738, in save
self._jwrite.save()
File "/home/ec2-user/.local/lib/python3.7/site-packages/py4j/java_gateway.py", line 1322, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/ec2-user/.local/lib/python3.7/site-packages/pyspark/sql/utils.py", line 111, in deco
return f(*a, **kw)
File "/home/ec2-user/.local/lib/python3.7/site-packages/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o45.save.
: java.lang.ClassNotFoundException: com.mysql.cj.jdbc.Driver
at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:46)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1(JDBCOptions.scala:101)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1$adapted(JDBCOptions.scala:101)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:101)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcOptionsInWrite.<init>(JDBCOptions.scala:218)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcOptionsInWrite.<init>(JDBCOptions.scala:222)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:46)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:110)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:106)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:106)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:93)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:91)
at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:128)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:848)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:382)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:355)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:748)
I have downloaded the driver, mysql-connector-java-8.0.16.jar, and put it in the same folder as the script.
However, when I run the script, the last line of my script keeps throwing out that error.
How could I fix this issue?
in the jdbc options set url value as:
url='XXX.rds.amazonaws.com?useSSL=FALSE&nullCatalogMeansCurrent=true&zeroDateTimeBehavior=convertToNull'
MySQL connector java 8.0 requires a SSL or to disable explicitly.
Reference: https://dev.mysql.com/doc/connector-j/8.0/en/connector-j-connp-props-security.html
Related
ClassNotFoundException: org.asynchttpclient.defaultasyncHttpClientConfigBuilder error
Recently I have upgraded my selenium version from 3.141.59 to 4.1.0 and whenever I am running my script getting the below error Console &3 N. Results of running class TestRunner 髪 minated> TestRunner [TestNG] C:\Program FilesVava\jdk1.8.0_321\bin\javaw.exe (Nov 18, 2022, 6:46:33 PM) java.lang.NoClassDefFoundError: org/asynchttpclient/DefaultAsyncHttpClientConfig$Builder at org.openqa.selenium.remote.http.netty.NettyClient.createHttpClient(NettyClient.java:91 at org.openqa.selenium.remote.http.netty .NettyClient.<clinit>(NettyClient.java:57) at org.openqa.selenium.remote.http.netty.NettyClient$Factory.createClient(NettyClient.jav at org.openqa.selenium.remote.tracing.TracedHttpClient$Factory.createClient(TracedHttpCli at org-openqa.selenium.remote.HttpCommandExecutor-<init>(HttpCommandExecutor.java:107) at org.openqa.selenium. remote. RemoteWebDriver.createTracedExecutorWithTracedHttpClient(Re at org.openqa.selenium. remote. RemoteWebDriver.<init>(RemoteWebDriver.java: 146) at com.cucumber.utility.BrowserFactory.launchBrowser (BrowserFactory. java: 82) at com.cucumber.step.StepDefinition _.launch_the_browser_and_navigate_to_ at ?.Launch the browser "yes" "Chrome" "105.0" "Windows 10" and navigate to URL Env "h sed by: java.lang.ClassNotFoundException: org.asynchttpclient.DefaultAsyncHttpClientConfig$Bui at java.net.URLClassLoader.findClass(URLClassLoader. java: 387) at java.lang.ClassLoader.loadClass(ClassLoader.java:418) at sun.misc. Launcher$ApplassLoader. loadClass (Launcher. java:355) at java.lang.ClassLoader.loadClass (ClassLoader. java: 351) at org.openqa.selenium.remote.http.netty.NettyClient.createHttpClient(NettyClient.java:91) at org.openqa.selenium.remote.http.netty.NettyClient.<clinit>(NettyClient.java:57) at org.openqa.selenium.remote.http.netty.NettyClient$Factory.createClient(NettyClient.java at org.openqa.selenium.remote.tracing. TracedHttpClient$Factory.createClient (TracedHttplie at org.openqa.selenium. remote.HttpCommandExecutor.<init>(HttpCommandExecutor. java:107) at org.openga.selenium.remote.RemoteWebDriver.createTracedExecutorWithTracedHttpClient(Rem at org.openqa.selenium.remote. RemoteWebDriver.<init> (RemoteWebDriver.java:146) at com.cucumber.utility.BrowserFactory.launchBrowser(BrowserFactory.java:82) at com.cucumber.step.StepDefinition_.launch_the_browser_and_navigate_to_Paypal at sun.reflect.NativeMethodAccessorImpl.invoke® (Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImp1.java:62) at sun.reflect.DelegatingMethodAccessorImp1.invoke (DelegatingMethodAccessorImpl. java:43) at java.lang.reflect.Method.invoke (Method. java: 498) at cucumber.runtime.Utils$1.call(Utils. java: 26) =+ Timenit timenit / Timennt I had okhttp3 class where now I have commented it due to its throwing error.
How to avoid java.io.StreamCorruptedException: invalid stream header: 204356EC when using toPandas() with PySpark?
Whenever I try to read a Spark dataset using PySpark and convert it to a Pandas df for modeling I get the error: java.io.StreamCorruptedException: invalid stream header: 204356EC on the toPandas() step. I am not a Java coder (hence PySpark) and so these errors can be pretty cryptic to me. I tried the following things, but I still have this issue: Made sure my Spark and PySpark versions matched as suggested here: java.io.StreamCorruptedException when importing a CSV to a Spark DataFrame Reinstalled Spark using the methods suggested here: Complete Guide to Installing PySpark on MacOS The logging in the test script below verifies the Spark and PySpark versions are aligned. test.py: import logging from pyspark.sql import SparkSession from pyspark import SparkContext import findspark findspark.init() logging.basicConfig( format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') sc = SparkContext('local[*]', 'test') spark = SparkSession(sc) logging.info('Spark location: {}'.format(findspark.find())) logging.info('PySpark version: {}'.format(spark.sparkContext.version)) logging.info('Reading spark input dataframe') test_df = spark.read.csv('./data', header=True, sep='|', inferSchema=True) logging.info('Converting spark DF to pandas DF') pandas_df = test_df.toPandas() logging.info('DF record count: {}'.format(len(pandas_df))) sc.stop() Output: $ python ./test.py 21/05/13 11:54:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 2021-05-13 11:54:34 INFO Spark location: /Users/username/server/spark-3.1.1-bin-hadoop2.7 2021-05-13 11:54:34 INFO PySpark version: 3.1.1 2021-05-13 11:54:34 INFO Reading spark input dataframe 2021-05-13 11:54:42 INFO Converting spark DF to pandas DF 21/05/13 11:54:42 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'. 21/05/13 11:54:45 ERROR TaskResultGetter: Exception while getting task result12] java.io.StreamCorruptedException: invalid stream header: 204356EC at java.io.ObjectInputStream.readStreamHeader(ObjectInputStream.java:936) at java.io.ObjectInputStream.<init>(ObjectInputStream.java:394) at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.<init>(JavaSerializer.scala:64) at org.apache.spark.serializer.JavaDeserializationStream.<init>(JavaSerializer.scala:64) at org.apache.spark.serializer.JavaSerializerInstance.deserializeStream(JavaSerializer.scala:123) at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:108) at org.apache.spark.scheduler.TaskResultGetter$$anon$3.$anonfun$run$1(TaskResultGetter.scala:97) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1996) at org.apache.spark.scheduler.TaskResultGetter$$anon$3.run(TaskResultGetter.scala:63) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Traceback (most recent call last): File "./test.py", line 23, in <module> pandas_df = test_df.toPandas() File "/Users/username/server/spark-3.1.1-bin-hadoop2.7/python/pyspark/sql/pandas/conversion.py", line 141, in toPandas pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns) File "/Users/username/server/spark-3.1.1-bin-hadoop2.7/python/pyspark/sql/dataframe.py", line 677, in collect sock_info = self._jdf.collectToPython() File "/Users/username/server/spark-3.1.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__ File "/Users/username/server/spark-3.1.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 111, in deco return f(*a, **kw) File "/Users/username/server/spark-3.1.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value py4j.protocol.Py4JJavaError: An error occurred while calling o31.collectToPython. : org.apache.spark.SparkException: Job aborted due to stage failure: Exception while getting task result: java.io.StreamCorruptedException: invalid stream header: 204356EC at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2267) at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:414) at org.apache.spark.rdd.RDD.collect(RDD.scala:1029) at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:390) at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3519) at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685) at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3516) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748)
The issue was resolved for me by ensuring that the serialisation option (registered in configuration under spark.serlializer) was not incompatible with pyarrow (typically used during the conversion of pandas to pyspark and vice versa if you've got it enabled). The fix was to remove the often recommended spark.serializer: org.apache.spark.serializer.KryoSerializer from the configuration and rely instead on the potentially slower default. For context, our set-up was with a ML version of the databricks spark cluster (v7.3).
I have this exception with Spark Thrift server. Driver version and cluster version was different. In my case i delete this, for using version from driver in all cluster. spark.yarn.archive=hdfs:///spark/3.1.1.zip
Janusgraph libs cant communicate with hbase in kerberos environment(Failed to specify server's Kerberos principal name)
I am getting "Failed to specify server's Kerberos principal name" when attempting to connect to habse with janusgraph in a kerberos hadoop cluster First off a little environmental info - OS: 7.6.1810 Java: 1.8.0_191-b12 Spark: 2.3.2.3.1.0.78-4 YARN: 2.5.0 Hbase: 2.0.2.3.1.0.78-4 Hadoop: 3.1.1.3.1.0.78-4 Kerberos: 5 version 1.15.1 Janusgraph: 0.4.0 I did kinit and test the bundled gremlin client to ensure the graph.properties for the env works. It was able to connect up create a simple test graph, add some vertices, restart and retrieve the stored data. So cool the bundled copy works. For laziness/simplicity I decided to load the spark-shell with janusgraph libs. While attempting to connect to the same graph it started throwing kerberos errors. First thought being maybe its a hadoop/spark lib/conf conflict(pretty typical). So built out a very simple and barebones java app in an attempt to see if it would work. Got the same errors as spark. Spark Invocations - First attempt: spark-shell \ --conf spark.driver.userClassPathFirst=true \ --conf spark.executor.userClassPathFirst=true \ --conf spark.driver.userClassPathFirst=true \ --jars /etc/hadoop/conf/core-site.xml,/etc/hbase/conf/hbase-site.xml,groovy-console-2.5.6.jar,javax.servlet-api-3.1.0.jar,netty-buffer-4.1.25.Final.jar,RoaringBitmap-0.5.11.jar,groovy-groovysh-2.5.6-indy.jar,javax.ws.rs-api-2.0.1.jar,netty-codec-4.1.25.Final.jar,activation-1.1.jar,groovy-json-2.5.6-indy.jar,jaxb-api-2.2.2.jar,netty-common-4.1.25.Final.jar,airline-0.6.jar,groovy-jsr223-2.5.6-indy.jar,jaxb-impl-2.2.3-1.jar,netty-handler-4.1.25.Final.jar,antlr-2.7.7.jar,groovy-swing-2.5.6.jar,jbcrypt-0.4.jar,netty-resolver-4.1.25.Final.jar,antlr-3.2.jar,groovy-templates-2.5.6.jar,jboss-logging-3.1.2.GA.jar,netty-transport-4.1.25.Final.jar,antlr-runtime-3.2.jar,groovy-xml-2.5.6.jar,jcabi-log-0.14.jar,noggit-0.6.jar,aopalliance-repackaged-2.4.0-b34.jar,gson-2.2.4.jar,jcabi-manifests-1.1.jar,objenesis-2.1.jar,apacheds-i18n-2.0.0-M15.jar,guava-18.0.jar,jcl-over-slf4j-1.7.25.jar,ohc-core-0.3.4.jar,apacheds-kerberos-codec-2.0.0-M15.jar,hadoop-annotations-2.7.7.jar,je-7.5.11.jar,org.apache.servicemix.bundles.commons-csv-1.0-r706900_3.jar,api-asn1-api-1.0.0-M20.jar,hadoop-auth-2.7.7.jar,jersey-client-1.9.jar,oro-2.0.8.jar,api-util-1.0.0-M20.jar,hadoop-client-2.7.7.jar,jersey-client-2.22.2.jar,osgi-resource-locator-1.0.1.jar,asm-3.1.jar,hadoop-common-2.7.7.jar,jersey-common-2.22.2.jar,paranamer-2.6.jar,asm-5.0.3.jar,hadoop-distcp-2.7.7.jar,jersey-container-servlet-2.22.2.jar,picocli-3.9.2.jar,asm-analysis-5.0.3.jar,hadoop-gremlin-3.4.1.jar,jersey-container-servlet-core-2.22.2.jar,protobuf-java-2.5.0.jar,asm-commons-5.0.3.jar,hadoop-hdfs-2.7.7.jar,jersey-core-1.9.jar,py4j-0.10.7.jar,asm-tree-5.0.3.jar,hadoop-mapreduce-client-app-2.7.7.jar,jersey-guava-2.22.2.jar,pyrolite-4.13.jar,asm-util-5.0.3.jar,hadoop-mapreduce-client-common-2.7.7.jar,jersey-json-1.9.jar,reflections-0.9.9-RC1.jar,astyanax-cassandra-3.10.2.jar,hadoop-mapreduce-client-core-2.7.7.jar,jersey-media-jaxb-2.22.2.jar,reporter-config-base-3.0.0.jar,astyanax-cassandra-all-shaded-3.10.2.jar,hadoop-mapreduce-client-jobclient-2.7.7.jar,jersey-server-1.9.jar,reporter-config3-3.0.0.jar,astyanax-core-3.10.2.jar,hadoop-mapreduce-client-shuffle-2.7.7.jar,jersey-server-2.22.2.jar,scala-library-2.11.8.jar,astyanax-recipes-3.10.2.jar,hadoop-yarn-api-2.7.7.jar,jets3t-0.7.1.jar,scala-reflect-2.11.8.jar,astyanax-thrift-3.10.2.jar,hadoop-yarn-client-2.7.7.jar,jettison-1.3.3.jar,scala-xml_2.11-1.0.5.jar,audience-annotations-0.5.0.jar,hadoop-yarn-common-2.7.7.jar,jetty-6.1.26.jar,servlet-api-2.5.jar,avro-1.7.4.jar,hadoop-yarn-server-common-2.7.7.jar,jetty-sslengine-6.1.26.jar,sesame-model-2.7.10.jar,avro-ipc-1.8.2.jar,hamcrest-core-1.3.jar,jetty-util-6.1.26.jar,sesame-rio-api-2.7.10.jar,avro-mapred-1.8.2-hadoop2.jar,hbase-shaded-client-2.1.5.jar,jffi-1.2.16-native.jar,sesame-rio-datatypes-2.7.10.jar,bigtable-hbase-1.x-shaded-1.11.0.jar,hbase-shaded-mapreduce-2.1.5.jar,jffi-1.2.16.jar,sesame-rio-languages-2.7.10.jar,caffeine-2.3.1.jar,hibernate-validator-4.3.0.Final.jar,jline-2.14.6.jar,sesame-rio-n3-2.7.10.jar,cassandra-all-2.2.13.jar,high-scale-lib-1.0.6.jar,jna-4.0.0.jar,sesame-rio-ntriples-2.7.10.jar,cassandra-driver-core-3.7.1.jar,high-scale-lib-1.1.4.jar,jnr-constants-0.9.9.jar,sesame-rio-rdfxml-2.7.10.jar,cassandra-thrift-2.2.13.jar,hk2-api-2.4.0-b34.jar,jnr-ffi-2.1.7.jar,sesame-rio-trig-2.7.10.jar,checker-compat-qual-2.5.2.jar,hk2-locator-2.4.0-b34.jar,jnr-posix-3.0.44.jar,sesame-rio-trix-2.7.10.jar,chill-java-0.9.3.jar,hk2-utils-2.4.0-b34.jar,jnr-x86asm-1.0.2.jar,sesame-rio-turtle-2.7.10.jar,chill_2.11-0.9.3.jar,hppc-0.7.1.jar,joda-time-2.8.2.jar,sesame-util-2.7.10.jar,commons-cli-1.3.1.jar,htrace-core-3.1.0-incubating.jar,jsch-0.1.54.jar,sigar-1.6.4.jar,commons-codec-1.7.jar,htrace-core4-4.2.0-incubating.jar,json-20090211_1.jar,slf4j-api-1.7.12.jar,commons-collections-3.2.2.jar,httpasyncclient-4.1.2.jar,json-simple-1.1.jar,slf4j-log4j12-1.7.12.jar,commons-configuration-1.10.jar,httpclient-4.4.1.jar,json4s-ast_2.11-3.5.3.jar,snakeyaml-1.11.jar,commons-crypto-1.0.0.jar,httpcore-4.4.1.jar,json4s-core_2.11-3.5.3.jar,snappy-java-1.0.5-M3.jar,commons-httpclient-3.1.jar,httpcore-nio-4.4.5.jar,json4s-jackson_2.11-3.5.3.jar,solr-solrj-7.0.0.jar,commons-io-2.3.jar,httpmime-4.4.1.jar,json4s-scalap_2.11-3.5.3.jar,spark-core_2.11-2.4.0.jar,commons-lang-2.5.jar,ivy-2.3.0.jar,jsp-api-2.1.jar,spark-gremlin-3.4.1.jar,commons-lang3-3.3.1.jar,jackson-annotations-2.6.6.jar,jsr305-3.0.0.jar,spark-kvstore_2.11-2.4.0.jar,commons-logging-1.1.1.jar,jackson-core-2.6.6.jar,jts-core-1.15.0.jar,spark-launcher_2.11-2.4.0.jar,commons-math3-3.2.jar,jackson-core-asl-1.9.13.jar,jul-to-slf4j-1.7.16.jar,spark-network-common_2.11-2.4.0.jar,commons-net-1.4.1.jar,jackson-databind-2.6.6.jar,junit-4.12.jar,spark-network-shuffle_2.11-2.4.0.jar,commons-pool-1.6.jar,jackson-datatype-json-org-2.6.6.jar,kryo-shaded-4.0.2.jar,spark-tags_2.11-2.4.0.jar,commons-text-1.0.jar,jackson-jaxrs-1.9.13.jar,leveldbjni-all-1.8.jar,spark-unsafe_2.11-2.4.0.jar,compress-lzf-1.0.0.jar,jackson-mapper-asl-1.9.13.jar,libthrift-0.9.2.jar,spatial4j-0.7.jar,concurrentlinkedhashmap-lru-1.3.jar,jackson-module-paranamer-2.6.6.jar,log4j-1.2.16.jar,stax-api-1.0-2.jar,crc32ex-0.1.1.jar,jackson-module-scala_2.11-2.6.6.jar,logback-classic-1.1.3.jar,stax-api-1.0.1.jar,curator-client-2.7.1.jar,jackson-xc-1.9.13.jar,logback-core-1.1.3.jar,stax2-api-3.1.4.jar,curator-framework-2.7.1.jar,jamm-0.3.0.jar,lucene-analyzers-common-7.0.0.jar,stream-2.7.0.jar,curator-recipes-2.7.1.jar,janusgraph-all-0.4.0.jar,lucene-core-7.0.0.jar,stringtemplate-3.2.jar,disruptor-3.0.1.jar,janusgraph-berkeleyje-0.4.0.jar,lucene-queries-7.0.0.jar,super-csv-2.1.0.jar,dom4j-1.6.1.jar,janusgraph-bigtable-0.4.0.jar,lucene-queryparser-7.0.0.jar,thrift-server-0.3.7.jar,ecj-4.4.2.jar,janusgraph-cassandra-0.4.0.jar,lucene-sandbox-7.0.0.jar,tinkergraph-gremlin-3.4.1.jar,elasticsearch-rest-client-6.6.0.jar,janusgraph-core-0.4.0.jar,lucene-spatial-7.0.0.jar,unused-1.0.0.jar,exp4j-0.4.8.jar,janusgraph-cql-0.4.0.jar,lucene-spatial-extras-7.0.0.jar,uuid-3.2.jar,findbugs-annotations-1.3.9-1.jar,janusgraph-es-0.4.0.jar,lucene-spatial3d-7.0.0.jar,validation-api-1.1.0.Final.jar,gbench-0.4.3-groovy-2.4.jar,janusgraph-hadoop-0.4.0.jar,lz4-1.3.0.jar,vavr-0.9.0.jar,gmetric4j-1.0.7.jar,janusgraph-hbase-0.4.0.jar,lz4-java-1.4.0.jar,vavr-match-0.9.0.jar,gprof-0.3.1-groovy-2.4.jar,janusgraph-lucene-0.4.0.jar,metrics-core-3.0.2.jar,woodstox-core-asl-4.4.1.jar,gremlin-console-3.4.1.jar,janusgraph-server-0.4.0.jar,metrics-core-3.2.2.jar,xbean-asm6-shaded-4.8.jar,gremlin-core-3.4.1.jar,janusgraph-solr-0.4.0.jar,metrics-ganglia-3.2.2.jar,xercesImpl-2.9.1.jar,gremlin-driver-3.4.1.jar,javapoet-1.8.0.jar,metrics-graphite-3.2.2.jar,xml-apis-1.3.04.jar,gremlin-groovy-3.4.1.jar,javassist-3.18.0-GA.jar,metrics-json-3.1.5.jar,xmlenc-0.52.jar,gremlin-server-3.4.1.jar,javatuples-1.2.jar,metrics-jvm-3.2.2.jar,zookeeper-3.4.6.jar,gremlin-shaded-3.4.1.jar,javax.inject-1.jar,minlog-1.3.0.jar,zstd-jni-1.3.2-2.jar,groovy-2.5.6-indy.jar,javax.inject-2.4.0-b34.jar,netty-3.10.5.Final.jar,groovy-cli-picocli-2.5.6.jar,javax.json-1.0.jar,netty-all-4.1.25.Final.jar Second attempt(less libs): spark-shell \ --conf spark.driver.userClassPathFirst=true \ --conf spark.executor.userClassPathFirst=true \ --conf spark.driver.userClassPathFirst=true \ --jars /etc/hadoop/conf/core-site.xml,/etc/hbase/conf/hbase-site.xml,gremlin-core-3.4.1.jar,gremlin-driver-3.4.3.jar,gremlin-shaded-3.4.1.jar,groovy-2.5.7.jar,groovy-json-2.5.7.jar,javatuples-1.2.jar,commons-lang3-3.8.1.jar,commons-configuration-1.10.jar,janusgraph-core-0.4.0.jar,hbase-shaded-client-2.1.5.jar,janusgraph-hbase-0.4.0.jar,high-scale-lib-1.1.4.jar Java attempt: java \ -cp /etc/hadoop/conf/core-site.xml:/etc/hbase/conf/hbase-site.xml:hbase-shaded-client-2.1.5.jar:janusgraph-hbase-0.4.0.jar:janusgraph-core-0.4.0.jar:commons-lang3-3.8.1.jar:gremlin-driver-3.4.3.jar:groovy-2.5.7.jar:javatuples-1.2.jar:commons-configuration-1.10.jar:gremlin-core-3.4.1.jar:gremlin-shaded-3.4.1.jar:groovy-json-2.5.7.jar:high-scale-lib-1.1.4.jar:Janusgraph_Ingestion.jar:../janusgraph-0.4.0-hadoop2/lib/commons-lang-2.5.jar:../janusgraph-0.4.0-hadoop2/lib/slf4j-api-1.7.12.jar:../janusgraph-0.4.0-hadoop2/lib/slf4j-log4j12-1.7.12.jar:../janusgraph-0.4.0-hadoop2/lib/log4j-1.2.16.jar:../janusgraph-0.4.0-hadoop2/lib/guava-18.0.jar:../janusgraph-0.4.0-hadoop2/lib/commons-logging-1.1.1.jar:../janusgraph-0.4.0-hadoop2/lib/commons-io-2.3.jar:../janusgraph-0.4.0-hadoop2/lib/htrace-core4-4.2.0-incubating.jar \ Entry As far as code being executed in the spark-shell or java import org.janusgraph.core.JanusGraphFactory; val g = JanusGraphFactory.open("/home/devuser/janusgraph-0.4.0-hadoop2/conf/janusgraph-hbase.properties").traversal() Also tried adding the below before attempting to open the graph import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.security.UserGroupInformation; val conf = new Configuration(); conf.set("hadoop.security.authentication", "Kerberos"); UserGroupInformation.setConfiguration(conf); UserGroupInformation.loginUserFromSubject(null); Including graph connect config for completeness gremlin.graph=org.janusgraph.core.JanusGraphFactory storage.backend=hbase storage.hostname=hosta.example.com:2181,hostb.example.com:2181,hostc.example.com:2181 storage.hbase.table=JgraphTest storage.hbase.ext.zookeeper.znode.parent=/hbase-secure storage.batch-loading=false java.security.krb5.conf=/etc/krb5.conf storage.hbase.ext.hbase.security.authentication=kerberos storage.hbase.ext.hbase.security.authorization=true storage.hbase.ext.hadoop.security.authentication=kerberos storage.hbase.ext.hadoop.security.authorization=true storage.hbase.ext.hbase.regionserver.kerberos.principal=hbase/_HOST#HDPDEV.example.com ids.block-size=10000 ids.renew-timeout=3600000 storage.buffer-size=10000 ids.num-partitions=10 ids.partition=true schema.default=none cache.db-cache = true cache.db-cache-clean-wait = 20 cache.db-cache-time = 180000 cache.db-cache-size = 0.5 Expected result would be a usable traversal object Actual result below 19/10/18 11:40:30 TRACE NettyRpcConnection: Connecting to hostb.example.com/192.168.1.101:16000 19/10/18 11:40:30 DEBUG AbstractHBaseSaslRpcClient: Creating SASL GSSAPI client. Server's Kerberos principal name is null 19/10/18 11:40:30 TRACE AbstractRpcClient: Call: IsMasterRunning, callTime: 4ms 19/10/18 11:40:30 DEBUG RpcRetryingCallerImpl: Call exception, tries=7, retries=16, started=8197 ms ago, cancelled=false, msg=java.io.IOException: Call to hostb.example.com/192.168.1.101:16000 failed on local exception: java.io.IOException: Failed to specify server's Kerberos principal name, details=, see https://s.apache.org/timeout, exception=org.apache.hadoop.hbase.MasterNotRunningException: java.io.IOException: Call to hostb.example.com/192.168.1.101:16000 failed on local exception: java.io.IOException: Failed to specify server's Kerberos principal name at org.apache.hadoop.hbase.client.ConnectionImplementation$MasterServiceStubMaker.makeStub(ConnectionImplementation.java:1175) at org.apache.hadoop.hbase.client.ConnectionImplementation.getKeepAliveMasterService(ConnectionImplementation.java:1234) at org.apache.hadoop.hbase.client.ConnectionImplementation.getMaster(ConnectionImplementation.java:1223) at org.apache.hadoop.hbase.client.MasterCallable.prepare(MasterCallable.java:57) at org.apache.hadoop.hbase.client.RpcRetryingCallerImpl.callWithRetries(RpcRetryingCallerImpl.java:105) at org.apache.hadoop.hbase.client.HBaseAdmin.executeCallable(HBaseAdmin.java:3089) at org.apache.hadoop.hbase.client.HBaseAdmin.getHTableDescriptor(HBaseAdmin.java:569) at org.apache.hadoop.hbase.client.HBaseAdmin.getTableDescriptor(HBaseAdmin.java:529) at org.janusgraph.diskstorage.hbase.HBaseAdmin1_0.getTableDescriptor(HBaseAdmin1_0.java:105) at org.janusgraph.diskstorage.hbase.HBaseStoreManager.ensureTableExists(HBaseStoreManager.java:726) at org.janusgraph.diskstorage.hbase.HBaseStoreManager.getLocalKeyPartition(HBaseStoreManager.java:537) at org.janusgraph.diskstorage.hbase.HBaseStoreManager.getDeployment(HBaseStoreManager.java:376) at org.janusgraph.diskstorage.hbase.HBaseStoreManager.getFeatures(HBaseStoreManager.java:418) at org.janusgraph.graphdb.configuration.builder.GraphDatabaseConfigurationBuilder.build(GraphDatabaseConfigurationBuilder.java:51) at org.janusgraph.core.JanusGraphFactory.open(JanusGraphFactory.java:161) at org.janusgraph.core.JanusGraphFactory.open(JanusGraphFactory.java:132) at org.janusgraph.core.JanusGraphFactory.open(JanusGraphFactory.java:79) at $line22.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:26) at $line22.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:31) at $line22.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:33) at $line22.$read$$iw$$iw$$iw$$iw$$iw.<init>(<console>:35) at $line22.$read$$iw$$iw$$iw$$iw.<init>(<console>:37) at $line22.$read$$iw$$iw$$iw.<init>(<console>:39) at $line22.$read$$iw$$iw.<init>(<console>:41) at $line22.$read$$iw.<init>(<console>:43) at $line22.$read.<init>(<console>:45) at $line22.$read$.<init>(<console>:49) at $line22.$read$.<clinit>(<console>) at $line22.$eval$.$print$lzycompute(<console>:7) at $line22.$eval$.$print(<console>:6) at $line22.$eval.$print(<console>) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786) at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1047) at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:638) at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:637) at scala.reflect.internal.util.ScalaClassLoader$class.asContext(ScalaClassLoader.scala:31) at scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:19) at scala.tools.nsc.interpreter.IMain$WrappedRequest.loadAndRunReq(IMain.scala:637) at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:569) at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:565) at scala.tools.nsc.interpreter.ILoop.interpretStartingWith(ILoop.scala:807) at scala.tools.nsc.interpreter.ILoop.command(ILoop.scala:681) at scala.tools.nsc.interpreter.ILoop.processLine(ILoop.scala:395) at scala.tools.nsc.interpreter.ILoop.loop(ILoop.scala:415) at scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.apply$mcZ$sp(ILoop.scala:923) at scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.apply(ILoop.scala:909) at scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.apply(ILoop.scala:909) at scala.reflect.internal.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:97) at scala.tools.nsc.interpreter.ILoop.process(ILoop.scala:909) at org.apache.spark.repl.Main$.doMain(Main.scala:76) at org.apache.spark.repl.Main$.main(Main.scala:56) at org.apache.spark.repl.Main.main(Main.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:904) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) Caused by: java.io.IOException: Call to hostb.example.com/192.168.1.101:16000 failed on local exception: java.io.IOException: Failed to specify server's Kerberos principal name at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at org.apache.hadoop.hbase.ipc.IPCUtil.wrapException(IPCUtil.java:221) at org.apache.hadoop.hbase.ipc.AbstractRpcClient.onCallFinished(AbstractRpcClient.java:390) at org.apache.hadoop.hbase.ipc.AbstractRpcClient.access$100(AbstractRpcClient.java:95) at org.apache.hadoop.hbase.ipc.AbstractRpcClient$3.run(AbstractRpcClient.java:410) at org.apache.hadoop.hbase.ipc.AbstractRpcClient$3.run(AbstractRpcClient.java:406) at org.apache.hadoop.hbase.ipc.Call.callComplete(Call.java:103) at org.apache.hadoop.hbase.ipc.Call.setException(Call.java:118) at org.apache.hadoop.hbase.ipc.BufferCallBeforeInitHandler.userEventTriggered(BufferCallBeforeInitHandler.java:92) at org.apache.hbase.thirdparty.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:329) at org.apache.hbase.thirdparty.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:315) at org.apache.hbase.thirdparty.io.netty.channel.AbstractChannelHandlerContext.fireUserEventTriggered(AbstractChannelHandlerContext.java:307) at org.apache.hbase.thirdparty.io.netty.channel.DefaultChannelPipeline$HeadContext.userEventTriggered(DefaultChannelPipeline.java:1377) at org.apache.hbase.thirdparty.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:329) at org.apache.hbase.thirdparty.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:315) at org.apache.hbase.thirdparty.io.netty.channel.DefaultChannelPipeline.fireUserEventTriggered(DefaultChannelPipeline.java:929) at org.apache.hadoop.hbase.ipc.NettyRpcConnection.failInit(NettyRpcConnection.java:179) at org.apache.hadoop.hbase.ipc.NettyRpcConnection.saslNegotiate(NettyRpcConnection.java:197) at org.apache.hadoop.hbase.ipc.NettyRpcConnection.access$800(NettyRpcConnection.java:71) at org.apache.hadoop.hbase.ipc.NettyRpcConnection$3.operationComplete(NettyRpcConnection.java:273) at org.apache.hadoop.hbase.ipc.NettyRpcConnection$3.operationComplete(NettyRpcConnection.java:261) at org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:507) at org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:500) at org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:479) at org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:420) at org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:104) at org.apache.hbase.thirdparty.io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:82) at org.apache.hbase.thirdparty.io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.fulfillConnectPromise(AbstractNioChannel.java:306) at org.apache.hbase.thirdparty.io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:341) at org.apache.hbase.thirdparty.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:633) at org.apache.hbase.thirdparty.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:580) at org.apache.hbase.thirdparty.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:497) at org.apache.hbase.thirdparty.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:459) at org.apache.hbase.thirdparty.io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:858) at org.apache.hbase.thirdparty.io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:138) at java.lang.Thread.run(Thread.java:745) Caused by: java.io.IOException: Failed to specify server's Kerberos principal name at org.apache.hadoop.hbase.security.AbstractHBaseSaslRpcClient.<init>(AbstractHBaseSaslRpcClient.java:99) at org.apache.hadoop.hbase.security.NettyHBaseSaslRpcClient.<init>(NettyHBaseSaslRpcClient.java:43) at org.apache.hadoop.hbase.security.NettyHBaseSaslRpcClientHandler.<init>(NettyHBaseSaslRpcClientHandler.java:70) at org.apache.hadoop.hbase.ipc.NettyRpcConnection.saslNegotiate(NettyRpcConnection.java:194) ... 18 more
Well I feel like an idiot. Sooo apparently the answer was actually a really simple deal. Appears that the gremlin client will work fine when just using storage.hbase.ext.hbase.regionserver.kerberos.principal but when using the libs out side of that storage.hbase.ext.hbase.master.kerberos.principal is needed as well. Well as far as this things are working on to the next set of problems I made for myself lol.
Error sequenceiq/hadoop-docker writing file
Error writing file Trying to write the file to sequenceiq/hadoop-docker I don't. deploy a docker container sequenceiq/hadoop-docker, everything rises. but when I try to write the file it gives an error could only be replicated to 0 nodes instead of minReplication (=1). There are 1 datanode(s) running and 1 node(s) are excluded in this operation. Run docker run -it --rm --name=hadoopserver -p 8030:8030 -p 8040:8040 -p 8042:8042 -p 8088:8088 -p 19888:19888 -p 49707:49707 -p 50010:50010 -p 50020:50020 -p 50070:50070 -p 50075:50075 -p 50090:50090 -p 9000:9000 sequenceiq/hadoop-docker:latest /etc/bootstrap.sh –d App public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); conf.set("fs.defaultFS", "hdfs://localhost:9000"); System.setProperty("HADOOP_USER_NAME", "root"); System.setProperty("hadoop.home.dir", "/"); FileSystem fileSystem = FileSystem.get(conf); try (FSDataOutputStream out = fileSystem.create(new Path("test.txt"), true)) { out.write("Test".getBytes()); } } error org.apache.hadoop.ipc.RemoteException: File /user/root/test.txt could only be replicated to 0 nodes instead of minReplication (=1). There are 1 datanode(s) running and 1 node(s) are excluded in this operation. at org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.chooseTarget4NewBlock(BlockManager.java:1547) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getNewBlockTargets(FSNamesystem.java:3107) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getAdditionalBlock(FSNamesystem.java:3031) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.addBlock(NameNodeRpcServer.java:724) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.addBlock(ClientNamenodeProtocolServerSideTranslatorPB.java:492) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:616) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:969) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2049) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2045) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2043) at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1481) at org.apache.hadoop.ipc.Client.call(Client.java:1427) at org.apache.hadoop.ipc.Client.call(Client.java:1337) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:227) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:116) at com.sun.proxy.$Proxy13.addBlock(Unknown Source) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.addBlock(ClientNamenodeProtocolTranslatorPB.java:440) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:398) at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:163) at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:155) at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95) at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:335) at com.sun.proxy.$Proxy14.addBlock(Unknown Source) at org.apache.hadoop.hdfs.DataStreamer.locateFollowingBlock(DataStreamer.java:1733) at org.apache.hadoop.hdfs.DataStreamer.nextBlockOutputStream(DataStreamer.java:1536) at org.apache.hadoop.hdfs.DataStreamer.run(DataStreamer.java:658) what am I doing wrong?
Maybe this will not be the strait answer to your question, but very simple and good hadoop-cluster-docker running (on linux) explanation and also word count example implantation. This helped me to understand a lot.
Spark RDD create on s3 file
I'm trying to create JAVARDD on s3 file but not able to create rdd.Can someone help me to solve this problem. Code : SparkConf conf = new SparkConf().setAppName(appName).setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(conf); javaSparkContext.hadoopConfiguration().set("fs.s3.awsAccessKeyId", accessKey); javaSparkContext.hadoopConfiguration().set("fs.s3.awsSecretAccessKey", secretKey); javaSparkContext.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem"); JavaRDD<String> rawData = sparkContext .textFile("s3://mybucket/sample.txt"); This code throwing exception 2015-05-06 18:58:57 WARN LoadSnappy:46 - Snappy native library not loaded java.lang.IllegalArgumentException: java.net.URISyntaxException: Expected scheme-specific part at index 3: s3: at org.apache.hadoop.fs.Path.initialize(Path.java:148) at org.apache.hadoop.fs.Path.<init>(Path.java:126) at org.apache.hadoop.fs.Path.<init>(Path.java:50) at org.apache.hadoop.fs.FileSystem.globPathsLevel(FileSystem.java:1084) at org.apache.hadoop.fs.FileSystem.globPathsLevel(FileSystem.java:1087) at org.apache.hadoop.fs.FileSystem.globPathsLevel(FileSystem.java:1087) at org.apache.hadoop.fs.FileSystem.globPathsLevel(FileSystem.java:1087) at org.apache.hadoop.fs.FileSystem.globPathsLevel(FileSystem.java:1087) at org.apache.hadoop.fs.FileSystem.globStatusInternal(FileSystem.java:1023) at org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:987) at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:177) at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:208) at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:203) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) at org.apache.spark.rdd.RDD.take(RDD.scala:1156) at org.apache.spark.rdd.RDD.first(RDD.scala:1189) at org.apache.spark.api.java.JavaRDDLike$class.first(JavaRDDLike.scala:477) at org.apache.spark.api.java.JavaRDD.first(JavaRDD.scala:32) at com.cignifi.DataExplorationValidation.processFile(DataExplorationValidation.java:148) at com.cignifi.DataExplorationValidation.main(DataExplorationValidation.java:104) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:569) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:166) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:189) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:110) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) Caused by: java.net.URISyntaxException: Expected scheme-specific part at index 3: s3: at java.net.URI$Parser.fail(URI.java:2829) at java.net.URI$Parser.failExpecting(URI.java:2835) at java.net.URI$Parser.parse(URI.java:3038) at java.net.URI.<init>(URI.java:753) at org.apache.hadoop.fs.Path.initialize(Path.java:145) ... 36 more Some more details Spark version 1.3.0. Running in local mode using spark-submit. I tried this thing on local and EC2 instance ,In both case I'm getting same error.
It should be s3n:// instead of s3:// See External Datasets in Spark Programming Guide