问题描述
我正在创建一个简单的 ETL,它可以读取 10 亿个文件并对它们重新分区(换句话说,压缩到更小的数量以供进一步处理)。
简单的 AWS glue 应用程序:
import org.apache.spark.SparkContext
object Hello {
def main(sysArgs: Array[String]) {
val spark: SparkContext = new SparkContext()
val input_path = "s3a://my-bucket-name/input/*"
val output_path = "s3a://my-bucket-name/output/*"
val num_partitions = 5
val ingestRDD = spark.textFile(input_path)
ingestRDD.repartition(num_partitions).saveAsTextFile(output_path)
}
}
提出以下回溯:
ERROR [main] glue.ProcessLauncher (Logging.scala:logError(70)): Exception in User Class: java.lang.RuntimeException : java.lang.RuntimeException: java.lang.classNotFoundException: Class org.apache.hadoop.mapred.DirectOutputCommitter not found
org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2401)
org.apache.hadoop.mapred.JobConf.getoutputCommitter(JobConf.java:725)
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1048)
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:958)
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:957)
org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1499)
org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1478)
org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1478)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1478)
Hello$.main(hello_world_parallel_rdd_scala:18)
Hello.main(hello_world_parallel_rdd_scala)
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.lang.reflect.Method.invoke(Method.java:498)
com.amazonaws.services.glue.SparkProcessLauncherPlugin$class.invoke(ProcessLauncher.scala:38)
com.amazonaws.services.glue.ProcessLauncher$$anon$1.invoke(ProcessLauncher.scala:67)
com.amazonaws.services.glue.ProcessLauncher.launch(ProcessLauncher.scala:108)
com.amazonaws.services.glue.ProcessLauncher$.main(ProcessLauncher.scala:21)
com.amazonaws.services.glue.ProcessLauncher.main(ProcessLauncher.scala)
同时这段代码在本地环境、集群和 EMR 集群中都有效。
解决方法
import org.apache.spark.SparkContext
object Hello {
def main(sysArgs: Array[String]) {
val spark: SparkContext = new SparkContext()
spark.hadoopConfiguration.set("mapred.output.committer.class","org.apache.hadoop.mapred.DirectFileOutputCommitter")
val input_path = "s3a://my-bucket-name/input/*"
val output_path = "s3a://my-bucket-name/output/*"
val num_partitions = 5
val ingestRDD = spark.textFile(input_path)
ingestRDD.repartition(num_partitions).saveAsTextFile(output_path)
}
}
,
为pyspark设置hadoopConfiguration,
sc._jsc.hadoopConfiguration().set("mapred.output.committer.class","org.apache.hadoop.mapred.DirectFileOutputCommitter")