规划
cancer01 master/worker
cancer02 worker
cancer03 worker
cancer04 worker
cancer05 worker
准备
su hadoop
安装scala
每台机器上
cd /usr/local
wget http://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.tgz
tar zxf scala-2.11.8.tgz
mv scala-2.11.8 scala
chown -R hadoop:hadoop scala
vim /etc/profile
export SCALA_HOME=/usr/local/scala
export PATH=$PATH:$SCALA_HOME/bin
source /etc/profile
安装spark
wget http://d3kbcqa49mib13.cloudfront.net/spark-2.0.1-bin-hadoop2.7.tgz
tar zxf spark-2.0.1-bin-hadoop2.7.tgz
mv spark-2.0.1-bin-hadoop2.7 /usr/local/spark
chown -R hadoop:hadoop spark
vim /etc/profile
export SPARK_HOME=/usr/local/spark
export PATH=$PATH:$SPARK_HOME/bin
source /etc/profile
配置
cd /usr/local/spark/conf
mv spark-env.sh.template spark-env.sh
vim spark-env.sh
export SCALA_HOME=/usr/local/scala
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_MASTER_IP=192.168.11.134
export SPARK_MASTER_PORT=12345
export SPARK_disT_CLAsspATH=$(/usr/local/hadoop/bin/hadoop classpath)
复制
在cancer02|03|04|05上建立/usr/local/spark目录
scp –r spark hadoop@cancer02:/usr/local/
scp –r spark hadoop@cancer03:/usr/local/
scp –r spark hadoop@cancer04:/usr/local/
scp –r spark hadoop@cancer05:/usr/local/
启动
$HADOOP_HOME/sbin/start-all.sh
$SPARK_HOME/sbin/start-all.sh
或者
$SPARK_HOME/sbin/start-master.sh
$SPARK_HOME/sbin/start-slaves.sh
验证
运行
./bin/run-example SparkPi 2>%1 | grep "Pi is roughly"
./bin/spark-submit examples/src/main/python/pi.py 2>%1 | grep "Pi is roughly"
运行(scala python)
./bin/spark-shell
Scala样例:
val textFile = sc.textFile(“file:///usr/local/spark/README.md”);
textFile.count();
textFile.first();
val linesWithSpark = textFile.filter(line => line.contains("Spark"));
linesWithSpark.count();
textFile.filter(line => line.contains("Spark")).count();
配置conf/spark-env.sh
export SPARK_HOME=/var/lib/myspark/spark
export JAVA_HOME=/usr/java/jdk1.7.0_80
export HADOOP_HOME=/opt/cloudera/parcels/CDH/lib/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
export SPARK_LIBARY_PATH=.:$JAVA_HOME/lib:$JAVA_HOME/jre/lib:$HADOOP_HOME/lib/native
SPARK_MASTER_HOST=10.20.24.199
#web页面端口
SPARK_MASTER_WEBUI_PORT=28686
#Spark的local目录
SPARK_LOCAL_Dirs=/hadoopdata1/sparkdata/local
#worker目录
SPARK_WORKER_DIR=/hadoopdata1/sparkdata/work
#Driver内存大小
SPARK_DRIVER_MEMORY=4G
#Worker的cpu核数
SPARK_WORKER_CORES=16
#worker内存大小
SPARK_WORKER_MEMORY=64g
#Spark的log日志目录
SPARK_LOG_DIR=/var/lib/myspark/spark/logs