林子雨编著《Spark编程基础(Python版)》教材第3章的命令行和代码

大数据学习路线图

林子雨、郑海山、赖永炫编著《Spark编程基础(Python版)》(教材官网)教材中的代码,在纸质教材中的印刷效果,可能会影响读者对代码的理解,为了方便读者正确理解代码或者直接拷贝代码用于上机实验,这里提供全书配套的所有代码。
查看所有章节代码

第3章 Spark环境搭建和使用方法

sudo  tar  -zxf  ~/下载/spark-2.4.0-bin-without-hadoop.tgz  -C  /usr/local/
cd  /usr/local
sudo  mv  ./spark-2.4.0-bin-without-hadoop  ./spark
sudo  chown  -R  hadoop:hadoop  ./spark  # hadoop是当前登录Linux系统的用户名
cd  /usr/local/spark
cp  ./conf/spark-env.sh.template  ./conf/spark-env.sh
export  SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
vim ~/.bashrc
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_162
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=$PATH:${JAVA_HOME}/bin:/usr/local/hbase/bin
export HADOOP_HOME=/usr/local/hadoop
export SPARK_HOME=/usr/local/spark
export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip:$PYTHONPATH
export PYSPARK_PYTHON=python3
export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH
source ~/.bashrc
cd  /usr/local/spark
./bin/run-example  SparkPi
./bin/run-example  SparkPi  2>&1  |  grep  "Pi is roughly"
cd  /usr/local/hadoop
./sbin/start-dfs.sh
./sbin/stop-dfs.sh
pyspark  --master  <master-url>
cd  /usr/local/spark
./bin/pyspark  --master local[4]
cd  /usr/local/spark
./bin/pyspark  --master  local[4]  --jars  code.jar
cd /usr/local/spark
./bin/pyspark --help
cd /usr/local/spark
./bin/pyspark
PYSPARK_PYTHON=python3
cd /usr/local/spark
./bin/pyspark
>>> 8*2+5
>>> exit()

WordCount.py

from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)
logFile = "file:///usr/local/spark/README.md"
logData = sc.textFile(logFile, 2).cache()
numAs = logData.filter(lambda line: 'a' in line).count()
numBs = logData.filter(lambda line: 'b' in line).count()
print('Lines with a: %s, Lines with b: %s' % (numAs, numBs))
cd /usr/local/spark/mycode/python
python3 WordCount.py
cd /usr/local/spark
./bin/spark-submit --help
/usr/local/spark/bin/spark-submit  /usr/local/spark/mycode/python/WordCount.py
/usr/local/spark/bin/spark-submit  \
> /usr/local/spark/mycode/python/WordCount.py
cd /usr/local/spark/conf
sudo mv log4j.properties.template  log4j.properties
vim  log4j.properties
log4j.rootCategory=ERROR, console
sudo  tar  -zxf  ~/下载/spark-2.4.0-bin-without-hadoop.tgz  -C  /usr/local/
cd  /usr/local
sudo  mv  ./spark-2.4.0-bin-without-hadoop  ./spark
sudo  chown  -R  hadoop:hadoop  ./spark    # hadoop是当前登录Linux系统的用户名
vim  ~/.bashrc
export  SPARK_HOME=/usr/local/spark
export  PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
source  ~/.bashrc
cd  /usr/local/spark/
cp  ./conf/slaves.template  ./conf/slaves
Slave01
Slave02
cp  ./conf/spark-env.sh.template  ./conf/spark-env.sh
export  SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath) 
export  HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop 
export  SPARK_MASTER_IP=192.168.1.104
cd  /usr/local/
tar  -zcf  ~/spark.master.tar.gz  ./spark
cd  ~
scp  ./spark.master.tar.gz  Slave01:/home/hadoop
scp  ./spark.master.tar.gz  Slave02:/home/hadoop
sudo  rm  -rf  /usr/local/spark/
sudo  tar  -zxf  ~/spark.master.tar.gz  -C  /usr/local
sudo  chown  -R  hadoop  /usr/local/spark
cd  /usr/local/hadoop/
sbin/start-all.sh
cd  /usr/local/spark/
sbin/start-master.sh
cd /usr/local/spark/
sbin/start-slaves.sh
sbin/stop-master.sh
sbin/stop-slaves.sh
cd  /usr/local/hadoop/
sbin/stop-all.sh
cd  /usr/local/hadoop/
sbin/start-all.sh
cd  /usr/local/spark/
sbin/start-master.sh
sbin/start-slaves.sh
cd  /usr/local/spark/
bin/spark-submit  \
> --master spark://master:7077  \
> /usr/local/spark/examples/src/main/python/pi.py 2>&1  |  grep  "Pi is roughly"
cd  /usr/local/spark/
bin/pyspark  --master  spark://master:7077
>>> textFile = sc.textFile("hdfs://master:9000/README.md")
>>> textFile.count()
105
>>> textFile.first()
‘# Apache Spark’
cd /usr/local/spark/
bin/spark-submit  \
> --master yarn-client  \
> /usr/local/spark/examples/src/main/python/pi.py
cd /usr/local/spark/
bin/pyspark  --master  yarn
>>> textFile = sc.textFile("hdfs://master:9000/README.md")
>>> textFile.count()
105
>>> textFile.first()
‘# Apache Spark’