代码-第4章 Spark环境搭建和使用方法-林子雨编著《Spark编程基础(Python版,第2版)》

大数据学习路线图

厦门大学林子雨编著《Spark编程基础(Python版,第2版)》教材中的命令行和代码(教材官网
提供了教材中的所有章节的命令行和代码,可以直接复制粘贴去执行。
查看《Spark编程基础(Python版,第2版)》教材中的所有命令行和代码

sudo  tar  -zxf  ~/Downloads/spark-3.4.0-bin-without-hadoop.tgz  -C  /usr/local/
cd  /usr/local
sudo  mv  ./spark-3.4.0-bin-without-hadoop  ./spark
sudo  chown  -R  hadoop:hadoop  ./spark  # hadoop是当前登录Linux系统的用户名
cd  /usr/local/spark
cp  ./conf/spark-env.sh.template  ./conf/spark-env.sh
export  SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
vim ~/.bashrc
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_371
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export HADOOP_HOME=/usr/local/hadoop
export SPARK_HOME=/usr/local/spark
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export PYSPARK_PYTHON=/home/hadoop/anaconda3/envs/pyspark/bin/python3.8
export PATH=$PATH:${JAVA_HOME}/bin:$HADOOP_HOME/bin:$SPARK_HOME/bin
source ~/.bashrc
cd /usr/local/spark/conf
sudo mv log4j2.properties.template log4j.properties
vim log4j.properties 
rootLogger.level = error
cd  /usr/local/spark
./bin/run-example  SparkPi
./bin/run-example  SparkPi  2>&1  |  grep  "Pi is roughly"
cd  /usr/local/spark
./bin/pyspark  --master local[4]
cd  /usr/local/spark
./bin/pyspark  --master  local[4]  --jars  code.jar
cd /usr/local/spark
./bin/pyspark --help
cd /usr/local/spark
./bin/pyspark
cd /usr/local/spark
./bin/spark-submit --help
cd  /usr/local/spark
bin/spark-submit  \
> --master local[*]  \
> /usr/local/spark/examples/src/main/python/pi.py 10 2>&1  |  grep  "Pi is roughly"
conda activate pyspark
python
vim  ~/.bashrc
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_371
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export HADOOP_HOME=/usr/local/hadoop
export SPARK_HOME=/usr/local/spark
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export PYSPARK_PYTHON=/home/hadoop/anaconda3/envs/pyspark/bin/python3.8
cd  /usr/local/spark/
sudo mv  ./conf/workers.template  ./conf/workers
hadoop01
hadoop02
hadoop03
#PART1
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_371
export HADOOP_HOME=/usr/local/hadoop
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export YARN_CONF_DIR==/usr/local/hadoop/etc/hadoop

#PART2
export SPARK_MASTER_HOST=hadoop01
export SPARK_MASTER_PORT=7077
export SPARK_MASTER_WEBUI_PORT=8081
export  SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
export SPARK_HISTORY_OPTS="  
-Dspark.history.fs.logDirectory=hdfs://hadoop01:9000/sparklog 
-Dspark.history.fs.cleaner.enabled=true"

#PART3
export SPARK_WORKER_CORES=1
export SPARK_WORKER_MEMORY=1G
export SPARK_EXECUTOR_CORES=1
export SPARK_EXECUTOR_MEMORY=1G
export SPARK_DRIVER_MEMORY=1G
export SPARK_WORKER_PORT=7078
export SPARK_WORKER_WEBUI_PORT=8082
cd /usr/local/hadoop
./sbin/start-dfs.sh
cd /usr/local/hadoop
./bin/hdfs dfs -mkdir /sparklog
./bin/hdfs dfs -chmod 777 /sparklog
cd /usr/local/hadoop
sudo mv spark-defaults.conf.template spark-defaults.conf
vim spark-defaults.conf
spark.eventLog.enabled true  # 开启Spark的日志记录功能
spark.eventLog.dir hdfs://hadoop01:9000/sparklog  #日志记录的保存路径
spark.eventLog.compress true # 是否启动压缩
cd  /usr/local/
tar  -zcf  ~/spark.master.tar.gz  ./spark
cd  ~
scp  ./spark.master.tar.gz  hadoop02:/home/hadoop
scp  ./spark.master.tar.gz  hadoop03:/home/hadoop
cd  ~
sudo  rm  -rf  /usr/local/spark/
sudo  tar  -zxf  ~/spark.master.tar.gz  -C  /usr/local
sudo  chown  -R  hadoop  /usr/local/spark
cd  /usr/local/hadoop/
sbin/start-all.sh
cd  /usr/local/spark/
./sbin/start-history-server.sh
cd  /usr/local/spark/
sbin/start-master.sh
cd /usr/local/spark/
sbin/start-workers.sh
cd /usr/local/spark
sbin/stop-master.sh
sbin/stop-workers.sh
cd  /usr/local/hadoop/
sbin/stop-all.sh
cd  /usr/local/hadoop/
sbin/start-all.sh
cd  /usr/local/spark/
sbin/start-master.sh
sbin/start-workers.sh
cd  /usr/local/spark
bin/spark-submit  \
> --master spark://hadoop01:7077  \
> /usr/local/spark/examples/src/main/python/pi.py 10 2>&1  |  grep  "Pi is roughly"
cd  /usr/local/spark/
bin/pyspark  --master  spark://hadoop01:7077
>>> textFile = sc.textFile("hdfs://hadoop01:9000/README.md")
>>> textFile.count()
105
>>> textFile.first()
'# Apache Spark'
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export YARN_CONF_DIR==/usr/local/hadoop/etc/hadoop
cd /usr/local/hadoop
./sbin/start-yarn.sh # 启动YARN集群
cd /usr/local/spark
./bin/pyspark --master yarn
>>> textFile = sc.textFile("hdfs://master:9000/README.md")
>>> textFile.count()
105
>>> textFile.first()
'# Apache Spark'
cd  /usr/local/spark
bin/spark-submit  \
> --master yarn  \
> /usr/local/spark/examples/src/main/python/pi.py 10 2>&1  |  grep  "Pi is roughly"
cd  /usr/local/spark
bin/spark-submit  \
> --master yarn  \
> --deploy-mode client  \
> /usr/local/spark/examples/src/main/python/pi.py 10 2>&1  |  grep  "Pi is roughly"
cd  /usr/local/spark
bin/spark-submit  \
> --master yarn  \
> --deploy-mode cluster  \
> /usr/local/spark/examples/src/main/python/pi.py 10 2>&1  |  grep  "Pi is roughly"
conda activate pyspark
pip install pyspark==3.4.0 -i https://pypi.tuna.tsinghua.edu.cn/simple

下面打开一个Linux终端,新建一个代码文件“/usr/local/spark/mycode/python/WordCount.py”,具体内容如下:

from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("My App")
sc = SparkContext(conf = conf)
logFile = "file:///usr/local/spark/README.md"
logData = sc.textFile(logFile, 2).cache()
numAs = logData.filter(lambda line: 'a' in line).count()
numBs = logData.filter(lambda line: 'b' in line).count()
print('Lines with a: %s, Lines with b: %s' % (numAs, numBs))
sc.stop()
conda activate pyspark
cd /usr/local/spark/mycode/python
python WordCount.py
cd /usr/local/spark
./bin/spark-submit --help
/usr/local/spark/bin/spark-submit  \
> --master spark://hadoop01:7077  \ 
> /usr/local/spark/mycode/python/WordCount.py
https://www.jetbrains.com/pycharm/download
cd ~/Downloads # 假设安装文件在该目录下
sudo tar -zxvf pycharm-community-2022.2.3.tar.gz -C /usr/local
cd /usr/local
sudo mv pycharm-community-2022.2.3 pycharm
sudo chown -R hadoop pycharm
cd /usr/local/pycharm
./bin/pycharm.sh

在WordCount.py文件中输入如下内容:

# coding:utf8
from pyspark import SparkConf, SparkContext
if __name__ == '__main__':
    conf = SparkConf().setMaster("local[*]").setAppName("My App")
    sc = SparkContext(conf = conf)
    logFile = "file:///usr/local/spark/README.md"
    logData = sc.textFile(logFile, 2).cache()
    numAs = logData.filter(lambda line: 'a' in line).count()
    numBs = logData.filter(lambda line: 'b' in line).count()
    print('Lines with a: %s, Lines with b: %s' % (numAs, numBs))
    sc.stop()
cd /usr/local/hadoop
./sbin/start-dfs.sh  # 这里只启动了HDFS,没有启动YARN
cd /usr/local/spark
./sbin/start-all.sh # 同时启动了Spark的Master和Worker
logFile = "hdfs://hadoop01:9000/README.md"