林子雨、郑海山、赖永炫编著《Spark编程基础(Python版)》(教材官网)教材中的代码,在纸质教材中的印刷效果,可能会影响读者对代码的理解,为了方便读者正确理解代码或者直接拷贝代码用于上机实验,这里提供全书配套的所有代码。
查看所有章节代码
第3章 Spark环境搭建和使用方法
sudo tar -zxf ~/下载/spark-2.4.0-bin-without-hadoop.tgz -C /usr/local/
cd /usr/local
sudo mv ./spark-2.4.0-bin-without-hadoop ./spark
sudo chown -R hadoop:hadoop ./spark # hadoop是当前登录Linux系统的用户名
cd /usr/local/spark
cp ./conf/spark-env.sh.template ./conf/spark-env.sh
export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
vim ~/.bashrc
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_162
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=$PATH:${JAVA_HOME}/bin:/usr/local/hbase/bin
export HADOOP_HOME=/usr/local/hadoop
export SPARK_HOME=/usr/local/spark
export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip:$PYTHONPATH
export PYSPARK_PYTHON=python3
export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH
source ~/.bashrc
cd /usr/local/spark
./bin/run-example SparkPi
./bin/run-example SparkPi 2>&1 | grep "Pi is roughly"
cd /usr/local/hadoop
./sbin/start-dfs.sh
./sbin/stop-dfs.sh
pyspark --master <master-url>
cd /usr/local/spark
./bin/pyspark --master local[4]
cd /usr/local/spark
./bin/pyspark --master local[4] --jars code.jar
cd /usr/local/spark
./bin/pyspark --help
cd /usr/local/spark
./bin/pyspark
PYSPARK_PYTHON=python3
cd /usr/local/spark
./bin/pyspark
>>> 8*2+5
>>> exit()
WordCount.py
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)
logFile = "file:///usr/local/spark/README.md"
logData = sc.textFile(logFile, 2).cache()
numAs = logData.filter(lambda line: 'a' in line).count()
numBs = logData.filter(lambda line: 'b' in line).count()
print('Lines with a: %s, Lines with b: %s' % (numAs, numBs))
cd /usr/local/spark/mycode/python
python3 WordCount.py
cd /usr/local/spark
./bin/spark-submit --help
/usr/local/spark/bin/spark-submit /usr/local/spark/mycode/python/WordCount.py
/usr/local/spark/bin/spark-submit \
> /usr/local/spark/mycode/python/WordCount.py
cd /usr/local/spark/conf
sudo mv log4j.properties.template log4j.properties
vim log4j.properties
log4j.rootCategory=ERROR, console
sudo tar -zxf ~/下载/spark-2.4.0-bin-without-hadoop.tgz -C /usr/local/
cd /usr/local
sudo mv ./spark-2.4.0-bin-without-hadoop ./spark
sudo chown -R hadoop:hadoop ./spark # hadoop是当前登录Linux系统的用户名
vim ~/.bashrc
export SPARK_HOME=/usr/local/spark
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
source ~/.bashrc
cd /usr/local/spark/
cp ./conf/slaves.template ./conf/slaves
Slave01
Slave02
cp ./conf/spark-env.sh.template ./conf/spark-env.sh
export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_MASTER_IP=192.168.1.104
cd /usr/local/
tar -zcf ~/spark.master.tar.gz ./spark
cd ~
scp ./spark.master.tar.gz Slave01:/home/hadoop
scp ./spark.master.tar.gz Slave02:/home/hadoop
sudo rm -rf /usr/local/spark/
sudo tar -zxf ~/spark.master.tar.gz -C /usr/local
sudo chown -R hadoop /usr/local/spark
cd /usr/local/hadoop/
sbin/start-all.sh
cd /usr/local/spark/
sbin/start-master.sh
cd /usr/local/spark/
sbin/start-slaves.sh
sbin/stop-master.sh
sbin/stop-slaves.sh
cd /usr/local/hadoop/
sbin/stop-all.sh
cd /usr/local/hadoop/
sbin/start-all.sh
cd /usr/local/spark/
sbin/start-master.sh
sbin/start-slaves.sh
cd /usr/local/spark/
bin/spark-submit \
> --master spark://master:7077 \
> /usr/local/spark/examples/src/main/python/pi.py 2>&1 | grep "Pi is roughly"
cd /usr/local/spark/
bin/pyspark --master spark://master:7077
>>> textFile = sc.textFile("hdfs://master:9000/README.md")
>>> textFile.count()
105
>>> textFile.first()
‘# Apache Spark’
cd /usr/local/spark/
bin/spark-submit \
> --master yarn-client \
> /usr/local/spark/examples/src/main/python/pi.py
cd /usr/local/spark/
bin/pyspark --master yarn
>>> textFile = sc.textFile("hdfs://master:9000/README.md")
>>> textFile.count()
105
>>> textFile.first()
‘# Apache Spark’