林子雨编著《Spark编程基础》教材第4章的代码

大数据学习路线图

林子雨、赖永炫、陶继平编著《Spark编程基础》(教材官网)教材中的代码,在纸质教材中的印刷效果,可能会影响读者对代码的理解,为了方便读者正确理解代码或者直接拷贝代码用于上机实验,这里提供全书配套的所有代码。
查看教材所有章节的代码

第4章 Spark环境搭建和使用方法

sudo  tar  -zxf  ~/下载/spark-2.1.0-bin-without-hadoop.tgz  -C  /usr/local/
cd  /usr/local
sudo  mv  ./spark-2.1.0-bin-without-hadoop  ./spark
sudo  chown  -R  hadoop:hadoop  ./spark  # hadoop是当前登录Linux系统的用户名
cd  /usr/local/spark
cp  ./conf/spark-env.sh.template  ./conf/spark-env.sh
export  SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
cd  /usr/local/spark
bin/run-example  SparkPi
bin/run-example  SparkPi  2>&1  |  grep  "Pi is roughly"
cd  /usr/local/hadoop
./sbin/start-dfs.sh
jps
./sbin/stop-dfs.sh
./bin/spark-shell  --master  <master-url>
cd  /usr/local/spark
./bin/spark-shell  --master local[4]
cd  /usr/local/spark
./bin/spark-shell  --master  local[4]  --jars  code.jar
cd /usr/local/spark
./bin/spark-shell  --help
cd  /usr/local/spark
./bin/spark-shell
scala> 8*2+5
res0: Int = 21
scala > val  textFile = sc.textFile("file:///usr/local/spark/README.md") 
scala > textFile.count()
scala>:quit
https://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/0.13.11/sbt-launch.jar
sudo  mkdir  /usr/local/sbt
sudo  chown  -R  hadoop  /usr/local/sbt  #此处的hadoop是Linux系统当前登录用户名
cd  /usr/local/sbt
cp  ~/下载/sbt-launch.jar  .
vim  ./sbt
#!/bin/bash
SBT_OPTS="-Xms512M -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=256M"
java $SBT_OPTS -jar `dirname $0`/sbt-launch.jar "$@"
chmod  u+x  ./sbt
./sbt  sbt-version
http://apache.fayea.com/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.zip
sudo  unzip  ~/下载/apache-maven-3.3.9-bin.zip  -d  /usr/local
cd  /usr/local
sudo  mv  ./apache-maven-3.3.9  ./maven
sudo  chown  -R  hadoop  ./maven
cd  ~           # 进入用户主文件夹
mkdir  ./sparkapp        # 创建应用程序根目录
mkdir  -p  ./sparkapp/src/main/scala     # 创建所需的文件夹结构
cd  ~
vim  ./sparkapp/src/main/scala/SimpleApp.scala
/* SimpleApp.scala */
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object SimpleApp {
    def main(args: Array[String]) {
        val logFile = "file:///usr/local/spark/README.md"
        val conf = new SparkConf().setAppName("Simple Application")
        val sc = new SparkContext(conf)
        val logData = sc.textFile(logFile, 2).cache()
        val numAs = logData.filter(line => line.contains("a")).count()
        val numBs = logData.filter(line => line.contains("b")).count()
        println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
    }
}
cd  ~
vim  ./sparkapp/simple.sbt
name := "Simple Project"
version := "1.0"
scalaVersion := "2.11.8"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.1.0"
cd  ~/sparkapp
find  .
.
./src
./src/main
./src/main/scala
./src/main/scala/SimpleApp.scala
./simple.sbt
cd  ~/sparkapp  #一定把这个目录设置为当前目录
/usr/local/sbt/sbt  package
cd  ~
vim  ./sparkapp2/pom.xml
<project>
    <groupId>cn.edu.xmu</groupId>
    <artifactId>simple-project</artifactId>
    <modelVersion>4.0.0</modelVersion>
    <name>Simple Project</name>
    <packaging>jar</packaging>
    <version>1.0</version>
    <repositories>
        <repository>
            <id>jboss</id>
            <name>JBoss Repository</name>
            <url>http://repository.jboss.com/maven2/</url>
        </repository>
    </repositories>
    <dependencies>
        <dependency> <!-- Spark dependency -->
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.1.0</version>
        </dependency>
    </dependencies>

  <build>
    <sourceDirectory>src/main/scala</sourceDirectory>
    <plugins>
      <plugin>
        <groupId>org.scala-tools</groupId>
        <artifactId>maven-scala-plugin</artifactId>
        <executions>
          <execution>
            <goals>
              <goal>compile</goal>
            </goals>
          </execution>
        </executions>
        <configuration>
          <scalaVersion>2.11.8</scalaVersion>
          <args>
            <arg>-target:jvm-1.5</arg>
          </args>
        </configuration>
    </plugin>
    </plugins>
</build>
</project>
cd  ~/sparkapp2
find  .
.
./pom.xml
./src
./src/main
./src/main/scala
./src/main/scala/SimpleApp.java
cd  ~/sparkapp2    #一定把这个目录设置为当前目录
/usr/local/maven/bin/mvn  package
spark-submit
  --class <main-class>  #需要运行的程序的主类,应用程序的入口点
  --master <master-url>  #<master-url>的含义和表4-1中的相同
  --deploy-mode <deploy-mode>   #部署模式
  ... #其他参数
  <application-jar>  #应用程序JAR包
  [application-arguments]  #传递给主类的主方法的参数
/usr/local/spark/bin/spark-submit  --class  "SimpleApp" ~/sparkapp/target/scala-2.11/simple-project_2.11-1.0.jar
/usr/local/spark/bin/spark-submit  \
> --class "SimpleApp"  \
> ~/sparkapp/target/scala-2.11/simple-project_2.11-1.0.jar
/usr/local/spark/bin/spark-submit  \
> --class "SimpleApp" \
> ~/sparkapp/target/scala-2.11/simple-project_2.11-1.0.jar  2>&1  |  grep "Lines with a:" 
sudo  tar  -zxf  ~/下载/spark-2.1.0-bin-without-hadoop.tgz  -C  /usr/local/
cd  /usr/local
sudo  mv  ./spark-2.1.0-bin-without-hadoop  ./spark
sudo  chown  -R  hadoop:hadoop  ./spark    # hadoop是当前登录Linux系统的用户名
vim  ~/.bashrc
export  SPARK_HOME=/usr/local/spark
export  PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
source  ~/.bashrc
cd  /usr/local/spark/
cp  ./conf/slaves.template  ./conf/slaves
Slave01
Slave02
cp  ./conf/spark-env.sh.template  ./conf/spark-env.sh
export  SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath) 
export  HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop 
export  SPARK_MASTER_IP=192.168.1.104
cd  /usr/local/
tar  -zcf  ~/spark.master.tar.gz  ./spark
cd  ~
scp  ./spark.master.tar.gz  Slave01:/home/hadoop
scp  ./spark.master.tar.gz  Slave02:/home/hadoop
sudo  rm  -rf  /usr/local/spark/
sudo  tar  -zxf  ~/spark.master.tar.gz  -C  /usr/local
sudo  chown  -R  hadoop  /usr/local/spark
cd  /usr/local/hadoop/
sbin/start-all.sh
cd  /usr/local/spark/
sbin/start-master.sh
sbin/start-slaves.sh
sbin/stop-master.sh
sbin/stop-slaves.sh
cd  /usr/local/hadoop/
sbin/stop-all.sh
cd  /usr/local/hadoop/
sbin/start-all.sh
cd  /usr/local/spark/
sbin/start-master.sh
sbin/start-slaves.sh
bin/spark-submit  \
> --class org.apache.spark.examples.SparkPi  \
> --master spark://master:7077  \
> examples/jars/spark-examples_2.11-2.0.2.jar  100  2>&1  |  grep  "Pi is roughly"
bin/spark-shell  --master  spark://master:7077
scala> val  textFile = sc.textFile("hdfs://master:9000/README.md")
textFile: org.apache.spark.rdd.RDD[String] = hdfs://master:9000/README.md MapPartitionsRDD[1] at textFile at <console>:24
scala> textFile.count()
res0: Long = 99
scala> textFile.first()
res1: String = # Apache Spark
bin/spark-submit  \
> --class org.apache.spark.examples.SparkPi  \
> --master yarn-cluster  \
> examples/jars/spark-examples_2.11-2.0.2.jar
bin/spark-shell  --master  yarn
scala> val  textFile = sc.textFile("hdfs://master:9000/README.md")
textFile: org.apache.spark.rdd.RDD[String] = hdfs://master:9000/README.md MapPartitionsRDD[1] at textFile at <console>:24
scala> textFile.count()
res0: Long = 99
scala> textFile.first()
res1: String = # Apache Spark