林子雨编著《大数据基础编程、实验和案例教程》(教材官网)教材中的代码,在纸质教材中的印刷效果不是很好,可能会影响读者对代码的理解,为了方便读者正确理解代码或者直接拷贝代码用于上机实验,这里提供全书配套的所有代码。
查看教材所有章节的代码
第11章 数据采集工具的安装和使用
教材第218页
(温馨提示:代码框上方的复制代码按钮,也就是“两张A4纸图标”,用鼠标点击复制代码按钮,就可以把代码框中的代码复制到粘贴板,粘贴到其他地方。但是,有的浏览器可能不支持该功能)
- cd ~
- sudo tar -zxvf ./下载/apache-flume-1.7.0-bin.tar.gz -C /usr/local
教材第219页
- cd /usr/local
- sudo mv ./apache-flume-1.7.0-bin ./flume
- sudo vim ~/.bashrc
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64;
export FLUME_HOME=/usr/local/flume                   
export FLUME_CONF_DIR=$FLUME_HOME/conf
export PATH=$PATH:$FLUME_HOME/bin
- source ~/.bashrc
- cd /usr/local/flume
- sudo mv flume-env.sh.template flume-env.sh
- sudo vim flume-env.sh
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64;
教材第220页
- cd /usr/local/flume
- ./bin/flume-ng version
- cd /usr/local/hbase/conf
- sudo vim hbase-env.sh
export HBASE_CLASSPATH=/home/hadoop/hbase/conf
- cd /usr/local/flume
- sudo vim ./conf/avro.conf
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = avro
a1.sources.r1.channels = c1
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 4141 #注意这个端口名,在后面的教程中会用得到
# Describe the sink
a1.sinks.k1.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
教材第221页
- /usr/local/flume/bin/flume-ng agent -c . -f /usr/local/flume/conf/avro.conf -n a1 -Dflume.root.logger=INFO,console
教材第222页
- cd /usr/local/flume
- sudo sh -c 'echo "hello world" > /usr/local/flume/log.00'
- cd /usr/local/flume
- ./bin/flume-ng avro-client --conf conf -H localhost -p 4141 -F /usr/local/flume/log.00
- cd /usr/local/flume
- sudo vim ./conf/example.conf
教材第223页
#example.conf: A single-node Flume configuration
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444  #记住该端口号,后面会用到
# Describe the sink
a1.sinks.k1.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
- /usr/local/flume/bin/flume-ng agent --conf ./conf --conf-file ./example.conf --name a1 -Dflume.root.logger=INFO,console
教材第224页
- telnet localhost 44444
教材第225页
- cd ~/下载
- sudo tar -zxf kafka_2.10-0.10.1.0.tgz -C /usr/local
- cd /usr/local
- sudo mv kafka_2.10-0.10.1.0/ ./kafka
- sudo chown -R hadoop ./kafka
- cd /usr/local/kafka
- bin/zookeeper-server-start.sh config/zookeeper.properties
教材第226页
- cd /usr/local/kafka
- ./bin/kafka-server-start.sh config/server.properties
- cd /usr/local/kafka
- ./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic dblab
- cd /usr/local/kafka
- ./bin/kafka-topics.sh --list --zookeeper localhost:2181
- cd /usr/local/kafka
- ./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic dblab
hello hadoop
hello xmu
hadoop world
- cd /usr/local/kafka
- ./bin/kafka-console-consumer.sh --zookeeper localhost:2181 --topic dblab --from-beginning
教材第227页
- cd ~/下载
- sudo tar -zxvf sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz -C /usr/local
- cd /usr/local
- sudo mv sqoop-1.4.6.bin__hadoop-2.0.4-alpha sqoop
- sudo chown -R hadoop:hadoop sqoop
教材第228页
- cd sqoop/conf/
- cat sqoop-env-template.sh >> sqoop-env.sh
- cd /usr/local/sqoop/conf/
- vim sqoop-env.sh
export HADOOP_COMMON_HOME=/usr/local/hadoop
export HADOOP_MAPRED_HOME=/usr/local/hadoop
export HBASE_HOME=/usr/local/hbase
export HIVE_HOME=/usr/local/hive
#export ZOOCFGDIR= #如果读者配置了ZooKeeper,也需要在此配置ZooKeeper的路径
- vim ~/.bashrc
export SQOOP_HOME=/usr/local/sqoop
export PATH=$PATH:$SBT_HOME/bin:$SQOOP_HOME/bin
export CLASSPATH=$CLASSPATH:$SQOOP_HOME/lib
- source ~/.bashrc
教材第229页
- cd ~/下载
- sudo tar -zxvf mysql-connector-java-5.1.40.tar.gz #解压MySQL驱动包
- #注意,如果第8章已经解压缩了该文件,这里不用重复执行解压缩命令
- ls #这时就可以看到解压缩后得到的目录mysql-connector-java-5.1.40
- cp ./mysql-connector-java-5.1.40/mysql-connector-java-5.1.40-bin.jar /usr/local/sqoop/lib
- service mysql start
- sqoop list-databases --connect jdbc:mysql://127.0.0.1:3306/ --username root -P
教材第230页
- cd /usr/local/kafka
- ./bin/zookeeper-server-start.sh config/zookeeper.properties
- cd /usr/local/kafka
- ./bin/kafka-server-start.sh config/server.properties
- cd /usr/local/kafka
- bin/kafka-server-start.sh config/server.properties &
教材第231页
- cd /usr/local/kafka
- ./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic wordsendertest
- #这个topic叫wordsendertest,2181是Zookeeper默认的端口号,partition是topic里面的分区数,replication-factor是备份的数量,在Kafka集群中使用,这里单机版就不用备份了
- #可以用list列出所有创建的topics,来查看上面创建的topic是否存在
- ./bin/kafka-topics.sh --list --zookeeper localhost:2181
- ./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic wordsendertest
hello hadoop
hello spark
- cd /usr/local/kafka
- ./bin/kafka-console-consumer.sh --zookeeper localhost:2181 --topic wordsender --from-beginning
教材第232页
- cd /usr/local/spark
- ./bin/spark-shell
- scala> import org.apache.spark.streaming.kafka._
- cd /usr/local/spark/lib
- mkdir kafka
- cd ~
- cd 下载
- cp ./spark-streaming-kafka_2.10-1.6.2.jar /usr/local/spark/lib/kafka
教材第233页
- cd /usr/local/spark/lib
- cd ~
- cd 下载
- cp ./spark-streaming_2.10-1.6.1.jar /usr/local/spark/lib/kafka
- cd /usr/local/kafka/libs
- ls
- cp ./* /usr/local/spark/lib/kafka
- cd /usr/local/kafka/libs
- ls
- rm log4j*
- rm jackson*
- cd /usr/local/spark/conf
- vim spark-env.sh
教材第234页
export SPARK_CLASSPATH=$SPARK_CLASSPATH:/usr/local/spark/lib/hbase/*:/usr/local/spark/lib/kafka/*
export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
- cd /usr/local/spark
- ./bin/spark-shell
- scala> import org.apache.spark.streaming.kafka._
- cd /usr/local/spark/mycode
- mkdir kafka
- cd kafka
- mkdir –p src/main/scala
- cd src/main/scala
- vim KafkaWordProducer.scala
import java.util.HashMap
import org.apache.kafka.clients.producer.{ProducerConfig, KafkaProducer, ProducerRecord}
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
import org.apache.spark.SparkConf
object KafkaWordProducer {
  def main(args: Array[String]) {
    if (args.length < 4) {
      System.err.println("Usage: KafkaWordCountProducer <metadataBrokerList> <topic> " +
        "<messagesPerSec> <wordsPerMessage>")
      System.exit(1)
    }
    val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args
    // Zookeeper connection properties
    val props = new HashMap[String, Object]()
    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
      "org.apache.kafka.common.serialization.StringSerializer")
    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
      "org.apache.kafka.common.serialization.StringSerializer")
    val producer = new KafkaProducer[String, String](props)
教材第235页
- vim KafkaWordCount.scala
import org.apache.spark._
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.kafka.KafkaUtils
object KafkaWordCount{
def main(args:Array[String]){
StreamingExamples.setStreamingLogLevels()
val sc = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]")
val ssc = new StreamingContext(sc,Seconds(10))
ssc.checkpoint("file:///usr/local/spark/mycode/kafka/checkpoint") //设置检查点,如果存放在HDFS上面,则写成类似ssc.checkpoint("/user/hadoop/checkpoint")这种形式,但是,要启动hadoop
val zkQuorum = "localhost:2181" //Zookeeper服务器地址
val group = "1"  //topic所在的group,可以设置为自己想要的名称,比如不用1,而是val group = "test-consumer-group" 
val topics = "wordsender"  //topics的名称          
val numThreads = 1  //每个topic的分区数
val topicMap =topics.split(",").map((_,numThreads.toInt)).toMap
val lineMap = KafkaUtils.createStream(ssc,zkQuorum,group,topicMap)
val lines = lineMap.map(_._2)
val words = lines.flatMap(_.split(" "))
val pair = words.map(x => (x,1))
val wordCounts = pair.reduceByKeyAndWindow(_ + _,_ - _,Minutes(2),Seconds(10),2) //这行代码的含义在下一节的窗口转换操作中会有介绍
wordCounts.print
ssc.start
ssc.awaitTermination
}
} 
教材第236页
- vim StreamingExamples.scala
import org.apache.spark.Logging
import org.apache.log4j.{Level, Logger}
/** Utility functions for Spark Streaming examples. */
object StreamingExamples extends Logging {
  /** Set reasonable logging levels for streaming if the user has not configured log4j. */
  def setStreamingLogLevels() {
    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
    if (!log4jInitialized) {
      // We first log something to initialize Spark's default logging, then we override the
      // logging level.
      logInfo("Setting log level to [WARN] for streaming example." +
        " To override add a custom log4j.properties to the classpath.")
      Logger.getRootLogger.setLevel(Level.WARN)
    }
  }
}
教材第237页
- cd /usr/local/spark/mycode/kafka/
- vim simple.sbt
name := "Simple Project"
version := "1.0"
scalaVersion := "2.10.5"
libraryDependencies += "org.apache.spark" %% "spark-core" % "1.6.2"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.10" % "1.6.2"
libraryDependencies += "org.apache.spark" % "spark-streaming-kafka_2.10" % "1.6.2"  
- cd /usr/local/spark/mycode/kafka/
- /usr/local/sbt/sbt package
教材第238页
- cd /usr/local/hadoop
- ./sbin/start-dfs.sh
- cd /usr/local/spark
- /usr/local/spark/bin/spark-submit --class "KafkaWordProducer" /usr/local/spark/mycode/kafka/target/scala-2.10/simple-project_2.10-1.0.jar localhost:9092 wordsender 3 5
教材第239页
- cd /usr/local/spark
- /usr/local/spark/bin/spark-submit --class "KafkaWordCount" /usr/local/spark/mycode/kafka/target/scala-2.10/simple-project_2.10-1.0.jar
