第11章的代码-林子雨编著《大数据基础编程、实验和案例教程(第3版)》教材

大数据学习路线图

林子雨编著《大数据基础编程、实验和案例教程(第3版)》(教材官网)教材中的命令行和代码,在纸质教材中的印刷效果不是很好,可能会影响读者对命令行和代码的理解,为了方便读者正确理解命令行和代码或者直接拷贝命令行和代码用于上机实验,这里提供全书配套的所有命令行和代码。
查看教材所有章节的代码

第11章 数据采集工具的安装和使用

(温馨提示:代码框上方的复制代码按钮,也就是“两张A4纸图标”,用鼠标点击复制代码按钮,就可以把代码框中的代码复制到粘贴板,粘贴到其他地方。但是,有的浏览器可能不支持该功能)

教材第157页

cd ~/Downloads
sudo tar -zxf  kafka_2.12-2.6.0.tgz -C /usr/local
cd /usr/local
sudo mv kafka_2.12-2.6.0 kafka
sudo chown -R hadoop ./kafka
cd  /usr/local/kafka
./bin/zookeeper-server-start.sh  config/zookeeper.properties

教材第158页

cd /usr/local/kafka
./bin/kafka-server-start.sh config/server.properties
cd /usr/local/kafka
./bin/kafka-server-start.sh  config/server.properties  &
cd /usr/local/kafka
./bin/kafka-topics.sh --create --zookeeper localhost:2181 \
> --replication-factor 1 --partitions 1 \
> --topic wordsender
./bin/kafka-topics.sh --list --zookeeper localhost:2181
cd /usr/local/kafka
bin/kafka-console-consumer.sh \
> --bootstrap-server localhost:9092 --topic wordsender

教材第159页

cd  /usr/local/spark
mkdir mycode
cd mycode
mkdir  kafka
cd  kafka
mkdir -p  src/main/scala
cd  src/main/scala
vim  KafkaWordProducer.scala
import java.util.HashMap
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka010._
object KafkaWordProducer {
  def main(args: Array[String]) {
    if (args.length < 4) {
      System.err.println("Usage: KafkaWordProducer <metadataBrokerList> <topic> " +
        "<messagesPerSec> <wordsPerMessage>")
      System.exit(1)
    }
    val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args
    // Zookeeper connection properties
    val props = new HashMap[String, Object]()
    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
      "org.apache.kafka.common.serialization.StringSerializer")
    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
      "org.apache.kafka.common.serialization.StringSerializer")
    val producer = new KafkaProducer[String, String](props)
   // Send some messages
    while(true) {
      (1 to messagesPerSec.toInt).foreach { messageNum =>
        val str = (1 to wordsPerMessage.toInt).map(x => scala.util.Random.nextInt(10).
toString)
          .mkString(" ")
                    print(str)
                    println()
        val message = new ProducerRecord[String, String](topic, null, str)
        producer.send(message)
      }
     Thread.sleep(1000)
    }
  }
}

教材第160页

import org.apache.spark._
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe

object KafkaWordCount{
  def main(args:Array[String]){
    val sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]")
    val sc = new SparkContext(sparkConf)
    sc.setLogLevel("ERROR")
    val ssc = new StreamingContext(sc,Seconds(10))
    ssc.checkpoint("file:///usr/local/spark/mycode/kafka/checkpoint") //设置检查点,如果存放在HDFS上面,则写成类似ssc.checkpoint("/user/hadoop/checkpoint")这种形式,但是,要启动Hadoop
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "localhost:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "use_a_separate_group_id_for_each_stream",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (true: java.lang.Boolean)
    )
    val topics = Array("wordsender")
    val stream = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )
    stream.foreachRDD(rdd => {
      val offsetRange = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      val maped: RDD[(String, String)] = rdd.map(record => (record.key,record.value))
      val lines = maped.map(_._2)
      val words = lines.flatMap(_.split(" "))
      val pair = words.map(x => (x,1))
      val wordCounts = pair.reduceByKey(_+_)
      wordCounts.foreach(println)
    })
    ssc.start
    ssc.awaitTermination
  }
}

教材第162页

cd  /usr/local/spark/mycode/kafka/
vim  simple.sbt
name := "Simple Project"
version := "1.0"
scalaVersion := "2.12.17"
libraryDependencies += "org.apache.spark" %% "spark-core" % "3.4.0"
libraryDependencies += "org.apache.spark" %% "spark-streaming" % "3.4.0" % "provided"
libraryDependencies += "org.apache.spark" %% "spark-streaming-kafka-0-10" % "3.4.0"
libraryDependencies += "org.apache.kafka" % "kafka-clients" % "2.6.0"
cd  /usr/local/spark/mycode/kafka/
/usr/local/sbt/sbt  package
cd  /usr/local/hadoop
./sbin/start-dfs.sh
cd  /usr/local/spark/mycode/kafka/
/usr/local/spark/bin/spark-submit  \
> --class "KafkaWordProducer"   \
> ./target/scala-2.12/simple-project_2.12-1.0.jar  \
> localhost:9092  wordsender  3  5

教材第163页

cd  /usr/local/spark/mycode/kafka/
/usr/local/spark/bin/spark-submit  \
> --class "KafkaWordCount"  \
> ./target/scala-2.12/simple-project_2.12-1.0.jar