林子雨编著《大数据基础编程、实验和案例教程(第2版)》教材第13章的代码

大数据学习路线图

林子雨编著《大数据基础编程、实验和案例教程(第2版)》(教材官网)教材中的命令行和代码,在纸质教材中的印刷效果不是很好,可能会影响读者对命令行和代码的理解,为了方便读者正确理解命令行和代码或者直接拷贝命令行和代码用于上机实验,这里提供全书配套的所有命令行和代码。
查看教材所有章节的代码

第13章 大数据课程综合实验案例

教材第202页

(温馨提示:代码框上方的复制代码按钮,也就是“两张A4纸图标”,用鼠标点击复制代码按钮,就可以把代码框中的代码复制到粘贴板,粘贴到其他地方。但是,有的浏览器可能不支持该功能)

  1. cd /home/hadoop/下载
  2. ls
Shell 命令
  1. cd /usr/local
  2. ls
  3. sudo mkdir bigdatacase
  4. #这里会提示你输入当前用户(本教程是hadoop用户名)的密码
  5. #下面给hadoop用户赋予针对bigdatacase目录的各种操作权限
  6. sudo chown -R hadoop:hadoop ./bigdatacase
  7. cd bigdatacase
  8. #下面创建一个dataset目录,用于保存数据集
  9. mkdir dataset
  10. #下面就可以解压缩user.zip文件
  11. cd ~ //表示进入hadoop用户的目录
  12. cd 下载
  13. ls
  14. unzip user.zip -d /usr/local/bigdatacase/dataset
  15. cd /usr/local/bigdatacase/dataset
  16. ls
Shell 命令

教材第203页

  1. head -5 raw_user.csv
Shell 命令
  1. cd /usr/local/bigdatacase/dataset
  2. #下面删除raw_user中的第1行
  3. sed -i '1d' raw_user.csv
  4. #上面的1d表示删除第1行,同理,3d表示删除第3行,nd表示删除第n行
  5. #下面删除small_user中的第1行
  6. sed -i '1d' small_user.csv
  7. #下面再用head命令去查看文件的前5行记录,就看不到字段名称这一行了
  8. head -5 raw_user.csv
  9. head -5 small_user.csv
Shell 命令

教材第204页

  1. cd /usr/local/bigdatacase/dataset
  2. vim pre_deal.sh
Shell 命令
#!/bin/bash
#下面设置输入文件,把用户执行pre_deal.sh命令时提供的第一个参数作为输入文件名称
infile=$1
#下面设置输出文件,把用户执行pre_deal.sh命令时提供的第二个参数作为输出文件名称
outfile=$2
#注意,最后的$infile> $outfile必须跟在}’这两个字符的后面
awk -F "," 'BEGIN{
srand();
        id=0;
        Province[0]="山东";Province[1]="山西";Province[2]="河南";Province[3]="河北";Province[4]="陕西";Province[5]="内蒙古";Province[6]="上海市";
        Province[7]="北京市";Province[8]="重庆市";Province[9]="天津市";Province[10]="福建";Province[11]="广东";Province[12]="广西";Province[13]="云南"; 
        Province[14]="浙江";Province[15]="贵州";Province[16]="新疆";Province[17]="西藏";Province[18]="江西";Province[19]="湖南";Province[20]="湖北";
        Province[21]="黑龙江";Province[22]="吉林";Province[23]="辽宁"; Province[24]="江苏";Province[25]="甘肃";Province[26]="青海";Province[27]="四川";
        Province[28]="安徽"; Province[29]="宁夏";Province[30]="海南";Province[31]="香港";Province[32]="澳门";Province[33]="台湾";
    }
    {
        id=id+1;
        value=int(rand()*34);       
        print id"\t"$1"\t"$2"\t"$3"\t"$5"\t"substr($6,1,10)"\t"Province[value]
    }' $infile> $outfile

教材第205页

  1. cd /usr/local/bigdatacase/dataset
  2. bash ./pre_deal.sh small_user.csv user_table.txt
Shell 命令
  1. head -10 user_table.txt
Shell 命令

教材第206页

  1. cd /usr/local/hadoop
  2. ./sbin/start-dfs.sh
Shell 命令
  1. jps
Shell 命令
  1. cd /usr/local/hadoop
  2. ./bin/hdfs dfs -mkdir -p /bigdatacase/dataset
Shell 命令

教材第207页

  1. cd /usr/local/hadoop
  2. ./bin/hdfs dfs -put /usr/local/bigdatacase/dataset/user_table.txt /bigdatacase/dataset
Shell 命令
  1. cd /usr/local/hadoop
  2. ./bin/hdfs dfs -cat /bigdatacase/dataset/user_table.txt | head -10
Shell 命令
  1. service mysql start #可以在Linux的任何目录下执行该命令
Shell 命令
  1. cd /usr/local/hive
  2. ./bin/hive #启动Hive
Shell 命令
  1. hive> create database dblab;
  2. hive> use dblab;
hive

教材第208页

````hive
hive> CREATE EXTERNAL TABLE dblab.bigdata_user(id INT,uid STRING,item_id STRING,behavior_type INT,item_category STRING,visit_date DATE,province STRING) COMMENT 'Welcome to xmudblab!' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/bigdatacase/dataset';

<pre><code><br />````hive
hive> use dblab; //使用dblab数据库
hive> show tables; //显示数据库中所有表
hive> show create table bigdata_user; //查看bigdata_user表的各种属性;

教材第209页

````hive
hive> desc bigdata_user;

<pre><code><br />````hive
hive> select * from bigdata_user limit 10;
hive> select behavior_type from bigdata_user limit 10;

````hive
hive> select behavior_type from bigdata_user limit 10; #查看前10位用户对商品的行为

<pre><code>### 教材第210页
````hive
hive> select visit_date, item_category from bigdata_user limit 20;

````hive
hive> select e.bh, e.it from (select behavior_type as bh, item_category as it from bigdata_user) as e limit 20;

<pre><code>### 教材第211页
````hive
hive> select count(*) from bigdata_user;

教材第212页

````hive
hive> select count(distinct uid) from bigdata_user;

<pre><code><br />````hive
hive>select count(*) from (select uid,item_id,behavior_type,item_category,visit_date,province from bigdata_user group by uid,item_id,behavior_type,item_category,visit_date,province having count(*)=1)a;

教材第213页

````hive
hive> select count(*) from bigdata_user where behavior_type='1' and visit_date<'2014-12-13' and visit_date>'2014-12-10';

<pre><code><br />````hive
hive> select count(distinct uid), day(visit_date) from bigdata_user where behavior_type='4' group by day(visit_date);

教材第214页

````hive
hive> select count(*) from bigdata_user where province='江西' and visit_date='2014-12-12' and behavior_type='4';

<pre><code><br />````hive
hive> select count(*) from bigdata_user where visit_date='2014-12-11'and behavior_type='4';#查询有多少用户在2014-12-11购买了商品

````hive
hive> select count(*) from bigdata_user where visit_date ='2014-12-11';#查询有多少用户在2014-12-11点击了该店

<pre><code>### 教材第215页
````hive
hive> select count(*) from bigdata_user where uid=10001082 and visit_date='2014-12-12';#查询用户10001082在2014-12-12点击网站的次数

````hive
hive> select count(*) from bigdata_user where visit_date='2014-12-12';#查询所有用户在这一天点击该网站的次数

<pre><code><br />````hive
hive> select uid from bigdata_user where behavior_type='4' and visit_date='2014-12-12' group by uid having count(behavior_type='4')>5;#查询某一天在该网站购买商品超过5次的用户id

````hive
hive> create table scan(province STRING,scan INT) COMMENT 'This is the search of bigdataday' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;#创建新的数据表进行存储
hive> insert overwrite table scan select province,count(behavior_type) from bigdata_user where behavior_type='1' group by province;#导入数据
hive> select * from scan;#显示结果

<pre><code>### 教材第216页
````hive
hive> create table dblab.user_action(id STRING,uid STRING, item_id STRING, behavior_type STRING, item_category STRING, visit_date DATE, province STRING) COMMENT 'Welcome to XMU dblab! ' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;

  1. cd /usr/local/hadoop
  2. ./bin/hdfs dfs -ls /user/hive/warehouse/dblab.db/
Shell 命令

教材第217页

````hive
hive> INSERT OVERWRITE TABLE dblab.user_action select * from dblab.bigdata_user;

<pre><code><br />````hive
hive> select * from user_action limit 10;

  1. mysql -u root -p
Shell 命令

教材第218页

  1. mysql> show databases; #显示所有数据库
  2. mysql> create database dblab; #创建dblab数据库
  3. mysql> use dblab; #使用数据库
mysql
  1. mysql>show variables like "char%";
mysql

教材第219页

  1. mysql> CREATE TABLE `dblab`.`user_action` (`id` varchar(50),`uid` varchar(50),`item_id` varchar(50),`behavior_type` varchar(10),`item_category` varchar(50), `visit_date` DATE,`province` varchar(20)) ENGINE=InnoDB DEFAULT CHARSET=utf8;
mysql
  1. mysql> exit;
mysql
  1. <property>
  2. <name>hadoop.proxyuser.hadoop.hosts</name>
  3. <value>*</value>
  4. </property>
  5. <property>
  6. <name>hadoop.proxyuser.hadoop.groups</name>
  7. <value>*</value>
  8. </property>
XML
  1. cd /usr/local/hive
  2. ./bin/hive --service hiveserver2 -hiveconf hive.server2.thrift.port=10000
Shell 命令
  1. sudo netstat -anp|grep 10000
Shell 命令

教材第220页

  1. import java.sql.*;
  2. import java.sql.SQLException;
  3.  
  4. public class HivetoMySQL {
  5. private static String driverName = "org.apache.hive.jdbc.HiveDriver";
  6. private static String driverName_mysql = "com.mysql.jdbc.Driver";
  7. public static void main(String[] args) throws SQLException {
  8. try {
  9. Class.forName(driverName);
  10. }catch (ClassNotFoundException e) {
  11. // TODO Auto-generated catch block
  12. e.printStackTrace();
  13. System.exit(1);
  14. }
  15. Connection con1 = DriverManager.getConnection("jdbc:hive2://localhost:10000/default", "hive", "hive");//后两个参数是用户名密码
  16.  
  17. if(con1 == null)
  18. System.out.println("连接失败");
  19. else {
  20. Statement stmt = con1.createStatement();
  21. String sql = "select * from dblab.user_action";
  22. System.out.println("Running: " + sql);
  23. ResultSet res = stmt.executeQuery(sql);
  24.  
  25. //InsertToMysql
  26. try {
  27. Class.forName(driverName_mysql);
  28. Connection con2 = DriverManager.getConnection("jdbc:mysql://localhost:3306/dblab","root","root");
  29. String sql2 = "insert into user_action(id,uid,item_id,behavior_type,item_category,visit_date,province) values (?,?,?,?,?,?,?)";
  30. PreparedStatement ps = con2.prepareStatement(sql2);
  31. while (res.next()) {
  32. ps.setString(1,res.getString(1));
  33. ps.setString(2,res.getString(2));
  34. ps.setString(3,res.getString(3));
  35. ps.setString(4,res.getString(4));
  36. ps.setString(5,res.getString(5));
  37. ps.setDate(6,res.getDate(6));
  38. ps.setString(7,res.getString(7));
  39. ps.executeUpdate();
  40. }
  41. ps.close();
  42. con2.close();
  43. res.close();
  44. stmt.close();
  45. } catch (ClassNotFoundException e) {
  46. e.printStackTrace();
  47. }
  48. }
  49. con1.close();
  50. }
  51. }
Java

教材第221页

  1. mysql -u root -p
Shell 命令

教材第222页

  1. mysql> use dblab;
  2. mysql> select * from user_action limit 10;
mysql
  1. cd /usr/local/hadoop
  2. ./sbin/start-all.sh
Shell 命令
  1. cd /usr/local/hbase
  2. ./bin/start-hbase.sh
Shell 命令

教材第223页

  1. cd /usr/local/bigdatacase/dataset
  2. /usr/local/hadoop/bin/hdfs dfs -get /user/hive/warehouse/dblab.db/user_action .
  3. #将HDFS上的user_action数据复制到本地当前目录,注意'.'表示当前目录
  4. cat ./user_action/* | head -10 #查看前10行数据
  5. cat ./user_action/00000* > user_action.output #将00000*文件复制一份重命名为user_action.output,*表示通配符
  6. head user_action.output #查看user_action.output前10行
Shell 命令
  1. import java.io.BufferedReader;
  2. import java.io.FileInputStream;
  3. import java.io.IOException;
  4. import java.io.InputStreamReader;
  5. import java.util.List;
  6. import org.apache.hadoop.conf.Configuration;
  7. import org.apache.hadoop.hbase.HBaseConfiguration;
  8. import org.apache.hadoop.hbase.*;
  9. import org.apache.hadoop.hbase.client.*;
  10. import org.apache.hadoop.hbase.util.Bytes;
  11. public class ImportHBase extends Thread {
  12. public Configuration config;
  13. public Connection conn;
  14. public Table table;
  15. public Admin admin;
  16. public ImportHBase() {
  17. config = HBaseConfiguration.create();
  18. // config.set("hbase.master", "master:60000");
  19. // config.set("hbase.zookeeper.quorum", "master");
  20. try {
  21. conn = ConnectionFactory.createConnection(config);
  22. admin = conn.getAdmin();
  23. table = conn.getTable(TableName.valueOf("user_action"));
  24. } catch (IOException e) {
  25. e.printStackTrace();
  26. }
  27. }
  28. public static void main(String[] args) throws Exception {
  29. if (args.length == 0) { //第一个参数是该jar所使用的类,第二个参数是数据集所存放的路径
  30. throw new Exception("You must set input path!");
  31. }
  32. String fileName = args[args.length-1]; //输入的文件路径是最后一个参数
  33. ImportHBase test = new ImportHBase();
  34. test.importLocalFileToHBase(fileName);
  35. }
  36. public void importLocalFileToHBase(String fileName) {
  37. long st = System.currentTimeMillis();
  38. BufferedReader br = null;
  39. try {
  40. br = new BufferedReader(new InputStreamReader(new FileInputStream(
  41. fileName)));
  42. String line = null;
  43. int count = 0;
  44. while ((line = br.readLine()) != null) {
  45. count++;
  46. put(line);
  47. if (count % 10000 == 0)
  48. System.out.println(count);
  49. }
  50. } catch (IOException e) {
  51. e.printStackTrace();
  52. } finally {
  53. if (br != null) {
  54. try {
  55. br.close();
  56. } catch (IOException e) {
  57. e.printStackTrace();
  58. }
  59. }
  60. try {
  61. table.close(); // must close the client
  62. } catch (IOException e) {
  63. e.printStackTrace();
  64. }
  65. }
  66. long en2 = System.currentTimeMillis();
  67. System.out.println("Total Time: " + (en2 - st) + " ms");
  68. }
  69. @SuppressWarnings("deprecation")
  70. public void put(String line) throws IOException {
  71. String[] arr = line.split("\t", -1);
  72. String[] column = {"id","uid","item_id","behavior_type","item_category","date","province"};
  73.  
  74. if (arr.length == 7) {
  75. Put put = new Put(Bytes.toBytes(arr[0]));// rowkey
  76. for(int i=1;i<arr.length;i++){
  77. put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes(column[i]),Bytes.toBytes(arr[i]));
  78. }
  79. table.put(put); // put to server
  80. }
  81. }
  82. public void get(String rowkey, String columnFamily, String column,
  83. int versions) throws IOException {
  84. long st = System.currentTimeMillis();
  85. Get get = new Get(Bytes.toBytes(rowkey));
  86. get.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes(column));
  87. Scan scanner = new Scan(get);
  88. scanner.readVersions(versions);
  89. ResultScanner rsScanner = table.getScanner(scanner);
  90. for (Result result : rsScanner) {
  91. final List<Cell> list = result.listCells();
  92. for (final Cell kv : list) {
  93. System.out.println(Bytes.toStringBinary(kv.getValueArray()) + "\t"
  94. + kv.getTimestamp()); // mid + time
  95. }
  96. }
  97. rsScanner.close();
  98. long en2 = System.currentTimeMillis();
  99. System.out.println("Total Time: " + (en2 - st) + " ms");
  100. }
  101. }
Java

教材第页226

  1. hbase> truncate 'user_action'
hbase
  1. /usr/local/hadoop/bin/hadoop jar /usr/local/bigdatacase/hbase/ImportHBase.jar ImportHBase /usr/local/bigdatacase/dataset/user_action.output
Shell 命令

教材第227页

  1. habse> scan 'user_action',{LIMIT=>10} #只查询前面10行
hbase

教材第229页

  1. sudo vim /etc/apt/sources.list
Shell 命令
deb http://mirrors.aliyun.com/ubuntu/ xenial main restricted universe multiverse  
deb http://mirrors.aliyun.com/ubuntu/ xenial-security main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ xenial-updates main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ xenial-backports main restricted universe multiverse

教材第230页

  1. sudo apt-get update
Shell 命令
  1. sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 51716619E084DAB9
Shell 命令
  1. sudo apt-get install r-base
Shell 命令
  1. R
Shell 命令
>q()
> install.packages('RMySQL')
  1. sudo apt-get install libmariadb-client-lgpl-dev
Shell 命令
  1. R
Shell 命令

教材第231页

> install.packages('RMySQL')

教材第232页

> install.packages('ggplot2')
> install.packages('devtools')
  1. sudo apt-get install libssl-dev
  2. sudo apt-get install libssh2-1-dev
  3. sudo apt-get install libcurl4-openssl-dev
  4. sudo apt-get install libxml2-dev
Shell 命令
> devtools::install_github('taiyun/recharts')
>?sort

教材第233页

  1. service mysql start
Shell 命令
  1. mysql -u root -p
Shell 命令
  1. mysql> use dblab;
  2. mysql> select * from user_action limit 10;
mysql
>library(RMySQL)
>conn <- dbConnect(MySQL(),dbname='dblab',username='root',password='hadoop',host="127.0.0.1",port=3306)
>user_action <- dbGetQuery(conn,'select * from user_action')

教材第234页

>summary(user_action$behavior_type)
>summary(as.numeric(user_action$behavior_type))
>library(ggplot2)
>ggplot(user_action,aes(as.numeric(behavior_type)))+geom_histogram()
>temp <- subset(user_action,as.numeric(behavior_type)==4) # 获取子数据集
>count <- sort(table(temp$item_category),decreasing = T) #排序
>print(count[1:10]) # 获取第1到10个排序结果

教材第235页

>month <- substr(user_action$visit_date,6,7)  # visit_date变量中截取月份
>user_action <- cbind(user_action,month)  # user_action增加一列月份数据
>ggplot(user_action,aes(as.numeric(behavior_type),col=factor(month)))+geom_histogram()+facet_grid(.~month)

教材第236页

>library(recharts)
>rel <- as.data.frame(table(temp$province))
>provinces <- rel$Var1
>x = c()
>for(n in provinces){
>x[length(x)+1] = nrow(subset(temp,(province==n)))
>}
>mapData <- data.frame(province=rel$Var1,count=x, stringsAsFactors=F) # 设置地图信息
>eMap(mapData, namevar=~province, datavar = ~count) #画出中国地图