厦门大学林子雨编著《Spark编程基础(Python版,第2版)》教材中的命令行和代码(教材官网)
提供了教材中的所有章节的命令行和代码,可以直接复制粘贴去执行。
查看《Spark编程基础(Python版,第2版)》教材中的所有命令行和代码
sudo useradd -m hadoop -s /bin/bash
sudo passwd hadoop
sudo adduser hadoop sudo
sudo apt-get update
sudo apt-get install openssh-server
ssh localhost
cd ~/.ssh/ # 若没有该目录,请先执行一次ssh localhost
ssh-keygen -t rsa # 会有提示,都按回车即可
cat ./id_rsa.pub >> ./authorized_keys # 加入授权
cd /usr/lib
sudo mkdir jvm #创建/usr/lib/jvm目录用来存放JDK文件
cd ~ #进入hadoop用户的主目录
cd Downloads
sudo tar -zxvf ./jdk-8u371-linux-x64.tar.gz -C /usr/lib/jvm
vim ~/.bashrc
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_371
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
source ~/.bashrc
java -version
sudo tar -zxvf ~/Downloads/hadoop-3.3.5.tar.gz -C /usr/local # 解压到/usr/local中
cd /usr/local/
sudo mv ./hadoop-3.3.5/ ./hadoop # 将文件夹名改为hadoop
sudo chown -R hadoop:hadoop ./hadoop # 修改文件权限
cd /usr/local/hadoop
./bin/hadoop version
cd /usr/local/hadoop
./bin/hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.5.jar
cd /usr/local/hadoop
mkdir input
cp ./etc/hadoop/*.xml ./input # 将配置文件复制到input目录下
./bin/hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar grep ./input ./output 'dfs[a-z.]+'
cat ./output/* # 查看运行结果
rm -r ./output
修改以后,core-site.xml文件的内容如下:
<configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>file:/usr/local/hadoop/tmp</value>
<description>Abase for other temporary directories.</description>
</property>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
需要修改配置文件hdfs-site.xml,修改后的内容如下:
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/usr/local/hadoop/tmp/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/usr/local/hadoop/tmp/dfs/data</value>
</property>
</configuration>
cd /usr/local/hadoop
./bin/hdfs namenode -format
cd /usr/local/hadoop
./sbin/start-dfs.sh #start-dfs.sh是个完整的可执行文件,中间没有空格
jps
cd /usr/local/hadoop
./bin/hdfs dfs -mkdir -p /user/hadoop
cd /usr/local/hadoop
./bin/hdfs dfs -mkdir input #在HDFS中创建hadoop用户对应的input目录
./bin/hdfs dfs -put ./etc/hadoop/*.xml input #把本地文件复制到HDFS中
./bin/hdfs dfs -ls input
./bin/hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.5.jar grep input output 'dfs[a-z.]+'
./bin/hdfs dfs -cat output/*
./bin/hdfs dfs -rm -r output # 删除 output 文件夹
cd /usr/local/hadoop
./sbin/stop-dfs.sh
export PATH=$PATH:/usr/local/hadoop/sbin
export PATH=$PATH:/usr/local/hadoop/sbin:/usr/local/hadoop/bin
sudo vim /etc/hostname
ifconfig
netstat -nr
sudo vim /etc/hosts
192.168.91.128 hadoop01
192.168.91.129 hadoop02
192.168.91.130 hadoop03
ping hadoop01 -c 3 # 只ping 3次就会停止,否则要按Ctrl+c中断ping命令
ping hadoop02 -c 3
ping hadoop03 -c 3
sudo apt-get install openssh-server
cd ~/.ssh # 如果没有该目录,先执行一次ssh localhost
rm ./id_rsa* # 删除之前生成的公匙(如果已经存在)
ssh-keygen -t rsa # 执行该命令后,遇到提示信息,一直按回车就可以
cat ./id_rsa.pub >> ./authorized_keys
scp ~/.ssh/id_rsa.pub hadoop@hadoop02:/home/hadoop/
scp ~/.ssh/id_rsa.pub hadoop@hadoop03:/home/hadoop/
mkdir ~/.ssh # 如果不存在该文件夹需先创建,若已存在,则忽略本命令
cat ~/id_rsa.pub >> ~/.ssh/authorized_keys
rm ~/id_rsa.pub # 用完以后就可以删掉
ssh hadoop02
ssh hadoop03
sudo tar -zxvf ~/Downloads/hadoop-3.3.5.tar.gz -C /usr/local # 解压到/usr/local中
cd /usr/local/
sudo mv ./hadoop-3.3.5/ ./hadoop # 将文件夹名改为hadoop
sudo chown -R hadoop:hadoop ./hadoop # 修改文件权限
export PATH=$PATH:/usr/local/hadoop/bin:/usr/local/hadoop/sbin
请把hadoop01节点中的core-site.xml文件修改为如下内容:
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://hadoop01:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>file:/usr/local/hadoop/tmp</value>
<description>Abase for other temporary directories.</description>
</property>
</configuration>
把hadoop01节点中的hdfs-site.xml设置为如下内容:
<configuration>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>hadoop03:50090</value>
</property>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/usr/local/hadoop/tmp/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/usr/local/hadoop/tmp/dfs/data</value>
</property>
</configuration>
把mapred-site.xml文件配置成如下内容:
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>hadoop01:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>hadoop01:19888</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=/usr/local/hadoop</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=/usr/local/hadoop</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=/usr/local/hadoop</value>
</property>
</configuration>
请把hadoop01节点中的yarn-site.xml文件配置成如下内容:
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop01</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
cd /usr/local/hadoop
sudo rm -r ./tmp # 删除 Hadoop 临时文件
sudo rm -r ./logs/* # 删除日志文件
cd /usr/local
tar -zcf ~/hadoop.master.tar.gz ./hadoop # 先压缩再复制
cd ~
scp ./hadoop.master.tar.gz hadoop02:/home/hadoop
scp ./hadoop.master.tar.gz hadoop03:/home/hadoop
cd ~
sudo rm -r /usr/local/hadoop # 删掉旧的(如果存在)
sudo tar -zxf ~/hadoop.master.tar.gz -C /usr/local
sudo chown -R hadoop /usr/local/hadoop
cd /usr/local/hadoop
./bin/hdfs namenode -format
cd /usr/local/hadoop
./sbin/start-dfs.sh
./sbin/start-yarn.sh
./sbin/mr-jobhistory-daemon.sh start historyserver
cd /usr/local/hadoop
./bin/hdfs dfsadmin -report
hdfs dfs -mkdir -p /user/hadoop #此前已经配置了PATH环境变量,所以不用路径全称
hdfs dfs -mkdir input
hdfs dfs -put /usr/local/hadoop/etc/hadoop/*.xml input
hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.5.jar grep input output 'dfs[a-z.]+'
stop-yarn.sh
stop-dfs.sh
mr-jobhistory-daemon.sh stop historyserver
sudo apt-get update
sudo apt-get install mysql-server
service mysql stop
service mysql start
mysql -u root -p
mysql> show variables like ‘char%’;
mysql> set character_set_server=utf8;
vim /etc/mysql/mysql.conf.d/mysqld.cnf
character_set_server=utf8
service mysql restart
mysql> show variables like ‘char%’;
cd ~/Downloads #假设安装文件放在这个目录下
sudo tar -zxvf kafka_2.12-3.5.1.tgz -C /usr/local
cd /usr/local
sudo mv kafka_2.12-3.5.1 kafka
sudo chown -R hadoop ./kafka
cd /usr/local/kafka
./bin/zookeeper-server-start.sh config/zookeeper.properties
cd /usr/local/kafka
./bin/kafka-server-start.sh config/server.properties
cd /usr/local/kafka
bin/kafka-server-start.sh config/server.properties &
cd /usr/local/kafka
./bin/kafka-topics.sh --create --zookeeper localhost:2181 \
> --replication-factor 1 --partitions 1 --topic wordsendertest
#这个Topic叫wordsendertest,2181是Zookeeper默认的端口号,--partitions是Topic里面的分区数,--replication-factor是备份的数量,在Kafka集群中使用,由于这里是单机版,所以不用备份
#可以用list列出所有创建的Topic,来查看上面创建的Topic是否存在
./bin/kafka-topics.sh --list --zookeeper localhost:2181
./bin/kafka-console-producer.sh --broker-list localhost:9092 \
> --topic wordsendertest
cd /usr/local/kafka
./bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 \
> --topic wordsendertest --from-beginning
https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/
cd /home/hadoop/Downloads #假设安装文件在这个目录下
sh ./Anaconda3-2023.07-2-Linux-x86_64.sh
cd /home/hadoop/Downloads #假设安装文件在这个目录下
sh ./Anaconda3-2023.07-2-Linux-x86_64.sh
vim ~/.condarc
channels:
- defaults
show_channel_urls: true
channel_alias: https://mirrors.tuna.tsinghua.edu.cn/anaconda
default_channels:
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/pro
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2
custom_channels:
conda-forge: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
msys2: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
bioconda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
menpo: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
pytorch: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
simpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
conda create -n pyspark python=3.8
conda activate pyspark
>>> exit()