Hadoop 集群环境搭建

1. 集群规划

debian120 debian121 debian122
HDFS DataNode、NameNode DataNode、SecondaryNameNode DataNode
Yarn NodeManager NodeManager NodeManager、ResourceManager

2. 前期准备

1
2
3
4
5
6
# Selinux
# 防火墙
# 域名解析
# Java环境
# 时间同步
# SSH免密

3. Zookeeper 集群

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
$ tar -zxvf zookeeper-3.4.14.tar.gz -C /opt/app/
$ cd /opt/app/zookeeper-3.4.14
$ cp conf/zoo_sample.cfg conf/zoo.cfg
$ mkdir data logs
$ vim conf/zoo.cfg
dataDir=/opt/app/zookeeper-3.4.14/data
dataLogDir=/opt/app/zookeeper-3.4.14/logs
server.1=debian120:2888:3888
server.2=debian121:2888:3888
server.3=debian122:2888:3888
$ echo "1" > data/myid # 2/3
$ bin/zkServer.sh start # all
$ bin/zkServer.sh status # all
$ sudo vim /etc/profile.d/my.sh
export ZOOKEEPER_HOME=/opt/app/zookeeper-3.4.14
export PATH=$PATH:$ZOOKEEPER_HOME/bin

4. Hadoop & Yarn 集群

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
$ tar -zxvf hadoop-2.7.3.tar.gz -C /opt/app/
$ cd /opt/app/hadoop-2.7.3
$ mkdir -p hdfs/{datanode,namenode} tmpdata
$ cd /opt/app/hadoop-2.7.3/etc/hadoop

$ cp hadoop-env.sh hadoop-env.sh.bak
$ vim hadoop-env.sh
export JAVA_HOME=/opt/app/jdk1.8.0_212

$ cp yarn-env.sh yarn-env.sh.bak
$ vim yarn-env.sh
export JAVA_HOME=/opt/app/jdk1.8.0_212

$ cp hdfs-site.xml hdfs-site.xml.bak
$ vim hdfs-site.xml
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>/opt/app/hadoop-2.7.3/hdfs/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/opt/app/hadoop-2.7.3/hdfs/datanode</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>debian120:50070</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>debian121:50090</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
</configuration>

$ cp yarn-site.xml yarn-site.xml.bak
$ vim yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>debian122:8025</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>debian122:8030</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>debian122:8050</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>debian122:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>debian122:8088</value>
</property>
</configuration>

$ cp core-site.xml core-site.xml.bak
$ vim core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://debian120:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/app/hadoop-2.7.3/tmpdata</value>
</property>
<property>
<name>io.file.buffer.size</name>
<value>10240</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>debian120:2181,debian121:2181,debian122:2181</value>
</property>
<property>
<name>ha.zookeeper.session-timeout.ms</name>
<value>1000</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.groups</name>
<value>*</value>
</property>
</configuration>

$ cp mapred-site.xml.template mapred-site.xml
$ vim mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>

$ cp slaves slaves.bak
$ vim slaves
debian120
debian121
debian122

$ sudo vim /etc/profile.d/my.sh
export HADOOP_HOME=/opt/app/hadoop-2.7.3
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

# 初始化集群
debian120$ hadoop namenode -format
# 启动HDFS集群
debian120$ start-dfs.sh
Starting namenodes on [debian120]
debian120: starting namenode, logging to /opt/app/hadoop-2.7.3/logs/hadoop-liubin-namenode-debian120.out
debian120: starting datanode, logging to /opt/app/hadoop-2.7.3/logs/hadoop-liubin-datanode-debian120.out
debian121: starting datanode, logging to /opt/app/hadoop-2.7.3/logs/hadoop-liubin-datanode-debian121.out
debian122: starting datanode, logging to /opt/app/hadoop-2.7.3/logs/hadoop-liubin-datanode-debian122.out
Starting secondary namenodes [debian121]
debian121: starting secondarynamenode, logging to /opt/app/hadoop-2.7.3/logs/hadoop-liubin-secondarynamenode-debian121.out
# 启动Yarn集群
debian122$ start-yarn.sh
starting yarn daemons
resourcemanager running as process 1562. Stop it first.
debian121: starting nodemanager, logging to /opt/app/hadoop-2.7.3/logs/yarn-liubin-nodemanager-debian121.out
debian120: starting nodemanager, logging to /opt/app/hadoop-2.7.3/logs/yarn-liubin-nodemanager-debian120.out
debian122: starting nodemanager, logging to /opt/app/hadoop-2.7.3/logs/yarn-liubin-nodemanager-debian122.out

debian120$ jps
3840 Jps
2026 QuorumPeerMain
3483 DataNode
3741 NodeManager
3373 NameNode
debian121$ jps
1523 DataNode
1720 NodeManager
1611 SecondaryNameNode
1196 QuorumPeerMain
1821 Jps
debian122$ jps
2017 NodeManager
2116 Jps
1399 DataNode
1176 QuorumPeerMain
1562 ResourceManager

# 格式化namenode
$ hadoop namenode -format

参考