1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.master;
20
21 import static org.junit.Assert.*;
22
23 import java.io.IOException;
24 import java.util.List;
25 import java.util.NavigableSet;
26 import java.util.Set;
27 import java.util.TreeSet;
28
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.apache.hadoop.conf.Configuration;
32 import org.apache.hadoop.hbase.*;
33 import org.apache.hadoop.hbase.client.HTable;
34 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
35 import org.apache.hadoop.hbase.testclassification.LargeTests;
36 import org.apache.hadoop.hbase.util.Bytes;
37 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
38 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
39 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
40 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
41 import org.apache.zookeeper.KeeperException;
42 import org.junit.Test;
43 import org.junit.experimental.categories.Category;
44
45
46
47
48 @Category(LargeTests.class)
49 public class TestRollingRestart {
50 private static final Log LOG = LogFactory.getLog(TestRollingRestart.class);
51
52 @Test (timeout=500000)
53 public void testBasicRollingRestart() throws Exception {
54
55
56 final int NUM_MASTERS = 2;
57 final int NUM_RS = 3;
58 final int NUM_REGIONS_TO_CREATE = 20;
59
60 int expectedNumRS = 3;
61
62
63 log("Starting cluster");
64 Configuration conf = HBaseConfiguration.create();
65 conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
66 conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 5000);
67 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
68 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
69 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
70 log("Waiting for active/ready master");
71 cluster.waitForActiveAndReadyMaster();
72 ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart",
73 null);
74 HMaster master = cluster.getMaster();
75
76
77 byte [] table = Bytes.toBytes("tableRestart");
78 byte [] family = Bytes.toBytes("family");
79 log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
80 HTable ht = TEST_UTIL.createTable(table, family);
81 int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family,
82 NUM_REGIONS_TO_CREATE);
83 numRegions += 1;
84 log("Waiting for no more RIT\n");
85 blockUntilNoRIT(zkw, master);
86 log("Disabling table\n");
87 TEST_UTIL.getHBaseAdmin().disableTable(table);
88 log("Waiting for no more RIT\n");
89 blockUntilNoRIT(zkw, master);
90 NavigableSet<String> regions = getAllOnlineRegions(cluster);
91 log("Verifying only catalog and namespace regions are assigned\n");
92 if (regions.size() != 2) {
93 for (String oregion : regions) log("Region still online: " + oregion);
94 }
95 assertEquals(2, regions.size());
96 log("Enabling table\n");
97 TEST_UTIL.getHBaseAdmin().enableTable(table);
98 log("Waiting for no more RIT\n");
99 blockUntilNoRIT(zkw, master);
100 log("Verifying there are " + numRegions + " assigned on cluster\n");
101 regions = getAllOnlineRegions(cluster);
102 assertRegionsAssigned(cluster, regions);
103 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
104
105
106 log("Adding a fourth RS");
107 RegionServerThread restarted = cluster.startRegionServer();
108 expectedNumRS++;
109 restarted.waitForServerOnline();
110 log("Additional RS is online");
111 log("Waiting for no more RIT");
112 blockUntilNoRIT(zkw, master);
113 log("Verifying there are " + numRegions + " assigned on cluster");
114 assertRegionsAssigned(cluster, regions);
115 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
116
117
118 List<MasterThread> masterThreads = cluster.getMasterThreads();
119 MasterThread activeMaster = null;
120 MasterThread backupMaster = null;
121 assertEquals(2, masterThreads.size());
122 if (masterThreads.get(0).getMaster().isActiveMaster()) {
123 activeMaster = masterThreads.get(0);
124 backupMaster = masterThreads.get(1);
125 } else {
126 activeMaster = masterThreads.get(1);
127 backupMaster = masterThreads.get(0);
128 }
129
130
131 log("Stopping backup master\n\n");
132 backupMaster.getMaster().stop("Stop of backup during rolling restart");
133 cluster.hbaseCluster.waitOnMaster(backupMaster);
134
135
136 log("Stopping primary master\n\n");
137 activeMaster.getMaster().stop("Stop of active during rolling restart");
138 cluster.hbaseCluster.waitOnMaster(activeMaster);
139
140
141 log("Restarting primary master\n\n");
142 activeMaster = cluster.startMaster();
143 cluster.waitForActiveAndReadyMaster();
144 master = activeMaster.getMaster();
145
146
147 log("Restarting backup master\n\n");
148 backupMaster = cluster.startMaster();
149
150 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
151
152
153
154
155 List<RegionServerThread> regionServers =
156 cluster.getLiveRegionServerThreads();
157 int num = 1;
158 int total = regionServers.size();
159 for (RegionServerThread rst : regionServers) {
160 ServerName serverName = rst.getRegionServer().getServerName();
161 log("Stopping region server " + num + " of " + total + " [ " +
162 serverName + "]");
163 rst.getRegionServer().stop("Stopping RS during rolling restart");
164 cluster.hbaseCluster.waitOnRegionServer(rst);
165 log("Waiting for RS shutdown to be handled by master");
166 waitForRSShutdownToStartAndFinish(activeMaster, serverName);
167 log("RS shutdown done, waiting for no more RIT");
168 blockUntilNoRIT(zkw, master);
169 log("Verifying there are " + numRegions + " assigned on cluster");
170 assertRegionsAssigned(cluster, regions);
171 expectedNumRS--;
172 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
173 log("Restarting region server " + num + " of " + total);
174 restarted = cluster.startRegionServer();
175 restarted.waitForServerOnline();
176 expectedNumRS++;
177 log("Region server " + num + " is back online");
178 log("Waiting for no more RIT");
179 blockUntilNoRIT(zkw, master);
180 log("Verifying there are " + numRegions + " assigned on cluster");
181 assertRegionsAssigned(cluster, regions);
182 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
183 num++;
184 }
185 Thread.sleep(1000);
186 assertRegionsAssigned(cluster, regions);
187
188
189 RegionServerThread metaServer = getServerHostingMeta(cluster);
190 log("Stopping server hosting hbase:meta #1");
191 metaServer.getRegionServer().stop("Stopping hbase:meta server");
192 cluster.hbaseCluster.waitOnRegionServer(metaServer);
193 log("Meta server down #1");
194 expectedNumRS--;
195 log("Waiting for meta server #1 RS shutdown to be handled by master");
196 waitForRSShutdownToStartAndFinish(activeMaster,
197 metaServer.getRegionServer().getServerName());
198 log("Waiting for no more RIT");
199 long start = System.currentTimeMillis();
200 do {
201 blockUntilNoRIT(zkw, master);
202 } while (getNumberOfOnlineRegions(cluster) < numRegions
203 && System.currentTimeMillis()-start < 60000);
204 log("Verifying there are " + numRegions + " assigned on cluster");
205 assertRegionsAssigned(cluster, regions);
206 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
207
208
209 metaServer = getServerHostingMeta(cluster);
210 log("Stopping server hosting hbase:meta #2");
211 metaServer.getRegionServer().stop("Stopping hbase:meta server");
212 cluster.hbaseCluster.waitOnRegionServer(metaServer);
213 log("Meta server down");
214 expectedNumRS--;
215 log("Waiting for RS shutdown to be handled by master");
216 waitForRSShutdownToStartAndFinish(activeMaster,
217 metaServer.getRegionServer().getServerName());
218 log("RS shutdown done, waiting for no more RIT");
219 blockUntilNoRIT(zkw, master);
220 log("Verifying there are " + numRegions + " assigned on cluster");
221 assertRegionsAssigned(cluster, regions);
222 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
223
224
225 cluster.startRegionServer().waitForServerOnline();
226 cluster.startRegionServer().waitForServerOnline();
227 cluster.startRegionServer().waitForServerOnline();
228 Thread.sleep(1000);
229 log("Waiting for no more RIT");
230 blockUntilNoRIT(zkw, master);
231 log("Verifying there are " + numRegions + " assigned on cluster");
232 assertRegionsAssigned(cluster, regions);
233
234 metaServer = getServerHostingMeta(cluster);
235 log("Stopping server hosting hbase:meta (1 of 3)");
236 metaServer.getRegionServer().stop("Stopping hbase:meta server");
237 cluster.hbaseCluster.waitOnRegionServer(metaServer);
238 log("Meta server down (1 of 3)");
239 log("Waiting for RS shutdown to be handled by master");
240 waitForRSShutdownToStartAndFinish(activeMaster,
241 metaServer.getRegionServer().getServerName());
242 log("RS shutdown done, waiting for no more RIT");
243 blockUntilNoRIT(zkw, master);
244 log("Verifying there are " + numRegions + " assigned on cluster");
245 assertRegionsAssigned(cluster, regions);
246
247
248 metaServer = getServerHostingMeta(cluster);
249 log("Stopping server hosting hbase:meta (2 of 3)");
250 metaServer.getRegionServer().stop("Stopping hbase:meta server");
251 cluster.hbaseCluster.waitOnRegionServer(metaServer);
252 log("Meta server down (2 of 3)");
253 log("Waiting for RS shutdown to be handled by master");
254 waitForRSShutdownToStartAndFinish(activeMaster,
255 metaServer.getRegionServer().getServerName());
256 log("RS shutdown done, waiting for no more RIT");
257 blockUntilNoRIT(zkw, master);
258 log("Verifying there are " + numRegions + " assigned on cluster");
259 assertRegionsAssigned(cluster, regions);
260
261
262 metaServer = getServerHostingMeta(cluster);
263 log("Stopping server hosting hbase:meta (3 of 3)");
264 metaServer.getRegionServer().stop("Stopping hbase:meta server");
265 cluster.hbaseCluster.waitOnRegionServer(metaServer);
266 log("Meta server down (3 of 3)");
267 log("Waiting for RS shutdown to be handled by master");
268 waitForRSShutdownToStartAndFinish(activeMaster,
269 metaServer.getRegionServer().getServerName());
270 log("RS shutdown done, waiting for no more RIT");
271 blockUntilNoRIT(zkw, master);
272 log("Verifying there are " + numRegions + " assigned on cluster");
273 assertRegionsAssigned(cluster, regions);
274
275 if (cluster.getRegionServerThreads().size() != 1) {
276 log("Online regionservers:");
277 for (RegionServerThread rst : cluster.getRegionServerThreads()) {
278 log("RS: " + rst.getRegionServer().getServerName());
279 }
280 }
281 assertEquals(2, cluster.getRegionServerThreads().size());
282
283
284
285
286 ht.close();
287
288 TEST_UTIL.shutdownMiniCluster();
289 }
290
291 private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master)
292 throws KeeperException, InterruptedException {
293 ZKAssign.blockUntilNoRIT(zkw);
294 master.assignmentManager.waitUntilNoRegionsInTransition(60000);
295 }
296
297 private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
298 ServerName serverName) throws InterruptedException {
299 ServerManager sm = activeMaster.getMaster().getServerManager();
300
301 while (!sm.getDeadServers().isDeadServer(serverName)) {
302 log("Waiting for [" + serverName + "] to be listed as dead in master");
303 Thread.sleep(1);
304 }
305 log("Server [" + serverName + "] marked as dead, waiting for it to " +
306 "finish dead processing");
307 while (sm.areDeadServersInProgress()) {
308 log("Server [" + serverName + "] still being processed, waiting");
309 Thread.sleep(100);
310 }
311 log("Server [" + serverName + "] done with server shutdown processing");
312 }
313
314 private void log(String msg) {
315 LOG.debug("\n\nTRR: " + msg + "\n");
316 }
317
318 private RegionServerThread getServerHostingMeta(MiniHBaseCluster cluster)
319 throws IOException {
320 return getServerHosting(cluster, HRegionInfo.FIRST_META_REGIONINFO);
321 }
322
323 private RegionServerThread getServerHosting(MiniHBaseCluster cluster,
324 HRegionInfo region) throws IOException {
325 for (RegionServerThread rst : cluster.getRegionServerThreads()) {
326 if (ProtobufUtil.getOnlineRegions(rst.getRegionServer()).contains(region)) {
327 return rst;
328 }
329 }
330 return null;
331 }
332
333 private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
334 int numFound = 0;
335 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
336 numFound += rst.getRegionServer().getNumberOfOnlineRegions();
337 }
338 return numFound;
339 }
340
341 private void assertRegionsAssigned(MiniHBaseCluster cluster,
342 Set<String> expectedRegions) throws IOException {
343 int numFound = getNumberOfOnlineRegions(cluster);
344 if (expectedRegions.size() > numFound) {
345 log("Expected to find " + expectedRegions.size() + " but only found"
346 + " " + numFound);
347 NavigableSet<String> foundRegions = getAllOnlineRegions(cluster);
348 for (String region : expectedRegions) {
349 if (!foundRegions.contains(region)) {
350 log("Missing region: " + region);
351 }
352 }
353 assertEquals(expectedRegions.size(), numFound);
354 } else if (expectedRegions.size() < numFound) {
355 int doubled = numFound - expectedRegions.size();
356 log("Expected to find " + expectedRegions.size() + " but found"
357 + " " + numFound + " (" + doubled + " double assignments?)");
358 NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
359 for (String region : doubleRegions) {
360 log("Region is double assigned: " + region);
361 }
362 assertEquals(expectedRegions.size(), numFound);
363 } else {
364 log("Success! Found expected number of " + numFound + " regions");
365 }
366 }
367
368 private NavigableSet<String> getAllOnlineRegions(MiniHBaseCluster cluster)
369 throws IOException {
370 NavigableSet<String> online = new TreeSet<String>();
371 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
372 for (HRegionInfo region : ProtobufUtil.getOnlineRegions(rst.getRegionServer())) {
373 online.add(region.getRegionNameAsString());
374 }
375 }
376 return online;
377 }
378
379 private NavigableSet<String> getDoubleAssignedRegions(
380 MiniHBaseCluster cluster) throws IOException {
381 NavigableSet<String> online = new TreeSet<String>();
382 NavigableSet<String> doubled = new TreeSet<String>();
383 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
384 for (HRegionInfo region : ProtobufUtil.getOnlineRegions(rst.getRegionServer())) {
385 if(!online.add(region.getRegionNameAsString())) {
386 doubled.add(region.getRegionNameAsString());
387 }
388 }
389 }
390 return doubled;
391 }
392
393
394 }
395