1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase;
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Comparator;
23 import java.util.HashMap;
24 import java.util.HashSet;
25 import java.util.List;
26 import java.util.Set;
27 import java.util.TreeSet;
28
29 import org.apache.hadoop.hbase.classification.InterfaceAudience;
30 import org.apache.hadoop.conf.Configuration;
31 import org.apache.hadoop.hbase.ClusterManager.ServiceType;
32 import org.apache.hadoop.hbase.client.HBaseAdmin;
33 import org.apache.hadoop.hbase.client.HConnection;
34 import org.apache.hadoop.hbase.client.HConnectionManager;
35 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
36 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
37 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
38 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
39 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
40 import org.apache.hadoop.hbase.util.Bytes;
41 import org.apache.hadoop.hbase.util.Threads;
42
43 import com.google.common.collect.Sets;
44
45
46
47
48
49 @InterfaceAudience.Private
50 public class DistributedHBaseCluster extends HBaseCluster {
51
52 private HBaseAdmin admin;
53
54 private ClusterManager clusterManager;
55
56 public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
57 throws IOException {
58 super(conf);
59 this.clusterManager = clusterManager;
60 this.admin = new HBaseAdmin(conf);
61 this.initialClusterStatus = getClusterStatus();
62 }
63
64 public void setClusterManager(ClusterManager clusterManager) {
65 this.clusterManager = clusterManager;
66 }
67
68 public ClusterManager getClusterManager() {
69 return clusterManager;
70 }
71
72
73
74
75
76 @Override
77 public ClusterStatus getClusterStatus() throws IOException {
78 return admin.getClusterStatus();
79 }
80
81 @Override
82 public ClusterStatus getInitialClusterStatus() throws IOException {
83 return initialClusterStatus;
84 }
85
86 @Override
87 public void close() throws IOException {
88 if (this.admin != null) {
89 admin.close();
90 }
91 }
92
93 @Override
94 public AdminProtos.AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
95 throws IOException {
96 return admin.getConnection().getAdmin(serverName);
97 }
98
99 @Override
100 public ClientProtos.ClientService.BlockingInterface getClientProtocol(ServerName serverName)
101 throws IOException {
102 return admin.getConnection().getClient(serverName);
103 }
104
105 @Override
106 public void startRegionServer(String hostname, int port) throws IOException {
107 LOG.info("Starting RS on: " + hostname);
108 clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
109 }
110
111 @Override
112 public void killRegionServer(ServerName serverName) throws IOException {
113 LOG.info("Aborting RS: " + serverName.getServerName());
114 clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
115 serverName.getHostname(),
116 serverName.getPort());
117 }
118
119 @Override
120 public void stopRegionServer(ServerName serverName) throws IOException {
121 LOG.info("Stopping RS: " + serverName.getServerName());
122 clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
123 serverName.getHostname(),
124 serverName.getPort());
125 }
126
127 @Override
128 public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
129 waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
130 }
131
132 private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
133 throws IOException {
134 LOG.info("Waiting service:" + service + " to stop: " + serverName.getServerName());
135 long start = System.currentTimeMillis();
136
137 while ((System.currentTimeMillis() - start) < timeout) {
138 if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
139 return;
140 }
141 Threads.sleep(1000);
142 }
143 throw new IOException("did timeout waiting for service to stop:" + serverName);
144 }
145
146 @Override
147 public MasterService.BlockingInterface getMaster()
148 throws IOException {
149 HConnection conn = HConnectionManager.getConnection(conf);
150 return conn.getMaster();
151 }
152
153 @Override
154 public void startMaster(String hostname, int port) throws IOException {
155 LOG.info("Starting Master on: " + hostname + ":" + port);
156 clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
157 }
158
159 @Override
160 public void killMaster(ServerName serverName) throws IOException {
161 LOG.info("Aborting Master: " + serverName.getServerName());
162 clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
163 }
164
165 @Override
166 public void stopMaster(ServerName serverName) throws IOException {
167 LOG.info("Stopping Master: " + serverName.getServerName());
168 clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
169 }
170
171 @Override
172 public void waitForMasterToStop(ServerName serverName, long timeout) throws IOException {
173 waitForServiceToStop(ServiceType.HBASE_MASTER, serverName, timeout);
174 }
175
176 @Override
177 public boolean waitForActiveAndReadyMaster(long timeout) throws IOException {
178 long start = System.currentTimeMillis();
179 while (System.currentTimeMillis() - start < timeout) {
180 try {
181 getMaster();
182 return true;
183 } catch (MasterNotRunningException m) {
184 LOG.warn("Master not started yet " + m);
185 } catch (ZooKeeperConnectionException e) {
186 LOG.warn("Failed to connect to ZK " + e);
187 }
188 Threads.sleep(1000);
189 }
190 return false;
191 }
192
193 @Override
194 public ServerName getServerHoldingRegion(byte[] regionName) throws IOException {
195 HConnection connection = admin.getConnection();
196 HRegionLocation regionLoc = connection.locateRegion(regionName);
197 if (regionLoc == null) {
198 LOG.warn("Cannot find region server holding region " + Bytes.toString(regionName)
199 + " for table " + HRegionInfo.getTableName(regionName) + ", start key [" +
200 Bytes.toString(HRegionInfo.getStartKey(regionName)) + "]");
201 return null;
202 }
203
204 AdminProtos.AdminService.BlockingInterface client =
205 connection.getAdmin(regionLoc.getServerName());
206 ServerInfo info = ProtobufUtil.getServerInfo(client);
207 return ProtobufUtil.toServerName(info.getServerName());
208 }
209
210 @Override
211 public void waitUntilShutDown() {
212
213 throw new RuntimeException("Not implemented yet");
214 }
215
216 @Override
217 public void shutdown() throws IOException {
218
219 throw new RuntimeException("Not implemented yet");
220 }
221
222 @Override
223 public boolean isDistributedCluster() {
224 return true;
225 }
226
227 @Override
228 public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
229 ClusterStatus current = getClusterStatus();
230
231 LOG.info("Restoring cluster - started");
232
233
234 boolean success = true;
235 success = restoreMasters(initial, current) & success;
236 success = restoreRegionServers(initial, current) & success;
237 success = restoreAdmin() & success;
238
239 LOG.info("Restoring cluster - done");
240 return success;
241 }
242
243 protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
244 List<IOException> deferred = new ArrayList<IOException>();
245
246 final ServerName initMaster = initial.getMaster();
247 if (!ServerName.isSameHostnameAndPort(initMaster, current.getMaster())) {
248 LOG.info("Restoring cluster - Initial active master : "
249 + initMaster.getHostAndPort()
250 + " has changed to : "
251 + current.getMaster().getHostAndPort());
252
253
254 try {
255 if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
256 initMaster.getHostname(), initMaster.getPort())) {
257 LOG.info("Restoring cluster - starting initial active master at:"
258 + initMaster.getHostAndPort());
259 startMaster(initMaster.getHostname(), initMaster.getPort());
260 }
261
262
263
264
265
266 for (ServerName currentBackup : current.getBackupMasters()) {
267 if (!ServerName.isSameHostnameAndPort(currentBackup, initMaster)) {
268 LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
269 stopMaster(currentBackup);
270 }
271 }
272 LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
273 stopMaster(current.getMaster());
274 waitForActiveAndReadyMaster();
275 } catch (IOException ex) {
276
277
278 deferred.add(ex);
279 }
280
281
282 for (ServerName backup : initial.getBackupMasters()) {
283 try {
284
285 if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
286 backup.getHostname(),
287 backup.getPort())) {
288 LOG.info("Restoring cluster - starting initial backup master: "
289 + backup.getHostAndPort());
290 startMaster(backup.getHostname(), backup.getPort());
291 }
292 } catch (IOException ex) {
293 deferred.add(ex);
294 }
295 }
296 } else {
297
298 Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
299 Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
300 toStart.addAll(initial.getBackupMasters());
301 toKill.addAll(current.getBackupMasters());
302
303 for (ServerName server : current.getBackupMasters()) {
304 toStart.remove(server);
305 }
306 for (ServerName server: initial.getBackupMasters()) {
307 toKill.remove(server);
308 }
309
310 for (ServerName sn:toStart) {
311 try {
312 if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
313 LOG.info("Restoring cluster - starting initial backup master: " + sn.getHostAndPort());
314 startMaster(sn.getHostname(), sn.getPort());
315 }
316 } catch (IOException ex) {
317 deferred.add(ex);
318 }
319 }
320
321 for (ServerName sn:toKill) {
322 try {
323 if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
324 LOG.info("Restoring cluster - stopping backup master: " + sn.getHostAndPort());
325 stopMaster(sn);
326 }
327 } catch (IOException ex) {
328 deferred.add(ex);
329 }
330 }
331 }
332 if (!deferred.isEmpty()) {
333 LOG.warn("Restoring cluster - restoring region servers reported "
334 + deferred.size() + " errors:");
335 for (int i=0; i<deferred.size() && i < 3; i++) {
336 LOG.warn(deferred.get(i));
337 }
338 }
339
340 return deferred.isEmpty();
341 }
342
343
344 private static class ServerNameIgnoreStartCodeComparator implements Comparator<ServerName> {
345 @Override
346 public int compare(ServerName o1, ServerName o2) {
347 int compare = o1.getHostname().compareToIgnoreCase(o2.getHostname());
348 if (compare != 0) return compare;
349 compare = o1.getPort() - o2.getPort();
350 if (compare != 0) return compare;
351 return 0;
352 }
353 }
354
355 protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
356 Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
357 Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
358 toStart.addAll(initial.getBackupMasters());
359 toKill.addAll(current.getBackupMasters());
360
361 for (ServerName server : current.getServers()) {
362 toStart.remove(server);
363 }
364 for (ServerName server: initial.getServers()) {
365 toKill.remove(server);
366 }
367
368 List<IOException> deferred = new ArrayList<IOException>();
369
370 for(ServerName sn:toStart) {
371 try {
372 if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
373 sn.getHostname(),
374 sn.getPort())) {
375 LOG.info("Restoring cluster - starting initial region server: " + sn.getHostAndPort());
376 startRegionServer(sn.getHostname(), sn.getPort());
377 }
378 } catch (IOException ex) {
379 deferred.add(ex);
380 }
381 }
382
383 for(ServerName sn:toKill) {
384 try {
385 if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
386 sn.getHostname(),
387 sn.getPort())) {
388 LOG.info("Restoring cluster - stopping initial region server: " + sn.getHostAndPort());
389 stopRegionServer(sn);
390 }
391 } catch (IOException ex) {
392 deferred.add(ex);
393 }
394 }
395 if (!deferred.isEmpty()) {
396 LOG.warn("Restoring cluster - restoring region servers reported "
397 + deferred.size() + " errors:");
398 for (int i=0; i<deferred.size() && i < 3; i++) {
399 LOG.warn(deferred.get(i));
400 }
401 }
402
403 return deferred.isEmpty();
404 }
405
406 protected boolean restoreAdmin() throws IOException {
407
408
409
410
411 try {
412 admin.close();
413 } catch (IOException ioe) {
414 LOG.warn("While closing the old connection", ioe);
415 }
416 this.admin = new HBaseAdmin(conf);
417 LOG.info("Added new HBaseAdmin");
418 return true;
419 }
420 }