1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.master;
20
21 import java.io.IOException;
22 import java.net.InetAddress;
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.Iterator;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Map.Entry;
31 import java.util.Set;
32 import java.util.SortedMap;
33 import java.util.concurrent.ConcurrentHashMap;
34 import java.util.concurrent.ConcurrentSkipListMap;
35 import java.util.concurrent.CopyOnWriteArrayList;
36
37 import org.apache.commons.logging.Log;
38 import org.apache.commons.logging.LogFactory;
39 import org.apache.hadoop.hbase.classification.InterfaceAudience;
40 import org.apache.hadoop.conf.Configuration;
41 import org.apache.hadoop.hbase.ClockOutOfSyncException;
42 import org.apache.hadoop.hbase.HRegionInfo;
43 import org.apache.hadoop.hbase.RegionLoad;
44 import org.apache.hadoop.hbase.Server;
45 import org.apache.hadoop.hbase.ServerLoad;
46 import org.apache.hadoop.hbase.ServerName;
47 import org.apache.hadoop.hbase.YouAreDeadException;
48 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
49 import org.apache.hadoop.hbase.client.HConnection;
50 import org.apache.hadoop.hbase.client.HConnectionManager;
51 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
52 import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
53 import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
54 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
55 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
56 import org.apache.hadoop.hbase.protobuf.RequestConverter;
57 import org.apache.hadoop.hbase.protobuf.ResponseConverter;
58 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
59 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
60 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
61 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
62 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
63 import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
64 import org.apache.hadoop.hbase.util.Bytes;
65 import org.apache.hadoop.hbase.util.Triple;
66 import org.apache.hadoop.hbase.util.RetryCounter;
67 import org.apache.hadoop.hbase.util.RetryCounterFactory;
68
69 import com.google.common.annotations.VisibleForTesting;
70 import com.google.protobuf.ServiceException;
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94 @InterfaceAudience.Private
95 public class ServerManager {
96 public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART =
97 "hbase.master.wait.on.regionservers.maxtostart";
98
99 public static final String WAIT_ON_REGIONSERVERS_MINTOSTART =
100 "hbase.master.wait.on.regionservers.mintostart";
101
102 public static final String WAIT_ON_REGIONSERVERS_TIMEOUT =
103 "hbase.master.wait.on.regionservers.timeout";
104
105 public static final String WAIT_ON_REGIONSERVERS_INTERVAL =
106 "hbase.master.wait.on.regionservers.interval";
107
108 private static final Log LOG = LogFactory.getLog(ServerManager.class);
109
110
111 private volatile boolean clusterShutdown = false;
112
113 private final SortedMap<byte[], Long> flushedSequenceIdByRegion =
114 new ConcurrentSkipListMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
115
116
117 private final ConcurrentHashMap<ServerName, ServerLoad> onlineServers =
118 new ConcurrentHashMap<ServerName, ServerLoad>();
119
120
121
122
123
124 private final Map<ServerName, AdminService.BlockingInterface> rsAdmins =
125 new HashMap<ServerName, AdminService.BlockingInterface>();
126
127
128
129
130
131 private final ArrayList<ServerName> drainingServers =
132 new ArrayList<ServerName>();
133
134 private final Server master;
135 private final MasterServices services;
136 private final HConnection connection;
137
138 private final DeadServer deadservers = new DeadServer();
139
140 private final long maxSkew;
141 private final long warningSkew;
142
143 private final RetryCounterFactory pingRetryCounterFactory;
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161 private Set<ServerName> queuedDeadServers = new HashSet<ServerName>();
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178 private Map<ServerName, Boolean> requeuedDeadServers
179 = new ConcurrentHashMap<ServerName, Boolean>();
180
181
182 private List<ServerListener> listeners = new CopyOnWriteArrayList<ServerListener>();
183
184
185
186
187
188
189
190 public ServerManager(final Server master, final MasterServices services)
191 throws IOException {
192 this(master, services, true);
193 }
194
195 @SuppressWarnings("deprecation")
196 ServerManager(final Server master, final MasterServices services,
197 final boolean connect) throws IOException {
198 this.master = master;
199 this.services = services;
200 Configuration c = master.getConfiguration();
201 maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
202 warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
203 this.connection = connect ? HConnectionManager.getConnection(c) : null;
204 int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt(
205 "hbase.master.maximum.ping.server.attempts", 10));
206 int pingSleepInterval = Math.max(1, master.getConfiguration().getInt(
207 "hbase.master.ping.server.retry.sleep.interval", 100));
208 this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval);
209 }
210
211
212
213
214
215 public void registerListener(final ServerListener listener) {
216 this.listeners.add(listener);
217 }
218
219
220
221
222
223 public boolean unregisterListener(final ServerListener listener) {
224 return this.listeners.remove(listener);
225 }
226
227
228
229
230
231
232
233
234
235
236 ServerName regionServerStartup(final InetAddress ia, final int port,
237 final long serverStartcode, long serverCurrentTime)
238 throws IOException {
239
240
241
242
243
244
245
246 ServerName sn = ServerName.valueOf(ia.getHostName(), port, serverStartcode);
247 checkClockSkew(sn, serverCurrentTime);
248 checkIsDead(sn, "STARTUP");
249 if (!checkAndRecordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD)) {
250 LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup"
251 + " could not record the server: " + sn);
252 }
253 return sn;
254 }
255
256
257
258
259
260
261 private void updateLastFlushedSequenceIds(ServerName sn, ServerLoad hsl) {
262 Map<byte[], RegionLoad> regionsLoad = hsl.getRegionsLoad();
263 for (Entry<byte[], RegionLoad> entry : regionsLoad.entrySet()) {
264 byte[] encodedRegionName = Bytes.toBytes(HRegionInfo.encodeRegionName(entry.getKey()));
265 Long existingValue = flushedSequenceIdByRegion.get(encodedRegionName);
266 long l = entry.getValue().getCompleteSequenceId();
267 if (existingValue != null) {
268 if (l != -1 && l < existingValue) {
269 LOG.warn("RegionServer " + sn +
270 " indicates a last flushed sequence id (" + entry.getValue() +
271 ") that is less than the previous last flushed sequence id (" +
272 existingValue + ") for region " +
273 Bytes.toString(entry.getKey()) + " Ignoring.");
274
275 continue;
276 }
277 }
278 flushedSequenceIdByRegion.put(encodedRegionName, l);
279 }
280 }
281
282 void regionServerReport(ServerName sn,
283 ServerLoad sl) throws YouAreDeadException {
284 checkIsDead(sn, "REPORT");
285 if (null == this.onlineServers.replace(sn, sl)) {
286
287
288
289
290
291
292 if (!checkAndRecordNewServer(sn, sl)) {
293 LOG.info("RegionServerReport ignored, could not record the server: " + sn);
294 return;
295 }
296 }
297 updateLastFlushedSequenceIds(sn, sl);
298 }
299
300
301
302
303
304
305
306
307
308 boolean checkAndRecordNewServer(
309 final ServerName serverName, final ServerLoad sl) {
310 ServerName existingServer = null;
311 synchronized (this.onlineServers) {
312 existingServer = findServerWithSameHostnamePortWithLock(serverName);
313 if (existingServer != null && (existingServer.getStartcode() > serverName.getStartcode())) {
314 LOG.info("Server serverName=" + serverName + " rejected; we already have "
315 + existingServer.toString() + " registered with same hostname and port");
316 return false;
317 }
318 recordNewServerWithLock(serverName, sl);
319 }
320
321
322 if (!this.listeners.isEmpty()) {
323 for (ServerListener listener : this.listeners) {
324 listener.serverAdded(serverName);
325 }
326 }
327
328
329
330 if (existingServer != null && (existingServer.getStartcode() < serverName.getStartcode())) {
331 LOG.info("Triggering server recovery; existingServer " +
332 existingServer + " looks stale, new server:" + serverName);
333 expireServer(existingServer);
334 }
335 return true;
336 }
337
338
339
340
341
342
343
344
345
346 private void checkClockSkew(final ServerName serverName, final long serverCurrentTime)
347 throws ClockOutOfSyncException {
348 long skew = Math.abs(System.currentTimeMillis() - serverCurrentTime);
349 if (skew > maxSkew) {
350 String message = "Server " + serverName + " has been " +
351 "rejected; Reported time is too far out of sync with master. " +
352 "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
353 LOG.warn(message);
354 throw new ClockOutOfSyncException(message);
355 } else if (skew > warningSkew){
356 String message = "Reported time for server " + serverName + " is out of sync with master " +
357 "by " + skew + "ms. (Warning threshold is " + warningSkew + "ms; " +
358 "error threshold is " + maxSkew + "ms)";
359 LOG.warn(message);
360 }
361 }
362
363
364
365
366
367
368
369
370
371 private void checkIsDead(final ServerName serverName, final String what)
372 throws YouAreDeadException {
373 if (this.deadservers.isDeadServer(serverName)) {
374
375
376 String message = "Server " + what + " rejected; currently processing " +
377 serverName + " as dead server";
378 LOG.debug(message);
379 throw new YouAreDeadException(message);
380 }
381
382
383 if ((this.services == null || ((HMaster) this.services).isInitialized())
384 && this.deadservers.cleanPreviousInstance(serverName)) {
385
386
387 LOG.debug(what + ":" + " Server " + serverName + " came back up," +
388 " removed it from the dead servers list");
389 }
390 }
391
392
393
394
395
396 private ServerName findServerWithSameHostnamePortWithLock(
397 final ServerName serverName) {
398 for (ServerName sn: this.onlineServers.keySet()) {
399 if (ServerName.isSameHostnameAndPort(serverName, sn)) return sn;
400 }
401 return null;
402 }
403
404
405
406
407
408
409
410 @VisibleForTesting
411 void recordNewServerWithLock(final ServerName serverName, final ServerLoad sl) {
412 LOG.info("Registering server=" + serverName);
413 this.onlineServers.put(serverName, sl);
414 this.rsAdmins.remove(serverName);
415 }
416
417 public long getLastFlushedSequenceId(byte[] encodedRegionName) {
418 long seqId = -1L;
419 if (flushedSequenceIdByRegion.containsKey(encodedRegionName)) {
420 seqId = flushedSequenceIdByRegion.get(encodedRegionName);
421 }
422 return seqId;
423 }
424
425
426
427
428
429 public ServerLoad getLoad(final ServerName serverName) {
430 return this.onlineServers.get(serverName);
431 }
432
433
434
435
436
437
438
439 public double getAverageLoad() {
440 int totalLoad = 0;
441 int numServers = 0;
442 double averageLoad;
443 for (ServerLoad sl: this.onlineServers.values()) {
444 numServers++;
445 totalLoad += sl.getNumberOfRegions();
446 }
447 averageLoad = (double)totalLoad / (double)numServers;
448 return averageLoad;
449 }
450
451
452 int countOfRegionServers() {
453
454 return this.onlineServers.size();
455 }
456
457
458
459
460 public Map<ServerName, ServerLoad> getOnlineServers() {
461
462 synchronized (this.onlineServers) {
463 return Collections.unmodifiableMap(this.onlineServers);
464 }
465 }
466
467
468 public DeadServer getDeadServers() {
469 return this.deadservers;
470 }
471
472
473
474
475
476 public boolean areDeadServersInProgress() {
477 return this.deadservers.areDeadServersInProgress();
478 }
479
480 void letRegionServersShutdown() {
481 long previousLogTime = 0;
482 int onlineServersCt;
483 while ((onlineServersCt = onlineServers.size()) > 0) {
484
485 if (System.currentTimeMillis() > (previousLogTime + 1000)) {
486 StringBuilder sb = new StringBuilder();
487
488 for (ServerName key : this.onlineServers.keySet()) {
489 if (sb.length() > 0) {
490 sb.append(", ");
491 }
492 sb.append(key);
493 }
494 LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
495 previousLogTime = System.currentTimeMillis();
496 }
497
498 synchronized (onlineServers) {
499 try {
500 if (onlineServersCt == onlineServers.size()) onlineServers.wait(100);
501 } catch (InterruptedException ignored) {
502
503 }
504 }
505 }
506 }
507
508
509
510
511
512 public synchronized void expireServer(final ServerName serverName) {
513 if (!services.isServerShutdownHandlerEnabled()) {
514 LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
515 + "delay expiring server " + serverName);
516 this.queuedDeadServers.add(serverName);
517 return;
518 }
519 if (this.deadservers.isDeadServer(serverName)) {
520
521 LOG.warn("Expiration of " + serverName +
522 " but server shutdown already in progress");
523 return;
524 }
525 synchronized (onlineServers) {
526 if (!this.onlineServers.containsKey(serverName)) {
527 LOG.warn("Expiration of " + serverName + " but server not online");
528 }
529
530
531
532 this.deadservers.add(serverName);
533 this.onlineServers.remove(serverName);
534 onlineServers.notifyAll();
535 }
536 this.rsAdmins.remove(serverName);
537
538
539 if (this.clusterShutdown) {
540 LOG.info("Cluster shutdown set; " + serverName +
541 " expired; onlineServers=" + this.onlineServers.size());
542 if (this.onlineServers.isEmpty()) {
543 master.stop("Cluster shutdown set; onlineServer=0");
544 }
545 return;
546 }
547
548 boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName);
549 if (carryingMeta) {
550 this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
551 this.services, this.deadservers, serverName));
552 } else {
553 this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
554 this.services, this.deadservers, serverName, true));
555 }
556 LOG.debug("Added=" + serverName +
557 " to dead servers, submitted shutdown handler to be executed meta=" + carryingMeta);
558
559
560 if (!this.listeners.isEmpty()) {
561 for (ServerListener listener : this.listeners) {
562 listener.serverRemoved(serverName);
563 }
564 }
565 }
566
567 public synchronized void processDeadServer(final ServerName serverName) {
568 this.processDeadServer(serverName, false);
569 }
570
571 public synchronized void processDeadServer(final ServerName serverName, boolean shouldSplitHlog) {
572
573
574
575
576
577
578
579
580 if (!services.getAssignmentManager().isFailoverCleanupDone()) {
581 requeuedDeadServers.put(serverName, shouldSplitHlog);
582 return;
583 }
584
585 this.deadservers.add(serverName);
586 this.services.getExecutorService().submit(
587 new ServerShutdownHandler(this.master, this.services, this.deadservers, serverName,
588 shouldSplitHlog));
589 }
590
591
592
593
594
595 synchronized void processQueuedDeadServers() {
596 if (!services.isServerShutdownHandlerEnabled()) {
597 LOG.info("Master hasn't enabled ServerShutdownHandler");
598 }
599 Iterator<ServerName> serverIterator = queuedDeadServers.iterator();
600 while (serverIterator.hasNext()) {
601 ServerName tmpServerName = serverIterator.next();
602 expireServer(tmpServerName);
603 serverIterator.remove();
604 requeuedDeadServers.remove(tmpServerName);
605 }
606
607 if (!services.getAssignmentManager().isFailoverCleanupDone()) {
608 LOG.info("AssignmentManager hasn't finished failover cleanup; waiting");
609 }
610
611 for(ServerName tmpServerName : requeuedDeadServers.keySet()){
612 processDeadServer(tmpServerName, requeuedDeadServers.get(tmpServerName));
613 }
614 requeuedDeadServers.clear();
615 }
616
617
618
619
620 public boolean removeServerFromDrainList(final ServerName sn) {
621
622
623
624 if (!this.isServerOnline(sn)) {
625 LOG.warn("Server " + sn + " is not currently online. " +
626 "Removing from draining list anyway, as requested.");
627 }
628
629 return this.drainingServers.remove(sn);
630 }
631
632
633
634
635 public boolean addServerToDrainList(final ServerName sn) {
636
637
638
639 if (!this.isServerOnline(sn)) {
640 LOG.warn("Server " + sn + " is not currently online. " +
641 "Ignoring request to add it to draining list.");
642 return false;
643 }
644
645
646 if (this.drainingServers.contains(sn)) {
647 LOG.warn("Server " + sn + " is already in the draining server list." +
648 "Ignoring request to add it again.");
649 return false;
650 }
651 return this.drainingServers.add(sn);
652 }
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667 public RegionOpeningState sendRegionOpen(final ServerName server,
668 HRegionInfo region, int versionOfOfflineNode, List<ServerName> favoredNodes)
669 throws IOException {
670 AdminService.BlockingInterface admin = getRsAdmin(server);
671 if (admin == null) {
672 LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
673 " failed because no RPC connection found to this server");
674 return RegionOpeningState.FAILED_OPENING;
675 }
676 OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server,
677 region, versionOfOfflineNode, favoredNodes,
678 (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
679 try {
680 OpenRegionResponse response = admin.openRegion(null, request);
681 return ResponseConverter.getRegionOpeningState(response);
682 } catch (ServiceException se) {
683 throw ProtobufUtil.getRemoteException(se);
684 }
685 }
686
687
688
689
690
691
692
693
694
695
696 public List<RegionOpeningState> sendRegionOpen(ServerName server,
697 List<Triple<HRegionInfo, Integer, List<ServerName>>> regionOpenInfos)
698 throws IOException {
699 AdminService.BlockingInterface admin = getRsAdmin(server);
700 if (admin == null) {
701 LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
702 " failed because no RPC connection found to this server");
703 return null;
704 }
705
706 OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server, regionOpenInfos,
707 (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
708 try {
709 OpenRegionResponse response = admin.openRegion(null, request);
710 return ResponseConverter.getRegionOpeningStateList(response);
711 } catch (ServiceException se) {
712 throw ProtobufUtil.getRemoteException(se);
713 }
714 }
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730 public boolean sendRegionClose(ServerName server, HRegionInfo region,
731 int versionOfClosingNode, ServerName dest, boolean transitionInZK) throws IOException {
732 if (server == null) throw new NullPointerException("Passed server is null");
733 AdminService.BlockingInterface admin = getRsAdmin(server);
734 if (admin == null) {
735 throw new IOException("Attempting to send CLOSE RPC to server " +
736 server.toString() + " for region " +
737 region.getRegionNameAsString() +
738 " failed because no RPC connection found to this server");
739 }
740 return ProtobufUtil.closeRegion(admin, server, region.getRegionName(),
741 versionOfClosingNode, dest, transitionInZK);
742 }
743
744 public boolean sendRegionClose(ServerName server,
745 HRegionInfo region, int versionOfClosingNode) throws IOException {
746 return sendRegionClose(server, region, versionOfClosingNode, null, true);
747 }
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762 public void sendRegionsMerge(ServerName server, HRegionInfo region_a,
763 HRegionInfo region_b, boolean forcible) throws IOException {
764 if (server == null)
765 throw new NullPointerException("Passed server is null");
766 if (region_a == null || region_b == null)
767 throw new NullPointerException("Passed region is null");
768 AdminService.BlockingInterface admin = getRsAdmin(server);
769 if (admin == null) {
770 throw new IOException("Attempting to send MERGE REGIONS RPC to server "
771 + server.toString() + " for region "
772 + region_a.getRegionNameAsString() + ","
773 + region_b.getRegionNameAsString()
774 + " failed because no RPC connection found to this server");
775 }
776 ProtobufUtil.mergeRegions(admin, region_a, region_b, forcible);
777 }
778
779
780
781
782 public boolean isServerReachable(ServerName server) {
783 if (server == null) throw new NullPointerException("Passed server is null");
784
785 RetryCounter retryCounter = pingRetryCounterFactory.create();
786 while (retryCounter.shouldRetry()) {
787 try {
788 AdminService.BlockingInterface admin = getRsAdmin(server);
789 if (admin != null) {
790 ServerInfo info = ProtobufUtil.getServerInfo(admin);
791 return info != null && info.hasServerName()
792 && server.getStartcode() == info.getServerName().getStartCode();
793 }
794 } catch (IOException ioe) {
795 LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes()
796 + " of " + retryCounter.getMaxAttempts(), ioe);
797 try {
798 retryCounter.sleepUntilNextRetry();
799 } catch(InterruptedException ie) {
800 Thread.currentThread().interrupt();
801 }
802 }
803 }
804 return false;
805 }
806
807
808
809
810
811
812
813 private AdminService.BlockingInterface getRsAdmin(final ServerName sn)
814 throws IOException {
815 AdminService.BlockingInterface admin = this.rsAdmins.get(sn);
816 if (admin == null) {
817 LOG.debug("New admin connection to " + sn.toString());
818 admin = this.connection.getAdmin(sn);
819 this.rsAdmins.put(sn, admin);
820 }
821 return admin;
822 }
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837 public void waitForRegionServers(MonitoredTask status)
838 throws InterruptedException {
839 final long interval = this.master.getConfiguration().
840 getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
841 final long timeout = this.master.getConfiguration().
842 getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
843 int minToStart = this.master.getConfiguration().
844 getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
845 if (minToStart < 1) {
846 LOG.warn(String.format(
847 "The value of '%s' (%d) can not be less than 1, ignoring.",
848 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
849 minToStart = 1;
850 }
851 int maxToStart = this.master.getConfiguration().
852 getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
853 if (maxToStart < minToStart) {
854 LOG.warn(String.format(
855 "The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
856 WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
857 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
858 maxToStart = Integer.MAX_VALUE;
859 }
860
861 long now = System.currentTimeMillis();
862 final long startTime = now;
863 long slept = 0;
864 long lastLogTime = 0;
865 long lastCountChange = startTime;
866 int count = countOfRegionServers();
867 int oldCount = 0;
868 while (
869 !this.master.isStopped() &&
870 count < maxToStart &&
871 (lastCountChange+interval > now || timeout > slept || count < minToStart)
872 ){
873
874
875 if (oldCount != count || lastLogTime+interval < now){
876 lastLogTime = now;
877 String msg =
878 "Waiting for region servers count to settle; currently"+
879 " checked in " + count + ", slept for " + slept + " ms," +
880 " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+
881 ", timeout of "+timeout+" ms, interval of "+interval+" ms.";
882 LOG.info(msg);
883 status.setStatus(msg);
884 }
885
886
887 final long sleepTime = 50;
888 Thread.sleep(sleepTime);
889 now = System.currentTimeMillis();
890 slept = now - startTime;
891
892 oldCount = count;
893 count = countOfRegionServers();
894 if (count != oldCount) {
895 lastCountChange = now;
896 }
897 }
898
899 LOG.info("Finished waiting for region servers count to settle;" +
900 " checked in " + count + ", slept for " + slept + " ms," +
901 " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+","+
902 " master is "+ (this.master.isStopped() ? "stopped.": "running.")
903 );
904 }
905
906
907
908
909 public List<ServerName> getOnlineServersList() {
910
911
912 return new ArrayList<ServerName>(this.onlineServers.keySet());
913 }
914
915
916
917
918 public List<ServerName> getDrainingServersList() {
919 return new ArrayList<ServerName>(this.drainingServers);
920 }
921
922
923
924
925 Set<ServerName> getDeadNotExpiredServers() {
926 return new HashSet<ServerName>(this.queuedDeadServers);
927 }
928
929
930
931
932
933
934 void removeRequeuedDeadServers() {
935 requeuedDeadServers.clear();
936 }
937
938
939
940
941
942 Map<ServerName, Boolean> getRequeuedDeadServers() {
943 return Collections.unmodifiableMap(this.requeuedDeadServers);
944 }
945
946 public boolean isServerOnline(ServerName serverName) {
947 return serverName != null && onlineServers.containsKey(serverName);
948 }
949
950
951
952
953
954
955
956 public synchronized boolean isServerDead(ServerName serverName) {
957 return serverName == null || deadservers.isDeadServer(serverName)
958 || queuedDeadServers.contains(serverName)
959 || requeuedDeadServers.containsKey(serverName);
960 }
961
962 public void shutdownCluster() {
963 this.clusterShutdown = true;
964 this.master.stop("Cluster shutdown requested");
965 }
966
967 public boolean isClusterShutdown() {
968 return this.clusterShutdown;
969 }
970
971
972
973
974 public void stop() {
975 if (connection != null) {
976 try {
977 connection.close();
978 } catch (IOException e) {
979 LOG.error("Attempt to close connection to master failed", e);
980 }
981 }
982 }
983
984
985
986
987
988
989 public List<ServerName> createDestinationServersList(final ServerName serverToExclude){
990 final List<ServerName> destServers = getOnlineServersList();
991
992 if (serverToExclude != null){
993 destServers.remove(serverToExclude);
994 }
995
996
997 final List<ServerName> drainingServersCopy = getDrainingServersList();
998 if (!drainingServersCopy.isEmpty()) {
999 for (final ServerName server: drainingServersCopy) {
1000 destServers.remove(server);
1001 }
1002 }
1003
1004
1005 removeDeadNotExpiredServers(destServers);
1006
1007 return destServers;
1008 }
1009
1010
1011
1012
1013 public List<ServerName> createDestinationServersList(){
1014 return createDestinationServersList(null);
1015 }
1016
1017
1018
1019
1020
1021
1022
1023 void removeDeadNotExpiredServers(List<ServerName> servers) {
1024 Set<ServerName> deadNotExpiredServersCopy = this.getDeadNotExpiredServers();
1025 if (!deadNotExpiredServersCopy.isEmpty()) {
1026 for (ServerName server : deadNotExpiredServersCopy) {
1027 LOG.debug("Removing dead but not expired server: " + server
1028 + " from eligible server pool.");
1029 servers.remove(server);
1030 }
1031 }
1032 }
1033
1034
1035
1036
1037 void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
1038 for (ServerName serverName : getOnlineServersList()) {
1039 deadservers.cleanAllPreviousInstances(serverName);
1040 }
1041 }
1042 }