1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.master.balancer;
21
22 import java.io.IOException;
23 import java.util.ArrayList;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.Random;
29 import java.util.Set;
30 import java.util.Map.Entry;
31
32 import org.apache.commons.logging.Log;
33 import org.apache.commons.logging.LogFactory;
34 import org.apache.hadoop.hbase.classification.InterfaceAudience;
35 import org.apache.hadoop.conf.Configuration;
36 import org.apache.hadoop.hbase.TableName;
37 import org.apache.hadoop.hbase.HConstants;
38 import org.apache.hadoop.hbase.HRegionInfo;
39 import org.apache.hadoop.hbase.ServerName;
40 import org.apache.hadoop.hbase.catalog.CatalogTracker;
41 import org.apache.hadoop.hbase.catalog.MetaEditor;
42 import org.apache.hadoop.hbase.client.HTable;
43 import org.apache.hadoop.hbase.client.Put;
44 import org.apache.hadoop.hbase.master.RackManager;
45 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
46 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos;
47 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.FavoredNodes;
48 import org.apache.hadoop.hbase.util.Bytes;
49 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
50
51 import com.google.protobuf.InvalidProtocolBufferException;
52
53
54
55
56
57
58
59
60 @InterfaceAudience.Private
61 public class FavoredNodeAssignmentHelper {
62 private static final Log LOG = LogFactory.getLog(FavoredNodeAssignmentHelper.class);
63 private RackManager rackManager;
64 private Map<String, List<ServerName>> rackToRegionServerMap;
65 private List<String> uniqueRackList;
66 private Map<ServerName, String> regionServerToRackMap;
67 private Random random;
68 private List<ServerName> servers;
69 public static final byte [] FAVOREDNODES_QUALIFIER = Bytes.toBytes("fn");
70 public final static short FAVORED_NODES_NUM = 3;
71
72 public FavoredNodeAssignmentHelper(final List<ServerName> servers, Configuration conf) {
73 this(servers, new RackManager(conf));
74 }
75
76 public FavoredNodeAssignmentHelper(final List<ServerName> servers,
77 final RackManager rackManager) {
78 this.servers = servers;
79 this.rackManager = rackManager;
80 this.rackToRegionServerMap = new HashMap<String, List<ServerName>>();
81 this.regionServerToRackMap = new HashMap<ServerName, String>();
82 this.uniqueRackList = new ArrayList<String>();
83 this.random = new Random();
84 }
85
86
87
88
89
90
91
92 public static void updateMetaWithFavoredNodesInfo(
93 Map<HRegionInfo, List<ServerName>> regionToFavoredNodes,
94 CatalogTracker catalogTracker) throws IOException {
95 List<Put> puts = new ArrayList<Put>();
96 for (Map.Entry<HRegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) {
97 Put put = makePutFromRegionInfo(entry.getKey(), entry.getValue());
98 if (put != null) {
99 puts.add(put);
100 }
101 }
102 MetaEditor.putsToMetaTable(catalogTracker, puts);
103 LOG.info("Added " + puts.size() + " regions in META");
104 }
105
106
107
108
109
110
111
112 public static void updateMetaWithFavoredNodesInfo(
113 Map<HRegionInfo, List<ServerName>> regionToFavoredNodes,
114 Configuration conf) throws IOException {
115 List<Put> puts = new ArrayList<Put>();
116 for (Map.Entry<HRegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) {
117 Put put = makePutFromRegionInfo(entry.getKey(), entry.getValue());
118 if (put != null) {
119 puts.add(put);
120 }
121 }
122
123 HTable metaTable = null;
124 try {
125 metaTable = new HTable(conf, TableName.META_TABLE_NAME);
126 metaTable.put(puts);
127 } finally {
128 if (metaTable != null) metaTable.close();
129 }
130 LOG.info("Added " + puts.size() + " regions in META");
131 }
132
133
134
135
136
137
138
139
140 static Put makePutFromRegionInfo(HRegionInfo regionInfo, List<ServerName>favoredNodeList)
141 throws IOException {
142 Put put = null;
143 if (favoredNodeList != null) {
144 put = MetaEditor.makePutFromRegionInfo(regionInfo);
145 byte[] favoredNodes = getFavoredNodes(favoredNodeList);
146 put.addImmutable(HConstants.CATALOG_FAMILY, FAVOREDNODES_QUALIFIER,
147 EnvironmentEdgeManager.currentTimeMillis(), favoredNodes);
148 LOG.info("Create the region " + regionInfo.getRegionNameAsString() +
149 " with favored nodes " + Bytes.toString(favoredNodes));
150 }
151 return put;
152 }
153
154
155
156
157
158
159 public static ServerName[] getFavoredNodesList(byte[] favoredNodes)
160 throws InvalidProtocolBufferException {
161 FavoredNodes f = FavoredNodes.parseFrom(favoredNodes);
162 List<HBaseProtos.ServerName> protoNodes = f.getFavoredNodeList();
163 ServerName[] servers = new ServerName[protoNodes.size()];
164 int i = 0;
165 for (HBaseProtos.ServerName node : protoNodes) {
166 servers[i++] = ProtobufUtil.toServerName(node);
167 }
168 return servers;
169 }
170
171
172
173
174
175 public static byte[] getFavoredNodes(List<ServerName> serverAddrList) {
176 FavoredNodes.Builder f = FavoredNodes.newBuilder();
177 for (ServerName s : serverAddrList) {
178 HBaseProtos.ServerName.Builder b = HBaseProtos.ServerName.newBuilder();
179 b.setHostName(s.getHostname());
180 b.setPort(s.getPort());
181 b.setStartCode(s.getStartcode());
182 f.addFavoredNode(b.build());
183 }
184 return f.build().toByteArray();
185 }
186
187
188
189
190
191
192
193
194
195
196 void placePrimaryRSAsRoundRobin(Map<ServerName, List<HRegionInfo>> assignmentMap,
197 Map<HRegionInfo, ServerName> primaryRSMap, List<HRegionInfo> regions) {
198 List<String> rackList = new ArrayList<String>(rackToRegionServerMap.size());
199 rackList.addAll(rackToRegionServerMap.keySet());
200 int rackIndex = random.nextInt(rackList.size());
201 int maxRackSize = 0;
202 for (Map.Entry<String,List<ServerName>> r : rackToRegionServerMap.entrySet()) {
203 if (r.getValue().size() > maxRackSize) {
204 maxRackSize = r.getValue().size();
205 }
206 }
207 int numIterations = 0;
208 int firstServerIndex = random.nextInt(maxRackSize);
209
210 int serverIndex = firstServerIndex;
211 for (HRegionInfo regionInfo : regions) {
212 List<ServerName> currentServerList;
213 String rackName;
214 while (true) {
215 rackName = rackList.get(rackIndex);
216 numIterations++;
217
218 currentServerList = rackToRegionServerMap.get(rackName);
219
220 if (serverIndex >= currentServerList.size()) {
221 if (numIterations % rackList.size() == 0) {
222 if (++serverIndex >= maxRackSize) serverIndex = 0;
223 }
224 if ((++rackIndex) >= rackList.size()) {
225 rackIndex = 0;
226 }
227 } else break;
228 }
229
230
231 ServerName currentServer = currentServerList.get(serverIndex);
232
233
234 primaryRSMap.put(regionInfo, currentServer);
235 List<HRegionInfo> regionsForServer = assignmentMap.get(currentServer);
236 if (regionsForServer == null) {
237 regionsForServer = new ArrayList<HRegionInfo>();
238 assignmentMap.put(currentServer, regionsForServer);
239 }
240 regionsForServer.add(regionInfo);
241
242
243 if (numIterations % rackList.size() == 0) {
244 ++serverIndex;
245 }
246 if ((++rackIndex) >= rackList.size()) {
247 rackIndex = 0;
248 }
249 }
250 }
251
252 Map<HRegionInfo, ServerName[]> placeSecondaryAndTertiaryRS(
253 Map<HRegionInfo, ServerName> primaryRSMap) {
254 Map<HRegionInfo, ServerName[]> secondaryAndTertiaryMap =
255 new HashMap<HRegionInfo, ServerName[]>();
256 for (Map.Entry<HRegionInfo, ServerName> entry : primaryRSMap.entrySet()) {
257
258 HRegionInfo regionInfo = entry.getKey();
259 ServerName primaryRS = entry.getValue();
260 try {
261
262 ServerName[] favoredNodes;
263
264 String primaryRack = rackManager.getRack(primaryRS);
265
266 if (getTotalNumberOfRacks() == 1) {
267 favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack);
268 } else {
269 favoredNodes = multiRackCase(regionInfo, primaryRS, primaryRack);
270 }
271 if (favoredNodes != null) {
272 secondaryAndTertiaryMap.put(regionInfo, favoredNodes);
273 LOG.debug("Place the secondary and tertiary region server for region "
274 + regionInfo.getRegionNameAsString());
275 }
276 } catch (Exception e) {
277 LOG.warn("Cannot place the favored nodes for region " +
278 regionInfo.getRegionNameAsString() + " because " + e, e);
279 continue;
280 }
281 }
282 return secondaryAndTertiaryMap;
283 }
284
285 private Map<ServerName, Set<HRegionInfo>> mapRSToPrimaries(
286 Map<HRegionInfo, ServerName> primaryRSMap) {
287 Map<ServerName, Set<HRegionInfo>> primaryServerMap =
288 new HashMap<ServerName, Set<HRegionInfo>>();
289 for (Entry<HRegionInfo, ServerName> e : primaryRSMap.entrySet()) {
290 Set<HRegionInfo> currentSet = primaryServerMap.get(e.getValue());
291 if (currentSet == null) {
292 currentSet = new HashSet<HRegionInfo>();
293 }
294 currentSet.add(e.getKey());
295 primaryServerMap.put(e.getValue(), currentSet);
296 }
297 return primaryServerMap;
298 }
299
300
301
302
303
304
305
306
307
308 public Map<HRegionInfo, ServerName[]> placeSecondaryAndTertiaryWithRestrictions(
309 Map<HRegionInfo, ServerName> primaryRSMap) {
310 Map<ServerName, Set<HRegionInfo>> serverToPrimaries =
311 mapRSToPrimaries(primaryRSMap);
312 Map<HRegionInfo, ServerName[]> secondaryAndTertiaryMap =
313 new HashMap<HRegionInfo, ServerName[]>();
314
315 for (Entry<HRegionInfo, ServerName> entry : primaryRSMap.entrySet()) {
316
317 HRegionInfo regionInfo = entry.getKey();
318 ServerName primaryRS = entry.getValue();
319 try {
320
321 String primaryRack = rackManager.getRack(primaryRS);
322 ServerName[] favoredNodes = null;
323 if (getTotalNumberOfRacks() == 1) {
324
325
326 favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack);
327 } else {
328 favoredNodes = multiRackCaseWithRestrictions(serverToPrimaries,
329 secondaryAndTertiaryMap, primaryRack, primaryRS, regionInfo);
330 }
331 if (favoredNodes != null) {
332 secondaryAndTertiaryMap.put(regionInfo, favoredNodes);
333 LOG.debug("Place the secondary and tertiary region server for region "
334 + regionInfo.getRegionNameAsString());
335 }
336 } catch (Exception e) {
337 LOG.warn("Cannot place the favored nodes for region "
338 + regionInfo.getRegionNameAsString() + " because " + e, e);
339 continue;
340 }
341 }
342 return secondaryAndTertiaryMap;
343 }
344
345 private ServerName[] multiRackCaseWithRestrictions(
346 Map<ServerName, Set<HRegionInfo>> serverToPrimaries,
347 Map<HRegionInfo, ServerName[]> secondaryAndTertiaryMap,
348 String primaryRack, ServerName primaryRS, HRegionInfo regionInfo) throws IOException {
349
350
351
352 Set<String> rackSkipSet = new HashSet<String>();
353 rackSkipSet.add(primaryRack);
354 String secondaryRack = getOneRandomRack(rackSkipSet);
355 List<ServerName> serverList = getServersFromRack(secondaryRack);
356 Set<ServerName> serverSet = new HashSet<ServerName>();
357 serverSet.addAll(serverList);
358 ServerName[] favoredNodes;
359 if (serverList.size() >= 2) {
360
361
362
363 Set<HRegionInfo> primaries = serverToPrimaries.get(primaryRS);
364 Set<ServerName> skipServerSet = new HashSet<ServerName>();
365 while (true) {
366 ServerName[] secondaryAndTertiary = null;
367 if (primaries.size() > 1) {
368
369 for (HRegionInfo primary : primaries) {
370 secondaryAndTertiary = secondaryAndTertiaryMap.get(primary);
371 if (secondaryAndTertiary != null) {
372 if (regionServerToRackMap.get(secondaryAndTertiary[0]).equals(secondaryRack)) {
373 skipServerSet.add(secondaryAndTertiary[0]);
374 }
375 if (regionServerToRackMap.get(secondaryAndTertiary[1]).equals(secondaryRack)) {
376 skipServerSet.add(secondaryAndTertiary[1]);
377 }
378 }
379 }
380 }
381 if (skipServerSet.size() + 2 <= serverSet.size())
382 break;
383 skipServerSet.clear();
384 rackSkipSet.add(secondaryRack);
385
386 if (rackSkipSet.size() == getTotalNumberOfRacks()) {
387
388 skipServerSet.remove(secondaryAndTertiary[0]);
389 skipServerSet.remove(secondaryAndTertiary[1]);
390 break;
391 }
392 secondaryRack = getOneRandomRack(rackSkipSet);
393 serverList = getServersFromRack(secondaryRack);
394 serverSet = new HashSet<ServerName>();
395 serverSet.addAll(serverList);
396 }
397
398
399 ServerName secondaryRS = getOneRandomServer(secondaryRack, skipServerSet);
400 skipServerSet.add(secondaryRS);
401
402 ServerName tertiaryRS = getOneRandomServer(secondaryRack, skipServerSet);
403
404 if (secondaryRS == null || tertiaryRS == null) {
405 LOG.error("Cannot place the secondary and tertiary"
406 + " region server for region "
407 + regionInfo.getRegionNameAsString());
408 }
409
410 favoredNodes = new ServerName[2];
411 favoredNodes[0] = secondaryRS;
412 favoredNodes[1] = tertiaryRS;
413 } else {
414
415
416 favoredNodes = new ServerName[2];
417 ServerName secondary = getOneRandomServer(secondaryRack);
418 favoredNodes[0] = secondary;
419
420
421 if (getTotalNumberOfRacks() == 2) {
422
423 Set<ServerName> serverSkipSet = new HashSet<ServerName>();
424 serverSkipSet.add(primaryRS);
425 favoredNodes[1] = getOneRandomServer(primaryRack, serverSkipSet);
426 } else {
427
428 rackSkipSet.add(secondaryRack);
429 String tertiaryRandomRack = getOneRandomRack(rackSkipSet);
430 favoredNodes[1] = getOneRandomServer(tertiaryRandomRack);
431 }
432 }
433 return favoredNodes;
434 }
435
436 private ServerName[] singleRackCase(HRegionInfo regionInfo,
437 ServerName primaryRS,
438 String primaryRack) throws IOException {
439
440
441 List<ServerName> serverList = getServersFromRack(primaryRack);
442 if (serverList.size() <= 2) {
443
444
445 return null;
446 } else {
447
448
449 Set<ServerName> serverSkipSet = new HashSet<ServerName>();
450 serverSkipSet.add(primaryRS);
451
452
453 ServerName secondaryRS = getOneRandomServer(primaryRack, serverSkipSet);
454
455 serverSkipSet.add(secondaryRS);
456
457
458 ServerName tertiaryRS =
459 getOneRandomServer(primaryRack, serverSkipSet);
460
461 if (secondaryRS == null || tertiaryRS == null) {
462 LOG.error("Cannot place the secondary and terinary" +
463 "region server for region " +
464 regionInfo.getRegionNameAsString());
465 }
466
467 ServerName[] favoredNodes = new ServerName[2];
468 favoredNodes[0] = secondaryRS;
469 favoredNodes[1] = tertiaryRS;
470 return favoredNodes;
471 }
472 }
473
474 private ServerName[] multiRackCase(HRegionInfo regionInfo,
475 ServerName primaryRS,
476 String primaryRack) throws IOException {
477
478
479
480
481
482 Set<String> rackSkipSet = new HashSet<String>();
483 rackSkipSet.add(primaryRack);
484 ServerName[] favoredNodes = new ServerName[2];
485 String secondaryRack = getOneRandomRack(rackSkipSet);
486 List<ServerName> serverList = getServersFromRack(secondaryRack);
487 if (serverList.size() >= 2) {
488
489
490
491 ServerName secondaryRS = getOneRandomServer(secondaryRack);
492
493
494 Set<ServerName> skipServerSet = new HashSet<ServerName>();
495 skipServerSet.add(secondaryRS);
496
497 ServerName tertiaryRS = getOneRandomServer(secondaryRack, skipServerSet);
498
499 if (secondaryRS == null || tertiaryRS == null) {
500 LOG.error("Cannot place the secondary and terinary" +
501 "region server for region " +
502 regionInfo.getRegionNameAsString());
503 }
504
505 favoredNodes[0] = secondaryRS;
506 favoredNodes[1] = tertiaryRS;
507 } else {
508
509
510 favoredNodes[0] = getOneRandomServer(secondaryRack);
511
512
513 if (getTotalNumberOfRacks() == 2) {
514
515 Set<ServerName> serverSkipSet = new HashSet<ServerName>();
516 serverSkipSet.add(primaryRS);
517 favoredNodes[1] = getOneRandomServer(primaryRack, serverSkipSet);
518 } else {
519
520 rackSkipSet.add(secondaryRack);
521 String tertiaryRandomRack = getOneRandomRack(rackSkipSet);
522 favoredNodes[1] = getOneRandomServer(tertiaryRandomRack);
523 }
524 }
525 return favoredNodes;
526 }
527
528 boolean canPlaceFavoredNodes() {
529 int serverSize = this.regionServerToRackMap.size();
530 return (serverSize >= FAVORED_NODES_NUM);
531 }
532
533 public void initialize() {
534 for (ServerName sn : this.servers) {
535 String rackName = this.rackManager.getRack(sn);
536 List<ServerName> serverList = this.rackToRegionServerMap.get(rackName);
537 if (serverList == null) {
538 serverList = new ArrayList<ServerName>();
539
540 this.uniqueRackList.add(rackName);
541 }
542 if (!serverList.contains(sn)) {
543 serverList.add(sn);
544 this.rackToRegionServerMap.put(rackName, serverList);
545 this.regionServerToRackMap.put(sn, rackName);
546 }
547 }
548 }
549
550 private int getTotalNumberOfRacks() {
551 return this.uniqueRackList.size();
552 }
553
554 private List<ServerName> getServersFromRack(String rack) {
555 return this.rackToRegionServerMap.get(rack);
556 }
557
558 private ServerName getOneRandomServer(String rack,
559 Set<ServerName> skipServerSet) throws IOException {
560 if(rack == null) return null;
561 List<ServerName> serverList = this.rackToRegionServerMap.get(rack);
562 if (serverList == null) return null;
563
564
565 if (skipServerSet != null && serverList.size() <= skipServerSet.size()) {
566 throw new IOException("Cannot randomly pick another random server");
567 }
568
569 ServerName randomServer;
570 do {
571 int randomIndex = random.nextInt(serverList.size());
572 randomServer = serverList.get(randomIndex);
573 } while (skipServerSet != null && skipServerSet.contains(randomServer));
574
575 return randomServer;
576 }
577
578 private ServerName getOneRandomServer(String rack) throws IOException {
579 return this.getOneRandomServer(rack, null);
580 }
581
582 private String getOneRandomRack(Set<String> skipRackSet) throws IOException {
583 if (skipRackSet == null || uniqueRackList.size() <= skipRackSet.size()) {
584 throw new IOException("Cannot randomly pick another random server");
585 }
586
587 String randomRack;
588 do {
589 int randomIndex = random.nextInt(this.uniqueRackList.size());
590 randomRack = this.uniqueRackList.get(randomIndex);
591 } while (skipRackSet.contains(randomRack));
592
593 return randomRack;
594 }
595
596 public static String getFavoredNodesAsString(List<ServerName> nodes) {
597 StringBuffer strBuf = new StringBuffer();
598 int i = 0;
599 for (ServerName node : nodes) {
600 strBuf.append(node.getHostAndPort());
601 if (++i != nodes.size()) strBuf.append(";");
602 }
603 return strBuf.toString();
604 }
605 }