View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.master;
21  
22  import java.io.IOException;
23  import java.text.DecimalFormat;
24  import java.util.ArrayList;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Random;
30  import java.util.Scanner;
31  import java.util.Set;
32  import java.util.TreeMap;
33  
34  import org.apache.commons.cli.CommandLine;
35  import org.apache.commons.cli.GnuParser;
36  import org.apache.commons.cli.HelpFormatter;
37  import org.apache.commons.cli.Options;
38  import org.apache.commons.cli.ParseException;
39  import org.apache.commons.lang.StringUtils;
40  import org.apache.commons.logging.Log;
41  import org.apache.commons.logging.LogFactory;
42  import org.apache.hadoop.hbase.classification.InterfaceAudience;
43  import org.apache.hadoop.conf.Configuration;
44  import org.apache.hadoop.fs.FileSystem;
45  import org.apache.hadoop.hbase.HBaseConfiguration;
46  import org.apache.hadoop.hbase.HConstants;
47  import org.apache.hadoop.hbase.HRegionInfo;
48  import org.apache.hadoop.hbase.ServerName;
49  import org.apache.hadoop.hbase.TableName;
50  import org.apache.hadoop.hbase.catalog.CatalogTracker;
51  import org.apache.hadoop.hbase.client.HBaseAdmin;
52  import org.apache.hadoop.hbase.client.HConnection;
53  import org.apache.hadoop.hbase.master.balancer.FavoredNodeAssignmentHelper;
54  import org.apache.hadoop.hbase.master.balancer.FavoredNodesPlan;
55  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
56  import org.apache.hadoop.hbase.protobuf.RequestConverter;
57  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService.BlockingInterface;
58  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.UpdateFavoredNodesRequest;
59  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.UpdateFavoredNodesResponse;
60  import org.apache.hadoop.hbase.util.FSUtils;
61  import org.apache.hadoop.hbase.util.MunkresAssignment;
62  import org.apache.hadoop.hbase.util.Pair;
63  import org.apache.log4j.Level;
64  import org.apache.log4j.Logger;
65  
66  /**
67   * A tool that is used for manipulating and viewing favored nodes information
68   * for regions. Run with -h to get a list of the options
69   *
70   */
71  @InterfaceAudience.Private
72  public class RegionPlacementMaintainer {
73    private static final Log LOG = LogFactory.getLog(RegionPlacementMaintainer.class
74        .getName());
75    //The cost of a placement that should never be assigned.
76    private static final float MAX_COST = Float.POSITIVE_INFINITY;
77  
78    // The cost of a placement that is undesirable but acceptable.
79    private static final float AVOID_COST = 100000f;
80  
81    // The amount by which the cost of a placement is increased if it is the
82    // last slot of the server. This is done to more evenly distribute the slop
83    // amongst servers.
84    private static final float LAST_SLOT_COST_PENALTY = 0.5f;
85  
86    // The amount by which the cost of a primary placement is penalized if it is
87    // not the host currently serving the region. This is done to minimize moves.
88    private static final float NOT_CURRENT_HOST_PENALTY = 0.1f;
89  
90    private static boolean USE_MUNKRES_FOR_PLACING_SECONDARY_AND_TERTIARY = false;
91  
92    private Configuration conf;
93    private final boolean enforceLocality;
94    private final boolean enforceMinAssignmentMove;
95    private HBaseAdmin admin;
96    private RackManager rackManager;
97    private Set<TableName> targetTableSet;
98  
99    public RegionPlacementMaintainer(Configuration conf) {
100     this(conf, true, true);
101   }
102 
103   public RegionPlacementMaintainer(Configuration conf, boolean enforceLocality,
104       boolean enforceMinAssignmentMove) {
105     this.conf = conf;
106     this.enforceLocality = enforceLocality;
107     this.enforceMinAssignmentMove = enforceMinAssignmentMove;
108     this.targetTableSet = new HashSet<TableName>();
109     this.rackManager = new RackManager(conf);
110   }
111   private static void printHelp(Options opt) {
112     new HelpFormatter().printHelp(
113         "RegionPlacement < -w | -u | -n | -v | -t | -h | -overwrite -r regionName -f favoredNodes " +
114         "-diff>" +
115         " [-l false] [-m false] [-d] [-tables t1,t2,...tn] [-zk zk1,zk2,zk3]" +
116         " [-fs hdfs://a.b.c.d:9000] [-hbase_root /HBASE]", opt);
117   }
118 
119   public void setTargetTableName(String[] tableNames) {
120     if (tableNames != null) {
121       for (String table : tableNames)
122         this.targetTableSet.add(TableName.valueOf(table));
123     }
124   }
125 
126   /**
127    * @return the cached HBaseAdmin
128    * @throws IOException
129    */
130   private HBaseAdmin getHBaseAdmin() throws IOException {
131     if (this.admin == null) {
132       this.admin = new HBaseAdmin(this.conf);
133     }
134     return this.admin;
135   }
136 
137   /**
138    * @return the new RegionAssignmentSnapshot
139    * @throws IOException
140    */
141   public SnapshotOfRegionAssignmentFromMeta getRegionAssignmentSnapshot()
142   throws IOException {
143     SnapshotOfRegionAssignmentFromMeta currentAssignmentShapshot =
144       new SnapshotOfRegionAssignmentFromMeta(new CatalogTracker(this.conf));
145     currentAssignmentShapshot.initialize();
146     return currentAssignmentShapshot;
147   }
148 
149   /**
150    * Verify the region placement is consistent with the assignment plan
151    * @param isDetailMode
152    * @return reports
153    * @throws IOException
154    */
155   public List<AssignmentVerificationReport> verifyRegionPlacement(boolean isDetailMode)
156       throws IOException {
157     System.out.println("Start to verify the region assignment and " +
158         "generate the verification report");
159     // Get the region assignment snapshot
160     SnapshotOfRegionAssignmentFromMeta snapshot = this.getRegionAssignmentSnapshot();
161 
162     // Get all the tables
163     Set<TableName> tables = snapshot.getTableSet();
164 
165     // Get the region locality map
166     Map<String, Map<String, Float>> regionLocalityMap = null;
167     if (this.enforceLocality == true) {
168       regionLocalityMap = FSUtils.getRegionDegreeLocalityMappingFromFS(conf);
169     }
170     List<AssignmentVerificationReport> reports = new ArrayList<AssignmentVerificationReport>();
171     // Iterate all the tables to fill up the verification report
172     for (TableName table : tables) {
173       if (!this.targetTableSet.isEmpty() &&
174           !this.targetTableSet.contains(table)) {
175         continue;
176       }
177       AssignmentVerificationReport report = new AssignmentVerificationReport();
178       report.fillUp(table, snapshot, regionLocalityMap);
179       report.print(isDetailMode);
180       reports.add(report);
181     }
182     return reports;
183   }
184 
185   /**
186    * Generate the assignment plan for the existing table
187    *
188    * @param tableName
189    * @param assignmentSnapshot
190    * @param regionLocalityMap
191    * @param plan
192    * @param munkresForSecondaryAndTertiary if set on true the assignment plan
193    * for the tertiary and secondary will be generated with Munkres algorithm,
194    * otherwise will be generated using placeSecondaryAndTertiaryRS
195    * @throws IOException
196    */
197   private void genAssignmentPlan(TableName tableName,
198       SnapshotOfRegionAssignmentFromMeta assignmentSnapshot,
199       Map<String, Map<String, Float>> regionLocalityMap, FavoredNodesPlan plan,
200       boolean munkresForSecondaryAndTertiary) throws IOException {
201       // Get the all the regions for the current table
202       List<HRegionInfo> regions =
203         assignmentSnapshot.getTableToRegionMap().get(tableName);
204       int numRegions = regions.size();
205 
206       // Get the current assignment map
207       Map<HRegionInfo, ServerName> currentAssignmentMap =
208         assignmentSnapshot.getRegionToRegionServerMap();
209 
210       // Get the all the region servers
211       List<ServerName> servers = new ArrayList<ServerName>();
212       servers.addAll(getHBaseAdmin().getClusterStatus().getServers());
213       
214       LOG.info("Start to generate assignment plan for " + numRegions +
215           " regions from table " + tableName + " with " +
216           servers.size() + " region servers");
217 
218       int slotsPerServer = (int) Math.ceil((float) numRegions /
219           servers.size());
220       int regionSlots = slotsPerServer * servers.size();
221 
222       // Compute the primary, secondary and tertiary costs for each region/server
223       // pair. These costs are based only on node locality and rack locality, and
224       // will be modified later.
225       float[][] primaryCost = new float[numRegions][regionSlots];
226       float[][] secondaryCost = new float[numRegions][regionSlots];
227       float[][] tertiaryCost = new float[numRegions][regionSlots];
228 
229       if (this.enforceLocality && regionLocalityMap != null) {
230         // Transform the locality mapping into a 2D array, assuming that any
231         // unspecified locality value is 0.
232         float[][] localityPerServer = new float[numRegions][regionSlots];
233         for (int i = 0; i < numRegions; i++) {
234           Map<String, Float> serverLocalityMap =
235               regionLocalityMap.get(regions.get(i).getEncodedName());
236           if (serverLocalityMap == null) {
237             continue;
238           }
239           for (int j = 0; j < servers.size(); j++) {
240             String serverName = servers.get(j).getHostname();
241             if (serverName == null) {
242               continue;
243             }
244             Float locality = serverLocalityMap.get(serverName);
245             if (locality == null) {
246               continue;
247             }
248             for (int k = 0; k < slotsPerServer; k++) {
249               // If we can't find the locality of a region to a server, which occurs
250               // because locality is only reported for servers which have some
251               // blocks of a region local, then the locality for that pair is 0.
252               localityPerServer[i][j * slotsPerServer + k] = locality.floatValue();
253             }
254           }
255         }
256 
257         // Compute the total rack locality for each region in each rack. The total
258         // rack locality is the sum of the localities of a region on all servers in
259         // a rack.
260         Map<String, Map<HRegionInfo, Float>> rackRegionLocality =
261             new HashMap<String, Map<HRegionInfo, Float>>();
262         for (int i = 0; i < numRegions; i++) {
263           HRegionInfo region = regions.get(i);
264           for (int j = 0; j < regionSlots; j += slotsPerServer) {
265             String rack = rackManager.getRack(servers.get(j / slotsPerServer));
266             Map<HRegionInfo, Float> rackLocality = rackRegionLocality.get(rack);
267             if (rackLocality == null) {
268               rackLocality = new HashMap<HRegionInfo, Float>();
269               rackRegionLocality.put(rack, rackLocality);
270             }
271             Float localityObj = rackLocality.get(region);
272             float locality = localityObj == null ? 0 : localityObj.floatValue();
273             locality += localityPerServer[i][j];
274             rackLocality.put(region, locality);
275           }
276         }
277         for (int i = 0; i < numRegions; i++) {
278           for (int j = 0; j < regionSlots; j++) {
279             String rack = rackManager.getRack(servers.get(j / slotsPerServer));
280             Float totalRackLocalityObj =
281                 rackRegionLocality.get(rack).get(regions.get(i));
282             float totalRackLocality = totalRackLocalityObj == null ?
283                 0 : totalRackLocalityObj.floatValue();
284 
285             // Primary cost aims to favor servers with high node locality and low
286             // rack locality, so that secondaries and tertiaries can be chosen for
287             // nodes with high rack locality. This might give primaries with
288             // slightly less locality at first compared to a cost which only
289             // considers the node locality, but should be better in the long run.
290             primaryCost[i][j] = 1 - (2 * localityPerServer[i][j] -
291                 totalRackLocality);
292 
293             // Secondary cost aims to favor servers with high node locality and high
294             // rack locality since the tertiary will be chosen from the same rack as
295             // the secondary. This could be negative, but that is okay.
296             secondaryCost[i][j] = 2 - (localityPerServer[i][j] + totalRackLocality);
297 
298             // Tertiary cost is only concerned with the node locality. It will later
299             // be restricted to only hosts on the same rack as the secondary.
300             tertiaryCost[i][j] = 1 - localityPerServer[i][j];
301           }
302         }
303       }
304 
305       if (this.enforceMinAssignmentMove && currentAssignmentMap != null) {
306         // We want to minimize the number of regions which move as the result of a
307         // new assignment. Therefore, slightly penalize any placement which is for
308         // a host that is not currently serving the region.
309         for (int i = 0; i < numRegions; i++) {
310           for (int j = 0; j < servers.size(); j++) {
311             ServerName currentAddress = currentAssignmentMap.get(regions.get(i));
312             if (currentAddress != null &&
313                 !currentAddress.equals(servers.get(j))) {
314               for (int k = 0; k < slotsPerServer; k++) {
315                 primaryCost[i][j * slotsPerServer + k] += NOT_CURRENT_HOST_PENALTY;
316               }
317             }
318           }
319         }
320       }
321 
322       // Artificially increase cost of last slot of each server to evenly
323       // distribute the slop, otherwise there will be a few servers with too few
324       // regions and many servers with the max number of regions.
325       for (int i = 0; i < numRegions; i++) {
326         for (int j = 0; j < regionSlots; j += slotsPerServer) {
327           primaryCost[i][j] += LAST_SLOT_COST_PENALTY;
328           secondaryCost[i][j] += LAST_SLOT_COST_PENALTY;
329           tertiaryCost[i][j] += LAST_SLOT_COST_PENALTY;
330         }
331       }
332 
333       RandomizedMatrix randomizedMatrix = new RandomizedMatrix(numRegions,
334           regionSlots);
335       primaryCost = randomizedMatrix.transform(primaryCost);
336       int[] primaryAssignment = new MunkresAssignment(primaryCost).solve();
337       primaryAssignment = randomizedMatrix.invertIndices(primaryAssignment);
338 
339       // Modify the secondary and tertiary costs for each region/server pair to
340       // prevent a region from being assigned to the same rack for both primary
341       // and either one of secondary or tertiary.
342       for (int i = 0; i < numRegions; i++) {
343         int slot = primaryAssignment[i];
344         String rack = rackManager.getRack(servers.get(slot / slotsPerServer));
345         for (int k = 0; k < servers.size(); k++) {
346           if (!rackManager.getRack(servers.get(k)).equals(rack)) {
347             continue;
348           }
349           if (k == slot / slotsPerServer) {
350             // Same node, do not place secondary or tertiary here ever.
351             for (int m = 0; m < slotsPerServer; m++) {
352               secondaryCost[i][k * slotsPerServer + m] = MAX_COST;
353               tertiaryCost[i][k * slotsPerServer + m] = MAX_COST;
354             }
355           } else {
356             // Same rack, do not place secondary or tertiary here if possible.
357             for (int m = 0; m < slotsPerServer; m++) {
358               secondaryCost[i][k * slotsPerServer + m] = AVOID_COST;
359               tertiaryCost[i][k * slotsPerServer + m] = AVOID_COST;
360             }
361           }
362         }
363       }
364       if (munkresForSecondaryAndTertiary) {
365         randomizedMatrix = new RandomizedMatrix(numRegions, regionSlots);
366         secondaryCost = randomizedMatrix.transform(secondaryCost);
367         int[] secondaryAssignment = new MunkresAssignment(secondaryCost).solve();
368         secondaryAssignment = randomizedMatrix.invertIndices(secondaryAssignment);
369 
370         // Modify the tertiary costs for each region/server pair to ensure that a
371         // region is assigned to a tertiary server on the same rack as its secondary
372         // server, but not the same server in that rack.
373         for (int i = 0; i < numRegions; i++) {
374           int slot = secondaryAssignment[i];
375           String rack = rackManager.getRack(servers.get(slot / slotsPerServer));
376           for (int k = 0; k < servers.size(); k++) {
377             if (k == slot / slotsPerServer) {
378               // Same node, do not place tertiary here ever.
379               for (int m = 0; m < slotsPerServer; m++) {
380                 tertiaryCost[i][k * slotsPerServer + m] = MAX_COST;
381               }
382             } else {
383               if (rackManager.getRack(servers.get(k)).equals(rack)) {
384                 continue;
385               }
386               // Different rack, do not place tertiary here if possible.
387               for (int m = 0; m < slotsPerServer; m++) {
388                 tertiaryCost[i][k * slotsPerServer + m] = AVOID_COST;
389               }
390             }
391           }
392         }
393 
394         randomizedMatrix = new RandomizedMatrix(numRegions, regionSlots);
395         tertiaryCost = randomizedMatrix.transform(tertiaryCost);
396         int[] tertiaryAssignment = new MunkresAssignment(tertiaryCost).solve();
397         tertiaryAssignment = randomizedMatrix.invertIndices(tertiaryAssignment);
398 
399         for (int i = 0; i < numRegions; i++) {
400           List<ServerName> favoredServers =
401             new ArrayList<ServerName>(FavoredNodeAssignmentHelper.FAVORED_NODES_NUM);
402           ServerName s = servers.get(primaryAssignment[i] / slotsPerServer);
403           favoredServers.add(ServerName.valueOf(s.getHostname(), s.getPort(),
404               ServerName.NON_STARTCODE));
405 
406           s = servers.get(secondaryAssignment[i] / slotsPerServer);
407           favoredServers.add(ServerName.valueOf(s.getHostname(), s.getPort(),
408               ServerName.NON_STARTCODE));
409 
410           s = servers.get(tertiaryAssignment[i] / slotsPerServer);
411           favoredServers.add(ServerName.valueOf(s.getHostname(), s.getPort(),
412               ServerName.NON_STARTCODE));
413           // Update the assignment plan
414           plan.updateAssignmentPlan(regions.get(i), favoredServers);
415         }
416         LOG.info("Generated the assignment plan for " + numRegions +
417             " regions from table " + tableName + " with " +
418             servers.size() + " region servers");
419         LOG.info("Assignment plan for secondary and tertiary generated " +
420             "using MunkresAssignment");
421       } else {
422         Map<HRegionInfo, ServerName> primaryRSMap = new HashMap<HRegionInfo, ServerName>();
423         for (int i = 0; i < numRegions; i++) {
424           primaryRSMap.put(regions.get(i), servers.get(primaryAssignment[i] / slotsPerServer));
425         }
426         FavoredNodeAssignmentHelper favoredNodeHelper =
427             new FavoredNodeAssignmentHelper(servers, conf);
428         favoredNodeHelper.initialize();
429         Map<HRegionInfo, ServerName[]> secondaryAndTertiaryMap =
430             favoredNodeHelper.placeSecondaryAndTertiaryWithRestrictions(primaryRSMap);
431         for (int i = 0; i < numRegions; i++) {
432           List<ServerName> favoredServers =
433             new ArrayList<ServerName>(FavoredNodeAssignmentHelper.FAVORED_NODES_NUM);
434           HRegionInfo currentRegion = regions.get(i);
435           ServerName s = primaryRSMap.get(currentRegion);
436           favoredServers.add(ServerName.valueOf(s.getHostname(), s.getPort(),
437               ServerName.NON_STARTCODE));
438 
439           ServerName[] secondaryAndTertiary =
440               secondaryAndTertiaryMap.get(currentRegion);
441           s = secondaryAndTertiary[0];
442           favoredServers.add(ServerName.valueOf(s.getHostname(), s.getPort(),
443               ServerName.NON_STARTCODE));
444 
445           s = secondaryAndTertiary[1];
446           favoredServers.add(ServerName.valueOf(s.getHostname(), s.getPort(),
447               ServerName.NON_STARTCODE));
448           // Update the assignment plan
449           plan.updateAssignmentPlan(regions.get(i), favoredServers);
450         }
451         LOG.info("Generated the assignment plan for " + numRegions +
452             " regions from table " + tableName + " with " +
453             servers.size() + " region servers");
454         LOG.info("Assignment plan for secondary and tertiary generated " +
455             "using placeSecondaryAndTertiaryWithRestrictions method");
456       }
457     }
458 
459   public FavoredNodesPlan getNewAssignmentPlan() throws IOException {
460     // Get the current region assignment snapshot by scanning from the META
461     SnapshotOfRegionAssignmentFromMeta assignmentSnapshot =
462       this.getRegionAssignmentSnapshot();
463 
464     // Get the region locality map
465     Map<String, Map<String, Float>> regionLocalityMap = null;
466     if (this.enforceLocality) {
467       regionLocalityMap = FSUtils.getRegionDegreeLocalityMappingFromFS(conf);
468     }
469     // Initialize the assignment plan
470     FavoredNodesPlan plan = new FavoredNodesPlan();
471 
472     // Get the table to region mapping
473     Map<TableName, List<HRegionInfo>> tableToRegionMap =
474       assignmentSnapshot.getTableToRegionMap();
475     LOG.info("Start to generate the new assignment plan for the " +
476          + tableToRegionMap.keySet().size() + " tables" );
477     for (TableName table : tableToRegionMap.keySet()) {
478       try {
479         if (!this.targetTableSet.isEmpty() &&
480             !this.targetTableSet.contains(table)) {
481           continue;
482         }
483         // TODO: maybe run the placement in parallel for each table
484         genAssignmentPlan(table, assignmentSnapshot, regionLocalityMap, plan,
485             USE_MUNKRES_FOR_PLACING_SECONDARY_AND_TERTIARY);
486       } catch (Exception e) {
487         LOG.error("Get some exceptions for placing primary region server" +
488             "for table " + table + " because " + e);
489       }
490     }
491     LOG.info("Finish to generate the new assignment plan for the " +
492         + tableToRegionMap.keySet().size() + " tables" );
493     return plan;
494   }
495 
496   /**
497    * Some algorithms for solving the assignment problem may traverse workers or
498    * jobs in linear order which may result in skewing the assignments of the
499    * first jobs in the matrix toward the last workers in the matrix if the
500    * costs are uniform. To avoid this kind of clumping, we can randomize the
501    * rows and columns of the cost matrix in a reversible way, such that the
502    * solution to the assignment problem can be interpreted in terms of the
503    * original untransformed cost matrix. Rows and columns are transformed
504    * independently such that the elements contained in any row of the input
505    * matrix are the same as the elements in the corresponding output matrix,
506    * and each row has its elements transformed in the same way. Similarly for
507    * columns.
508    */
509   protected static class RandomizedMatrix {
510     private final int rows;
511     private final int cols;
512     private final int[] rowTransform;
513     private final int[] rowInverse;
514     private final int[] colTransform;
515     private final int[] colInverse;
516 
517     /**
518      * Create a randomization scheme for a matrix of a given size.
519      * @param rows the number of rows in the matrix
520      * @param cols the number of columns in the matrix
521      */
522     public RandomizedMatrix(int rows, int cols) {
523       this.rows = rows;
524       this.cols = cols;
525       Random random = new Random();
526       rowTransform = new int[rows];
527       rowInverse = new int[rows];
528       for (int i = 0; i < rows; i++) {
529         rowTransform[i] = i;
530       }
531       // Shuffle the row indices.
532       for (int i = rows - 1; i >= 0; i--) {
533         int r = random.nextInt(i + 1);
534         int temp = rowTransform[r];
535         rowTransform[r] = rowTransform[i];
536         rowTransform[i] = temp;
537       }
538       // Generate the inverse row indices.
539       for (int i = 0; i < rows; i++) {
540         rowInverse[rowTransform[i]] = i;
541       }
542 
543       colTransform = new int[cols];
544       colInverse = new int[cols];
545       for (int i = 0; i < cols; i++) {
546         colTransform[i] = i;
547       }
548       // Shuffle the column indices.
549       for (int i = cols - 1; i >= 0; i--) {
550         int r = random.nextInt(i + 1);
551         int temp = colTransform[r];
552         colTransform[r] = colTransform[i];
553         colTransform[i] = temp;
554       }
555       // Generate the inverse column indices.
556       for (int i = 0; i < cols; i++) {
557         colInverse[colTransform[i]] = i;
558       }
559     }
560 
561     /**
562      * Copy a given matrix into a new matrix, transforming each row index and
563      * each column index according to the randomization scheme that was created
564      * at construction time.
565      * @param matrix the cost matrix to transform
566      * @return a new matrix with row and column indices transformed
567      */
568     public float[][] transform(float[][] matrix) {
569       float[][] result = new float[rows][cols];
570       for (int i = 0; i < rows; i++) {
571         for (int j = 0; j < cols; j++) {
572           result[rowTransform[i]][colTransform[j]] = matrix[i][j];
573         }
574       }
575       return result;
576     }
577 
578     /**
579      * Copy a given matrix into a new matrix, transforming each row index and
580      * each column index according to the inverse of the randomization scheme
581      * that was created at construction time.
582      * @param matrix the cost matrix to be inverted
583      * @return a new matrix with row and column indices inverted
584      */
585     public float[][] invert(float[][] matrix) {
586       float[][] result = new float[rows][cols];
587       for (int i = 0; i < rows; i++) {
588         for (int j = 0; j < cols; j++) {
589           result[rowInverse[i]][colInverse[j]] = matrix[i][j];
590         }
591       }
592       return result;
593     }
594 
595     /**
596      * Given an array where each element {@code indices[i]} represents the
597      * randomized column index corresponding to randomized row index {@code i},
598      * create a new array with the corresponding inverted indices.
599      * @param indices an array of transformed indices to be inverted
600      * @return an array of inverted indices
601      */
602     public int[] invertIndices(int[] indices) {
603       int[] result = new int[indices.length];
604       for (int i = 0; i < indices.length; i++) {
605         result[rowInverse[i]] = colInverse[indices[i]];
606       }
607       return result;
608     }
609   }
610 
611   /**
612    * Print the assignment plan to the system output stream
613    * @param plan
614    */
615   public static void printAssignmentPlan(FavoredNodesPlan plan) {
616     if (plan == null) return;
617     LOG.info("========== Start to print the assignment plan ================");
618     // sort the map based on region info
619     Map<HRegionInfo, List<ServerName>> assignmentMap =
620       new TreeMap<HRegionInfo, List<ServerName>>(plan.getAssignmentMap());
621     
622     for (Map.Entry<HRegionInfo, List<ServerName>> entry : assignmentMap.entrySet()) {
623       
624       String serverList = FavoredNodeAssignmentHelper.getFavoredNodesAsString(entry.getValue());
625       String regionName = entry.getKey().getRegionNameAsString();
626       LOG.info("Region: " + regionName );
627       LOG.info("Its favored nodes: " + serverList);
628     }
629     LOG.info("========== Finish to print the assignment plan ================");
630   }
631 
632   /**
633    * Update the assignment plan into hbase:meta
634    * @param plan the assignments plan to be updated into hbase:meta
635    * @throws IOException if cannot update assignment plan in hbase:meta
636    */
637   public void updateAssignmentPlanToMeta(FavoredNodesPlan plan)
638   throws IOException {
639     try {
640       LOG.info("Start to update the hbase:meta with the new assignment plan");
641       Map<HRegionInfo, List<ServerName>> assignmentMap =
642         plan.getAssignmentMap();
643       FavoredNodeAssignmentHelper.updateMetaWithFavoredNodesInfo(assignmentMap, conf);
644       LOG.info("Updated the hbase:meta with the new assignment plan");
645     } catch (Exception e) {
646       LOG.error("Failed to update hbase:meta with the new assignment" +
647           "plan because " + e.getMessage());
648     }
649   }
650 
651   /**
652    * Update the assignment plan to all the region servers
653    * @param plan
654    * @throws IOException
655    */
656   private void updateAssignmentPlanToRegionServers(FavoredNodesPlan plan)
657   throws IOException{
658     LOG.info("Start to update the region servers with the new assignment plan");
659     // Get the region to region server map
660     Map<ServerName, List<HRegionInfo>> currentAssignment =
661       this.getRegionAssignmentSnapshot().getRegionServerToRegionMap();
662     HConnection connection = this.getHBaseAdmin().getConnection();
663 
664     // track of the failed and succeeded updates
665     int succeededNum = 0;
666     Map<ServerName, Exception> failedUpdateMap =
667       new HashMap<ServerName, Exception>();
668 
669     for (Map.Entry<ServerName, List<HRegionInfo>> entry :
670       currentAssignment.entrySet()) {
671       List<Pair<HRegionInfo, List<ServerName>>> regionUpdateInfos =
672           new ArrayList<Pair<HRegionInfo, List<ServerName>>>();
673       try {
674         // Keep track of the favored updates for the current region server
675         FavoredNodesPlan singleServerPlan = null;
676         // Find out all the updates for the current region server
677         for (HRegionInfo region : entry.getValue()) {
678           List<ServerName> favoredServerList = plan.getFavoredNodes(region);
679           if (favoredServerList != null &&
680               favoredServerList.size() == FavoredNodeAssignmentHelper.FAVORED_NODES_NUM) {
681             // Create the single server plan if necessary
682             if (singleServerPlan == null) {
683               singleServerPlan = new FavoredNodesPlan();
684             }
685             // Update the single server update
686             singleServerPlan.updateAssignmentPlan(region, favoredServerList);
687             regionUpdateInfos.add(
688               new Pair<HRegionInfo, List<ServerName>>(region, favoredServerList));
689           }
690         }
691         if (singleServerPlan != null) {
692           // Update the current region server with its updated favored nodes
693           BlockingInterface currentRegionServer = connection.getAdmin(entry.getKey());
694           UpdateFavoredNodesRequest request =
695               RequestConverter.buildUpdateFavoredNodesRequest(regionUpdateInfos);
696           
697           UpdateFavoredNodesResponse updateFavoredNodesResponse =
698               currentRegionServer.updateFavoredNodes(null, request);
699           LOG.info("Region server " +
700               ProtobufUtil.getServerInfo(currentRegionServer).getServerName() +
701               " has updated " + updateFavoredNodesResponse.getResponse() + " / " +
702               singleServerPlan.getAssignmentMap().size() +
703               " regions with the assignment plan");
704           succeededNum ++;
705         }
706       } catch (Exception e) {
707         failedUpdateMap.put(entry.getKey(), e);
708       }
709     }
710     // log the succeeded updates
711     LOG.info("Updated " + succeededNum + " region servers with " +
712             "the new assignment plan");
713 
714     // log the failed updates
715     int failedNum = failedUpdateMap.size();
716     if (failedNum != 0) {
717       LOG.error("Failed to update the following + " + failedNum +
718           " region servers with its corresponding favored nodes");
719       for (Map.Entry<ServerName, Exception> entry :
720         failedUpdateMap.entrySet() ) {
721         LOG.error("Failed to update " + entry.getKey().getHostAndPort() +
722             " because of " + entry.getValue().getMessage());
723       }
724     }
725   }
726 
727   public void updateAssignmentPlan(FavoredNodesPlan plan)
728       throws IOException {
729     LOG.info("Start to update the new assignment plan for the hbase:meta table and" +
730         " the region servers");
731     // Update the new assignment plan to META
732     updateAssignmentPlanToMeta(plan);
733     // Update the new assignment plan to Region Servers
734     updateAssignmentPlanToRegionServers(plan);
735     LOG.info("Finish to update the new assignment plan for the hbase:meta table and" +
736         " the region servers");
737   }
738 
739   /**
740    * Return how many regions will move per table since their primary RS will
741    * change
742    *
743    * @param newPlan - new AssignmentPlan
744    * @return how many primaries will move per table
745    */
746   public Map<TableName, Integer> getRegionsMovement(FavoredNodesPlan newPlan)
747       throws IOException {
748     Map<TableName, Integer> movesPerTable = new HashMap<TableName, Integer>();
749     SnapshotOfRegionAssignmentFromMeta snapshot = this.getRegionAssignmentSnapshot();
750     Map<TableName, List<HRegionInfo>> tableToRegions = snapshot
751         .getTableToRegionMap();
752     FavoredNodesPlan oldPlan = snapshot.getExistingAssignmentPlan();
753     Set<TableName> tables = snapshot.getTableSet();
754     for (TableName table : tables) {
755       int movedPrimaries = 0;
756       if (!this.targetTableSet.isEmpty()
757           && !this.targetTableSet.contains(table)) {
758         continue;
759       }
760       List<HRegionInfo> regions = tableToRegions.get(table);
761       for (HRegionInfo region : regions) {
762         List<ServerName> oldServers = oldPlan.getFavoredNodes(region);
763         List<ServerName> newServers = newPlan.getFavoredNodes(region);
764         if (oldServers != null && newServers != null) {
765           ServerName oldPrimary = oldServers.get(0);
766           ServerName newPrimary = newServers.get(0);
767           if (oldPrimary.compareTo(newPrimary) != 0) {
768             movedPrimaries++;
769           }
770         }
771       }
772       movesPerTable.put(table, movedPrimaries);
773     }
774     return movesPerTable;
775   }
776 
777   /**
778    * Compares two plans and check whether the locality dropped or increased
779    * (prints the information as a string) also prints the baseline locality
780    *
781    * @param movesPerTable - how many primary regions will move per table
782    * @param regionLocalityMap - locality map from FS
783    * @param newPlan - new assignment plan
784    * @throws IOException
785    */
786   public void checkDifferencesWithOldPlan(Map<TableName, Integer> movesPerTable,
787       Map<String, Map<String, Float>> regionLocalityMap, FavoredNodesPlan newPlan)
788           throws IOException {
789     // localities for primary, secondary and tertiary
790     SnapshotOfRegionAssignmentFromMeta snapshot = this.getRegionAssignmentSnapshot();
791     FavoredNodesPlan oldPlan = snapshot.getExistingAssignmentPlan();
792     Set<TableName> tables = snapshot.getTableSet();
793     Map<TableName, List<HRegionInfo>> tableToRegionsMap = snapshot.getTableToRegionMap();
794     for (TableName table : tables) {
795       float[] deltaLocality = new float[3];
796       float[] locality = new float[3];
797       if (!this.targetTableSet.isEmpty()
798           && !this.targetTableSet.contains(table)) {
799         continue;
800       }
801       List<HRegionInfo> regions = tableToRegionsMap.get(table);
802       System.out.println("==================================================");
803       System.out.println("Assignment Plan Projection Report For Table: " + table);
804       System.out.println("\t Total regions: " + regions.size());
805       System.out.println("\t" + movesPerTable.get(table)
806           + " primaries will move due to their primary has changed");
807       for (HRegionInfo currentRegion : regions) {
808         Map<String, Float> regionLocality = regionLocalityMap.get(currentRegion
809             .getEncodedName());
810         if (regionLocality == null) {
811           continue;
812         }
813         List<ServerName> oldServers = oldPlan.getFavoredNodes(currentRegion);
814         List<ServerName> newServers = newPlan.getFavoredNodes(currentRegion);
815         if (newServers != null && oldServers != null) {
816           int i=0;
817           for (FavoredNodesPlan.Position p : FavoredNodesPlan.Position.values()) {
818             ServerName newServer = newServers.get(p.ordinal());
819             ServerName oldServer = oldServers.get(p.ordinal());
820             Float oldLocality = 0f;
821             if (oldServers != null) {
822               oldLocality = regionLocality.get(oldServer.getHostname());
823               if (oldLocality == null) {
824                 oldLocality = 0f;
825               }
826               locality[i] += oldLocality;
827             }
828             Float newLocality = regionLocality.get(newServer.getHostname());
829             if (newLocality == null) {
830               newLocality = 0f;
831             }
832             deltaLocality[i] += newLocality - oldLocality;
833             i++;
834           }
835         }
836       }
837       DecimalFormat df = new java.text.DecimalFormat( "#.##");
838       for (int i = 0; i < deltaLocality.length; i++) {
839         System.out.print("\t\t Baseline locality for ");
840         if (i == 0) {
841           System.out.print("primary ");
842         } else if (i == 1) {
843           System.out.print("secondary ");
844         } else if (i == 2) {
845           System.out.print("tertiary ");
846         }
847         System.out.println(df.format(100 * locality[i] / regions.size()) + "%");
848         System.out.print("\t\t Locality will change with the new plan: ");
849         System.out.println(df.format(100 * deltaLocality[i] / regions.size())
850             + "%");
851       }
852       System.out.println("\t Baseline dispersion");
853       printDispersionScores(table, snapshot, regions.size(), null, true);
854       System.out.println("\t Projected dispersion");
855       printDispersionScores(table, snapshot, regions.size(), newPlan, true);
856     }
857   }
858 
859   public void printDispersionScores(TableName table,
860       SnapshotOfRegionAssignmentFromMeta snapshot, int numRegions, FavoredNodesPlan newPlan,
861       boolean simplePrint) {
862     if (!this.targetTableSet.isEmpty() && !this.targetTableSet.contains(table)) {
863       return;
864     }
865     AssignmentVerificationReport report = new AssignmentVerificationReport();
866     report.fillUpDispersion(table, snapshot, newPlan);
867     List<Float> dispersion = report.getDispersionInformation();
868     if (simplePrint) {
869       DecimalFormat df = new java.text.DecimalFormat("#.##");
870       System.out.println("\tAvg dispersion score: "
871           + df.format(dispersion.get(0)) + " hosts;\tMax dispersion score: "
872           + df.format(dispersion.get(1)) + " hosts;\tMin dispersion score: "
873           + df.format(dispersion.get(2)) + " hosts;");
874     } else {
875       LOG.info("For Table: " + table + " ; #Total Regions: " + numRegions
876           + " ; The average dispersion score is " + dispersion.get(0));
877     }
878   }
879 
880   public void printLocalityAndDispersionForCurrentPlan(
881       Map<String, Map<String, Float>> regionLocalityMap) throws IOException {
882     SnapshotOfRegionAssignmentFromMeta snapshot = this.getRegionAssignmentSnapshot();
883     FavoredNodesPlan assignmentPlan = snapshot.getExistingAssignmentPlan();
884     Set<TableName> tables = snapshot.getTableSet();
885     Map<TableName, List<HRegionInfo>> tableToRegionsMap = snapshot
886         .getTableToRegionMap();
887     for (TableName table : tables) {
888       float[] locality = new float[3];
889       if (!this.targetTableSet.isEmpty()
890           && !this.targetTableSet.contains(table)) {
891         continue;
892       }
893       List<HRegionInfo> regions = tableToRegionsMap.get(table);
894       for (HRegionInfo currentRegion : regions) {
895         Map<String, Float> regionLocality = regionLocalityMap.get(currentRegion
896             .getEncodedName());
897         if (regionLocality == null) {
898           continue;
899         }
900         List<ServerName> servers = assignmentPlan.getFavoredNodes(currentRegion);
901         if (servers != null) {
902           int i = 0;
903           for (FavoredNodesPlan.Position p : FavoredNodesPlan.Position.values()) {
904             ServerName server = servers.get(p.ordinal());
905             Float currentLocality = 0f;
906             if (servers != null) {
907               currentLocality = regionLocality.get(server.getHostname());
908               if (currentLocality == null) {
909                 currentLocality = 0f;
910               }
911               locality[i] += currentLocality;
912             }
913             i++;
914           }
915         }
916       }
917       for (int i = 0; i < locality.length; i++) {
918         String copy =  null;
919         if (i == 0) {
920           copy = "primary";
921         } else if (i == 1) {
922           copy = "secondary";
923         } else if (i == 2) {
924           copy = "tertiary" ;
925         }
926         float avgLocality = 100 * locality[i] / regions.size();
927         LOG.info("For Table: " + table + " ; #Total Regions: " + regions.size()
928             + " ; The average locality for " + copy+ " is " + avgLocality + " %");
929       }
930       printDispersionScores(table, snapshot, regions.size(), null, false);
931     }
932   }
933 
934   /**
935    * @param favoredNodesStr The String of favored nodes
936    * @return the list of ServerName for the byte array of favored nodes.
937    */
938   public static List<ServerName> getFavoredNodeList(String favoredNodesStr) {
939     String[] favoredNodesArray = StringUtils.split(favoredNodesStr, ",");
940     if (favoredNodesArray == null)
941       return null;
942 
943     List<ServerName> serverList = new ArrayList<ServerName>();
944     for (String hostNameAndPort : favoredNodesArray) {
945       serverList.add(ServerName.valueOf(hostNameAndPort, ServerName.NON_STARTCODE));
946     }
947     return serverList;
948   }
949 
950   public static void main(String args[]) throws IOException {
951     Options opt = new Options();
952     opt.addOption("w", "write", false, "write the assignments to hbase:meta only");
953     opt.addOption("u", "update", false,
954         "update the assignments to hbase:meta and RegionServers together");
955     opt.addOption("n", "dry-run", false, "do not write assignments to META");
956     opt.addOption("v", "verify", false, "verify current assignments against META");
957     opt.addOption("p", "print", false, "print the current assignment plan in META");
958     opt.addOption("h", "help", false, "print usage");
959     opt.addOption("d", "verification-details", false,
960         "print the details of verification report");
961 
962     opt.addOption("zk", true, "to set the zookeeper quorum");
963     opt.addOption("fs", true, "to set HDFS");
964     opt.addOption("hbase_root", true, "to set hbase_root directory");
965 
966     opt.addOption("overwrite", false,
967         "overwrite the favored nodes for a single region," +
968         "for example: -update -r regionName -f server1:port,server2:port,server3:port");
969     opt.addOption("r", true, "The region name that needs to be updated");
970     opt.addOption("f", true, "The new favored nodes");
971 
972     opt.addOption("tables", true,
973         "The list of table names splitted by ',' ;" +
974         "For example: -tables: t1,t2,...,tn");
975     opt.addOption("l", "locality", true, "enforce the maxium locality");
976     opt.addOption("m", "min-move", true, "enforce minium assignment move");
977     opt.addOption("diff", false, "calculate difference between assignment plans");
978     opt.addOption("munkres", false,
979         "use munkres to place secondaries and tertiaries");
980     opt.addOption("ld", "locality-dispersion", false, "print locality and dispersion " +
981     		"information for current plan");
982     try {
983       // Set the log4j
984       Logger.getLogger("org.apache.zookeeper").setLevel(Level.ERROR);
985       Logger.getLogger("org.apache.hadoop.hbase").setLevel(Level.ERROR);
986       Logger.getLogger("org.apache.hadoop.hbase.master.RegionPlacementMaintainer")
987       .setLevel(Level.INFO);
988 
989       CommandLine cmd = new GnuParser().parse(opt, args);
990       Configuration conf = HBaseConfiguration.create();
991 
992       boolean enforceMinAssignmentMove = true;
993       boolean enforceLocality = true;
994       boolean verificationDetails = false;
995 
996       // Read all the options
997       if ((cmd.hasOption("l") &&
998           cmd.getOptionValue("l").equalsIgnoreCase("false")) ||
999           (cmd.hasOption("locality") &&
1000               cmd.getOptionValue("locality").equalsIgnoreCase("false"))) {
1001         enforceLocality = false;
1002       }
1003 
1004       if ((cmd.hasOption("m") &&
1005           cmd.getOptionValue("m").equalsIgnoreCase("false")) ||
1006           (cmd.hasOption("min-move") &&
1007               cmd.getOptionValue("min-move").equalsIgnoreCase("false"))) {
1008         enforceMinAssignmentMove = false;
1009       }
1010 
1011       if (cmd.hasOption("zk")) {
1012         conf.set(HConstants.ZOOKEEPER_QUORUM, cmd.getOptionValue("zk"));
1013         LOG.info("Setting the zk quorum: " + conf.get(HConstants.ZOOKEEPER_QUORUM));
1014       }
1015 
1016       if (cmd.hasOption("fs")) {
1017         conf.set(FileSystem.FS_DEFAULT_NAME_KEY, cmd.getOptionValue("fs"));
1018         LOG.info("Setting the HDFS: " + conf.get(FileSystem.FS_DEFAULT_NAME_KEY));
1019       }
1020 
1021       if (cmd.hasOption("hbase_root")) {
1022         conf.set(HConstants.HBASE_DIR, cmd.getOptionValue("hbase_root"));
1023         LOG.info("Setting the hbase root directory: " + conf.get(HConstants.HBASE_DIR));
1024       }
1025 
1026       // Create the region placement obj
1027       RegionPlacementMaintainer rp = new RegionPlacementMaintainer(conf, enforceLocality,
1028           enforceMinAssignmentMove);
1029 
1030       if (cmd.hasOption("d") || cmd.hasOption("verification-details")) {
1031         verificationDetails = true;
1032       }
1033 
1034       if (cmd.hasOption("tables")) {
1035         String tableNameListStr = cmd.getOptionValue("tables");
1036         String[] tableNames = StringUtils.split(tableNameListStr, ",");
1037         rp.setTargetTableName(tableNames);
1038       }
1039 
1040       if (cmd.hasOption("munkres")) {
1041         USE_MUNKRES_FOR_PLACING_SECONDARY_AND_TERTIARY = true;
1042       }
1043 
1044       // Read all the modes
1045       if (cmd.hasOption("v") || cmd.hasOption("verify")) {
1046         // Verify the region placement.
1047         rp.verifyRegionPlacement(verificationDetails);
1048       } else if (cmd.hasOption("n") || cmd.hasOption("dry-run")) {
1049         // Generate the assignment plan only without updating the hbase:meta and RS
1050         FavoredNodesPlan plan = rp.getNewAssignmentPlan();
1051         printAssignmentPlan(plan);
1052       } else if (cmd.hasOption("w") || cmd.hasOption("write")) {
1053         // Generate the new assignment plan
1054         FavoredNodesPlan plan = rp.getNewAssignmentPlan();
1055         // Print the new assignment plan
1056         printAssignmentPlan(plan);
1057         // Write the new assignment plan to META
1058         rp.updateAssignmentPlanToMeta(plan);
1059       } else if (cmd.hasOption("u") || cmd.hasOption("update")) {
1060         // Generate the new assignment plan
1061         FavoredNodesPlan plan = rp.getNewAssignmentPlan();
1062         // Print the new assignment plan
1063         printAssignmentPlan(plan);
1064         // Update the assignment to hbase:meta and Region Servers
1065         rp.updateAssignmentPlan(plan);
1066       } else if (cmd.hasOption("diff")) {
1067         FavoredNodesPlan newPlan = rp.getNewAssignmentPlan();
1068         Map<String, Map<String, Float>> locality = FSUtils
1069             .getRegionDegreeLocalityMappingFromFS(conf);
1070         Map<TableName, Integer> movesPerTable = rp.getRegionsMovement(newPlan);
1071         rp.checkDifferencesWithOldPlan(movesPerTable, locality, newPlan);
1072         System.out.println("Do you want to update the assignment plan? [y/n]");
1073         Scanner s = new Scanner(System.in);
1074         String input = s.nextLine().trim();
1075         if (input.equals("y")) {
1076           System.out.println("Updating assignment plan...");
1077           rp.updateAssignmentPlan(newPlan);
1078         }
1079         s.close();
1080       } else if (cmd.hasOption("ld")) {
1081         Map<String, Map<String, Float>> locality = FSUtils
1082             .getRegionDegreeLocalityMappingFromFS(conf);
1083         rp.printLocalityAndDispersionForCurrentPlan(locality);
1084       } else if (cmd.hasOption("p") || cmd.hasOption("print")) {
1085         FavoredNodesPlan plan = rp.getRegionAssignmentSnapshot().getExistingAssignmentPlan();
1086         printAssignmentPlan(plan);
1087       } else if (cmd.hasOption("overwrite")) {
1088         if (!cmd.hasOption("f") || !cmd.hasOption("r")) {
1089           throw new IllegalArgumentException("Please specify: " +
1090               " -update -r regionName -f server1:port,server2:port,server3:port");
1091         }
1092 
1093         String regionName = cmd.getOptionValue("r");
1094         String favoredNodesStr = cmd.getOptionValue("f");
1095         LOG.info("Going to update the region " + regionName + " with the new favored nodes " +
1096             favoredNodesStr);
1097         List<ServerName> favoredNodes = null;
1098         HRegionInfo regionInfo =
1099             rp.getRegionAssignmentSnapshot().getRegionNameToRegionInfoMap().get(regionName);
1100         if (regionInfo == null) {
1101           LOG.error("Cannot find the region " + regionName + " from the META");
1102         } else {
1103           try {
1104             favoredNodes = getFavoredNodeList(favoredNodesStr);
1105           } catch (IllegalArgumentException e) {
1106             LOG.error("Cannot parse the invalid favored nodes because " + e);
1107           }
1108           FavoredNodesPlan newPlan = new FavoredNodesPlan();
1109           newPlan.updateAssignmentPlan(regionInfo, favoredNodes);
1110           rp.updateAssignmentPlan(newPlan);
1111         }
1112       } else {
1113         printHelp(opt);
1114       }
1115     } catch (ParseException e) {
1116       printHelp(opt);
1117     }
1118   }
1119 }