View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.balancer;
19  
20  import java.util.ArrayDeque;
21  import java.util.Collection;
22  import java.util.Deque;
23  import java.util.HashMap;
24  import java.util.LinkedList;
25  import java.util.List;
26  import java.util.Map;
27  import java.util.Map.Entry;
28  import java.util.Random;
29  
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  import org.apache.commons.math.stat.descriptive.DescriptiveStatistics;
33  import org.apache.hadoop.hbase.classification.InterfaceAudience;
34  import org.apache.hadoop.conf.Configuration;
35  import org.apache.hadoop.hbase.ClusterStatus;
36  import org.apache.hadoop.hbase.HBaseInterfaceAudience;
37  import org.apache.hadoop.hbase.HRegionInfo;
38  import org.apache.hadoop.hbase.RegionLoad;
39  import org.apache.hadoop.hbase.ServerLoad;
40  import org.apache.hadoop.hbase.ServerName;
41  import org.apache.hadoop.hbase.master.MasterServices;
42  import org.apache.hadoop.hbase.master.RegionPlan;
43  import org.apache.hadoop.hbase.util.Bytes;
44  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
45  import org.apache.hadoop.hbase.util.Pair;
46  
47  /**
48   * <p>This is a best effort load balancer. Given a Cost function F(C) => x It will
49   * randomly try and mutate the cluster to Cprime. If F(Cprime) < F(C) then the
50   * new cluster state becomes the plan. It includes costs functions to compute the cost of:</p>
51   * <ul>
52   * <li>Region Load</li>
53   * <li>Table Load</li>
54   * <li>Data Locality</li>
55   * <li>Memstore Sizes</li>
56   * <li>Storefile Sizes</li>
57   * </ul>
58   *
59   *
60   * <p>Every cost function returns a number between 0 and 1 inclusive; where 0 is the lowest cost
61   * best solution, and 1 is the highest possible cost and the worst solution.  The computed costs are
62   * scaled by their respective multipliers:</p>
63   *
64   * <ul>
65   *   <li>hbase.master.balancer.stochastic.regionLoadCost</li>
66   *   <li>hbase.master.balancer.stochastic.moveCost</li>
67   *   <li>hbase.master.balancer.stochastic.tableLoadCost</li>
68   *   <li>hbase.master.balancer.stochastic.localityCost</li>
69   *   <li>hbase.master.balancer.stochastic.memstoreSizeCost</li>
70   *   <li>hbase.master.balancer.stochastic.storefileSizeCost</li>
71   * </ul>
72   *
73   * <p>In addition to the above configurations, the balancer can be tuned by the following
74   * configuration values:</p>
75   * <ul>
76   *   <li>hbase.master.balancer.stochastic.maxMoveRegions which
77   *   controls what the max number of regions that can be moved in a single invocation of this
78   *   balancer.</li>
79   *   <li>hbase.master.balancer.stochastic.stepsPerRegion is the coefficient by which the number of
80   *   regions is multiplied to try and get the number of times the balancer will
81   *   mutate all servers.</li>
82   *   <li>hbase.master.balancer.stochastic.maxSteps which controls the maximum number of times that
83   *   the balancer will try and mutate all the servers. The balancer will use the minimum of this
84   *   value and the above computation.</li>
85   * </ul>
86   *
87   * <p>This balancer is best used with hbase.master.loadbalance.bytable set to false
88   * so that the balancer gets the full picture of all loads on the cluster.</p>
89   */
90  @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
91  public class StochasticLoadBalancer extends BaseLoadBalancer {
92  
93    private static final String STEPS_PER_REGION_KEY =
94        "hbase.master.balancer.stochastic.stepsPerRegion";
95    private static final String MAX_STEPS_KEY =
96        "hbase.master.balancer.stochastic.maxSteps";
97    private static final String MAX_RUNNING_TIME_KEY =
98        "hbase.master.balancer.stochastic.maxRunningTime";
99    private static final String KEEP_REGION_LOADS =
100       "hbase.master.balancer.stochastic.numRegionLoadsToRemember";
101 
102   private static final Random RANDOM = new Random(System.currentTimeMillis());
103   private static final Log LOG = LogFactory.getLog(StochasticLoadBalancer.class);
104 
105   private final RegionLocationFinder regionFinder = new RegionLocationFinder();
106   private ClusterStatus clusterStatus = null;
107   Map<String, Deque<RegionLoad>> loads = new HashMap<String, Deque<RegionLoad>>();
108 
109   // values are defaults
110   private int maxSteps = 1000000;
111   private int stepsPerRegion = 800;
112   private long maxRunningTime = 30 * 1000 * 1; // 30 seconds.
113   private int numRegionLoadsToRemember = 15;
114 
115   private RegionPicker[] pickers;
116   private CostFromRegionLoadFunction[] regionLoadFunctions;
117   private CostFunction[] costFunctions;
118   // Keep locality based picker and cost function to alert them
119   // when new services are offered
120   private LocalityBasedPicker localityPicker;
121   private LocalityCostFunction localityCost;
122 
123   @Override
124   public void setConf(Configuration conf) {
125     super.setConf(conf);
126 
127     regionFinder.setConf(conf);
128 
129     maxSteps = conf.getInt(MAX_STEPS_KEY, maxSteps);
130 
131     stepsPerRegion = conf.getInt(STEPS_PER_REGION_KEY, stepsPerRegion);
132     maxRunningTime = conf.getLong(MAX_RUNNING_TIME_KEY, maxRunningTime);
133 
134     numRegionLoadsToRemember = conf.getInt(KEEP_REGION_LOADS, numRegionLoadsToRemember);
135 
136     localityPicker = new LocalityBasedPicker(services);
137     localityCost = new LocalityCostFunction(conf, services);
138 
139     pickers = new RegionPicker[] {
140       new RandomRegionPicker(),
141       new LoadPicker(),
142       localityPicker
143     };
144 
145     regionLoadFunctions = new CostFromRegionLoadFunction[] {
146       new ReadRequestCostFunction(conf),
147       new WriteRequestCostFunction(conf),
148       new MemstoreSizeCostFunction(conf),
149       new StoreFileCostFunction(conf)
150     };
151 
152     costFunctions = new CostFunction[]{
153       new RegionCountSkewCostFunction(conf),
154       new MoveCostFunction(conf),
155       localityCost,
156       new TableSkewCostFunction(conf),
157       regionLoadFunctions[0],
158       regionLoadFunctions[1],
159       regionLoadFunctions[2],
160       regionLoadFunctions[3],
161     };
162   }
163 
164   @Override
165   protected void setSlop(Configuration conf) {
166     this.slop = conf.getFloat("hbase.regions.slop", 0.001F);
167   }
168 
169   @Override
170   public void setClusterStatus(ClusterStatus st) {
171     super.setClusterStatus(st);
172     regionFinder.setClusterStatus(st);
173     this.clusterStatus = st;
174     updateRegionLoad();
175     for(CostFromRegionLoadFunction cost : regionLoadFunctions) {
176       cost.setClusterStatus(st);
177     }
178   }
179 
180   @Override
181   public void setMasterServices(MasterServices masterServices) {
182     super.setMasterServices(masterServices);
183     this.regionFinder.setServices(masterServices);
184     this.localityCost.setServices(masterServices);
185     this.localityPicker.setServices(masterServices);
186 
187   }
188 
189   /**
190    * Given the cluster state this will try and approach an optimal balance. This
191    * should always approach the optimal state given enough steps.
192    */
193   @Override
194   public List<RegionPlan> balanceCluster(Map<ServerName, List<HRegionInfo>> clusterState) {
195     if (!needsBalance(new ClusterLoadState(clusterState))) {
196       return null;
197     }
198 
199     long startTime = EnvironmentEdgeManager.currentTimeMillis();
200 
201     // On clusters with lots of HFileLinks or lots of reference files,
202     // instantiating the storefile infos can be quite expensive.
203     // Allow turning this feature off if the locality cost is not going to
204     // be used in any computations.
205     RegionLocationFinder finder = null;
206     if (this.localityCost != null && this.localityCost.getMultiplier() > 0) {
207       finder = this.regionFinder;
208     }
209     
210     // Keep track of servers to iterate through them.
211     Cluster cluster = new Cluster(clusterState, loads, finder);
212     double currentCost = computeCost(cluster, Double.MAX_VALUE);
213 
214     double initCost = currentCost;
215     double newCost = currentCost;
216 
217     long computedMaxSteps = Math.min(this.maxSteps,
218         ((long)cluster.numRegions * (long)this.stepsPerRegion * (long)cluster.numServers));
219     // Perform a stochastic walk to see if we can get a good fit.
220     long step;
221     for (step = 0; step < computedMaxSteps; step++) {
222       int pickerIdx = RANDOM.nextInt(pickers.length);
223       RegionPicker p = pickers[pickerIdx];
224       Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> picks = p.pick(cluster);
225 
226       int leftServer = picks.getFirst().getFirst();
227       int leftRegion = picks.getFirst().getSecond();
228       int rightServer = picks.getSecond().getFirst();
229       int rightRegion = picks.getSecond().getSecond();
230 
231       // We couldn't find a server
232       if (rightServer < 0 || leftServer < 0) {
233         continue;
234       }
235 
236       // We randomly picked to do nothing.
237       if (leftRegion < 0 && rightRegion < 0) {
238         continue;
239       }
240 
241       cluster.moveOrSwapRegion(leftServer,
242           rightServer,
243           leftRegion,
244           rightRegion);
245 
246       newCost = computeCost(cluster, currentCost);
247       // Should this be kept?
248       if (newCost < currentCost) {
249         currentCost = newCost;
250       } else {
251         // Put things back the way they were before.
252         // TODO: undo by remembering old values, using an UndoAction class
253         cluster.moveOrSwapRegion(leftServer,
254             rightServer,
255             rightRegion,
256             leftRegion);
257       }
258 
259       if (EnvironmentEdgeManager.currentTimeMillis() - startTime >
260           maxRunningTime) {
261         break;
262       }
263     }
264 
265     long endTime = EnvironmentEdgeManager.currentTimeMillis();
266 
267     metricsBalancer.balanceCluster(endTime - startTime);
268 
269     if (initCost > currentCost) {
270       List<RegionPlan> plans = createRegionPlans(cluster);
271       if (LOG.isDebugEnabled()) {
272         LOG.debug("Finished computing new load balance plan.  Computation took "
273             + (endTime - startTime) + "ms to try " + step
274             + " different iterations.  Found a solution that moves "
275             + plans.size() + " regions; Going from a computed cost of "
276             + initCost + " to a new cost of " + currentCost);
277       }
278       return plans;
279     }
280     if (LOG.isDebugEnabled()) {
281       LOG.debug("Could not find a better load balance plan.  Tried "
282           + step + " different configurations in " + (endTime - startTime)
283           + "ms, and did not find anything with a computed cost less than " + initCost);
284     }
285     return null;
286   }
287 
288   /**
289    * Create all of the RegionPlan's needed to move from the initial cluster state to the desired
290    * state.
291    *
292    * @param cluster The state of the cluster
293    * @return List of RegionPlan's that represent the moves needed to get to desired final state.
294    */
295   private List<RegionPlan> createRegionPlans(Cluster cluster) {
296     List<RegionPlan> plans = new LinkedList<RegionPlan>();
297     for (int regionIndex = 0;
298          regionIndex < cluster.regionIndexToServerIndex.length; regionIndex++) {
299       int initialServerIndex = cluster.initialRegionIndexToServerIndex[regionIndex];
300       int newServerIndex = cluster.regionIndexToServerIndex[regionIndex];
301 
302       if (initialServerIndex != newServerIndex) {
303         HRegionInfo region = cluster.regions[regionIndex];
304         ServerName initialServer = cluster.servers[initialServerIndex];
305         ServerName newServer = cluster.servers[newServerIndex];
306 
307         if (LOG.isTraceEnabled()) {
308           LOG.trace("Moving Region " + region.getEncodedName() + " from server "
309               + initialServer.getHostname() + " to " + newServer.getHostname());
310         }
311         RegionPlan rp = new RegionPlan(region, initialServer, newServer);
312         plans.add(rp);
313       }
314     }
315     return plans;
316   }
317 
318   /**
319    * Store the current region loads.
320    */
321   private synchronized void updateRegionLoad() {
322     // We create a new hashmap so that regions that are no longer there are removed.
323     // However we temporarily need the old loads so we can use them to keep the rolling average.
324     Map<String, Deque<RegionLoad>> oldLoads = loads;
325     loads = new HashMap<String, Deque<RegionLoad>>();
326 
327     for (ServerName sn : clusterStatus.getServers()) {
328       ServerLoad sl = clusterStatus.getLoad(sn);
329       if (sl == null) {
330         continue;
331       }
332       for (Entry<byte[], RegionLoad> entry : sl.getRegionsLoad().entrySet()) {
333         Deque<RegionLoad> rLoads = oldLoads.get(Bytes.toString(entry.getKey()));
334         if (rLoads == null) {
335           // There was nothing there
336           rLoads = new ArrayDeque<RegionLoad>();
337         } else if (rLoads.size() >= numRegionLoadsToRemember) {
338           rLoads.remove();
339         }
340         rLoads.add(entry.getValue());
341         loads.put(Bytes.toString(entry.getKey()), rLoads);
342 
343       }
344     }
345 
346     for(CostFromRegionLoadFunction cost : regionLoadFunctions) {
347       cost.setLoads(loads);
348     }
349   }
350 
351 
352   /**
353    * This is the main cost function.  It will compute a cost associated with a proposed cluster
354    * state.  All different costs will be combined with their multipliers to produce a double cost.
355    *
356    * @param cluster The state of the cluster
357    * @param previousCost the previous cost. This is used as an early out.
358    * @return a double of a cost associated with the proposed cluster state.  This cost is an
359    *         aggregate of all individual cost functions.
360    */
361   protected double computeCost(Cluster cluster, double previousCost) {
362     double total = 0;
363 
364     for (CostFunction c:costFunctions) {
365       if (c.getMultiplier() <= 0) {
366         continue;
367       }
368 
369       total += c.getMultiplier() * c.cost(cluster);
370 
371       if (total > previousCost) {
372         return total;
373       }
374     }
375     return total;
376   }
377 
378   abstract static class RegionPicker {
379     abstract Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> pick(Cluster cluster);
380 
381     /**
382      * From a list of regions pick a random one. Null can be returned which
383      * {@link StochasticLoadBalancer#balanceCluster(Map)} recognize as signal to try a region move
384      * rather than swap.
385      *
386      * @param cluster        The state of the cluster
387      * @param server         index of the server
388      * @param chanceOfNoSwap Chance that this will decide to try a move rather
389      *                       than a swap.
390      * @return a random {@link HRegionInfo} or null if an asymmetrical move is
391      *         suggested.
392      */
393     protected int pickRandomRegion(Cluster cluster, int server, double chanceOfNoSwap) {
394       // Check to see if this is just a move.
395       if (cluster.regionsPerServer[server].length == 0 || RANDOM.nextFloat() < chanceOfNoSwap) {
396         // signal a move only.
397         return -1;
398       }
399       int rand = RANDOM.nextInt(cluster.regionsPerServer[server].length);
400       return cluster.regionsPerServer[server][rand];
401 
402     }
403     protected int pickRandomServer(Cluster cluster) {
404       if (cluster.numServers < 1) {
405         return -1;
406       }
407 
408       return RANDOM.nextInt(cluster.numServers);
409     }
410     protected int pickOtherRandomServer(Cluster cluster, int serverIndex) {
411       if (cluster.numServers < 2) {
412         return -1;
413       }
414       while (true) {
415         int otherServerIndex = pickRandomServer(cluster);
416         if (otherServerIndex != serverIndex) {
417           return otherServerIndex;
418         }
419       }
420     }
421 
422     protected Pair<Integer, Integer> pickRandomRegions(Cluster cluster,
423                                                        int thisServer,
424                                                        int otherServer) {
425       if (thisServer < 0 || otherServer < 0) {
426         return new Pair<Integer, Integer>(-1, -1);
427       }
428 
429       // Decide who is most likely to need another region
430       int thisRegionCount = cluster.getNumRegions(thisServer);
431       int otherRegionCount = cluster.getNumRegions(otherServer);
432 
433       // Assign the chance based upon the above
434       double thisChance = (thisRegionCount > otherRegionCount) ? 0 : 0.5;
435       double otherChance = (thisRegionCount <= otherRegionCount) ? 0 : 0.5;
436 
437       int thisRegion = pickRandomRegion(cluster, thisServer, thisChance);
438       int otherRegion = pickRandomRegion(cluster, otherServer, otherChance);
439 
440       return new Pair<Integer, Integer>(thisRegion, otherRegion);
441     }
442   }
443 
444   static class RandomRegionPicker extends RegionPicker {
445 
446     @Override
447     Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> pick(Cluster cluster) {
448 
449       int thisServer = pickRandomServer(cluster);
450 
451       // Pick the other server
452       int otherServer = pickOtherRandomServer(cluster, thisServer);
453 
454       Pair<Integer, Integer> regions = pickRandomRegions(cluster, thisServer, otherServer);
455 
456       return new Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>(
457           new Pair<Integer, Integer>(thisServer, regions.getFirst()),
458           new Pair<Integer, Integer>(otherServer, regions.getSecond())
459 
460       );
461     }
462 
463   }
464 
465   public static class LoadPicker extends RegionPicker {
466 
467     @Override
468     Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> pick(Cluster cluster) {
469       cluster.sortServersByRegionCount();
470       int thisServer = pickMostLoadedServer(cluster, -1);
471       int otherServer = pickLeastLoadedServer(cluster, thisServer);
472 
473       Pair<Integer, Integer> regions = pickRandomRegions(cluster, thisServer, otherServer);
474       return new Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>(
475           new Pair<Integer, Integer>(thisServer, regions.getFirst()),
476           new Pair<Integer, Integer>(otherServer, regions.getSecond())
477 
478       );
479     }
480 
481     private int pickLeastLoadedServer(final Cluster cluster, int thisServer) {
482       Integer[] servers = cluster.serverIndicesSortedByRegionCount;
483 
484       int index = 0;
485       while (servers[index] == null || servers[index] == thisServer) {
486         index++;
487         if (index == servers.length) {
488           return -1;
489         }
490       }
491       return servers[index];
492     }
493 
494     private int pickMostLoadedServer(final Cluster cluster, int thisServer) {
495       Integer[] servers = cluster.serverIndicesSortedByRegionCount;
496 
497       int index = servers.length - 1;
498       while (servers[index] == null || servers[index] == thisServer) {
499         index--;
500         if (index < 0) {
501           return -1;
502         }
503       }
504       return servers[index];
505     }
506   }
507 
508   static class LocalityBasedPicker extends RegionPicker {
509 
510     private MasterServices masterServices;
511 
512     LocalityBasedPicker(MasterServices masterServices) {
513       this.masterServices = masterServices;
514     }
515 
516     @Override
517     Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> pick(Cluster cluster) {
518       if (this.masterServices == null) {
519         return new Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>(
520             new Pair<Integer, Integer>(-1,-1),
521             new Pair<Integer, Integer>(-1,-1)
522         );
523       }
524       // Pick a random region server
525       int thisServer = pickRandomServer(cluster);
526 
527       // Pick a random region on this server
528       int thisRegion = pickRandomRegion(cluster, thisServer, 0.0f);
529 
530       if (thisRegion == -1) {
531         return new Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>(
532             new Pair<Integer, Integer>(-1,-1),
533             new Pair<Integer, Integer>(-1,-1)
534         );
535       }
536 
537       // Pick the server with the highest locality
538       int otherServer = pickHighestLocalityServer(cluster, thisServer, thisRegion);
539 
540       // pick an region on the other server to potentially swap
541       int otherRegion = this.pickRandomRegion(cluster, otherServer, 0.5f);
542 
543       return new Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>(
544           new Pair<Integer, Integer>(thisServer,thisRegion),
545           new Pair<Integer, Integer>(otherServer,otherRegion)
546       );
547     }
548 
549     private int pickHighestLocalityServer(Cluster cluster, int thisServer, int thisRegion) {
550       int[] regionLocations = cluster.regionLocations[thisRegion];
551 
552       if (regionLocations == null || regionLocations.length <= 1) {
553         return pickOtherRandomServer(cluster, thisServer);
554       }
555 
556       for (int loc : regionLocations) {
557         if (loc >= 0 && loc != thisServer) { // find the first suitable server
558           return loc;
559         }
560       }
561 
562       // no location found
563       return pickOtherRandomServer(cluster, thisServer);
564     }
565 
566     void setServices(MasterServices services) {
567       this.masterServices = services;
568     }
569   }
570 
571   /**
572    * Base class of StochasticLoadBalancer's Cost Functions.
573    */
574   public abstract static class CostFunction {
575 
576     private float multiplier = 0;
577     private Configuration conf;
578 
579     CostFunction(Configuration c) {
580       this.conf = c;
581     }
582 
583     float getMultiplier() {
584       return multiplier;
585     }
586 
587     void setMultiplier(float m) {
588       this.multiplier = m;
589     }
590 
591     abstract double cost(Cluster cluster);
592 
593     /**
594      * Function to compute a scaled cost using {@link DescriptiveStatistics}. It
595      * assumes that this is a zero sum set of costs.  It assumes that the worst case
596      * possible is all of the elements in one region server and the rest having 0.
597      *
598      * @param stats the costs
599      * @return a scaled set of costs.
600      */
601     protected double costFromArray(double[] stats) {
602       double totalCost = 0;
603       double total = getSum(stats);
604       double mean = total/((double)stats.length);
605       double count = stats.length;
606 
607       // Compute max as if all region servers had 0 and one had the sum of all costs.  This must be
608       // a zero sum cost for this to make sense.
609       double max = ((count - 1) * mean) + (total - mean);
610 
611       // It's possible that there aren't enough regions to go around
612       double min;
613       if (count > total) {
614         min = ((count - total) * mean) + ((1 - mean) * total);
615       } else {
616         // Some will have 1 more than everything else.
617         int numHigh = (int) (total - (Math.floor(mean) * count));
618         int numLow = (int) (count - numHigh);
619 
620         min = (numHigh * (Math.ceil(mean) - mean)) + (numLow * (mean - Math.floor(mean)));
621 
622       }
623       min = Math.max(0, min);
624       for (int i=0; i<stats.length; i++) {
625         double n = stats[i];
626         double diff = Math.abs(mean - n);
627         totalCost += diff;
628       }
629 
630       double scaled =  scale(min, max, totalCost);
631       return scaled;
632     }
633 
634 
635 
636     private double getSum(double[] stats) {
637       double total = 0;
638       for(double s:stats) {
639         total += s;
640       }
641       return total;
642     }
643 
644     /**
645      * Scale the value between 0 and 1.
646      *
647      * @param min   Min value
648      * @param max   The Max value
649      * @param value The value to be scaled.
650      * @return The scaled value.
651      */
652     protected double scale(double min, double max, double value) {
653       if (max == 0 || value == 0) {
654         return 0;
655       }
656       if ((max - min) <= 0) return 0;
657 
658       return Math.max(0d, Math.min(1d, (value - min) / (max - min)));
659     }
660   }
661 
662   /**
663    * Given the starting state of the regions and a potential ending state
664    * compute cost based upon the number of regions that have moved.
665    */
666   public static class MoveCostFunction extends CostFunction {
667     private static final String MOVE_COST_KEY = "hbase.master.balancer.stochastic.moveCost";
668     private static final String MAX_MOVES_PERCENT_KEY =
669         "hbase.master.balancer.stochastic.maxMovePercent";
670     private static final float DEFAULT_MOVE_COST = 100;
671     private static final int DEFAULT_MAX_MOVES = 600;
672     private static final float DEFAULT_MAX_MOVE_PERCENT = 0.25f;
673     private static final int META_MOVE_COST_MULT = 10;
674 
675     private final float maxMovesPercent;
676 
677     MoveCostFunction(Configuration conf) {
678       super(conf);
679 
680       // Move cost multiplier should be the same cost or higher than the rest of the costs to ensure
681       // that large benefits are need to overcome the cost of a move.
682       this.setMultiplier(conf.getFloat(MOVE_COST_KEY, DEFAULT_MOVE_COST));
683       // What percent of the number of regions a single run of the balancer can move.
684       maxMovesPercent = conf.getFloat(MAX_MOVES_PERCENT_KEY, DEFAULT_MAX_MOVE_PERCENT);
685     }
686 
687     @Override
688     double cost(Cluster cluster) {
689       // Try and size the max number of Moves, but always be prepared to move some.
690       int maxMoves = Math.max((int) (cluster.numRegions * maxMovesPercent),
691           DEFAULT_MAX_MOVES);
692 
693       double moveCost = cluster.numMovedRegions;
694 
695       // Don't let this single balance move more than the max moves.
696       // This allows better scaling to accurately represent the actual cost of a move.
697       if (moveCost > maxMoves) {
698         return 1000000;   // return a number much greater than any of the other cost
699       }
700 
701       // hbase:meta region is special
702       if (cluster.numMovedMetaRegions > 0) {
703         // assume each hbase:meta region move costs 10 times
704         moveCost += META_MOVE_COST_MULT * cluster.numMovedMetaRegions;
705       }
706 
707       return scale(0, cluster.numRegions + META_MOVE_COST_MULT, moveCost);
708     }
709   }
710 
711   /**
712    * Compute the cost of a potential cluster state from skew in number of
713    * regions on a cluster.
714    */
715   public static class RegionCountSkewCostFunction extends CostFunction {
716     private static final String REGION_COUNT_SKEW_COST_KEY =
717         "hbase.master.balancer.stochastic.regionCountCost";
718     private static final float DEFAULT_REGION_COUNT_SKEW_COST = 500;
719 
720     private double[] stats = null;
721 
722     RegionCountSkewCostFunction(Configuration conf) {
723       super(conf);
724       // Load multiplier should be the greatest as it is the most general way to balance data.
725       this.setMultiplier(conf.getFloat(REGION_COUNT_SKEW_COST_KEY, DEFAULT_REGION_COUNT_SKEW_COST));
726     }
727 
728     @Override
729     double cost(Cluster cluster) {
730       if (stats == null || stats.length != cluster.numServers) {
731         stats = new double[cluster.numServers];
732       }
733 
734       for (int i =0; i < cluster.numServers; i++) {
735         stats[i] = cluster.regionsPerServer[i].length;
736       }
737 
738       return costFromArray(stats);
739     }
740   }
741 
742   /**
743    * Compute the cost of a potential cluster configuration based upon how evenly
744    * distributed tables are.
745    */
746   public static class TableSkewCostFunction extends CostFunction {
747 
748     private static final String TABLE_SKEW_COST_KEY =
749         "hbase.master.balancer.stochastic.tableSkewCost";
750     private static final float DEFAULT_TABLE_SKEW_COST = 35;
751 
752     TableSkewCostFunction(Configuration conf) {
753       super(conf);
754       this.setMultiplier(conf.getFloat(TABLE_SKEW_COST_KEY, DEFAULT_TABLE_SKEW_COST));
755     }
756 
757     @Override
758     double cost(Cluster cluster) {
759       double max = cluster.numRegions;
760       double min = ((double) cluster.numRegions) / cluster.numServers;
761       double value = 0;
762 
763       for (int i = 0; i < cluster.numMaxRegionsPerTable.length; i++) {
764         value += cluster.numMaxRegionsPerTable[i];
765       }
766 
767       return scale(min, max, value);
768     }
769   }
770 
771 
772   /**
773    * Compute a cost of a potential cluster configuration based upon where
774    * {@link org.apache.hadoop.hbase.regionserver.StoreFile}s are located.
775    */
776   public static class LocalityCostFunction extends CostFunction {
777 
778     private static final String LOCALITY_COST_KEY = "hbase.master.balancer.stochastic.localityCost";
779     private static final float DEFAULT_LOCALITY_COST = 25;
780 
781     private MasterServices services;
782 
783     LocalityCostFunction(Configuration conf, MasterServices srv) {
784       super(conf);
785       this.setMultiplier(conf.getFloat(LOCALITY_COST_KEY, DEFAULT_LOCALITY_COST));
786       this.services = srv;
787     }
788 
789     void setServices(MasterServices srvc) {
790       this.services = srvc;
791     }
792 
793     @Override
794     double cost(Cluster cluster) {
795       double max = 0;
796       double cost = 0;
797 
798       // If there's no master so there's no way anything else works.
799       if (this.services == null) {
800         return cost;
801       }
802 
803       for (int i = 0; i < cluster.regionLocations.length; i++) {
804         max += 1;
805         int serverIndex = cluster.regionIndexToServerIndex[i];
806         int[] regionLocations = cluster.regionLocations[i];
807 
808         // If we can't find where the data is getTopBlock returns null.
809         // so count that as being the best possible.
810         if (regionLocations == null) {
811           continue;
812         }
813 
814         int index = -1;
815         for (int j = 0; j < regionLocations.length; j++) {
816           if (regionLocations[j] >= 0 && regionLocations[j] == serverIndex) {
817             index = j;
818             break;
819           }
820         }
821 
822         if (index < 0) {
823           if (regionLocations.length > 0) {
824             cost += 1;
825           }
826         } else {
827           cost += (double) index / (double) regionLocations.length;
828         }
829       }
830       return scale(0, max, cost);
831     }
832   }
833 
834   /**
835    * Base class the allows writing costs functions from rolling average of some
836    * number from RegionLoad.
837    */
838   public abstract static class CostFromRegionLoadFunction extends CostFunction {
839 
840     private ClusterStatus clusterStatus = null;
841     private Map<String, Deque<RegionLoad>> loads = null;
842     private double[] stats = null;
843     CostFromRegionLoadFunction(Configuration conf) {
844       super(conf);
845     }
846 
847     void setClusterStatus(ClusterStatus status) {
848       this.clusterStatus = status;
849     }
850 
851     void setLoads(Map<String, Deque<RegionLoad>> l) {
852       this.loads = l;
853     }
854 
855 
856     @Override
857     double cost(Cluster cluster) {
858       if (clusterStatus == null || loads == null) {
859         return 0;
860       }
861 
862       if (stats == null || stats.length != cluster.numServers) {
863         stats = new double[cluster.numServers];
864       }
865 
866       for (int i =0; i < stats.length; i++) {
867         //Cost this server has from RegionLoad
868         long cost = 0;
869 
870         // for every region on this server get the rl
871         for(int regionIndex:cluster.regionsPerServer[i]) {
872           Collection<RegionLoad> regionLoadList =  cluster.regionLoads[regionIndex];
873 
874           // Now if we found a region load get the type of cost that was requested.
875           if (regionLoadList != null) {
876             cost += getRegionLoadCost(regionLoadList);
877           }
878         }
879 
880         // Add the total cost to the stats.
881         stats[i] = cost;
882       }
883 
884       // Now return the scaled cost from data held in the stats object.
885       return costFromArray(stats);
886     }
887 
888     protected double getRegionLoadCost(Collection<RegionLoad> regionLoadList) {
889       double cost = 0;
890 
891       for (RegionLoad rl : regionLoadList) {
892         double toAdd = getCostFromRl(rl);
893 
894         if (cost == 0) {
895           cost = toAdd;
896         } else {
897           cost = (.5 * cost) + (.5 * toAdd);
898         }
899       }
900 
901       return cost;
902     }
903 
904     protected abstract double getCostFromRl(RegionLoad rl);
905   }
906 
907   /**
908    * Compute the cost of total number of read requests  The more unbalanced the higher the
909    * computed cost will be.  This uses a rolling average of regionload.
910    */
911 
912   public static class ReadRequestCostFunction extends CostFromRegionLoadFunction {
913 
914     private static final String READ_REQUEST_COST_KEY =
915         "hbase.master.balancer.stochastic.readRequestCost";
916     private static final float DEFAULT_READ_REQUEST_COST = 5;
917 
918     ReadRequestCostFunction(Configuration conf) {
919       super(conf);
920       this.setMultiplier(conf.getFloat(READ_REQUEST_COST_KEY, DEFAULT_READ_REQUEST_COST));
921     }
922 
923 
924     @Override
925     protected double getCostFromRl(RegionLoad rl) {
926       return rl.getReadRequestsCount();
927     }
928   }
929 
930   /**
931    * Compute the cost of total number of write requests.  The more unbalanced the higher the
932    * computed cost will be.  This uses a rolling average of regionload.
933    */
934   public static class WriteRequestCostFunction extends CostFromRegionLoadFunction {
935 
936     private static final String WRITE_REQUEST_COST_KEY =
937         "hbase.master.balancer.stochastic.writeRequestCost";
938     private static final float DEFAULT_WRITE_REQUEST_COST = 5;
939 
940     WriteRequestCostFunction(Configuration conf) {
941       super(conf);
942       this.setMultiplier(conf.getFloat(WRITE_REQUEST_COST_KEY, DEFAULT_WRITE_REQUEST_COST));
943     }
944 
945     @Override
946     protected double getCostFromRl(RegionLoad rl) {
947       return rl.getWriteRequestsCount();
948     }
949   }
950 
951   /**
952    * Compute the cost of total memstore size.  The more unbalanced the higher the
953    * computed cost will be.  This uses a rolling average of regionload.
954    */
955   public static class MemstoreSizeCostFunction extends CostFromRegionLoadFunction {
956 
957     private static final String MEMSTORE_SIZE_COST_KEY =
958         "hbase.master.balancer.stochastic.memstoreSizeCost";
959     private static final float DEFAULT_MEMSTORE_SIZE_COST = 5;
960 
961     MemstoreSizeCostFunction(Configuration conf) {
962       super(conf);
963       this.setMultiplier(conf.getFloat(MEMSTORE_SIZE_COST_KEY, DEFAULT_MEMSTORE_SIZE_COST));
964     }
965 
966     @Override
967     protected double getCostFromRl(RegionLoad rl) {
968       return rl.getMemStoreSizeMB();
969     }
970   }
971   /**
972    * Compute the cost of total open storefiles size.  The more unbalanced the higher the
973    * computed cost will be.  This uses a rolling average of regionload.
974    */
975   public static class StoreFileCostFunction extends CostFromRegionLoadFunction {
976 
977     private static final String STOREFILE_SIZE_COST_KEY =
978         "hbase.master.balancer.stochastic.storefileSizeCost";
979     private static final float DEFAULT_STOREFILE_SIZE_COST = 5;
980 
981     StoreFileCostFunction(Configuration conf) {
982       super(conf);
983       this.setMultiplier(conf.getFloat(STOREFILE_SIZE_COST_KEY, DEFAULT_STOREFILE_SIZE_COST));
984     }
985 
986     @Override
987     protected double getCostFromRl(RegionLoad rl) {
988       return rl.getStorefileSizeMB();
989     }
990   }
991 }