View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Comparator;
23  import java.util.HashMap;
24  import java.util.HashSet;
25  import java.util.List;
26  import java.util.Set;
27  import java.util.TreeSet;
28  
29  import org.apache.hadoop.hbase.classification.InterfaceAudience;
30  import org.apache.hadoop.conf.Configuration;
31  import org.apache.hadoop.hbase.ClusterManager.ServiceType;
32  import org.apache.hadoop.hbase.client.HBaseAdmin;
33  import org.apache.hadoop.hbase.client.HConnection;
34  import org.apache.hadoop.hbase.client.HConnectionManager;
35  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
36  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
37  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
38  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
39  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
40  import org.apache.hadoop.hbase.util.Bytes;
41  import org.apache.hadoop.hbase.util.Threads;
42  
43  import com.google.common.collect.Sets;
44  
45  /**
46   * Manages the interactions with an already deployed distributed cluster (as opposed to
47   * a pseudo-distributed, or mini/local cluster). This is used by integration and system tests.
48   */
49  @InterfaceAudience.Private
50  public class DistributedHBaseCluster extends HBaseCluster {
51  
52    private HBaseAdmin admin;
53  
54    private ClusterManager clusterManager;
55  
56    public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
57        throws IOException {
58      super(conf);
59      this.clusterManager = clusterManager;
60      this.admin = new HBaseAdmin(conf);
61      this.initialClusterStatus = getClusterStatus();
62    }
63  
64    public void setClusterManager(ClusterManager clusterManager) {
65      this.clusterManager = clusterManager;
66    }
67  
68    public ClusterManager getClusterManager() {
69      return clusterManager;
70    }
71  
72    /**
73     * Returns a ClusterStatus for this HBase cluster
74     * @throws IOException
75     */
76    @Override
77    public ClusterStatus getClusterStatus() throws IOException {
78      return admin.getClusterStatus();
79    }
80  
81    @Override
82    public ClusterStatus getInitialClusterStatus() throws IOException {
83      return initialClusterStatus;
84    }
85  
86    @Override
87    public void close() throws IOException {
88      if (this.admin != null) {
89        admin.close();
90      }
91    }
92  
93    @Override
94    public AdminProtos.AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
95    throws IOException {
96      return admin.getConnection().getAdmin(serverName);
97    }
98  
99    @Override
100   public ClientProtos.ClientService.BlockingInterface getClientProtocol(ServerName serverName)
101   throws IOException {
102     return admin.getConnection().getClient(serverName);
103   }
104 
105   @Override
106   public void startRegionServer(String hostname, int port) throws IOException {
107     LOG.info("Starting RS on: " + hostname);
108     clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
109   }
110 
111   @Override
112   public void killRegionServer(ServerName serverName) throws IOException {
113     LOG.info("Aborting RS: " + serverName.getServerName());
114     clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
115             serverName.getHostname(),
116             serverName.getPort());
117   }
118 
119   @Override
120   public void stopRegionServer(ServerName serverName) throws IOException {
121     LOG.info("Stopping RS: " + serverName.getServerName());
122     clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
123             serverName.getHostname(),
124             serverName.getPort());
125   }
126 
127   @Override
128   public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
129     waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
130   }
131 
132   private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
133     throws IOException {
134     LOG.info("Waiting service:" + service + " to stop: " + serverName.getServerName());
135     long start = System.currentTimeMillis();
136 
137     while ((System.currentTimeMillis() - start) < timeout) {
138       if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
139         return;
140       }
141       Threads.sleep(1000);
142     }
143     throw new IOException("did timeout waiting for service to stop:" + serverName);
144   }
145 
146   @Override
147   public MasterService.BlockingInterface getMaster()
148   throws IOException {
149     HConnection conn = HConnectionManager.getConnection(conf);
150     return conn.getMaster();
151   }
152 
153   @Override
154   public void startMaster(String hostname, int port) throws IOException {
155     LOG.info("Starting Master on: " + hostname + ":" + port);
156     clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
157   }
158 
159   @Override
160   public void killMaster(ServerName serverName) throws IOException {
161     LOG.info("Aborting Master: " + serverName.getServerName());
162     clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
163   }
164 
165   @Override
166   public void stopMaster(ServerName serverName) throws IOException {
167     LOG.info("Stopping Master: " + serverName.getServerName());
168     clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
169   }
170 
171   @Override
172   public void waitForMasterToStop(ServerName serverName, long timeout) throws IOException {
173     waitForServiceToStop(ServiceType.HBASE_MASTER, serverName, timeout);
174   }
175 
176   @Override
177   public boolean waitForActiveAndReadyMaster(long timeout) throws IOException {
178     long start = System.currentTimeMillis();
179     while (System.currentTimeMillis() - start < timeout) {
180       try {
181         getMaster();
182         return true;
183       } catch (MasterNotRunningException m) {
184         LOG.warn("Master not started yet " + m);
185       } catch (ZooKeeperConnectionException e) {
186         LOG.warn("Failed to connect to ZK " + e);
187       }
188       Threads.sleep(1000);
189     }
190     return false;
191   }
192 
193   @Override
194   public ServerName getServerHoldingRegion(byte[] regionName) throws IOException {
195     HConnection connection = admin.getConnection();
196     HRegionLocation regionLoc = connection.locateRegion(regionName);
197     if (regionLoc == null) {
198       LOG.warn("Cannot find region server holding region " + Bytes.toString(regionName)
199           + " for table " + HRegionInfo.getTableName(regionName) + ", start key [" +
200           Bytes.toString(HRegionInfo.getStartKey(regionName)) + "]");
201       return null;
202     }
203 
204     AdminProtos.AdminService.BlockingInterface client =
205       connection.getAdmin(regionLoc.getServerName());
206     ServerInfo info = ProtobufUtil.getServerInfo(client);
207     return ProtobufUtil.toServerName(info.getServerName());
208   }
209 
210   @Override
211   public void waitUntilShutDown() {
212     // Simply wait for a few seconds for now (after issuing serverManager.kill
213     throw new RuntimeException("Not implemented yet");
214   }
215 
216   @Override
217   public void shutdown() throws IOException {
218     // not sure we want this
219     throw new RuntimeException("Not implemented yet");
220   }
221 
222   @Override
223   public boolean isDistributedCluster() {
224     return true;
225   }
226 
227   @Override
228   public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
229     ClusterStatus current = getClusterStatus();
230 
231     LOG.info("Restoring cluster - started");
232 
233     // do a best effort restore
234     boolean success = true;
235     success = restoreMasters(initial, current) & success;
236     success = restoreRegionServers(initial, current) & success;
237     success = restoreAdmin() & success;
238 
239     LOG.info("Restoring cluster - done");
240     return success;
241   }
242 
243   protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
244     List<IOException> deferred = new ArrayList<IOException>();
245     //check whether current master has changed
246     final ServerName initMaster = initial.getMaster();
247     if (!ServerName.isSameHostnameAndPort(initMaster, current.getMaster())) {
248       LOG.info("Restoring cluster - Initial active master : "
249               + initMaster.getHostAndPort()
250               + " has changed to : "
251               + current.getMaster().getHostAndPort());
252       // If initial master is stopped, start it, before restoring the state.
253       // It will come up as a backup master, if there is already an active master.
254       try {
255         if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
256                 initMaster.getHostname(), initMaster.getPort())) {
257           LOG.info("Restoring cluster - starting initial active master at:"
258                   + initMaster.getHostAndPort());
259           startMaster(initMaster.getHostname(), initMaster.getPort());
260         }
261 
262         // master has changed, we would like to undo this.
263         // 1. Kill the current backups
264         // 2. Stop current master
265         // 3. Start backup masters
266         for (ServerName currentBackup : current.getBackupMasters()) {
267           if (!ServerName.isSameHostnameAndPort(currentBackup, initMaster)) {
268             LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
269             stopMaster(currentBackup);
270           }
271         }
272         LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
273         stopMaster(current.getMaster());
274         waitForActiveAndReadyMaster(); // wait so that active master takes over
275       } catch (IOException ex) {
276         // if we fail to start the initial active master, we do not want to continue stopping
277         // backup masters. Just keep what we have now
278         deferred.add(ex);
279       }
280 
281       //start backup masters
282       for (ServerName backup : initial.getBackupMasters()) {
283         try {
284           //these are not started in backup mode, but we should already have an active master
285           if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
286                   backup.getHostname(),
287                   backup.getPort())) {
288             LOG.info("Restoring cluster - starting initial backup master: "
289                     + backup.getHostAndPort());
290             startMaster(backup.getHostname(), backup.getPort());
291           }
292         } catch (IOException ex) {
293           deferred.add(ex);
294         }
295       }
296     } else {
297       //current master has not changed, match up backup masters
298       Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
299       Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
300       toStart.addAll(initial.getBackupMasters());
301       toKill.addAll(current.getBackupMasters());
302 
303       for (ServerName server : current.getBackupMasters()) {
304         toStart.remove(server);
305       }
306       for (ServerName server: initial.getBackupMasters()) {
307         toKill.remove(server);
308       }
309 
310       for (ServerName sn:toStart) {
311         try {
312           if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
313             LOG.info("Restoring cluster - starting initial backup master: " + sn.getHostAndPort());
314             startMaster(sn.getHostname(), sn.getPort());
315           }
316         } catch (IOException ex) {
317           deferred.add(ex);
318         }
319       }
320 
321       for (ServerName sn:toKill) {
322         try {
323           if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
324             LOG.info("Restoring cluster - stopping backup master: " + sn.getHostAndPort());
325             stopMaster(sn);
326           }
327         } catch (IOException ex) {
328           deferred.add(ex);
329         }
330       }
331     }
332     if (!deferred.isEmpty()) {
333       LOG.warn("Restoring cluster - restoring region servers reported "
334               + deferred.size() + " errors:");
335       for (int i=0; i<deferred.size() && i < 3; i++) {
336         LOG.warn(deferred.get(i));
337       }
338     }
339 
340     return deferred.isEmpty();
341   }
342 
343 
344   private static class ServerNameIgnoreStartCodeComparator implements Comparator<ServerName> {
345     @Override
346     public int compare(ServerName o1, ServerName o2) {
347       int compare = o1.getHostname().compareToIgnoreCase(o2.getHostname());
348       if (compare != 0) return compare;
349       compare = o1.getPort() - o2.getPort();
350       if (compare != 0) return compare;
351       return 0;
352     }
353   }
354 
355   protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
356     Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
357     Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
358     toStart.addAll(initial.getBackupMasters());
359     toKill.addAll(current.getBackupMasters());
360 
361     for (ServerName server : current.getServers()) {
362       toStart.remove(server);
363     }
364     for (ServerName server: initial.getServers()) {
365       toKill.remove(server);
366     }
367 
368     List<IOException> deferred = new ArrayList<IOException>();
369 
370     for(ServerName sn:toStart) {
371       try {
372         if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
373                 sn.getHostname(),
374                 sn.getPort())) {
375           LOG.info("Restoring cluster - starting initial region server: " + sn.getHostAndPort());
376           startRegionServer(sn.getHostname(), sn.getPort());
377         }
378       } catch (IOException ex) {
379         deferred.add(ex);
380       }
381     }
382 
383     for(ServerName sn:toKill) {
384       try {
385         if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
386                 sn.getHostname(),
387                 sn.getPort())) {
388           LOG.info("Restoring cluster - stopping initial region server: " + sn.getHostAndPort());
389           stopRegionServer(sn);
390         }
391       } catch (IOException ex) {
392         deferred.add(ex);
393       }
394     }
395     if (!deferred.isEmpty()) {
396       LOG.warn("Restoring cluster - restoring region servers reported "
397               + deferred.size() + " errors:");
398       for (int i=0; i<deferred.size() && i < 3; i++) {
399         LOG.warn(deferred.get(i));
400       }
401     }
402 
403     return deferred.isEmpty();
404   }
405 
406   protected boolean restoreAdmin() throws IOException {
407     // While restoring above, if the HBase Master which was initially the Active one, was down
408     // and the restore put the cluster back to Initial configuration, HAdmin instance will need
409     // to refresh its connections (otherwise it will return incorrect information) or we can
410     // point it to new instance.
411     try {
412       admin.close();
413     } catch (IOException ioe) {
414       LOG.warn("While closing the old connection", ioe);
415     }
416     this.admin = new HBaseAdmin(conf);
417     LOG.info("Added new HBaseAdmin");
418     return true;
419   }
420 }