View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotEquals;
27  import static org.junit.Assert.assertNotNull;
28  import static org.junit.Assert.assertTrue;
29  import static org.junit.Assert.fail;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Collection;
34  import java.util.HashMap;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.Map.Entry;
39  import java.util.concurrent.Callable;
40  import java.util.concurrent.CountDownLatch;
41  import java.util.concurrent.ExecutorService;
42  import java.util.concurrent.Executors;
43  import java.util.concurrent.Future;
44  import java.util.concurrent.ScheduledThreadPoolExecutor;
45  import java.util.concurrent.SynchronousQueue;
46  import java.util.concurrent.ThreadPoolExecutor;
47  import java.util.concurrent.TimeUnit;
48  import java.util.concurrent.atomic.AtomicBoolean;
49  
50  import org.apache.commons.io.IOUtils;
51  import org.apache.commons.logging.Log;
52  import org.apache.commons.logging.LogFactory;
53  import org.apache.hadoop.conf.Configuration;
54  import org.apache.hadoop.fs.FileStatus;
55  import org.apache.hadoop.fs.FileSystem;
56  import org.apache.hadoop.fs.Path;
57  import org.apache.hadoop.hbase.ClusterStatus;
58  import org.apache.hadoop.hbase.HBaseTestingUtility;
59  import org.apache.hadoop.hbase.HColumnDescriptor;
60  import org.apache.hadoop.hbase.HConstants;
61  import org.apache.hadoop.hbase.HRegionInfo;
62  import org.apache.hadoop.hbase.HRegionLocation;
63  import org.apache.hadoop.hbase.HTableDescriptor;
64  import org.apache.hadoop.hbase.TableExistsException;
65  import org.apache.hadoop.hbase.testclassification.LargeTests;
66  import org.apache.hadoop.hbase.MiniHBaseCluster;
67  import org.apache.hadoop.hbase.ServerName;
68  import org.apache.hadoop.hbase.TableName;
69  import org.apache.hadoop.hbase.catalog.MetaEditor;
70  import org.apache.hadoop.hbase.client.Delete;
71  import org.apache.hadoop.hbase.client.Durability;
72  import org.apache.hadoop.hbase.client.Get;
73  import org.apache.hadoop.hbase.client.HBaseAdmin;
74  import org.apache.hadoop.hbase.client.HConnection;
75  import org.apache.hadoop.hbase.client.HConnectionManager;
76  import org.apache.hadoop.hbase.client.HTable;
77  import org.apache.hadoop.hbase.client.MetaScanner;
78  import org.apache.hadoop.hbase.client.Put;
79  import org.apache.hadoop.hbase.client.Result;
80  import org.apache.hadoop.hbase.client.ResultScanner;
81  import org.apache.hadoop.hbase.client.Scan;
82  import org.apache.hadoop.hbase.io.hfile.TestHFile;
83  import org.apache.hadoop.hbase.master.AssignmentManager;
84  import org.apache.hadoop.hbase.master.HMaster;
85  import org.apache.hadoop.hbase.master.RegionState;
86  import org.apache.hadoop.hbase.master.RegionStates;
87  import org.apache.hadoop.hbase.master.TableLockManager;
88  import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
89  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
90  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
91  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
92  import org.apache.hadoop.hbase.regionserver.HRegion;
93  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
94  import org.apache.hadoop.hbase.regionserver.HRegionServer;
95  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
96  import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
97  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
98  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
99  import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
100 import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
101 import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
102 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
103 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
104 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
105 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
106 import org.apache.zookeeper.KeeperException;
107 import org.junit.AfterClass;
108 import org.junit.Assert;
109 import org.junit.BeforeClass;
110 import org.junit.Ignore;
111 import org.junit.Test;
112 import org.junit.experimental.categories.Category;
113 import org.junit.rules.TestName;
114 
115 import com.google.common.collect.Multimap;
116 
117 /**
118  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
119  */
120 @Category(LargeTests.class)
121 public class TestHBaseFsck {
122   final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
123   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
124   private final static Configuration conf = TEST_UTIL.getConfiguration();
125   private final static String FAM_STR = "fam";
126   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
127   private final static int REGION_ONLINE_TIMEOUT = 800;
128   private static RegionStates regionStates;
129   private static ExecutorService executorService;
130 
131   // for the instance, reset every test run
132   private HTable tbl;
133   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
134     Bytes.toBytes("B"), Bytes.toBytes("C") };
135   // one row per region.
136   private final static byte[][] ROWKEYS= new byte[][] {
137     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
138     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
139 
140   @SuppressWarnings("deprecation")
141   @BeforeClass
142   public static void setUpBeforeClass() throws Exception {
143     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.handler.count", 2);
144     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.metahandler.count", 2);
145     TEST_UTIL.getConfiguration().setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
146     TEST_UTIL.startMiniCluster(3);
147     TEST_UTIL.setHDFSClientRetry(0);
148 
149     executorService = new ThreadPoolExecutor(1, Integer.MAX_VALUE, 60, TimeUnit.SECONDS,
150         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
151 
152     AssignmentManager assignmentManager =
153       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
154     regionStates = assignmentManager.getRegionStates();
155     TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, true);
156   }
157 
158   @AfterClass
159   public static void tearDownAfterClass() throws Exception {
160     TEST_UTIL.shutdownMiniCluster();
161   }
162 
163   @Test
164   public void testHBaseFsck() throws Exception {
165     assertNoErrors(doFsck(conf, false));
166     String table = "tableBadMetaAssign";
167     TEST_UTIL.createTable(Bytes.toBytes(table), FAM);
168 
169     // We created 1 table, should be fine
170     assertNoErrors(doFsck(conf, false));
171 
172     // Now let's mess it up and change the assignment in hbase:meta to
173     // point to a different region server
174     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
175     Scan scan = new Scan();
176     scan.setStartRow(Bytes.toBytes(table+",,"));
177     ResultScanner scanner = meta.getScanner(scan);
178     HRegionInfo hri = null;
179 
180     Result res = scanner.next();
181     ServerName currServer =
182       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
183           HConstants.SERVER_QUALIFIER));
184     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
185         HConstants.STARTCODE_QUALIFIER));
186 
187     for (JVMClusterUtil.RegionServerThread rs :
188         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
189 
190       ServerName sn = rs.getRegionServer().getServerName();
191 
192       // When we find a diff RS, change the assignment and break
193       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
194           startCode != sn.getStartcode()) {
195         Put put = new Put(res.getRow());
196         put.setDurability(Durability.SKIP_WAL);
197         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
198           Bytes.toBytes(sn.getHostAndPort()));
199         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
200           Bytes.toBytes(sn.getStartcode()));
201         meta.put(put);
202         hri = HRegionInfo.getHRegionInfo(res);
203         break;
204       }
205     }
206 
207     // Try to fix the data
208     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
209         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
210 
211     TEST_UTIL.getHBaseCluster().getMaster()
212       .getAssignmentManager().waitForAssignment(hri);
213 
214     // Should be fixed now
215     assertNoErrors(doFsck(conf, false));
216 
217     // comment needed - what is the purpose of this line
218     HTable t = new HTable(conf, Bytes.toBytes(table), executorService);
219     ResultScanner s = t.getScanner(new Scan());
220     s.close();
221     t.close();
222 
223     scanner.close();
224     meta.close();
225   }
226 
227   @Test(timeout=180000)
228   public void testFixAssignmentsWhenMETAinTransition() throws Exception {
229     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
230     HBaseAdmin admin = null;
231     try {
232       admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
233       admin.closeRegion(cluster.getServerHoldingMeta(),
234           HRegionInfo.FIRST_META_REGIONINFO);
235     } finally {
236       if (admin != null) {
237         admin.close();
238       }
239     }
240     regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
241     MetaRegionTracker.deleteMetaLocation(cluster.getMaster().getZooKeeper());
242     assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
243     HBaseFsck hbck = doFsck(conf, true);
244     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
245         ERROR_CODE.NULL_META_REGION });
246     assertNoErrors(doFsck(conf, false));
247   }
248 
249   /**
250    * Create a new region in META.
251    */
252   private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
253       htd, byte[] startKey, byte[] endKey)
254       throws IOException {
255     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
256     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
257     MetaEditor.addRegionToMeta(meta, hri);
258     meta.close();
259     return hri;
260   }
261 
262   /**
263    * Debugging method to dump the contents of meta.
264    */
265   private void dumpMeta(TableName tableName) throws IOException {
266     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
267     for (byte[] row : metaRows) {
268       LOG.info(Bytes.toString(row));
269     }
270   }
271 
272   /**
273    * This method is used to undeploy a region -- close it and attempt to
274    * remove its state from the Master.
275    */
276   private void undeployRegion(HBaseAdmin admin, ServerName sn,
277       HRegionInfo hri) throws IOException, InterruptedException {
278     try {
279       HBaseFsckRepair.closeRegionSilentlyAndWait(admin, sn, hri);
280       if (!hri.isMetaTable()) {
281         admin.offline(hri.getRegionName());
282       }
283     } catch (IOException ioe) {
284       LOG.warn("Got exception when attempting to offline region "
285           + Bytes.toString(hri.getRegionName()), ioe);
286     }
287   }
288   /**
289    * Delete a region from assignments, meta, or completely from hdfs.
290    * @param unassign if true unassign region if assigned
291    * @param metaRow  if true remove region's row from META
292    * @param hdfs if true remove region's dir in HDFS
293    */
294   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
295       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
296       boolean hdfs) throws IOException, InterruptedException {
297     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false);
298   }
299 
300   /**
301    * Delete a region from assignments, meta, or completely from hdfs.
302    * @param unassign if true unassign region if assigned
303    * @param metaRow  if true remove region's row from META
304    * @param hdfs if true remove region's dir in HDFS
305    * @param regionInfoOnly if true remove a region dir's .regioninfo file
306    */
307   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
308       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
309       boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException {
310     LOG.info("** Before delete:");
311     dumpMeta(htd.getTableName());
312 
313     Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
314     for (Entry<HRegionInfo, ServerName> e: hris.entrySet()) {
315       HRegionInfo hri = e.getKey();
316       ServerName hsa = e.getValue();
317       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
318           && Bytes.compareTo(hri.getEndKey(), endKey) == 0) {
319 
320         LOG.info("RegionName: " +hri.getRegionNameAsString());
321         byte[] deleteRow = hri.getRegionName();
322 
323         if (unassign) {
324           LOG.info("Undeploying region " + hri + " from server " + hsa);
325           undeployRegion(new HBaseAdmin(conf), hsa, hri);
326         }
327 
328         if (regionInfoOnly) {
329           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
330           Path rootDir = FSUtils.getRootDir(conf);
331           FileSystem fs = rootDir.getFileSystem(conf);
332           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
333               hri.getEncodedName());
334           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
335           fs.delete(hriPath, true);
336         }
337 
338         if (hdfs) {
339           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
340           Path rootDir = FSUtils.getRootDir(conf);
341           FileSystem fs = rootDir.getFileSystem(conf);
342           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
343               hri.getEncodedName());
344           HBaseFsck.debugLsr(conf, p);
345           boolean success = fs.delete(p, true);
346           LOG.info("Deleted " + p + " sucessfully? " + success);
347           HBaseFsck.debugLsr(conf, p);
348         }
349 
350         if (metaRow) {
351           HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
352           Delete delete = new Delete(deleteRow);
353           meta.delete(delete);
354         }
355       }
356       LOG.info(hri.toString() + hsa.toString());
357     }
358 
359     TEST_UTIL.getMetaTableRows(htd.getTableName());
360     LOG.info("*** After delete:");
361     dumpMeta(htd.getTableName());
362   }
363 
364   /**
365    * Setup a clean table before we start mucking with it.
366    *
367    * @throws IOException
368    * @throws InterruptedException
369    * @throws KeeperException
370    */
371   HTable setupTable(TableName tablename) throws Exception {
372     HTableDescriptor desc = new HTableDescriptor(tablename);
373     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
374     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
375     TEST_UTIL.getHBaseAdmin().createTable(desc, SPLITS);
376     tbl = new HTable(TEST_UTIL.getConfiguration(), tablename, executorService);
377 
378     List<Put> puts = new ArrayList<Put>();
379     for (byte[] row : ROWKEYS) {
380       Put p = new Put(row);
381       p.add(FAM, Bytes.toBytes("val"), row);
382       puts.add(p);
383     }
384     tbl.put(puts);
385     tbl.flushCommits();
386     return tbl;
387   }
388 
389   /**
390    * Counts the number of row to verify data loss or non-dataloss.
391    */
392   int countRows() throws IOException {
393      Scan s = new Scan();
394      ResultScanner rs = tbl.getScanner(s);
395      int i = 0;
396      while(rs.next() !=null) {
397        i++;
398      }
399      return i;
400   }
401 
402   /**
403    * delete table in preparation for next test
404    *
405    * @param tablename
406    * @throws IOException
407    */
408   void deleteTable(TableName tablename) throws IOException {
409     HBaseAdmin admin = new HBaseAdmin(conf);
410     admin.getConnection().clearRegionCache();
411     if (admin.isTableEnabled(tablename)) {
412       admin.disableTableAsync(tablename);
413     }
414     long totalWait = 0;
415     long maxWait = 30*1000;
416     long sleepTime = 250;
417     while (!admin.isTableDisabled(tablename)) {
418       try {
419         Thread.sleep(sleepTime);
420         totalWait += sleepTime;
421         if (totalWait >= maxWait) {
422           fail("Waited too long for table to be disabled + " + tablename);
423         }
424       } catch (InterruptedException e) {
425         e.printStackTrace();
426         fail("Interrupted when trying to disable table " + tablename);
427       }
428     }
429     admin.deleteTable(tablename);
430   }
431 
432   /**
433    * This creates a clean table and confirms that the table is clean.
434    */
435   @Test
436   public void testHBaseFsckClean() throws Exception {
437     assertNoErrors(doFsck(conf, false));
438     TableName table = TableName.valueOf("tableClean");
439     try {
440       HBaseFsck hbck = doFsck(conf, false);
441       assertNoErrors(hbck);
442 
443       setupTable(table);
444       assertEquals(ROWKEYS.length, countRows());
445 
446       // We created 1 table, should be fine
447       hbck = doFsck(conf, false);
448       assertNoErrors(hbck);
449       assertEquals(0, hbck.getOverlapGroups(table).size());
450       assertEquals(ROWKEYS.length, countRows());
451     } finally {
452       deleteTable(table);
453     }
454   }
455 
456   /**
457    * Test thread pooling in the case where there are more regions than threads
458    */
459   @Test
460   public void testHbckThreadpooling() throws Exception {
461     TableName table =
462         TableName.valueOf("tableDupeStartKey");
463     try {
464       // Create table with 4 regions
465       setupTable(table);
466 
467       // limit number of threads to 1.
468       Configuration newconf = new Configuration(conf);
469       newconf.setInt("hbasefsck.numthreads", 1);
470       assertNoErrors(doFsck(newconf, false));
471 
472       // We should pass without triggering a RejectedExecutionException
473     } finally {
474       deleteTable(table);
475     }
476   }
477 
478   @Test
479   public void testHbckFixOrphanTable() throws Exception {
480     TableName table = TableName.valueOf("tableInfo");
481     FileSystem fs = null;
482     Path tableinfo = null;
483     try {
484       setupTable(table);
485       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
486 
487       Path hbaseTableDir = FSUtils.getTableDir(
488           FSUtils.getRootDir(conf), table);
489       fs = hbaseTableDir.getFileSystem(conf);
490       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
491       tableinfo = status.getPath();
492       fs.rename(tableinfo, new Path("/.tableinfo"));
493 
494       //to report error if .tableinfo is missing.
495       HBaseFsck hbck = doFsck(conf, false);
496       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
497 
498       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
499       hbck = doFsck(conf, true);
500       assertNoErrors(hbck);
501       status = null;
502       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
503       assertNotNull(status);
504 
505       HTableDescriptor htd = admin.getTableDescriptor(table);
506       htd.setValue("NOT_DEFAULT", "true");
507       admin.disableTable(table);
508       admin.modifyTable(table, htd);
509       admin.enableTable(table);
510       fs.delete(status.getPath(), true);
511 
512       // fix OrphanTable with cache
513       htd = admin.getTableDescriptor(table); // warms up cached htd on master
514       hbck = doFsck(conf, true);
515       assertNoErrors(hbck);
516       status = null;
517       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
518       assertNotNull(status);
519       htd = admin.getTableDescriptor(table);
520       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
521     } finally {
522       fs.rename(new Path("/.tableinfo"), tableinfo);
523       deleteTable(table);
524     }
525   }
526 
527   /**
528    * This test makes sure that parallel instances of Hbck is disabled.
529    *
530    * @throws Exception
531    */
532   @Test
533   public void testParallelHbck() throws Exception {
534     final ExecutorService service;
535     final Future<HBaseFsck> hbck1,hbck2;
536 
537     class RunHbck implements Callable<HBaseFsck>{
538       boolean fail = true;
539       @Override
540       public HBaseFsck call(){
541         try{
542           return doFsck(conf, false);
543         } catch(Exception e){
544           if (e.getMessage().contains("Duplicate hbck")) {
545             fail = false;
546           } else {
547             LOG.fatal("hbck failed.", e);
548           }
549         }
550         // If we reach here, then an exception was caught
551         if (fail) fail();
552         return null;
553       }
554     }
555     service = Executors.newFixedThreadPool(2);
556     hbck1 = service.submit(new RunHbck());
557     hbck2 = service.submit(new RunHbck());
558     service.shutdown();
559     //wait for 15 seconds, for both hbck calls finish
560     service.awaitTermination(15, TimeUnit.SECONDS);
561     HBaseFsck h1 = hbck1.get();
562     HBaseFsck h2 = hbck2.get();
563     // Make sure only one of the calls was successful
564     assert(h1 == null || h2 == null);
565     if (h1 != null) {
566       assert(h1.getRetCode() >= 0);
567     }
568     if (h2 != null) {
569       assert(h2.getRetCode() >= 0);
570     }
571   }
572 
573   /**
574    * This create and fixes a bad table with regions that have a duplicate
575    * start key
576    */
577   @Test
578   public void testDupeStartKey() throws Exception {
579     TableName table =
580         TableName.valueOf("tableDupeStartKey");
581     try {
582       setupTable(table);
583       assertNoErrors(doFsck(conf, false));
584       assertEquals(ROWKEYS.length, countRows());
585 
586       // Now let's mess it up, by adding a region with a duplicate startkey
587       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
588           Bytes.toBytes("A"), Bytes.toBytes("A2"));
589       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
590       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
591           .waitForAssignment(hriDupe);
592       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
593       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
594 
595       HBaseFsck hbck = doFsck(conf, false);
596       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
597             ERROR_CODE.DUPE_STARTKEYS});
598       assertEquals(2, hbck.getOverlapGroups(table).size());
599       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
600 
601       // fix the degenerate region.
602       doFsck(conf,true);
603 
604       // check that the degenerate region is gone and no data loss
605       HBaseFsck hbck2 = doFsck(conf,false);
606       assertNoErrors(hbck2);
607       assertEquals(0, hbck2.getOverlapGroups(table).size());
608       assertEquals(ROWKEYS.length, countRows());
609     } finally {
610       deleteTable(table);
611     }
612   }
613 
614   /**
615    * Get region info from local cluster.
616    */
617   Map<ServerName, List<String>> getDeployedHRIs(
618       final HBaseAdmin admin) throws IOException {
619     ClusterStatus status = admin.getClusterStatus();
620     Collection<ServerName> regionServers = status.getServers();
621     Map<ServerName, List<String>> mm =
622         new HashMap<ServerName, List<String>>();
623     HConnection connection = admin.getConnection();
624     for (ServerName hsi : regionServers) {
625       AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi);
626 
627       // list all online regions from this region server
628       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
629       List<String> regionNames = new ArrayList<String>();
630       for (HRegionInfo hri : regions) {
631         regionNames.add(hri.getRegionNameAsString());
632       }
633       mm.put(hsi, regionNames);
634     }
635     return mm;
636   }
637 
638   /**
639    * Returns the HSI a region info is on.
640    */
641   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
642     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
643       if (e.getValue().contains(hri.getRegionNameAsString())) {
644         return e.getKey();
645       }
646     }
647     return null;
648   }
649 
650   /**
651    * This create and fixes a bad table with regions that have a duplicate
652    * start key
653    */
654   @Test
655   public void testDupeRegion() throws Exception {
656     TableName table =
657         TableName.valueOf("tableDupeRegion");
658     try {
659       setupTable(table);
660       assertNoErrors(doFsck(conf, false));
661       assertEquals(ROWKEYS.length, countRows());
662 
663       // Now let's mess it up, by adding a region with a duplicate startkey
664       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
665           Bytes.toBytes("A"), Bytes.toBytes("B"));
666 
667       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
668       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
669           .waitForAssignment(hriDupe);
670       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
671       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
672 
673       // Yikes! The assignment manager can't tell between diff between two
674       // different regions with the same start/endkeys since it doesn't
675       // differentiate on ts/regionId!  We actually need to recheck
676       // deployments!
677       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
678       while (findDeployedHSI(getDeployedHRIs(admin), hriDupe) == null) {
679         Thread.sleep(250);
680       }
681 
682       LOG.debug("Finished assignment of dupe region");
683 
684       // TODO why is dupe region different from dupe start keys?
685       HBaseFsck hbck = doFsck(conf, false);
686       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
687             ERROR_CODE.DUPE_STARTKEYS});
688       assertEquals(2, hbck.getOverlapGroups(table).size());
689       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
690 
691       // fix the degenerate region.
692       doFsck(conf,true);
693 
694       // check that the degenerate region is gone and no data loss
695       HBaseFsck hbck2 = doFsck(conf,false);
696       assertNoErrors(hbck2);
697       assertEquals(0, hbck2.getOverlapGroups(table).size());
698       assertEquals(ROWKEYS.length, countRows());
699     } finally {
700       deleteTable(table);
701     }
702   }
703 
704   /**
705    * This creates and fixes a bad table with regions that has startkey == endkey
706    */
707   @Test
708   public void testDegenerateRegions() throws Exception {
709     TableName table =
710         TableName.valueOf("tableDegenerateRegions");
711     try {
712       setupTable(table);
713       assertNoErrors(doFsck(conf,false));
714       assertEquals(ROWKEYS.length, countRows());
715 
716       // Now let's mess it up, by adding a region with a duplicate startkey
717       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
718           Bytes.toBytes("B"), Bytes.toBytes("B"));
719       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
720       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
721           .waitForAssignment(hriDupe);
722       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
723       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
724 
725       HBaseFsck hbck = doFsck(conf,false);
726       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION,
727           ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.DUPE_STARTKEYS});
728       assertEquals(2, hbck.getOverlapGroups(table).size());
729       assertEquals(ROWKEYS.length, countRows());
730 
731       // fix the degenerate region.
732       doFsck(conf,true);
733 
734       // check that the degenerate region is gone and no data loss
735       HBaseFsck hbck2 = doFsck(conf,false);
736       assertNoErrors(hbck2);
737       assertEquals(0, hbck2.getOverlapGroups(table).size());
738       assertEquals(ROWKEYS.length, countRows());
739     } finally {
740       deleteTable(table);
741     }
742   }
743 
744   /**
745    * This creates and fixes a bad table where a region is completely contained
746    * by another region.
747    */
748   @Test
749   public void testContainedRegionOverlap() throws Exception {
750     TableName table =
751         TableName.valueOf("tableContainedRegionOverlap");
752     try {
753       setupTable(table);
754       assertEquals(ROWKEYS.length, countRows());
755 
756       // Mess it up by creating an overlap in the metadata
757       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
758           Bytes.toBytes("A2"), Bytes.toBytes("B"));
759       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
760       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
761           .waitForAssignment(hriOverlap);
762       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
763       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
764 
765       HBaseFsck hbck = doFsck(conf, false);
766       assertErrors(hbck, new ERROR_CODE[] {
767           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
768       assertEquals(2, hbck.getOverlapGroups(table).size());
769       assertEquals(ROWKEYS.length, countRows());
770 
771       // fix the problem.
772       doFsck(conf, true);
773 
774       // verify that overlaps are fixed
775       HBaseFsck hbck2 = doFsck(conf,false);
776       assertNoErrors(hbck2);
777       assertEquals(0, hbck2.getOverlapGroups(table).size());
778       assertEquals(ROWKEYS.length, countRows());
779     } finally {
780        deleteTable(table);
781     }
782   }
783 
784   /**
785    * This creates and fixes a bad table where an overlap group of
786    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
787    * region. Mess around the meta data so that closeRegion/offlineRegion
788    * throws exceptions.
789    */
790   @Test
791   public void testSidelineOverlapRegion() throws Exception {
792     TableName table =
793         TableName.valueOf("testSidelineOverlapRegion");
794     try {
795       setupTable(table);
796       assertEquals(ROWKEYS.length, countRows());
797 
798       // Mess it up by creating an overlap
799       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
800       HMaster master = cluster.getMaster();
801       HRegionInfo hriOverlap1 = createRegion(conf, tbl.getTableDescriptor(),
802         Bytes.toBytes("A"), Bytes.toBytes("AB"));
803       master.assignRegion(hriOverlap1);
804       master.getAssignmentManager().waitForAssignment(hriOverlap1);
805       HRegionInfo hriOverlap2 = createRegion(conf, tbl.getTableDescriptor(),
806         Bytes.toBytes("AB"), Bytes.toBytes("B"));
807       master.assignRegion(hriOverlap2);
808       master.getAssignmentManager().waitForAssignment(hriOverlap2);
809 
810       HBaseFsck hbck = doFsck(conf, false);
811       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
812         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
813       assertEquals(3, hbck.getOverlapGroups(table).size());
814       assertEquals(ROWKEYS.length, countRows());
815 
816       // mess around the overlapped regions, to trigger NotServingRegionException
817       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
818       ServerName serverName = null;
819       byte[] regionName = null;
820       for (HbckInfo hbi: overlapGroups.values()) {
821         if ("A".equals(Bytes.toString(hbi.getStartKey()))
822             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
823           regionName = hbi.getRegionName();
824 
825           // get an RS not serving the region to force bad assignment info in to META.
826           int k = cluster.getServerWith(regionName);
827           for (int i = 0; i < 3; i++) {
828             if (i != k) {
829               HRegionServer rs = cluster.getRegionServer(i);
830               serverName = rs.getServerName();
831               break;
832             }
833           }
834 
835           HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
836           HBaseFsckRepair.closeRegionSilentlyAndWait(admin,
837             cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
838           admin.offline(regionName);
839           break;
840         }
841       }
842 
843       assertNotNull(regionName);
844       assertNotNull(serverName);
845       HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
846       Put put = new Put(regionName);
847       put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
848         Bytes.toBytes(serverName.getHostAndPort()));
849       meta.put(put);
850 
851       // fix the problem.
852       HBaseFsck fsck = new HBaseFsck(conf);
853       fsck.connect();
854       fsck.setDisplayFullReport(); // i.e. -details
855       fsck.setTimeLag(0);
856       fsck.setFixAssignments(true);
857       fsck.setFixMeta(true);
858       fsck.setFixHdfsHoles(true);
859       fsck.setFixHdfsOverlaps(true);
860       fsck.setFixHdfsOrphans(true);
861       fsck.setFixVersionFile(true);
862       fsck.setSidelineBigOverlaps(true);
863       fsck.setMaxMerge(2);
864       fsck.onlineHbck();
865 
866       // verify that overlaps are fixed, and there are less rows
867       // since one region is sidelined.
868       HBaseFsck hbck2 = doFsck(conf,false);
869       assertNoErrors(hbck2);
870       assertEquals(0, hbck2.getOverlapGroups(table).size());
871       assertTrue(ROWKEYS.length > countRows());
872     } finally {
873        deleteTable(table);
874     }
875   }
876 
877   /**
878    * This creates and fixes a bad table where a region is completely contained
879    * by another region, and there is a hole (sort of like a bad split)
880    */
881   @Test
882   public void testOverlapAndOrphan() throws Exception {
883     TableName table =
884         TableName.valueOf("tableOverlapAndOrphan");
885     try {
886       setupTable(table);
887       assertEquals(ROWKEYS.length, countRows());
888 
889       // Mess it up by creating an overlap in the metadata
890       TEST_UTIL.getHBaseAdmin().disableTable(table);
891       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
892           Bytes.toBytes("B"), true, true, false, true);
893       TEST_UTIL.getHBaseAdmin().enableTable(table);
894 
895       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
896           Bytes.toBytes("A2"), Bytes.toBytes("B"));
897       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
898       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
899           .waitForAssignment(hriOverlap);
900       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
901       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
902 
903       HBaseFsck hbck = doFsck(conf, false);
904       assertErrors(hbck, new ERROR_CODE[] {
905           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
906           ERROR_CODE.HOLE_IN_REGION_CHAIN});
907 
908       // fix the problem.
909       doFsck(conf, true);
910 
911       // verify that overlaps are fixed
912       HBaseFsck hbck2 = doFsck(conf,false);
913       assertNoErrors(hbck2);
914       assertEquals(0, hbck2.getOverlapGroups(table).size());
915       assertEquals(ROWKEYS.length, countRows());
916     } finally {
917        deleteTable(table);
918     }
919   }
920 
921   /**
922    * This creates and fixes a bad table where a region overlaps two regions --
923    * a start key contained in another region and its end key is contained in
924    * yet another region.
925    */
926   @Test
927   public void testCoveredStartKey() throws Exception {
928     TableName table =
929         TableName.valueOf("tableCoveredStartKey");
930     try {
931       setupTable(table);
932       assertEquals(ROWKEYS.length, countRows());
933 
934       // Mess it up by creating an overlap in the metadata
935       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
936           Bytes.toBytes("A2"), Bytes.toBytes("B2"));
937       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
938       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
939           .waitForAssignment(hriOverlap);
940       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
941       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
942 
943       HBaseFsck hbck = doFsck(conf, false);
944       assertErrors(hbck, new ERROR_CODE[] {
945           ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
946           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
947       assertEquals(3, hbck.getOverlapGroups(table).size());
948       assertEquals(ROWKEYS.length, countRows());
949 
950       // fix the problem.
951       doFsck(conf, true);
952 
953       // verify that overlaps are fixed
954       HBaseFsck hbck2 = doFsck(conf, false);
955       assertErrors(hbck2, new ERROR_CODE[0]);
956       assertEquals(0, hbck2.getOverlapGroups(table).size());
957       assertEquals(ROWKEYS.length, countRows());
958     } finally {
959       deleteTable(table);
960     }
961   }
962 
963   /**
964    * This creates and fixes a bad table with a missing region -- hole in meta
965    * and data missing in the fs.
966    */
967   @Test
968   public void testRegionHole() throws Exception {
969     TableName table =
970         TableName.valueOf("tableRegionHole");
971     try {
972       setupTable(table);
973       assertEquals(ROWKEYS.length, countRows());
974 
975       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
976       TEST_UTIL.getHBaseAdmin().disableTable(table);
977       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
978           Bytes.toBytes("C"), true, true, true);
979       TEST_UTIL.getHBaseAdmin().enableTable(table);
980 
981       HBaseFsck hbck = doFsck(conf, false);
982       assertErrors(hbck, new ERROR_CODE[] {
983           ERROR_CODE.HOLE_IN_REGION_CHAIN});
984       // holes are separate from overlap groups
985       assertEquals(0, hbck.getOverlapGroups(table).size());
986 
987       // fix hole
988       doFsck(conf, true);
989 
990       // check that hole fixed
991       assertNoErrors(doFsck(conf,false));
992       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
993     } finally {
994       deleteTable(table);
995     }
996   }
997 
998   /**
999    * This creates and fixes a bad table with a missing region -- hole in meta
1000    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1001    */
1002   @Test
1003   public void testHDFSRegioninfoMissing() throws Exception {
1004     TableName table =
1005         TableName.valueOf("tableHDFSRegioininfoMissing");
1006     try {
1007       setupTable(table);
1008       assertEquals(ROWKEYS.length, countRows());
1009 
1010       // Mess it up by leaving a hole in the meta data
1011       TEST_UTIL.getHBaseAdmin().disableTable(table);
1012       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1013           Bytes.toBytes("C"), true, true, false, true);
1014       TEST_UTIL.getHBaseAdmin().enableTable(table);
1015 
1016       HBaseFsck hbck = doFsck(conf, false);
1017       assertErrors(hbck, new ERROR_CODE[] {
1018           ERROR_CODE.ORPHAN_HDFS_REGION,
1019           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1020           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1021       // holes are separate from overlap groups
1022       assertEquals(0, hbck.getOverlapGroups(table).size());
1023 
1024       // fix hole
1025       doFsck(conf, true);
1026 
1027       // check that hole fixed
1028       assertNoErrors(doFsck(conf, false));
1029       assertEquals(ROWKEYS.length, countRows());
1030     } finally {
1031       deleteTable(table);
1032     }
1033   }
1034 
1035   /**
1036    * This creates and fixes a bad table with a region that is missing meta and
1037    * not assigned to a region server.
1038    */
1039   @Test
1040   public void testNotInMetaOrDeployedHole() throws Exception {
1041     TableName table =
1042         TableName.valueOf("tableNotInMetaOrDeployedHole");
1043     try {
1044       setupTable(table);
1045       assertEquals(ROWKEYS.length, countRows());
1046 
1047       // Mess it up by leaving a hole in the meta data
1048       TEST_UTIL.getHBaseAdmin().disableTable(table);
1049       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1050           Bytes.toBytes("C"), true, true, false); // don't rm from fs
1051       TEST_UTIL.getHBaseAdmin().enableTable(table);
1052 
1053       HBaseFsck hbck = doFsck(conf, false);
1054       assertErrors(hbck, new ERROR_CODE[] {
1055           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1056       // holes are separate from overlap groups
1057       assertEquals(0, hbck.getOverlapGroups(table).size());
1058 
1059       // fix hole
1060       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1061           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1062 
1063       // check that hole fixed
1064       assertNoErrors(doFsck(conf,false));
1065       assertEquals(ROWKEYS.length, countRows());
1066     } finally {
1067       deleteTable(table);
1068     }
1069   }
1070 
1071   /**
1072    * This creates fixes a bad table with a hole in meta.
1073    */
1074   @Test
1075   public void testNotInMetaHole() throws Exception {
1076     TableName table =
1077         TableName.valueOf("tableNotInMetaHole");
1078     try {
1079       setupTable(table);
1080       assertEquals(ROWKEYS.length, countRows());
1081 
1082       // Mess it up by leaving a hole in the meta data
1083       TEST_UTIL.getHBaseAdmin().disableTable(table);
1084       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1085           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1086       TEST_UTIL.getHBaseAdmin().enableTable(table);
1087 
1088       HBaseFsck hbck = doFsck(conf, false);
1089       assertErrors(hbck, new ERROR_CODE[] {
1090           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1091       // holes are separate from overlap groups
1092       assertEquals(0, hbck.getOverlapGroups(table).size());
1093 
1094       // fix hole
1095       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1096           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1097 
1098       // check that hole fixed
1099       assertNoErrors(doFsck(conf,false));
1100       assertEquals(ROWKEYS.length, countRows());
1101     } finally {
1102       deleteTable(table);
1103     }
1104   }
1105 
1106   /**
1107    * This creates and fixes a bad table with a region that is in meta but has
1108    * no deployment or data hdfs
1109    */
1110   @Test
1111   public void testNotInHdfs() throws Exception {
1112     TableName table =
1113         TableName.valueOf("tableNotInHdfs");
1114     try {
1115       setupTable(table);
1116       assertEquals(ROWKEYS.length, countRows());
1117 
1118       // make sure data in regions, if in hlog only there is no data loss
1119       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1120 
1121       // Mess it up by leaving a hole in the hdfs data
1122       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1123           Bytes.toBytes("C"), false, false, true); // don't rm meta
1124 
1125       HBaseFsck hbck = doFsck(conf, false);
1126       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1127       // holes are separate from overlap groups
1128       assertEquals(0, hbck.getOverlapGroups(table).size());
1129 
1130       // fix hole
1131       doFsck(conf, true);
1132 
1133       // check that hole fixed
1134       assertNoErrors(doFsck(conf,false));
1135       assertEquals(ROWKEYS.length - 2, countRows());
1136     } finally {
1137       deleteTable(table);
1138     }
1139   }
1140 
1141   /**
1142    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1143    * remove the table.
1144    */
1145   @Test
1146   public void testNoHdfsTable() throws Exception {
1147     TableName table = TableName.valueOf("NoHdfsTable");
1148     setupTable(table);
1149     assertEquals(ROWKEYS.length, countRows());
1150 
1151     // make sure data in regions, if in hlog only there is no data loss
1152     TEST_UTIL.getHBaseAdmin().flush(table.getName());
1153 
1154     // Mess it up by deleting hdfs dirs
1155     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1156         Bytes.toBytes("A"), false, false, true); // don't rm meta
1157     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1158         Bytes.toBytes("B"), false, false, true); // don't rm meta
1159     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1160         Bytes.toBytes("C"), false, false, true); // don't rm meta
1161     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1162         Bytes.toBytes(""), false, false, true); // don't rm meta
1163 
1164     // also remove the table directory in hdfs
1165     deleteTableDir(table);
1166 
1167     HBaseFsck hbck = doFsck(conf, false);
1168     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1169         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1170         ERROR_CODE.NOT_IN_HDFS,});
1171     // holes are separate from overlap groups
1172     assertEquals(0, hbck.getOverlapGroups(table).size());
1173 
1174     // fix hole
1175     doFsck(conf, true); // detect dangling regions and remove those
1176 
1177     // check that hole fixed
1178     assertNoErrors(doFsck(conf,false));
1179     assertFalse("Table "+ table + " should have been deleted",
1180         TEST_UTIL.getHBaseAdmin().tableExists(table));
1181   }
1182 
1183   public void deleteTableDir(TableName table) throws IOException {
1184     Path rootDir = FSUtils.getRootDir(conf);
1185     FileSystem fs = rootDir.getFileSystem(conf);
1186     Path p = FSUtils.getTableDir(rootDir, table);
1187     HBaseFsck.debugLsr(conf, p);
1188     boolean success = fs.delete(p, true);
1189     LOG.info("Deleted " + p + " sucessfully? " + success);
1190   }
1191 
1192   /**
1193    * when the hbase.version file missing, It is fix the fault.
1194    */
1195   @Test
1196   public void testNoVersionFile() throws Exception {
1197     // delete the hbase.version file
1198     Path rootDir = FSUtils.getRootDir(conf);
1199     FileSystem fs = rootDir.getFileSystem(conf);
1200     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1201     fs.delete(versionFile, true);
1202 
1203     // test
1204     HBaseFsck hbck = doFsck(conf, false);
1205     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1206     // fix hbase.version missing
1207     doFsck(conf, true);
1208 
1209     // no version file fixed
1210     assertNoErrors(doFsck(conf, false));
1211   }
1212 
1213   /**
1214    * The region is not deployed when the table is disabled.
1215    */
1216   @Test
1217   public void testRegionShouldNotBeDeployed() throws Exception {
1218     TableName table =
1219         TableName.valueOf("tableRegionShouldNotBeDeployed");
1220     try {
1221       LOG.info("Starting testRegionShouldNotBeDeployed.");
1222       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1223       assertTrue(cluster.waitForActiveAndReadyMaster());
1224 
1225 
1226       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1227           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1228       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1229       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1230 
1231       // Write the .tableinfo
1232       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1233       fstd.createTableDescriptor(htdDisabled);
1234       List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
1235           TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
1236 
1237       // Let's just assign everything to first RS
1238       HRegionServer hrs = cluster.getRegionServer(0);
1239 
1240       // Create region files.
1241       TEST_UTIL.getHBaseAdmin().disableTable(table);
1242       TEST_UTIL.getHBaseAdmin().enableTable(table);
1243 
1244       // Disable the table and close its regions
1245       TEST_UTIL.getHBaseAdmin().disableTable(table);
1246       HRegionInfo region = disabledRegions.remove(0);
1247       byte[] regionName = region.getRegionName();
1248 
1249       // The region should not be assigned currently
1250       assertTrue(cluster.getServerWith(regionName) == -1);
1251 
1252       // Directly open a region on a region server.
1253       // If going through AM/ZK, the region won't be open.
1254       // Even it is opened, AM will close it which causes
1255       // flakiness of this test.
1256       HRegion r = HRegion.openHRegion(
1257         region, htdDisabled, hrs.getWAL(region), conf);
1258       hrs.addToOnlineRegions(r);
1259 
1260       HBaseFsck hbck = doFsck(conf, false);
1261       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1262 
1263       // fix this fault
1264       doFsck(conf, true);
1265 
1266       // check result
1267       assertNoErrors(doFsck(conf, false));
1268     } finally {
1269       TEST_UTIL.getHBaseAdmin().enableTable(table);
1270       deleteTable(table);
1271     }
1272   }
1273 
1274   /**
1275    * This creates two tables and mess both of them and fix them one by one
1276    */
1277   @Test
1278   public void testFixByTable() throws Exception {
1279     TableName table1 =
1280         TableName.valueOf("testFixByTable1");
1281     TableName table2 =
1282         TableName.valueOf("testFixByTable2");
1283     try {
1284       setupTable(table1);
1285       // make sure data in regions, if in hlog only there is no data loss
1286       TEST_UTIL.getHBaseAdmin().flush(table1.getName());
1287       // Mess them up by leaving a hole in the hdfs data
1288       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1289         Bytes.toBytes("C"), false, false, true); // don't rm meta
1290 
1291       setupTable(table2);
1292       // make sure data in regions, if in hlog only there is no data loss
1293       TEST_UTIL.getHBaseAdmin().flush(table2.getName());
1294       // Mess them up by leaving a hole in the hdfs data
1295       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1296         Bytes.toBytes("C"), false, false, true); // don't rm meta
1297 
1298       HBaseFsck hbck = doFsck(conf, false);
1299       assertErrors(hbck, new ERROR_CODE[] {
1300         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1301 
1302       // fix hole in table 1
1303       doFsck(conf, true, table1);
1304       // check that hole in table 1 fixed
1305       assertNoErrors(doFsck(conf, false, table1));
1306       // check that hole in table 2 still there
1307       assertErrors(doFsck(conf, false, table2),
1308         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1309 
1310       // fix hole in table 2
1311       doFsck(conf, true, table2);
1312       // check that hole in both tables fixed
1313       assertNoErrors(doFsck(conf, false));
1314       assertEquals(ROWKEYS.length - 2, countRows());
1315     } finally {
1316       deleteTable(table1);
1317       deleteTable(table2);
1318     }
1319   }
1320   /**
1321    * A split parent in meta, in hdfs, and not deployed
1322    */
1323   @Test
1324   public void testLingeringSplitParent() throws Exception {
1325     TableName table =
1326         TableName.valueOf("testLingeringSplitParent");
1327     HTable meta = null;
1328     try {
1329       setupTable(table);
1330       assertEquals(ROWKEYS.length, countRows());
1331 
1332       // make sure data in regions, if in hlog only there is no data loss
1333       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1334       HRegionLocation location = tbl.getRegionLocation("B");
1335 
1336       // Delete one region from meta, but not hdfs, unassign it.
1337       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1338         Bytes.toBytes("C"), true, true, false);
1339 
1340       // Create a new meta entry to fake it as a split parent.
1341       meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
1342       HRegionInfo hri = location.getRegionInfo();
1343 
1344       HRegionInfo a = new HRegionInfo(tbl.getName(),
1345         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1346       HRegionInfo b = new HRegionInfo(tbl.getName(),
1347         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1348 
1349       hri.setOffline(true);
1350       hri.setSplit(true);
1351 
1352       MetaEditor.addRegionToMeta(meta, hri, a, b);
1353       meta.flushCommits();
1354       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1355 
1356       HBaseFsck hbck = doFsck(conf, false);
1357       assertErrors(hbck, new ERROR_CODE[] {
1358         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1359 
1360       // regular repair cannot fix lingering split parent
1361       hbck = doFsck(conf, true);
1362       assertErrors(hbck, new ERROR_CODE[] {
1363         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1364       assertFalse(hbck.shouldRerun());
1365       hbck = doFsck(conf, false);
1366       assertErrors(hbck, new ERROR_CODE[] {
1367         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1368 
1369       // fix lingering split parent
1370       hbck = new HBaseFsck(conf);
1371       hbck.connect();
1372       hbck.setDisplayFullReport(); // i.e. -details
1373       hbck.setTimeLag(0);
1374       hbck.setFixSplitParents(true);
1375       hbck.onlineHbck();
1376       assertTrue(hbck.shouldRerun());
1377 
1378       Get get = new Get(hri.getRegionName());
1379       Result result = meta.get(get);
1380       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1381         HConstants.SPLITA_QUALIFIER).isEmpty());
1382       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1383         HConstants.SPLITB_QUALIFIER).isEmpty());
1384       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1385 
1386       // fix other issues
1387       doFsck(conf, true);
1388 
1389       // check that all are fixed
1390       assertNoErrors(doFsck(conf, false));
1391       assertEquals(ROWKEYS.length, countRows());
1392     } finally {
1393       deleteTable(table);
1394       IOUtils.closeQuietly(meta);
1395     }
1396   }
1397 
1398   /**
1399    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1400    * valid cases where the daughters are there.
1401    */
1402   @Test
1403   public void testValidLingeringSplitParent() throws Exception {
1404     TableName table =
1405         TableName.valueOf("testLingeringSplitParent");
1406     HTable meta = null;
1407     try {
1408       setupTable(table);
1409       assertEquals(ROWKEYS.length, countRows());
1410 
1411       // make sure data in regions, if in hlog only there is no data loss
1412       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1413       HRegionLocation location = tbl.getRegionLocation("B");
1414 
1415       meta = new HTable(conf, TableName.META_TABLE_NAME);
1416       HRegionInfo hri = location.getRegionInfo();
1417 
1418       // do a regular split
1419       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1420       byte[] regionName = location.getRegionInfo().getRegionName();
1421       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1422       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1423           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1424 
1425       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1426       // for some time until children references are deleted. HBCK erroneously sees this as
1427       // overlapping regions
1428       HBaseFsck hbck = doFsck(
1429         conf, true, true, false, false, false, true, true, true, false, false, false, null);
1430       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1431 
1432       // assert that the split hbase:meta entry is still there.
1433       Get get = new Get(hri.getRegionName());
1434       Result result = meta.get(get);
1435       assertNotNull(result);
1436       assertNotNull(HRegionInfo.getHRegionInfo(result));
1437 
1438       assertEquals(ROWKEYS.length, countRows());
1439 
1440       // assert that we still have the split regions
1441       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1442       assertNoErrors(doFsck(conf, false));
1443     } finally {
1444       deleteTable(table);
1445       IOUtils.closeQuietly(meta);
1446     }
1447   }
1448 
1449   /**
1450    * Split crashed after write to hbase:meta finished for the parent region, but
1451    * failed to write daughters (pre HBASE-7721 codebase)
1452    */
1453   @Test(timeout=75000)
1454   public void testSplitDaughtersNotInMeta() throws Exception {
1455     TableName table =
1456         TableName.valueOf("testSplitdaughtersNotInMeta");
1457     HTable meta = null;
1458     try {
1459       setupTable(table);
1460       assertEquals(ROWKEYS.length, countRows());
1461 
1462       // make sure data in regions, if in hlog only there is no data loss
1463       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1464       HRegionLocation location = tbl.getRegionLocation("B");
1465 
1466       meta = new HTable(conf, TableName.META_TABLE_NAME);
1467       HRegionInfo hri = location.getRegionInfo();
1468 
1469       // do a regular split
1470       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1471       byte[] regionName = location.getRegionInfo().getRegionName();
1472       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1473       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1474           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1475 
1476       PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(meta.get(new Get(regionName)));
1477 
1478       // Delete daughter regions from meta, but not hdfs, unassign it.
1479       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1480       undeployRegion(admin, hris.get(daughters.getFirst()), daughters.getFirst());
1481       undeployRegion(admin, hris.get(daughters.getSecond()), daughters.getSecond());
1482 
1483       meta.delete(new Delete(daughters.getFirst().getRegionName()));
1484       meta.delete(new Delete(daughters.getSecond().getRegionName()));
1485       meta.flushCommits();
1486 
1487       HBaseFsck hbck = doFsck(conf, false);
1488       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1489           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN}); //no LINGERING_SPLIT_PARENT
1490 
1491       // now fix it. The fix should not revert the region split, but add daughters to META
1492       hbck = doFsck(
1493         conf, true, true, false, false, false, false, false, false, false, false, false, null);
1494       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1495           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1496 
1497       // assert that the split hbase:meta entry is still there.
1498       Get get = new Get(hri.getRegionName());
1499       Result result = meta.get(get);
1500       assertNotNull(result);
1501       assertNotNull(HRegionInfo.getHRegionInfo(result));
1502 
1503       assertEquals(ROWKEYS.length, countRows());
1504 
1505       // assert that we still have the split regions
1506       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1507       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1508     } finally {
1509       deleteTable(table);
1510       IOUtils.closeQuietly(meta);
1511     }
1512   }
1513 
1514   /**
1515    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1516    * meta and data missing in the fs.
1517    */
1518   @Test(timeout=120000)
1519   public void testMissingFirstRegion() throws Exception {
1520     TableName table =
1521         TableName.valueOf("testMissingFirstRegion");
1522     try {
1523       setupTable(table);
1524       assertEquals(ROWKEYS.length, countRows());
1525 
1526       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1527       TEST_UTIL.getHBaseAdmin().disableTable(table);
1528       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1529           true, true);
1530       TEST_UTIL.getHBaseAdmin().enableTable(table);
1531 
1532       HBaseFsck hbck = doFsck(conf, false);
1533       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1534       // fix hole
1535       doFsck(conf, true);
1536       // check that hole fixed
1537       assertNoErrors(doFsck(conf, false));
1538     } finally {
1539       deleteTable(table);
1540     }
1541   }
1542 
1543   /**
1544    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1545    * meta and data missing in the fs.
1546    */
1547   @Test(timeout=120000)
1548   public void testRegionDeployedNotInHdfs() throws Exception {
1549     TableName table =
1550         TableName.valueOf("testSingleRegionDeployedNotInHdfs");
1551     try {
1552       setupTable(table);
1553       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1554 
1555       // Mess it up by deleting region dir
1556       deleteRegion(conf, tbl.getTableDescriptor(),
1557         HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
1558         false, true);
1559 
1560       HBaseFsck hbck = doFsck(conf, false);
1561       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
1562       // fix hole
1563       doFsck(conf, true);
1564       // check that hole fixed
1565       assertNoErrors(doFsck(conf, false));
1566     } finally {
1567       deleteTable(table);
1568     }
1569   }
1570 
1571   /**
1572    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1573    * the fs.
1574    */
1575   @Test(timeout=120000)
1576   public void testMissingLastRegion() throws Exception {
1577     TableName table =
1578         TableName.valueOf("testMissingLastRegion");
1579     try {
1580       setupTable(table);
1581       assertEquals(ROWKEYS.length, countRows());
1582 
1583       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1584       TEST_UTIL.getHBaseAdmin().disableTable(table);
1585       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1586           true, true);
1587       TEST_UTIL.getHBaseAdmin().enableTable(table);
1588 
1589       HBaseFsck hbck = doFsck(conf, false);
1590       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1591       // fix hole
1592       doFsck(conf, true);
1593       // check that hole fixed
1594       assertNoErrors(doFsck(conf, false));
1595     } finally {
1596       deleteTable(table);
1597     }
1598   }
1599 
1600   /**
1601    * Test -noHdfsChecking option can detect and fix assignments issue.
1602    */
1603   @Test
1604   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1605     TableName table =
1606         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1607     try {
1608       setupTable(table);
1609       assertEquals(ROWKEYS.length, countRows());
1610 
1611       // Mess it up by closing a region
1612       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1613         Bytes.toBytes("B"), true, false, false, false);
1614 
1615       // verify there is no other errors
1616       HBaseFsck hbck = doFsck(conf, false);
1617       assertErrors(hbck, new ERROR_CODE[] {
1618         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1619 
1620       // verify that noHdfsChecking report the same errors
1621       HBaseFsck fsck = new HBaseFsck(conf);
1622       fsck.connect();
1623       fsck.setDisplayFullReport(); // i.e. -details
1624       fsck.setTimeLag(0);
1625       fsck.setCheckHdfs(false);
1626       fsck.onlineHbck();
1627       assertErrors(fsck, new ERROR_CODE[] {
1628         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1629 
1630       // verify that fixAssignments works fine with noHdfsChecking
1631       fsck = new HBaseFsck(conf);
1632       fsck.connect();
1633       fsck.setDisplayFullReport(); // i.e. -details
1634       fsck.setTimeLag(0);
1635       fsck.setCheckHdfs(false);
1636       fsck.setFixAssignments(true);
1637       fsck.onlineHbck();
1638       assertTrue(fsck.shouldRerun());
1639       fsck.onlineHbck();
1640       assertNoErrors(fsck);
1641 
1642       assertEquals(ROWKEYS.length, countRows());
1643     } finally {
1644       deleteTable(table);
1645     }
1646   }
1647 
1648   /**
1649    * Test -noHdfsChecking option can detect region is not in meta but deployed.
1650    * However, it can not fix it without checking Hdfs because we need to get
1651    * the region info from Hdfs in this case, then to patch the meta.
1652    */
1653   @Test
1654   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1655     TableName table =
1656         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1657     try {
1658       setupTable(table);
1659       assertEquals(ROWKEYS.length, countRows());
1660 
1661       // Mess it up by deleting a region from the metadata
1662       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1663         Bytes.toBytes("B"), false, true, false, false);
1664 
1665       // verify there is no other errors
1666       HBaseFsck hbck = doFsck(conf, false);
1667       assertErrors(hbck, new ERROR_CODE[] {
1668         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1669 
1670       // verify that noHdfsChecking report the same errors
1671       HBaseFsck fsck = new HBaseFsck(conf);
1672       fsck.connect();
1673       fsck.setDisplayFullReport(); // i.e. -details
1674       fsck.setTimeLag(0);
1675       fsck.setCheckHdfs(false);
1676       fsck.onlineHbck();
1677       assertErrors(fsck, new ERROR_CODE[] {
1678         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1679 
1680       // verify that fixMeta doesn't work with noHdfsChecking
1681       fsck = new HBaseFsck(conf);
1682       fsck.connect();
1683       fsck.setDisplayFullReport(); // i.e. -details
1684       fsck.setTimeLag(0);
1685       fsck.setCheckHdfs(false);
1686       fsck.setFixAssignments(true);
1687       fsck.setFixMeta(true);
1688       fsck.onlineHbck();
1689       assertFalse(fsck.shouldRerun());
1690       assertErrors(fsck, new ERROR_CODE[] {
1691         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1692 
1693       // fix the cluster so other tests won't be impacted
1694       fsck = doFsck(conf, true);
1695       assertTrue(fsck.shouldRerun());
1696       fsck = doFsck(conf, true);
1697       assertNoErrors(fsck);
1698     } finally {
1699       deleteTable(table);
1700     }
1701   }
1702 
1703   /**
1704    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
1705    * and -noHdfsChecking can't detect orphan Hdfs region.
1706    */
1707   @Test
1708   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
1709     TableName table =
1710         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
1711     try {
1712       setupTable(table);
1713       assertEquals(ROWKEYS.length, countRows());
1714 
1715       // Mess it up by creating an overlap in the metadata
1716       TEST_UTIL.getHBaseAdmin().disableTable(table);
1717       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1718         Bytes.toBytes("B"), true, true, false, true);
1719       TEST_UTIL.getHBaseAdmin().enableTable(table);
1720 
1721       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
1722         Bytes.toBytes("A2"), Bytes.toBytes("B"));
1723       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1724       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1725         .waitForAssignment(hriOverlap);
1726       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1727       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1728 
1729       HBaseFsck hbck = doFsck(conf, false);
1730       assertErrors(hbck, new ERROR_CODE[] {
1731         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1732         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1733 
1734       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
1735       HBaseFsck fsck = new HBaseFsck(conf);
1736       fsck.connect();
1737       fsck.setDisplayFullReport(); // i.e. -details
1738       fsck.setTimeLag(0);
1739       fsck.setCheckHdfs(false);
1740       fsck.onlineHbck();
1741       assertErrors(fsck, new ERROR_CODE[] {
1742         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1743 
1744       // verify that fixHdfsHoles doesn't work with noHdfsChecking
1745       fsck = new HBaseFsck(conf);
1746       fsck.connect();
1747       fsck.setDisplayFullReport(); // i.e. -details
1748       fsck.setTimeLag(0);
1749       fsck.setCheckHdfs(false);
1750       fsck.setFixHdfsHoles(true);
1751       fsck.setFixHdfsOverlaps(true);
1752       fsck.setFixHdfsOrphans(true);
1753       fsck.onlineHbck();
1754       assertFalse(fsck.shouldRerun());
1755       assertErrors(fsck, new ERROR_CODE[] {
1756         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1757     } finally {
1758       if (TEST_UTIL.getHBaseAdmin().isTableDisabled(table)) {
1759         TEST_UTIL.getHBaseAdmin().enableTable(table);
1760       }
1761       deleteTable(table);
1762     }
1763   }
1764 
1765   /**
1766    * We don't have an easy way to verify that a flush completed, so we loop until we find a
1767    * legitimate hfile and return it.
1768    * @param fs
1769    * @param table
1770    * @return Path of a flushed hfile.
1771    * @throws IOException
1772    */
1773   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
1774     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1775     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1776     Path famDir = new Path(regionDir, FAM_STR);
1777 
1778     // keep doing this until we get a legit hfile
1779     while (true) {
1780       FileStatus[] hfFss = fs.listStatus(famDir);
1781       if (hfFss.length == 0) {
1782         continue;
1783       }
1784       for (FileStatus hfs : hfFss) {
1785         if (!hfs.isDir()) {
1786           return hfs.getPath();
1787         }
1788       }
1789     }
1790   }
1791 
1792   /**
1793    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
1794    */
1795   @Test(timeout=180000)
1796   public void testQuarantineCorruptHFile() throws Exception {
1797     TableName table = TableName.valueOf(name.getMethodName());
1798     try {
1799       setupTable(table);
1800       assertEquals(ROWKEYS.length, countRows());
1801       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1802 
1803       FileSystem fs = FileSystem.get(conf);
1804       Path hfile = getFlushedHFile(fs, table);
1805 
1806       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1807       TEST_UTIL.getHBaseAdmin().disableTable(table);
1808 
1809       // create new corrupt file called deadbeef (valid hfile name)
1810       Path corrupt = new Path(hfile.getParent(), "deadbeef");
1811       TestHFile.truncateFile(fs, hfile, corrupt);
1812       LOG.info("Created corrupted file " + corrupt);
1813       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
1814 
1815       // we cannot enable here because enable never finished due to the corrupt region.
1816       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
1817       assertEquals(res.getRetCode(), 0);
1818       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1819       assertEquals(hfcc.getHFilesChecked(), 5);
1820       assertEquals(hfcc.getCorrupted().size(), 1);
1821       assertEquals(hfcc.getFailures().size(), 0);
1822       assertEquals(hfcc.getQuarantined().size(), 1);
1823       assertEquals(hfcc.getMissing().size(), 0);
1824 
1825       // Its been fixed, verify that we can enable.
1826       TEST_UTIL.getHBaseAdmin().enableTable(table);
1827     } finally {
1828       deleteTable(table);
1829     }
1830   }
1831 
1832   /**
1833   * Test that use this should have a timeout, because this method could potentially wait forever.
1834   */
1835   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
1836                                 int corrupt, int fail, int quar, int missing) throws Exception {
1837     try {
1838       setupTable(table);
1839       assertEquals(ROWKEYS.length, countRows());
1840       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1841 
1842       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1843       TEST_UTIL.getHBaseAdmin().disableTable(table);
1844 
1845       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
1846           table.getNameAsString()};
1847       ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1848       HBaseFsck res = hbck.exec(exec, args);
1849 
1850       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1851       assertEquals(hfcc.getHFilesChecked(), check);
1852       assertEquals(hfcc.getCorrupted().size(), corrupt);
1853       assertEquals(hfcc.getFailures().size(), fail);
1854       assertEquals(hfcc.getQuarantined().size(), quar);
1855       assertEquals(hfcc.getMissing().size(), missing);
1856 
1857       // its been fixed, verify that we can enable
1858       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1859       admin.enableTableAsync(table);
1860       while (!admin.isTableEnabled(table)) {
1861         try {
1862           Thread.sleep(250);
1863         } catch (InterruptedException e) {
1864           e.printStackTrace();
1865           fail("Interrupted when trying to enable table " + table);
1866         }
1867       }
1868     } finally {
1869       deleteTable(table);
1870     }
1871   }
1872 
1873   /**
1874    * This creates a table and simulates the race situation where a concurrent compaction or split
1875    * has removed an hfile after the corruption checker learned about it.
1876    */
1877   @Test(timeout=180000)
1878   public void testQuarantineMissingHFile() throws Exception {
1879     TableName table = TableName.valueOf(name.getMethodName());
1880     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1881     // inject a fault in the hfcc created.
1882     final FileSystem fs = FileSystem.get(conf);
1883     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1884       @Override
1885       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1886         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1887           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1888           @Override
1889           protected void checkHFile(Path p) throws IOException {
1890             if (attemptedFirstHFile.compareAndSet(false, true)) {
1891               assertTrue(fs.delete(p, true)); // make sure delete happened.
1892             }
1893             super.checkHFile(p);
1894           }
1895         };
1896       }
1897     };
1898     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
1899   }
1900 
1901   /**
1902    * This creates a table and simulates the race situation where a concurrent compaction or split
1903    * has removed an colfam dir before the corruption checker got to it.
1904    */
1905   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
1906   // files in a column family on initial creation -- as suggested by Matteo.
1907   @Ignore @Test(timeout=180000)
1908   public void testQuarantineMissingFamdir() throws Exception {
1909     TableName table = TableName.valueOf(name.getMethodName());
1910     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1911     // inject a fault in the hfcc created.
1912     final FileSystem fs = FileSystem.get(conf);
1913     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1914       @Override
1915       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1916         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1917           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1918           @Override
1919           protected void checkColFamDir(Path p) throws IOException {
1920             if (attemptedFirstHFile.compareAndSet(false, true)) {
1921               assertTrue(fs.delete(p, true)); // make sure delete happened.
1922             }
1923             super.checkColFamDir(p);
1924           }
1925         };
1926       }
1927     };
1928     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1929   }
1930 
1931   /**
1932    * This creates a table and simulates the race situation where a concurrent compaction or split
1933    * has removed a region dir before the corruption checker got to it.
1934    */
1935   @Test(timeout=180000)
1936   public void testQuarantineMissingRegionDir() throws Exception {
1937     TableName table = TableName.valueOf(name.getMethodName());
1938     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1939     // inject a fault in the hfcc created.
1940     final FileSystem fs = FileSystem.get(conf);
1941     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1942       @Override
1943       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1944         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1945           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1946           @Override
1947           protected void checkRegionDir(Path p) throws IOException {
1948             if (attemptedFirstHFile.compareAndSet(false, true)) {
1949               assertTrue(fs.delete(p, true)); // make sure delete happened.
1950             }
1951             super.checkRegionDir(p);
1952           }
1953         };
1954       }
1955     };
1956     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1957   }
1958 
1959   /**
1960    * Test fixing lingering reference file.
1961    */
1962   @Test
1963   public void testLingeringReferenceFile() throws Exception {
1964     TableName table =
1965         TableName.valueOf("testLingeringReferenceFile");
1966     try {
1967       setupTable(table);
1968       assertEquals(ROWKEYS.length, countRows());
1969 
1970       // Mess it up by creating a fake reference file
1971       FileSystem fs = FileSystem.get(conf);
1972       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1973       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1974       Path famDir = new Path(regionDir, FAM_STR);
1975       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
1976       fs.create(fakeReferenceFile);
1977 
1978       HBaseFsck hbck = doFsck(conf, false);
1979       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
1980       // fix reference file
1981       doFsck(conf, true);
1982       // check that reference file fixed
1983       assertNoErrors(doFsck(conf, false));
1984     } finally {
1985       deleteTable(table);
1986     }
1987   }
1988 
1989   /**
1990    * Test mission REGIONINFO_QUALIFIER in hbase:meta
1991    */
1992   @Test
1993   public void testMissingRegionInfoQualifier() throws Exception {
1994     TableName table =
1995         TableName.valueOf("testMissingRegionInfoQualifier");
1996     try {
1997       setupTable(table);
1998 
1999       // Mess it up by removing the RegionInfo for one region.
2000       final List<Delete> deletes = new LinkedList<Delete>();
2001       HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
2002       MetaScanner.metaScan(conf, new MetaScanner.MetaScannerVisitor() {
2003 
2004         @Override
2005         public boolean processRow(Result rowResult) throws IOException {
2006           HRegionInfo hri = MetaScanner.getHRegionInfo(rowResult);
2007           if (hri != null && !hri.getTable().isSystemTable()) {
2008             Delete delete = new Delete(rowResult.getRow());
2009             delete.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2010             deletes.add(delete);
2011           }
2012           return true;
2013         }
2014 
2015         @Override
2016         public void close() throws IOException {
2017         }
2018       });
2019       meta.delete(deletes);
2020 
2021       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2022       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2023         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2024       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2025         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2026       meta.close();
2027 
2028       HBaseFsck hbck = doFsck(conf, false);
2029       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2030 
2031       // fix reference file
2032       hbck = doFsck(conf, true);
2033 
2034       // check that reference file fixed
2035       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2036     } finally {
2037       deleteTable(table);
2038     }
2039   }
2040 
2041 
2042   /**
2043    * Test pluggable error reporter. It can be plugged in
2044    * from system property or configuration.
2045    */
2046   @Test
2047   public void testErrorReporter() throws Exception {
2048     try {
2049       MockErrorReporter.calledCount = 0;
2050       doFsck(conf, false);
2051       assertEquals(MockErrorReporter.calledCount, 0);
2052 
2053       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2054       doFsck(conf, false);
2055       assertTrue(MockErrorReporter.calledCount > 20);
2056     } finally {
2057       conf.set("hbasefsck.errorreporter",
2058         PrintingErrorReporter.class.getName());
2059       MockErrorReporter.calledCount = 0;
2060     }
2061   }
2062 
2063   static class MockErrorReporter implements ErrorReporter {
2064     static int calledCount = 0;
2065 
2066     @Override
2067     public void clear() {
2068       calledCount++;
2069     }
2070 
2071     @Override
2072     public void report(String message) {
2073       calledCount++;
2074     }
2075 
2076     @Override
2077     public void reportError(String message) {
2078       calledCount++;
2079     }
2080 
2081     @Override
2082     public void reportError(ERROR_CODE errorCode, String message) {
2083       calledCount++;
2084     }
2085 
2086     @Override
2087     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2088       calledCount++;
2089     }
2090 
2091     @Override
2092     public void reportError(ERROR_CODE errorCode,
2093         String message, TableInfo table, HbckInfo info) {
2094       calledCount++;
2095     }
2096 
2097     @Override
2098     public void reportError(ERROR_CODE errorCode, String message,
2099         TableInfo table, HbckInfo info1, HbckInfo info2) {
2100       calledCount++;
2101     }
2102 
2103     @Override
2104     public int summarize() {
2105       return ++calledCount;
2106     }
2107 
2108     @Override
2109     public void detail(String details) {
2110       calledCount++;
2111     }
2112 
2113     @Override
2114     public ArrayList<ERROR_CODE> getErrorList() {
2115       calledCount++;
2116       return new ArrayList<ERROR_CODE>();
2117     }
2118 
2119     @Override
2120     public void progress() {
2121       calledCount++;
2122     }
2123 
2124     @Override
2125     public void print(String message) {
2126       calledCount++;
2127     }
2128 
2129     @Override
2130     public void resetErrors() {
2131       calledCount++;
2132     }
2133 
2134     @Override
2135     public boolean tableHasErrors(TableInfo table) {
2136       calledCount++;
2137       return false;
2138     }
2139   }
2140 
2141   @Test(timeout=180000)
2142   public void testCheckTableLocks() throws Exception {
2143     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2144     EnvironmentEdgeManager.injectEdge(edge);
2145     // check no errors
2146     HBaseFsck hbck = doFsck(conf, false);
2147     assertNoErrors(hbck);
2148 
2149     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2150 
2151     // obtain one lock
2152     final TableLockManager tableLockManager = TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2153     TableLock writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2154         "testCheckTableLocks");
2155     writeLock.acquire();
2156     hbck = doFsck(conf, false);
2157     assertNoErrors(hbck); // should not have expired, no problems
2158 
2159     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2160         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2161 
2162     hbck = doFsck(conf, false);
2163     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2164 
2165     final CountDownLatch latch = new CountDownLatch(1);
2166     new Thread() {
2167       @Override
2168       public void run() {
2169         TableLock readLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2170             "testCheckTableLocks");
2171         try {
2172           latch.countDown();
2173           readLock.acquire();
2174         } catch (IOException ex) {
2175           fail();
2176         } catch (IllegalStateException ex) {
2177           return; // expected, since this will be reaped under us.
2178         }
2179         fail("should not have come here");
2180       };
2181     }.start();
2182 
2183     latch.await(); // wait until thread starts
2184     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2185 
2186     hbck = doFsck(conf, false);
2187     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2188 
2189     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2190         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2191 
2192     hbck = doFsck(conf, false);
2193     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2194 
2195     conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2196                                                                  // which is not injectable through EnvironmentEdge
2197     Threads.sleep(10);
2198     hbck = doFsck(conf, true); // now fix both cases
2199 
2200     hbck = doFsck(conf, false);
2201     assertNoErrors(hbck);
2202 
2203     // ensure that locks are deleted
2204     writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2205         "should acquire without blocking");
2206     writeLock.acquire(); // this should not block.
2207     writeLock.release(); // release for clean state
2208   }
2209 
2210   /**
2211    * Test orphaned table ZNode (for table states)
2212    */
2213   @Test
2214   public void testOrphanedTableZNode() throws Exception {
2215     TableName table = TableName.valueOf("testOrphanedZKTableEntry");
2216 
2217     try {
2218       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getZKTable().
2219       setEnablingTable(table);
2220 
2221       try {
2222         setupTable(table);
2223         Assert.fail(
2224           "Create table should fail when its ZNode has already existed with ENABLING state.");
2225       } catch(TableExistsException t) {
2226         //Expected exception
2227       }
2228       // The setup table was interrupted in some state that needs to some cleanup.
2229       try {
2230         deleteTable(table);
2231       } catch (IOException e) {
2232         // Because create table failed, it is expected that the cleanup table would
2233         // throw some exception.  Ignore and continue.
2234       }
2235 
2236       HBaseFsck hbck = doFsck(conf, false);
2237       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2238 
2239       // fix the orphaned ZK entry
2240       hbck = doFsck(conf, true);
2241 
2242       // check that orpahned ZK table entry is gone.
2243       hbck = doFsck(conf, false);
2244       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2245       // Now create table should succeed.
2246       setupTable(table);
2247     } finally {
2248       // This code could be called that either a table was created successfully or set up
2249       // table failed in some unknown state.  Therefore, clean up can either succeed or fail.
2250       try {
2251         deleteTable(table);
2252       } catch (IOException e) {
2253         // The cleanup table would throw some exception if create table failed in some state.
2254         // Ignore this exception
2255       }
2256     }
2257   }
2258 
2259   @Test
2260   public void testMetaOffline() throws Exception {
2261     // check no errors
2262     HBaseFsck hbck = doFsck(conf, false);
2263     assertNoErrors(hbck);
2264     deleteMetaRegion(conf, true, false, false);
2265     hbck = doFsck(conf, false);
2266     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2267     // inconsistency and whether we will be fixing it or not.
2268     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2269     hbck = doFsck(conf, true);
2270     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2271     hbck = doFsck(conf, false);
2272     assertNoErrors(hbck);
2273   }
2274 
2275   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2276       boolean regionInfoOnly) throws IOException, InterruptedException {
2277     HConnection connection = HConnectionManager.getConnection(conf);
2278     HRegionLocation metaLocation = connection.locateRegion(TableName.META_TABLE_NAME,
2279         HConstants.EMPTY_START_ROW);
2280     ServerName hsa = metaLocation.getServerName();
2281     HRegionInfo hri = metaLocation.getRegionInfo();
2282     if (unassign) {
2283       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2284       undeployRegion(new HBaseAdmin(conf), hsa, hri);
2285     }
2286 
2287     if (regionInfoOnly) {
2288       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2289       Path rootDir = FSUtils.getRootDir(conf);
2290       FileSystem fs = rootDir.getFileSystem(conf);
2291       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2292           hri.getEncodedName());
2293       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2294       fs.delete(hriPath, true);
2295     }
2296 
2297     if (hdfs) {
2298       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2299       Path rootDir = FSUtils.getRootDir(conf);
2300       FileSystem fs = rootDir.getFileSystem(conf);
2301       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2302           hri.getEncodedName());
2303       HBaseFsck.debugLsr(conf, p);
2304       boolean success = fs.delete(p, true);
2305       LOG.info("Deleted " + p + " sucessfully? " + success);
2306       HBaseFsck.debugLsr(conf, p);
2307     }
2308   }
2309 
2310   @Test
2311   public void testTableWithNoRegions() throws Exception {
2312     // We might end up with empty regions in a table
2313     // see also testNoHdfsTable()
2314     TableName table =
2315         TableName.valueOf(name.getMethodName());
2316     try {
2317       // create table with one region
2318       HTableDescriptor desc = new HTableDescriptor(table);
2319       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2320       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2321       TEST_UTIL.getHBaseAdmin().createTable(desc);
2322       tbl = new HTable(TEST_UTIL.getConfiguration(), table, executorService);
2323 
2324       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2325       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, false,
2326           false, true);
2327 
2328       HBaseFsck hbck = doFsck(conf, false);
2329       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2330 
2331       doFsck(conf, true);
2332 
2333       // fix hole
2334       doFsck(conf, true);
2335 
2336       // check that hole fixed
2337       assertNoErrors(doFsck(conf, false));
2338     } finally {
2339       deleteTable(table);
2340     }
2341 
2342   }
2343 
2344   @Test
2345   public void testHbckAfterRegionMerge() throws Exception {
2346     TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
2347     HTable meta = null;
2348     try {
2349       // disable CatalogJanitor
2350       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2351       setupTable(table);
2352       assertEquals(ROWKEYS.length, countRows());
2353 
2354       // make sure data in regions, if in hlog only there is no data loss
2355       TEST_UTIL.getHBaseAdmin().flush(table.getName());
2356       HRegionInfo region1 = tbl.getRegionLocation("A").getRegionInfo();
2357       HRegionInfo region2 = tbl.getRegionLocation("B").getRegionInfo();
2358 
2359       int regionCountBeforeMerge = tbl.getRegionLocations().size();
2360 
2361       assertNotEquals(region1, region2);
2362 
2363       // do a region merge
2364       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
2365       admin.mergeRegions(region1.getEncodedNameAsBytes(),
2366           region2.getEncodedNameAsBytes(), false);
2367 
2368       // wait until region merged
2369       long timeout = System.currentTimeMillis() + 30 * 1000;
2370       while (true) {
2371         if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
2372           break;
2373         } else if (System.currentTimeMillis() > timeout) {
2374           fail("Time out waiting on region " + region1.getEncodedName()
2375               + " and " + region2.getEncodedName() + " be merged");
2376         }
2377         Thread.sleep(10);
2378       }
2379 
2380       assertEquals(ROWKEYS.length, countRows());
2381 
2382       HBaseFsck hbck = doFsck(conf, false);
2383       assertNoErrors(hbck); // no errors
2384 
2385     } finally {
2386       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2387       deleteTable(table);
2388       IOUtils.closeQuietly(meta);
2389     }
2390   }
2391 
2392   @Test
2393   public void testRegionBoundariesCheck() throws Exception {
2394     HBaseFsck hbck = doFsck(conf, false);
2395     assertNoErrors(hbck); // no errors
2396     try {
2397       hbck.checkRegionBoundaries();
2398     } catch (IllegalArgumentException e) {
2399       if (e.getMessage().endsWith("not a valid DFS filename.")) {
2400         fail("Table directory path is not valid." + e.getMessage());
2401       }
2402     }
2403   }
2404 
2405   @org.junit.Rule
2406   public TestName name = new TestName();
2407 
2408   @Test
2409   public void testReadOnlyProperty() throws Exception {
2410     HBaseFsck hbck = doFsck(conf, false);
2411     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2412       hbck.shouldIgnorePreCheckPermission());
2413 
2414     hbck = doFsck(conf, true);
2415     Assert.assertEquals("shouldIgnorePreCheckPermission", false,
2416       hbck.shouldIgnorePreCheckPermission());
2417 
2418     hbck = doFsck(conf, true);
2419     hbck.setIgnorePreCheckPermission(true);
2420     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2421       hbck.shouldIgnorePreCheckPermission());
2422   }
2423 
2424   @Test (timeout=180000)
2425   public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
2426     TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
2427     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
2428     try {
2429       HTableDescriptor desc = new HTableDescriptor(table);
2430       desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
2431       TEST_UTIL.getHBaseAdmin().createTable(desc);
2432       tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
2433       for (int i = 0; i < 5; i++) {
2434         Put p1 = new Put(("r" + i).getBytes());
2435         p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
2436         tbl.put(p1);
2437       }
2438       TEST_UTIL.getHBaseAdmin().flush(desc.getTableName().toString());
2439       List<HRegion> regions = cluster.getRegions(desc.getTableName());
2440       int serverWith = cluster.getServerWith(regions.get(0).getRegionName());
2441       HRegionServer regionServer = cluster.getRegionServer(serverWith);
2442       cluster.getServerWith(regions.get(0).getRegionName());
2443       SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3"));
2444       st.prepare();
2445       st.stepsBeforePONR(regionServer, regionServer, false);
2446       AssignmentManager am = cluster.getMaster().getAssignmentManager();
2447       Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
2448       for (RegionState state : regionsInTransition.values()) {
2449         am.regionOffline(state.getRegion());
2450       }
2451       ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo());
2452       Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
2453       regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
2454       am.assign(regionsMap);
2455       am.waitForAssignment(regions.get(0).getRegionInfo());
2456       HBaseFsck hbck = doFsck(conf, false);
2457       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2458           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2459       // holes are separate from overlap groups
2460       assertEquals(0, hbck.getOverlapGroups(table).size());
2461 
2462       // fix hole
2463       assertErrors(
2464         doFsck(
2465           conf, false, true, false, false, false, false, false, false, false, false, false, null),
2466         new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2467           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2468 
2469       // check that hole fixed
2470       assertNoErrors(doFsck(conf, false));
2471       assertEquals(5, countRows());
2472     } finally {
2473       if (tbl != null) {
2474         tbl.close();
2475         tbl = null;
2476       }
2477       deleteTable(table);
2478     }
2479   }
2480 }