View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.io.hfile;
19  
20  import java.io.DataInput;
21  import java.io.IOException;
22  import java.nio.ByteBuffer;
23  import java.util.ArrayList;
24  import java.util.List;
25  
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.hadoop.hbase.classification.InterfaceAudience;
29  import org.apache.hadoop.conf.Configuration;
30  import org.apache.hadoop.fs.Path;
31  import org.apache.hadoop.hbase.HConstants;
32  import org.apache.hadoop.hbase.KeyValue;
33  import org.apache.hadoop.hbase.KeyValue.KVComparator;
34  import org.apache.hadoop.hbase.fs.HFileSystem;
35  import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
36  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
37  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
38  import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
39  import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
40  import org.apache.hadoop.hbase.util.ByteBufferUtils;
41  import org.apache.hadoop.hbase.util.Bytes;
42  import org.apache.hadoop.hbase.util.IdLock;
43  import org.apache.hadoop.io.WritableUtils;
44  import org.cloudera.htrace.Trace;
45  import org.cloudera.htrace.TraceScope;
46  
47  import com.google.common.annotations.VisibleForTesting;
48  
49  /**
50   * {@link HFile} reader for version 2.
51   */
52  @InterfaceAudience.Private
53  public class HFileReaderV2 extends AbstractHFileReader {
54  
55    private static final Log LOG = LogFactory.getLog(HFileReaderV2.class);
56  
57    /** Minor versions in HFile V2 starting with this number have hbase checksums */
58    public static final int MINOR_VERSION_WITH_CHECKSUM = 1;
59    /** In HFile V2 minor version that does not support checksums */
60    public static final int MINOR_VERSION_NO_CHECKSUM = 0;
61  
62    /** HFile minor version that introduced pbuf filetrailer */
63    public static final int PBUF_TRAILER_MINOR_VERSION = 2;
64  
65    /**
66     * The size of a (key length, value length) tuple that prefixes each entry in
67     * a data block.
68     */
69    public final static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
70  
71    protected boolean includesMemstoreTS = false;
72    protected boolean decodeMemstoreTS = false;
73    protected boolean shouldIncludeMemstoreTS() {
74      return includesMemstoreTS;
75    }
76  
77    /** Filesystem-level block reader. */
78    protected HFileBlock.FSReader fsBlockReader;
79  
80    /**
81     * A "sparse lock" implementation allowing to lock on a particular block
82     * identified by offset. The purpose of this is to avoid two clients loading
83     * the same block, and have all but one client wait to get the block from the
84     * cache.
85     */
86    private IdLock offsetLock = new IdLock();
87  
88    /**
89     * Blocks read from the load-on-open section, excluding data root index, meta
90     * index, and file info.
91     */
92    private List<HFileBlock> loadOnOpenBlocks = new ArrayList<HFileBlock>();
93  
94    /** Minimum minor version supported by this HFile format */
95    static final int MIN_MINOR_VERSION = 0;
96  
97    /** Maximum minor version supported by this HFile format */
98    // We went to version 2 when we moved to pb'ing fileinfo and the trailer on
99    // the file. This version can read Writables version 1.
100   static final int MAX_MINOR_VERSION = 3;
101 
102   /** Minor versions starting with this number have faked index key */
103   static final int MINOR_VERSION_WITH_FAKED_KEY = 3;
104 
105   protected HFileContext hfileContext;
106 
107   /**
108    * Opens a HFile. You must load the index before you can use it by calling
109    * {@link #loadFileInfo()}.
110    *
111    * @param path Path to HFile.
112    * @param trailer File trailer.
113    * @param fsdis input stream.
114    * @param size Length of the stream.
115    * @param cacheConf Cache configuration.
116    * @param hfs
117    * @param conf
118    */
119   public HFileReaderV2(final Path path, final FixedFileTrailer trailer,
120       final FSDataInputStreamWrapper fsdis, final long size, final CacheConfig cacheConf,
121       final HFileSystem hfs, final Configuration conf) throws IOException {
122     super(path, trailer, size, cacheConf, hfs, conf);
123     this.conf = conf;
124     trailer.expectMajorVersion(getMajorVersion());
125     validateMinorVersion(path, trailer.getMinorVersion());
126     this.hfileContext = createHFileContext(fsdis, fileSize, hfs, path, trailer);
127     HFileBlock.FSReaderV2 fsBlockReaderV2 = new HFileBlock.FSReaderV2(fsdis, fileSize, hfs, path,
128         hfileContext);
129     this.fsBlockReader = fsBlockReaderV2; // upcast
130 
131     // Comparator class name is stored in the trailer in version 2.
132     comparator = trailer.createComparator();
133     dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(comparator,
134         trailer.getNumDataIndexLevels(), this);
135     metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
136         KeyValue.RAW_COMPARATOR, 1);
137 
138     // Parse load-on-open data.
139 
140     HFileBlock.BlockIterator blockIter = fsBlockReaderV2.blockRange(
141         trailer.getLoadOnOpenDataOffset(),
142         fileSize - trailer.getTrailerSize());
143 
144     // Data index. We also read statistics about the block index written after
145     // the root level.
146     dataBlockIndexReader.readMultiLevelIndexRoot(
147         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
148         trailer.getDataIndexCount());
149 
150     // Meta index.
151     metaBlockIndexReader.readRootIndex(
152         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
153         trailer.getMetaIndexCount());
154 
155     // File info
156     fileInfo = new FileInfo();
157     fileInfo.read(blockIter.nextBlockWithBlockType(BlockType.FILE_INFO).getByteStream());
158     lastKey = fileInfo.get(FileInfo.LASTKEY);
159     avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
160     avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
161     byte [] keyValueFormatVersion =
162         fileInfo.get(HFileWriterV2.KEY_VALUE_VERSION);
163     includesMemstoreTS = keyValueFormatVersion != null &&
164         Bytes.toInt(keyValueFormatVersion) ==
165             HFileWriterV2.KEY_VALUE_VER_WITH_MEMSTORE;
166     fsBlockReaderV2.setIncludesMemstoreTS(includesMemstoreTS);
167     if (includesMemstoreTS) {
168       decodeMemstoreTS = Bytes.toLong(fileInfo.get(HFileWriterV2.MAX_MEMSTORE_TS_KEY)) > 0;
169     }
170 
171     // Read data block encoding algorithm name from file info.
172     dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo);
173     fsBlockReaderV2.setDataBlockEncoder(dataBlockEncoder);
174 
175     // Store all other load-on-open blocks for further consumption.
176     HFileBlock b;
177     while ((b = blockIter.nextBlock()) != null) {
178       loadOnOpenBlocks.add(b);
179     }
180 
181     // Prefetch file blocks upon open if requested
182     if (cacheConf.shouldPrefetchOnOpen()) {
183       PrefetchExecutor.request(path, new Runnable() {
184         public void run() {
185           try {
186             long offset = 0;
187             long end = fileSize - getTrailer().getTrailerSize();
188             HFileBlock prevBlock = null;
189             while (offset < end) {
190               if (Thread.interrupted()) {
191                 break;
192               }
193               long onDiskSize = -1;
194               if (prevBlock != null) {
195                 onDiskSize = prevBlock.getNextBlockOnDiskSizeWithHeader();
196               }
197               HFileBlock block = readBlock(offset, onDiskSize, true, false, false, false, null);
198               prevBlock = block;
199               offset += block.getOnDiskSizeWithHeader();
200             }
201           } catch (IOException e) {
202             // IOExceptions are probably due to region closes (relocation, etc.)
203             if (LOG.isTraceEnabled()) {
204               LOG.trace("Exception encountered while prefetching " + path + ":", e);
205             }
206           } catch (Exception e) {
207             // Other exceptions are interesting
208             LOG.warn("Exception encountered while prefetching " + path + ":", e);
209           } finally {
210             PrefetchExecutor.complete(path);
211           }
212         }
213       });
214     }
215   }
216 
217   protected HFileContext createHFileContext(FSDataInputStreamWrapper fsdis, long fileSize,
218       HFileSystem hfs, Path path, FixedFileTrailer trailer) throws IOException {
219     return new HFileContextBuilder()
220       .withIncludesMvcc(this.includesMemstoreTS)
221       .withCompression(this.compressAlgo)
222       .withHBaseCheckSum(trailer.getMinorVersion() >= MINOR_VERSION_WITH_CHECKSUM)
223       .build();
224   }
225 
226   /**
227    * Create a Scanner on this file. No seeks or reads are done on creation. Call
228    * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
229    * nothing to clean up in a Scanner. Letting go of your references to the
230    * scanner is sufficient.
231    *
232    * @param cacheBlocks True if we should cache blocks read in by this scanner.
233    * @param pread Use positional read rather than seek+read if true (pread is
234    *          better for random reads, seek+read is better scanning).
235    * @param isCompaction is scanner being used for a compaction?
236    * @return Scanner on this file.
237    */
238    @Override
239    public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
240       final boolean isCompaction) {
241     if (dataBlockEncoder.useEncodedScanner()) {
242       return new EncodedScannerV2(this, cacheBlocks, pread, isCompaction,
243           hfileContext);
244     }
245 
246     return new ScannerV2(this, cacheBlocks, pread, isCompaction);
247   }
248 
249   /**
250    * @param metaBlockName
251    * @param cacheBlock Add block to cache, if found
252    * @return block wrapped in a ByteBuffer, with header skipped
253    * @throws IOException
254    */
255   @Override
256   public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
257       throws IOException {
258     if (trailer.getMetaIndexCount() == 0) {
259       return null; // there are no meta blocks
260     }
261     if (metaBlockIndexReader == null) {
262       throw new IOException("Meta index not loaded");
263     }
264 
265     byte[] mbname = Bytes.toBytes(metaBlockName);
266     int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0,
267         mbname.length);
268     if (block == -1)
269       return null;
270     long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
271 
272     // Per meta key from any given file, synchronize reads for said block. This
273     // is OK to do for meta blocks because the meta block index is always
274     // single-level.
275     synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
276       // Check cache for block. If found return.
277       long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
278       BlockCacheKey cacheKey = new BlockCacheKey(name, metaBlockOffset,
279           DataBlockEncoding.NONE, BlockType.META);
280 
281       cacheBlock &= cacheConf.shouldCacheDataOnRead();
282       if (cacheConf.isBlockCacheEnabled()) {
283         HFileBlock cachedBlock =
284           (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, false, true);
285         if (cachedBlock != null) {
286           assert cachedBlock.isUnpacked() : "Packed block leak.";
287           // Return a distinct 'shallow copy' of the block,
288           // so pos does not get messed by the scanner
289           return cachedBlock.getBufferWithoutHeader();
290         }
291         // Cache Miss, please load.
292       }
293 
294       HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
295           blockSize, -1, true).unpack(hfileContext, fsBlockReader);
296 
297       // Cache the block
298       if (cacheBlock) {
299         cacheConf.getBlockCache().cacheBlock(cacheKey, metaBlock,
300             cacheConf.isInMemory());
301       }
302 
303       return metaBlock.getBufferWithoutHeader();
304     }
305   }
306 
307   /**
308    * Read in a file block.
309    * @param dataBlockOffset offset to read.
310    * @param onDiskBlockSize size of the block
311    * @param cacheBlock
312    * @param pread Use positional read instead of seek+read (positional is
313    *          better doing random reads whereas seek+read is better scanning).
314    * @param isCompaction is this block being read as part of a compaction
315    * @param expectedBlockType the block type we are expecting to read with this
316    *          read operation, or null to read whatever block type is available
317    *          and avoid checking (that might reduce caching efficiency of
318    *          encoded data blocks)
319    * @return Block wrapped in a ByteBuffer.
320    * @throws IOException
321    */
322   @Override
323   public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize,
324       final boolean cacheBlock, boolean pread, final boolean isCompaction,
325       final boolean updateCacheMetrics, BlockType expectedBlockType)
326       throws IOException {
327     if (dataBlockIndexReader == null) {
328       throw new IOException("Block index not loaded");
329     }
330     if (dataBlockOffset < 0
331         || dataBlockOffset >= trailer.getLoadOnOpenDataOffset()) {
332       throw new IOException("Requested block is out of range: "
333           + dataBlockOffset + ", lastDataBlockOffset: "
334           + trailer.getLastDataBlockOffset());
335     }
336     // For any given block from any given file, synchronize reads for said
337     // block.
338     // Without a cache, this synchronizing is needless overhead, but really
339     // the other choice is to duplicate work (which the cache would prevent you
340     // from doing).
341 
342     BlockCacheKey cacheKey =
343         new BlockCacheKey(name, dataBlockOffset,
344             dataBlockEncoder.getDataBlockEncoding(),
345             expectedBlockType);
346 
347     boolean useLock = false;
348     IdLock.Entry lockEntry = null;
349     TraceScope traceScope = Trace.startSpan("HFileReaderV2.readBlock");
350     try {
351       while (true) {
352         if (useLock) {
353           lockEntry = offsetLock.getLockEntry(dataBlockOffset);
354         }
355 
356         // Check cache for block. If found return.
357         if (cacheConf.isBlockCacheEnabled()) {
358           // Try and get the block from the block cache. If the useLock variable is true then this
359           // is the second time through the loop and it should not be counted as a block cache miss.
360           HFileBlock cachedBlock = (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, 
361             cacheBlock, useLock, updateCacheMetrics);
362           if (cachedBlock != null) {
363             if (cacheConf.shouldCacheCompressed(cachedBlock.getBlockType().getCategory())) {
364               cachedBlock = cachedBlock.unpack(hfileContext, fsBlockReader);
365             }
366             if (Trace.isTracing()) {
367               traceScope.getSpan().addTimelineAnnotation("blockCacheHit");
368             }
369             assert cachedBlock.isUnpacked() : "Packed block leak.";
370             if (cachedBlock.getBlockType().isData()) {
371               HFile.dataBlockReadCnt.incrementAndGet();
372 
373               // Validate encoding type for data blocks. We include encoding
374               // type in the cache key, and we expect it to match on a cache hit.
375               if (cachedBlock.getDataBlockEncoding() != dataBlockEncoder.getDataBlockEncoding()) {
376                 throw new IOException("Cached block under key " + cacheKey + " "
377                   + "has wrong encoding: " + cachedBlock.getDataBlockEncoding() + " (expected: "
378                   + dataBlockEncoder.getDataBlockEncoding() + ")");
379               }
380             }
381             return cachedBlock;
382           }
383           // Carry on, please load.
384         }
385         if (!useLock) {
386           // check cache again with lock
387           useLock = true;
388           continue;
389         }
390         if (Trace.isTracing()) {
391           traceScope.getSpan().addTimelineAnnotation("blockCacheMiss");
392         }
393         // Load block from filesystem.
394         HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, -1,
395             pread);
396         validateBlockType(hfileBlock, expectedBlockType);
397         HFileBlock unpacked = hfileBlock.unpack(hfileContext, fsBlockReader);
398         BlockType.BlockCategory category = hfileBlock.getBlockType().getCategory();
399 
400         // Cache the block if necessary
401         if (cacheBlock && cacheConf.shouldCacheBlockOnRead(category)) {
402           cacheConf.getBlockCache().cacheBlock(cacheKey,
403             cacheConf.shouldCacheCompressed(category) ? hfileBlock : unpacked,
404             cacheConf.isInMemory());
405         }
406 
407         if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
408           HFile.dataBlockReadCnt.incrementAndGet();
409         }
410 
411         return unpacked;
412       }
413     } finally {
414       traceScope.close();
415       if (lockEntry != null) {
416         offsetLock.releaseLockEntry(lockEntry);
417       }
418     }
419   }
420 
421   @Override
422   public boolean hasMVCCInfo() {
423     return includesMemstoreTS && decodeMemstoreTS;
424   }
425 
426   /**
427    * Compares the actual type of a block retrieved from cache or disk with its
428    * expected type and throws an exception in case of a mismatch. Expected
429    * block type of {@link BlockType#DATA} is considered to match the actual
430    * block type [@link {@link BlockType#ENCODED_DATA} as well.
431    * @param block a block retrieved from cache or disk
432    * @param expectedBlockType the expected block type, or null to skip the
433    *          check
434    */
435   private void validateBlockType(HFileBlock block,
436       BlockType expectedBlockType) throws IOException {
437     if (expectedBlockType == null) {
438       return;
439     }
440     BlockType actualBlockType = block.getBlockType();
441     if (actualBlockType == BlockType.ENCODED_DATA &&
442         expectedBlockType == BlockType.DATA) {
443       // We consider DATA to match ENCODED_DATA for the purpose of this
444       // verification.
445       return;
446     }
447     if (actualBlockType != expectedBlockType) {
448       throw new IOException("Expected block type " + expectedBlockType + ", " +
449           "but got " + actualBlockType + ": " + block);
450     }
451   }
452 
453   /**
454    * @return Last key in the file. May be null if file has no entries. Note that
455    *         this is not the last row key, but rather the byte form of the last
456    *         KeyValue.
457    */
458   @Override
459   public byte[] getLastKey() {
460     return dataBlockIndexReader.isEmpty() ? null : lastKey;
461   }
462 
463   /**
464    * @return Midkey for this file. We work with block boundaries only so
465    *         returned midkey is an approximation only.
466    * @throws IOException
467    */
468   @Override
469   public byte[] midkey() throws IOException {
470     return dataBlockIndexReader.midkey();
471   }
472 
473   @Override
474   public void close() throws IOException {
475     close(cacheConf.shouldEvictOnClose());
476   }
477 
478   public void close(boolean evictOnClose) throws IOException {
479     PrefetchExecutor.cancel(path);
480     if (evictOnClose && cacheConf.isBlockCacheEnabled()) {
481       int numEvicted = cacheConf.getBlockCache().evictBlocksByHfileName(name);
482       if (LOG.isTraceEnabled()) {
483         LOG.trace("On close, file=" + name + " evicted=" + numEvicted
484           + " block(s)");
485       }
486     }
487     fsBlockReader.closeStreams();
488   }
489 
490   /** For testing */
491   @Override
492   HFileBlock.FSReader getUncachedBlockReader() {
493     return fsBlockReader;
494   }
495 
496 
497   protected abstract static class AbstractScannerV2
498       extends AbstractHFileReader.Scanner {
499     protected HFileBlock block;
500 
501     /**
502      * The next indexed key is to keep track of the indexed key of the next data block.
503      * If the nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the
504      * current data block is the last data block.
505      *
506      * If the nextIndexedKey is null, it means the nextIndexedKey has not been loaded yet.
507      */
508     protected byte[] nextIndexedKey;
509 
510     public AbstractScannerV2(HFileReaderV2 r, boolean cacheBlocks,
511         final boolean pread, final boolean isCompaction) {
512       super(r, cacheBlocks, pread, isCompaction);
513     }
514 
515     /**
516      * An internal API function. Seek to the given key, optionally rewinding to
517      * the first key of the block before doing the seek.
518      *
519      * @param key key byte array
520      * @param offset key offset in the key byte array
521      * @param length key length
522      * @param rewind whether to rewind to the first key of the block before
523      *        doing the seek. If this is false, we are assuming we never go
524      *        back, otherwise the result is undefined.
525      * @return -1 if the key is earlier than the first key of the file,
526      *         0 if we are at the given key, 1 if we are past the given key
527      *         -2 if the key is earlier than the first key of the file while
528      *         using a faked index key
529      * @throws IOException
530      */
531     protected int seekTo(byte[] key, int offset, int length, boolean rewind)
532         throws IOException {
533       HFileBlockIndex.BlockIndexReader indexReader =
534           reader.getDataBlockIndexReader();
535       BlockWithScanInfo blockWithScanInfo =
536         indexReader.loadDataBlockWithScanInfo(key, offset, length, block,
537             cacheBlocks, pread, isCompaction);
538       if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
539         // This happens if the key e.g. falls before the beginning of the file.
540         return -1;
541       }
542       return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
543           blockWithScanInfo.getNextIndexedKey(), rewind, key, offset, length, false);
544     }
545 
546     protected abstract ByteBuffer getFirstKeyInBlock(HFileBlock curBlock);
547 
548     protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
549         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
550         throws IOException;
551 
552     @Override
553     public int seekTo(byte[] key, int offset, int length) throws IOException {
554       // Always rewind to the first key of the block, because the given key
555       // might be before or after the current key.
556       return seekTo(key, offset, length, true);
557     }
558 
559     @Override
560     public int reseekTo(byte[] key, int offset, int length) throws IOException {
561       int compared;
562       if (isSeeked()) {
563         compared = compareKey(reader.getComparator(), key, offset, length);
564         if (compared < 1) {
565           // If the required key is less than or equal to current key, then
566           // don't do anything.
567           return compared;
568         } else {
569           if (this.nextIndexedKey != null &&
570               (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY ||
571                reader.getComparator().compareFlatKey(key, offset, length,
572                    nextIndexedKey, 0, nextIndexedKey.length) < 0)) {
573             // The reader shall continue to scan the current data block instead of querying the
574             // block index as long as it knows the target key is strictly smaller than
575             // the next indexed key or the current data block is the last data block.
576             return loadBlockAndSeekToKey(this.block, this.nextIndexedKey,
577                 false, key, offset, length, false);
578           }
579         }
580       }
581       // Don't rewind on a reseek operation, because reseek implies that we are
582       // always going forward in the file.
583       return seekTo(key, offset, length, false);
584     }
585 
586     @Override
587     public boolean seekBefore(byte[] key, int offset, int length)
588         throws IOException {
589       HFileBlock seekToBlock =
590           reader.getDataBlockIndexReader().seekToDataBlock(key, offset, length,
591               block, cacheBlocks, pread, isCompaction);
592       if (seekToBlock == null) {
593         return false;
594       }
595       ByteBuffer firstKey = getFirstKeyInBlock(seekToBlock);
596 
597       if (reader.getComparator().compareFlatKey(firstKey.array(),
598           firstKey.arrayOffset(), firstKey.limit(), key, offset, length) >= 0)
599       {
600         long previousBlockOffset = seekToBlock.getPrevBlockOffset();
601         // The key we are interested in
602         if (previousBlockOffset == -1) {
603           // we have a 'problem', the key we want is the first of the file.
604           return false;
605         }
606 
607         // It is important that we compute and pass onDiskSize to the block
608         // reader so that it does not have to read the header separately to
609         // figure out the size.
610         seekToBlock = reader.readBlock(previousBlockOffset,
611             seekToBlock.getOffset() - previousBlockOffset, cacheBlocks,
612             pread, isCompaction, true, BlockType.DATA);
613         // TODO shortcut: seek forward in this block to the last key of the
614         // block.
615       }
616       byte[] firstKeyInCurrentBlock = Bytes.getBytes(firstKey);
617       loadBlockAndSeekToKey(seekToBlock, firstKeyInCurrentBlock, true, key, offset, length, true);
618       return true;
619     }
620 
621 
622     /**
623      * Scans blocks in the "scanned" section of the {@link HFile} until the next
624      * data block is found.
625      *
626      * @return the next block, or null if there are no more data blocks
627      * @throws IOException
628      */
629     protected HFileBlock readNextDataBlock() throws IOException {
630       long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
631       if (block == null)
632         return null;
633 
634       HFileBlock curBlock = block;
635 
636       do {
637         if (curBlock.getOffset() >= lastDataBlockOffset)
638           return null;
639 
640         if (curBlock.getOffset() < 0) {
641           throw new IOException("Invalid block file offset: " + block);
642         }
643 
644         // We are reading the next block without block type validation, because
645         // it might turn out to be a non-data block.
646         curBlock = reader.readBlock(curBlock.getOffset()
647             + curBlock.getOnDiskSizeWithHeader(),
648             curBlock.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
649             isCompaction, true, null);
650       } while (!curBlock.getBlockType().isData());
651 
652       return curBlock;
653     }
654     /**
655      * Compare the given key against the current key
656      * @param comparator
657      * @param key
658      * @param offset
659      * @param length
660      * @return -1 is the passed key is smaller than the current key, 0 if equal and 1 if greater
661      */
662     public abstract int compareKey(KVComparator comparator, byte[] key, int offset,
663         int length);
664   }
665 
666   /**
667    * Implementation of {@link HFileScanner} interface.
668    */
669   protected static class ScannerV2 extends AbstractScannerV2 {
670     private HFileReaderV2 reader;
671 
672     public ScannerV2(HFileReaderV2 r, boolean cacheBlocks,
673         final boolean pread, final boolean isCompaction) {
674       super(r, cacheBlocks, pread, isCompaction);
675       this.reader = r;
676     }
677 
678     @Override
679     public KeyValue getKeyValue() {
680       if (!isSeeked())
681         return null;
682 
683       KeyValue ret = new KeyValue(blockBuffer.array(), blockBuffer.arrayOffset()
684           + blockBuffer.position(), getCellBufSize());
685       if (this.reader.shouldIncludeMemstoreTS()) {
686         ret.setMvccVersion(currMemstoreTS);
687       }
688       return ret;
689     }
690 
691     protected int getCellBufSize() {
692       return KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen;
693     }
694 
695     @Override
696     public ByteBuffer getKey() {
697       assertSeeked();
698       return ByteBuffer.wrap(
699           blockBuffer.array(),
700           blockBuffer.arrayOffset() + blockBuffer.position()
701               + KEY_VALUE_LEN_SIZE, currKeyLen).slice();
702     }
703 
704     @Override
705     public int compareKey(KVComparator comparator, byte[] key, int offset, int length) {
706       return comparator.compareFlatKey(key, offset, length, blockBuffer.array(),
707           blockBuffer.arrayOffset() + blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen);
708     }
709 
710     @Override
711     public ByteBuffer getValue() {
712       assertSeeked();
713       return ByteBuffer.wrap(
714           blockBuffer.array(),
715           blockBuffer.arrayOffset() + blockBuffer.position()
716               + KEY_VALUE_LEN_SIZE + currKeyLen, currValueLen).slice();
717     }
718 
719     protected void setNonSeekedState() {
720       block = null;
721       blockBuffer = null;
722       currKeyLen = 0;
723       currValueLen = 0;
724       currMemstoreTS = 0;
725       currMemstoreTSLen = 0;
726     }
727 
728     /**
729      * Go to the next key/value in the block section. Loads the next block if
730      * necessary. If successful, {@link #getKey()} and {@link #getValue()} can
731      * be called.
732      *
733      * @return true if successfully navigated to the next key/value
734      */
735     @Override
736     public boolean next() throws IOException {
737       assertSeeked();
738 
739       try {
740         blockBuffer.position(getNextCellStartPosition());
741       } catch (IllegalArgumentException e) {
742         LOG.error("Current pos = " + blockBuffer.position()
743             + "; currKeyLen = " + currKeyLen + "; currValLen = "
744             + currValueLen + "; block limit = " + blockBuffer.limit()
745             + "; HFile name = " + reader.getName()
746             + "; currBlock currBlockOffset = " + block.getOffset());
747         throw e;
748       }
749 
750       if (blockBuffer.remaining() <= 0) {
751         long lastDataBlockOffset =
752             reader.getTrailer().getLastDataBlockOffset();
753 
754         if (block.getOffset() >= lastDataBlockOffset) {
755           setNonSeekedState();
756           return false;
757         }
758 
759         // read the next block
760         HFileBlock nextBlock = readNextDataBlock();
761         if (nextBlock == null) {
762           setNonSeekedState();
763           return false;
764         }
765 
766         updateCurrBlock(nextBlock);
767         return true;
768       }
769 
770       // We are still in the same block.
771       readKeyValueLen();
772       return true;
773     }
774 
775     protected int getNextCellStartPosition() {
776       return blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen
777           + currMemstoreTSLen;
778     }
779 
780     /**
781      * Positions this scanner at the start of the file.
782      *
783      * @return false if empty file; i.e. a call to next would return false and
784      *         the current key and value are undefined.
785      * @throws IOException
786      */
787     @Override
788     public boolean seekTo() throws IOException {
789       if (reader == null) {
790         return false;
791       }
792 
793       if (reader.getTrailer().getEntryCount() == 0) {
794         // No data blocks.
795         return false;
796       }
797 
798       long firstDataBlockOffset =
799           reader.getTrailer().getFirstDataBlockOffset();
800       if (block != null && block.getOffset() == firstDataBlockOffset) {
801         blockBuffer.rewind();
802         readKeyValueLen();
803         return true;
804       }
805 
806       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
807           isCompaction, true, BlockType.DATA);
808       if (block.getOffset() < 0) {
809         throw new IOException("Invalid block offset: " + block.getOffset());
810       }
811       updateCurrBlock(block);
812       return true;
813     }
814 
815     @Override
816     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
817         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
818         throws IOException {
819       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
820         updateCurrBlock(seekToBlock);
821       } else if (rewind) {
822         blockBuffer.rewind();
823       }
824 
825       // Update the nextIndexedKey
826       this.nextIndexedKey = nextIndexedKey;
827       return blockSeek(key, offset, length, seekBefore);
828     }
829 
830     /**
831      * Updates the current block to be the given {@link HFileBlock}. Seeks to
832      * the the first key/value pair.
833      *
834      * @param newBlock the block to make current
835      */
836     protected void updateCurrBlock(HFileBlock newBlock) {
837       block = newBlock;
838 
839       // sanity check
840       if (block.getBlockType() != BlockType.DATA) {
841         throw new IllegalStateException("ScannerV2 works only on data " +
842             "blocks, got " + block.getBlockType() + "; " +
843             "fileName=" + reader.name + ", " +
844             "dataBlockEncoder=" + reader.dataBlockEncoder + ", " +
845             "isCompaction=" + isCompaction);
846       }
847 
848       blockBuffer = block.getBufferWithoutHeader();
849       readKeyValueLen();
850       blockFetches++;
851 
852       // Reset the next indexed key
853       this.nextIndexedKey = null;
854     }
855 
856     protected void readKeyValueLen() {
857       blockBuffer.mark();
858       currKeyLen = blockBuffer.getInt();
859       currValueLen = blockBuffer.getInt();
860       ByteBufferUtils.skip(blockBuffer, currKeyLen + currValueLen);
861       readMvccVersion();
862       if (currKeyLen < 0 || currValueLen < 0
863           || currKeyLen > blockBuffer.limit()
864           || currValueLen > blockBuffer.limit()) {
865         throw new IllegalStateException("Invalid currKeyLen " + currKeyLen
866             + " or currValueLen " + currValueLen + ". Block offset: "
867             + block.getOffset() + ", block length: " + blockBuffer.limit()
868             + ", position: " + blockBuffer.position() + " (without header).");
869       }
870       blockBuffer.reset();
871     }
872 
873     protected void readMvccVersion() {
874       if (this.reader.shouldIncludeMemstoreTS()) {
875         if (this.reader.decodeMemstoreTS) {
876           try {
877             currMemstoreTS = Bytes.readVLong(blockBuffer.array(), blockBuffer.arrayOffset()
878                 + blockBuffer.position());
879             currMemstoreTSLen = WritableUtils.getVIntSize(currMemstoreTS);
880           } catch (Exception e) {
881             throw new RuntimeException("Error reading memstore timestamp", e);
882           }
883         } else {
884           currMemstoreTS = 0;
885           currMemstoreTSLen = 1;
886         }
887       }
888     }
889 
890     /**
891      * Within a loaded block, seek looking for the last key that is smaller
892      * than (or equal to?) the key we are interested in.
893      *
894      * A note on the seekBefore: if you have seekBefore = true, AND the first
895      * key in the block = key, then you'll get thrown exceptions. The caller has
896      * to check for that case and load the previous block as appropriate.
897      *
898      * @param key the key to find
899      * @param seekBefore find the key before the given key in case of exact
900      *          match.
901      * @return 0 in case of an exact key match, 1 in case of an inexact match,
902      *         -2 in case of an inexact match and furthermore, the input key less
903      *         than the first key of current block(e.g. using a faked index key)
904      */
905     protected int blockSeek(byte[] key, int offset, int length,
906         boolean seekBefore) {
907       int klen, vlen;
908       long memstoreTS = 0;
909       int memstoreTSLen = 0;
910       int lastKeyValueSize = -1;
911       do {
912         blockBuffer.mark();
913         klen = blockBuffer.getInt();
914         vlen = blockBuffer.getInt();
915         blockBuffer.reset();
916         if (this.reader.shouldIncludeMemstoreTS()) {
917           if (this.reader.decodeMemstoreTS) {
918             try {
919               int memstoreTSOffset = blockBuffer.arrayOffset()
920                   + blockBuffer.position() + KEY_VALUE_LEN_SIZE + klen + vlen;
921               memstoreTS = Bytes.readVLong(blockBuffer.array(),
922                   memstoreTSOffset);
923               memstoreTSLen = WritableUtils.getVIntSize(memstoreTS);
924             } catch (Exception e) {
925               throw new RuntimeException("Error reading memstore timestamp", e);
926             }
927           } else {
928             memstoreTS = 0;
929             memstoreTSLen = 1;
930           }
931         }
932 
933         int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position()
934             + KEY_VALUE_LEN_SIZE;
935         int comp = reader.getComparator().compareFlatKey(key, offset, length,
936             blockBuffer.array(), keyOffset, klen);
937 
938         if (comp == 0) {
939           if (seekBefore) {
940             if (lastKeyValueSize < 0) {
941               throw new IllegalStateException("blockSeek with seekBefore "
942                   + "at the first key of the block: key="
943                   + Bytes.toStringBinary(key) + ", blockOffset="
944                   + block.getOffset() + ", onDiskSize="
945                   + block.getOnDiskSizeWithHeader());
946             }
947             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
948             readKeyValueLen();
949             return 1; // non exact match.
950           }
951           currKeyLen = klen;
952           currValueLen = vlen;
953           if (this.reader.shouldIncludeMemstoreTS()) {
954             currMemstoreTS = memstoreTS;
955             currMemstoreTSLen = memstoreTSLen;
956           }
957           return 0; // indicate exact match
958         } else if (comp < 0) {
959           if (lastKeyValueSize > 0)
960             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
961           readKeyValueLen();
962           if (lastKeyValueSize == -1 && blockBuffer.position() == 0
963               && this.reader.trailer.getMinorVersion() >= MINOR_VERSION_WITH_FAKED_KEY) {
964             return HConstants.INDEX_KEY_MAGIC;
965           }
966           return 1;
967         }
968 
969         // The size of this key/value tuple, including key/value length fields.
970         lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE;
971         blockBuffer.position(blockBuffer.position() + lastKeyValueSize);
972       } while (blockBuffer.remaining() > 0);
973 
974       // Seek to the last key we successfully read. This will happen if this is
975       // the last key/value pair in the file, in which case the following call
976       // to next() has to return false.
977       blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
978       readKeyValueLen();
979       return 1; // didn't exactly find it.
980     }
981 
982     @Override
983     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
984       ByteBuffer buffer = curBlock.getBufferWithoutHeader();
985       // It is safe to manipulate this buffer because we own the buffer object.
986       buffer.rewind();
987       int klen = buffer.getInt();
988       buffer.getInt();
989       ByteBuffer keyBuff = buffer.slice();
990       keyBuff.limit(klen);
991       keyBuff.rewind();
992       return keyBuff;
993     }
994 
995     @Override
996     public String getKeyString() {
997       return Bytes.toStringBinary(blockBuffer.array(),
998           blockBuffer.arrayOffset() + blockBuffer.position()
999               + KEY_VALUE_LEN_SIZE, currKeyLen);
1000     }
1001 
1002     @Override
1003     public String getValueString() {
1004       return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset()
1005           + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
1006           currValueLen);
1007     }
1008   }
1009 
1010   /**
1011    * ScannerV2 that operates on encoded data blocks.
1012    */
1013   protected static class EncodedScannerV2 extends AbstractScannerV2 {
1014     private final HFileBlockDecodingContext decodingCtx;
1015     private final DataBlockEncoder.EncodedSeeker seeker;
1016     private final DataBlockEncoder dataBlockEncoder;
1017     protected final HFileContext meta;
1018 
1019     public EncodedScannerV2(HFileReaderV2 reader, boolean cacheBlocks,
1020         boolean pread, boolean isCompaction, HFileContext meta) {
1021       super(reader, cacheBlocks, pread, isCompaction);
1022       DataBlockEncoding encoding = reader.dataBlockEncoder.getDataBlockEncoding();
1023       dataBlockEncoder = encoding.getEncoder();
1024       decodingCtx = dataBlockEncoder.newDataBlockDecodingContext(meta);
1025       seeker = dataBlockEncoder.createSeeker(
1026         reader.getComparator(), decodingCtx);
1027       this.meta = meta;
1028     }
1029 
1030     @Override
1031     public boolean isSeeked(){
1032       return this.block != null;
1033     }
1034 
1035     /**
1036      * Updates the current block to be the given {@link HFileBlock}. Seeks to
1037      * the the first key/value pair.
1038      *
1039      * @param newBlock the block to make current
1040      * @throws CorruptHFileException
1041      */
1042     private void updateCurrentBlock(HFileBlock newBlock) throws CorruptHFileException {
1043       block = newBlock;
1044 
1045       // sanity checks
1046       if (block.getBlockType() != BlockType.ENCODED_DATA) {
1047         throw new IllegalStateException(
1048             "EncodedScanner works only on encoded data blocks");
1049       }
1050       short dataBlockEncoderId = block.getDataBlockEncodingId();
1051       if (!DataBlockEncoding.isCorrectEncoder(dataBlockEncoder, dataBlockEncoderId)) {
1052         String encoderCls = dataBlockEncoder.getClass().getName();
1053         throw new CorruptHFileException("Encoder " + encoderCls
1054           + " doesn't support data block encoding "
1055           + DataBlockEncoding.getNameFromId(dataBlockEncoderId));
1056       }
1057 
1058       seeker.setCurrentBuffer(getEncodedBuffer(newBlock));
1059       blockFetches++;
1060 
1061       // Reset the next indexed key
1062       this.nextIndexedKey = null;
1063     }
1064 
1065     private ByteBuffer getEncodedBuffer(HFileBlock newBlock) {
1066       ByteBuffer origBlock = newBlock.getBufferReadOnly();
1067       ByteBuffer encodedBlock = ByteBuffer.wrap(origBlock.array(),
1068           origBlock.arrayOffset() + newBlock.headerSize() +
1069           DataBlockEncoding.ID_SIZE,
1070           newBlock.getUncompressedSizeWithoutHeader() -
1071           DataBlockEncoding.ID_SIZE).slice();
1072       return encodedBlock;
1073     }
1074 
1075     @Override
1076     public boolean seekTo() throws IOException {
1077       if (reader == null) {
1078         return false;
1079       }
1080 
1081       if (reader.getTrailer().getEntryCount() == 0) {
1082         // No data blocks.
1083         return false;
1084       }
1085 
1086       long firstDataBlockOffset =
1087           reader.getTrailer().getFirstDataBlockOffset();
1088       if (block != null && block.getOffset() == firstDataBlockOffset) {
1089         seeker.rewind();
1090         return true;
1091       }
1092 
1093       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
1094           isCompaction, true, BlockType.DATA);
1095       if (block.getOffset() < 0) {
1096         throw new IOException("Invalid block offset: " + block.getOffset());
1097       }
1098       updateCurrentBlock(block);
1099       return true;
1100     }
1101 
1102     @Override
1103     public boolean next() throws IOException {
1104       boolean isValid = seeker.next();
1105       if (!isValid) {
1106         block = readNextDataBlock();
1107         isValid = block != null;
1108         if (isValid) {
1109           updateCurrentBlock(block);
1110         }
1111       }
1112       return isValid;
1113     }
1114 
1115     @Override
1116     public ByteBuffer getKey() {
1117       assertValidSeek();
1118       return seeker.getKeyDeepCopy();
1119     }
1120 
1121     @Override
1122     public int compareKey(KVComparator comparator, byte[] key, int offset, int length) {
1123       return seeker.compareKey(comparator, key, offset, length);
1124     }
1125 
1126     @Override
1127     public ByteBuffer getValue() {
1128       assertValidSeek();
1129       return seeker.getValueShallowCopy();
1130     }
1131 
1132     @Override
1133     public KeyValue getKeyValue() {
1134       if (block == null) {
1135         return null;
1136       }
1137       return seeker.getKeyValue();
1138     }
1139 
1140     @Override
1141     public String getKeyString() {
1142       ByteBuffer keyBuffer = getKey();
1143       return Bytes.toStringBinary(keyBuffer.array(),
1144           keyBuffer.arrayOffset(), keyBuffer.limit());
1145     }
1146 
1147     @Override
1148     public String getValueString() {
1149       ByteBuffer valueBuffer = getValue();
1150       return Bytes.toStringBinary(valueBuffer.array(),
1151           valueBuffer.arrayOffset(), valueBuffer.limit());
1152     }
1153 
1154     private void assertValidSeek() {
1155       if (block == null) {
1156         throw new NotSeekedException();
1157       }
1158     }
1159 
1160     @Override
1161     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
1162       return dataBlockEncoder.getFirstKeyInBlock(getEncodedBuffer(curBlock));
1163     }
1164 
1165     @Override
1166     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
1167         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
1168         throws IOException  {
1169       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
1170         updateCurrentBlock(seekToBlock);
1171       } else if (rewind) {
1172         seeker.rewind();
1173       }
1174       this.nextIndexedKey = nextIndexedKey;
1175       return seeker.seekToKeyInBlock(key, offset, length, seekBefore);
1176     }
1177   }
1178 
1179   /**
1180    * Returns a buffer with the Bloom filter metadata. The caller takes
1181    * ownership of the buffer.
1182    */
1183   @Override
1184   public DataInput getGeneralBloomFilterMetadata() throws IOException {
1185     return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1186   }
1187 
1188   @Override
1189   public DataInput getDeleteBloomFilterMetadata() throws IOException {
1190     return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1191   }
1192 
1193   private DataInput getBloomFilterMetadata(BlockType blockType)
1194   throws IOException {
1195     if (blockType != BlockType.GENERAL_BLOOM_META &&
1196         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
1197       throw new RuntimeException("Block Type: " + blockType.toString() +
1198           " is not supported") ;
1199     }
1200 
1201     for (HFileBlock b : loadOnOpenBlocks)
1202       if (b.getBlockType() == blockType)
1203         return b.getByteStream();
1204     return null;
1205   }
1206 
1207   @Override
1208   public boolean isFileInfoLoaded() {
1209     return true; // We load file info in constructor in version 2.
1210   }
1211 
1212   /**
1213    * Validates that the minor version is within acceptable limits.
1214    * Otherwise throws an Runtime exception
1215    */
1216   private void validateMinorVersion(Path path, int minorVersion) {
1217     if (minorVersion < MIN_MINOR_VERSION ||
1218         minorVersion > MAX_MINOR_VERSION) {
1219       String msg = "Minor version for path " + path + 
1220                    " is expected to be between " +
1221                    MIN_MINOR_VERSION + " and " + MAX_MINOR_VERSION +
1222                    " but is found to be " + minorVersion;
1223       LOG.error(msg);
1224       throw new RuntimeException(msg);
1225     }
1226   }
1227 
1228   @Override
1229   public int getMajorVersion() {
1230     return 2;
1231   }
1232 
1233   @Override
1234   public HFileContext getFileContext() {
1235     return hfileContext;
1236   }
1237 
1238   /**
1239    * Returns false if block prefetching was requested for this file and has
1240    * not completed, true otherwise
1241    */
1242   @VisibleForTesting
1243   boolean prefetchComplete() {
1244     return PrefetchExecutor.isCompleted(path);
1245   }
1246 }