View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.filter;
19  
20  import com.google.common.annotations.VisibleForTesting;
21  import org.apache.hadoop.hbase.util.ByteStringer;
22  import com.google.protobuf.InvalidProtocolBufferException;
23  
24  import org.apache.hadoop.hbase.classification.InterfaceAudience;
25  import org.apache.hadoop.hbase.classification.InterfaceStability;
26  import org.apache.hadoop.hbase.Cell;
27  import org.apache.hadoop.hbase.KeyValue;
28  import org.apache.hadoop.hbase.KeyValueUtil;
29  import org.apache.hadoop.hbase.exceptions.DeserializationException;
30  import org.apache.hadoop.hbase.protobuf.generated.FilterProtos;
31  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.BytesBytesPair;
32  import org.apache.hadoop.hbase.util.Bytes;
33  import org.apache.hadoop.hbase.util.Pair;
34  
35  import java.util.ArrayList;
36  import java.util.Arrays;
37  import java.util.List;
38  
39  /**
40   * Filters data based on fuzzy row key. Performs fast-forwards during scanning.
41   * It takes pairs (row key, fuzzy info) to match row keys. Where fuzzy info is
42   * a byte array with 0 or 1 as its values:
43   * <ul>
44   *   <li>
45   *     0 - means that this byte in provided row key is fixed, i.e. row key's byte at same position
46   *         must match
47   *   </li>
48   *   <li>
49   *     1 - means that this byte in provided row key is NOT fixed, i.e. row key's byte at this
50   *         position can be different from the one in provided row key
51   *   </li>
52   * </ul>
53   *
54   *
55   * Example:
56   * Let's assume row key format is userId_actionId_year_month. Length of userId is fixed
57   * and is 4, length of actionId is 2 and year and month are 4 and 2 bytes long respectively.
58   *
59   * Let's assume that we need to fetch all users that performed certain action (encoded as "99")
60   * in Jan of any year. Then the pair (row key, fuzzy info) would be the following:
61   * row key = "????_99_????_01" (one can use any value instead of "?")
62   * fuzzy info = "\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x01\x00\x00\x00"
63   *
64   * I.e. fuzzy info tells the matching mask is "????_99_????_01", where at ? can be any value.
65   *
66   */
67  @InterfaceAudience.Public
68  @InterfaceStability.Evolving
69  public class FuzzyRowFilter extends FilterBase {
70    private List<Pair<byte[], byte[]>> fuzzyKeysData;
71    private boolean done = false;
72  
73    public FuzzyRowFilter(List<Pair<byte[], byte[]>> fuzzyKeysData) {
74      Pair<byte[], byte[]> p;
75      for (int i = 0; i < fuzzyKeysData.size(); i++) {
76        p = fuzzyKeysData.get(i);
77        if (p.getFirst().length != p.getSecond().length) {
78          Pair<String, String> readable = new Pair<String, String>(
79            Bytes.toStringBinary(p.getFirst()),
80            Bytes.toStringBinary(p.getSecond()));
81          throw new IllegalArgumentException("Fuzzy pair lengths do not match: " + readable);
82        }
83      }
84      this.fuzzyKeysData = fuzzyKeysData;
85    }
86  
87    // TODO: possible improvement: save which fuzzy row key to use when providing a hint
88    @Override
89    public ReturnCode filterKeyValue(Cell kv) {
90      // TODO add getRow() equivalent to Cell or change satisfies to take b[],o,l style args.
91      KeyValue v = KeyValueUtil.ensureKeyValue(kv);
92  
93      byte[] rowKey = v.getRow();
94      // assigning "worst" result first and looking for better options
95      SatisfiesCode bestOption = SatisfiesCode.NO_NEXT;
96      for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
97        SatisfiesCode satisfiesCode =
98                satisfies(isReversed(), rowKey, fuzzyData.getFirst(), fuzzyData.getSecond());
99        if (satisfiesCode == SatisfiesCode.YES) {
100         return ReturnCode.INCLUDE;
101       }
102 
103       if (satisfiesCode == SatisfiesCode.NEXT_EXISTS) {
104         bestOption = SatisfiesCode.NEXT_EXISTS;
105       }
106     }
107 
108     if (bestOption == SatisfiesCode.NEXT_EXISTS) {
109       return ReturnCode.SEEK_NEXT_USING_HINT;
110     }
111 
112     // the only unhandled SatisfiesCode is NO_NEXT, i.e. we are done
113     done = true;
114     return ReturnCode.NEXT_ROW;
115   }
116 
117   @Override
118   public Cell getNextCellHint(Cell currentKV) {
119     // TODO make matching Column a cell method or CellUtil method.
120     KeyValue v = KeyValueUtil.ensureKeyValue(currentKV);
121 
122     byte[] rowKey = v.getRow();
123     byte[] nextRowKey = null;
124     // Searching for the "smallest" row key that satisfies at least one fuzzy row key
125     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
126       byte[] nextRowKeyCandidate = getNextForFuzzyRule(isReversed(), rowKey,
127               fuzzyData.getFirst(), fuzzyData.getSecond());
128       if (nextRowKeyCandidate == null) {
129         continue;
130       }
131       if (nextRowKey == null ||
132         (reversed && Bytes.compareTo(nextRowKeyCandidate, nextRowKey) > 0) ||
133         (!reversed && Bytes.compareTo(nextRowKeyCandidate, nextRowKey) < 0)) {
134         nextRowKey = nextRowKeyCandidate;
135       }
136     }
137 
138     if (!reversed && nextRowKey == null) {
139       // Should never happen for forward scanners; logic in filterKeyValue should return NO_NEXT.
140       // Can happen in reversed scanner when currentKV is just before the next possible match; in
141       // this case, fall back on scanner simply calling KeyValueHeap.next()
142       // TODO: is there a better way than throw exception? (stop the scanner?)
143       throw new IllegalStateException("No next row key that satisfies fuzzy exists when" +
144                                          " getNextKeyHint() is invoked." +
145                                          " Filter: " + this.toString() +
146                                          " currentKV: " + currentKV.toString());
147     }
148 
149     return nextRowKey == null ? null : KeyValue.createFirstOnRow(nextRowKey);
150   }
151 
152   @Override
153   public boolean filterAllRemaining() {
154     return done;
155   }
156 
157   /**
158    * @return The filter serialized using pb
159    */
160   public byte [] toByteArray() {
161     FilterProtos.FuzzyRowFilter.Builder builder =
162       FilterProtos.FuzzyRowFilter.newBuilder();
163     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
164       BytesBytesPair.Builder bbpBuilder = BytesBytesPair.newBuilder();
165       bbpBuilder.setFirst(ByteStringer.wrap(fuzzyData.getFirst()));
166       bbpBuilder.setSecond(ByteStringer.wrap(fuzzyData.getSecond()));
167       builder.addFuzzyKeysData(bbpBuilder);
168     }
169     return builder.build().toByteArray();
170   }
171 
172   /**
173    * @param pbBytes A pb serialized {@link FuzzyRowFilter} instance
174    * @return An instance of {@link FuzzyRowFilter} made from <code>bytes</code>
175    * @throws DeserializationException
176    * @see #toByteArray
177    */
178   public static FuzzyRowFilter parseFrom(final byte [] pbBytes)
179   throws DeserializationException {
180     FilterProtos.FuzzyRowFilter proto;
181     try {
182       proto = FilterProtos.FuzzyRowFilter.parseFrom(pbBytes);
183     } catch (InvalidProtocolBufferException e) {
184       throw new DeserializationException(e);
185     }
186     int count = proto.getFuzzyKeysDataCount();
187     ArrayList<Pair<byte[], byte[]>> fuzzyKeysData= new ArrayList<Pair<byte[], byte[]>>(count);
188     for (int i = 0; i < count; ++i) {
189       BytesBytesPair current = proto.getFuzzyKeysData(i);
190       byte[] keyBytes = current.getFirst().toByteArray();
191       byte[] keyMeta = current.getSecond().toByteArray();
192       fuzzyKeysData.add(new Pair<byte[], byte[]>(keyBytes, keyMeta));
193     }
194     return new FuzzyRowFilter(fuzzyKeysData);
195   }
196 
197   @Override
198   public String toString() {
199     final StringBuilder sb = new StringBuilder();
200     sb.append("FuzzyRowFilter");
201     sb.append("{fuzzyKeysData=");
202     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
203       sb.append('{').append(Bytes.toStringBinary(fuzzyData.getFirst())).append(":");
204       sb.append(Bytes.toStringBinary(fuzzyData.getSecond())).append('}');
205     }
206     sb.append("}, ");
207     return sb.toString();
208   }
209 
210   // Utility methods
211 
212   static enum SatisfiesCode {
213     /** row satisfies fuzzy rule */
214     YES,
215     /** row doesn't satisfy fuzzy rule, but there's possible greater row that does */
216     NEXT_EXISTS,
217     /** row doesn't satisfy fuzzy rule and there's no greater row that does */
218     NO_NEXT
219   }
220 
221   @VisibleForTesting
222   static SatisfiesCode satisfies(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
223     return satisfies(false, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
224   }
225 
226   @VisibleForTesting
227   static SatisfiesCode satisfies(boolean reverse, byte[] row, byte[] fuzzyKeyBytes,
228                                  byte[] fuzzyKeyMeta) {
229     return satisfies(reverse, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
230   }
231 
232   private static SatisfiesCode satisfies(boolean reverse, byte[] row, int offset, int length,
233                                          byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
234     if (row == null) {
235       // do nothing, let scan to proceed
236       return SatisfiesCode.YES;
237     }
238 
239     Order order = Order.orderFor(reverse);
240     boolean nextRowKeyCandidateExists = false;
241 
242     for (int i = 0; i < fuzzyKeyMeta.length && i < length; i++) {
243       // First, checking if this position is fixed and not equals the given one
244       boolean byteAtPositionFixed = fuzzyKeyMeta[i] == 0;
245       boolean fixedByteIncorrect = byteAtPositionFixed && fuzzyKeyBytes[i] != row[i + offset];
246       if (fixedByteIncorrect) {
247         // in this case there's another row that satisfies fuzzy rule and bigger than this row
248         if (nextRowKeyCandidateExists) {
249           return SatisfiesCode.NEXT_EXISTS;
250         }
251 
252         // If this row byte is less than fixed then there's a byte array bigger than
253         // this row and which satisfies the fuzzy rule. Otherwise there's no such byte array:
254         // this row is simply bigger than any byte array that satisfies the fuzzy rule
255         boolean rowByteLessThanFixed = (row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF);
256         if (rowByteLessThanFixed && !reverse) {
257           return SatisfiesCode.NEXT_EXISTS;
258         } else if (!rowByteLessThanFixed && reverse) {
259           return SatisfiesCode.NEXT_EXISTS;
260         } else {
261           return SatisfiesCode.NO_NEXT;
262         }
263       }
264 
265       // Second, checking if this position is not fixed and byte value is not the biggest. In this
266       // case there's a byte array bigger than this row and which satisfies the fuzzy rule. To get
267       // bigger byte array that satisfies the rule we need to just increase this byte
268       // (see the code of getNextForFuzzyRule below) by one.
269       // Note: if non-fixed byte is already at biggest value, this doesn't allow us to say there's
270       //       bigger one that satisfies the rule as it can't be increased.
271       if (fuzzyKeyMeta[i] == 1 && !order.isMax(fuzzyKeyBytes[i])) {
272         nextRowKeyCandidateExists = true;
273       }
274     }
275 
276     return SatisfiesCode.YES;
277   }
278 
279   @VisibleForTesting
280   static byte[] getNextForFuzzyRule(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
281     return getNextForFuzzyRule(false, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
282   }
283 
284   @VisibleForTesting
285   static byte[] getNextForFuzzyRule(boolean reverse, byte[] row, byte[] fuzzyKeyBytes,
286                                     byte[] fuzzyKeyMeta) {
287     return getNextForFuzzyRule(reverse, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
288   }
289 
290   /** Abstracts directional comparisons based on scan direction. */
291   private enum Order {
292     ASC {
293       public boolean lt(int lhs, int rhs) {
294         return lhs < rhs;
295       }
296       public boolean gt(int lhs, int rhs) {
297         return lhs > rhs;
298       }
299       public byte inc(byte val) {
300         // TODO: what about over/underflow?
301         return (byte) (val + 1);
302       }
303       public boolean isMax(byte val) {
304         return val == (byte) 0xff;
305       }
306       public byte min() {
307         return 0;
308       }
309     },
310     DESC {
311       public boolean lt(int lhs, int rhs) {
312         return lhs > rhs;
313       }
314       public boolean gt(int lhs, int rhs) {
315         return lhs < rhs;
316       }
317       public byte inc(byte val) {
318         // TODO: what about over/underflow?
319         return (byte) (val - 1);
320       }
321       public boolean isMax(byte val) {
322         return val == 0;
323       }
324       public byte min() {
325         return (byte) 0xFF;
326       }
327     };
328 
329     public static Order orderFor(boolean reverse) {
330       return reverse ? DESC : ASC;
331     }
332 
333     /** Returns true when {@code lhs < rhs}. */
334     public abstract boolean lt(int lhs, int rhs);
335     /** Returns true when {@code lhs > rhs}. */
336     public abstract boolean gt(int lhs, int rhs);
337     /** Returns {@code val} incremented by 1. */
338     public abstract byte inc(byte val);
339     /** Return true when {@code val} is the maximum value */
340     public abstract boolean isMax(byte val);
341     /** Return the minimum value according to this ordering scheme. */
342     public abstract byte min();
343   }
344 
345   /**
346    * @return greater byte array than given (row) which satisfies the fuzzy rule if it exists,
347    *         null otherwise
348    */
349   private static byte[] getNextForFuzzyRule(boolean reverse, byte[] row, int offset, int length,
350                                             byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
351     // To find out the next "smallest" byte array that satisfies fuzzy rule and "greater" than
352     // the given one we do the following:
353     // 1. setting values on all "fixed" positions to the values from fuzzyKeyBytes
354     // 2. if during the first step given row did not increase, then we increase the value at
355     //    the first "non-fixed" position (where it is not maximum already)
356 
357     // It is easier to perform this by using fuzzyKeyBytes copy and setting "non-fixed" position
358     // values than otherwise.
359     byte[] result = Arrays.copyOf(fuzzyKeyBytes,
360                                   length > fuzzyKeyBytes.length ? length : fuzzyKeyBytes.length);
361     if (reverse && length > fuzzyKeyBytes.length) {
362       // we need trailing 0xff's instead of trailing 0x00's
363       for (int i = fuzzyKeyBytes.length; i < result.length; i++) {
364         result[i] = (byte) 0xFF;
365       }
366     }
367     int toInc = -1;
368     final Order order = Order.orderFor(reverse);
369 
370     boolean increased = false;
371     for (int i = 0; i < result.length; i++) {
372       if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
373         result[i] = row[offset + i];
374         if (!order.isMax(row[i])) {
375           // this is "non-fixed" position and is not at max value, hence we can increase it
376           toInc = i;
377         }
378       } else if (i < fuzzyKeyMeta.length && fuzzyKeyMeta[i] == 0) {
379         if (order.lt((row[i + offset] & 0xFF), (fuzzyKeyBytes[i] & 0xFF))) {
380           // if setting value for any fixed position increased the original array,
381           // we are OK
382           increased = true;
383           break;
384         }
385 
386         if (order.gt((row[i + offset] & 0xFF), (fuzzyKeyBytes[i] & 0xFF))) {
387           // if setting value for any fixed position makes array "smaller", then just stop:
388           // in case we found some non-fixed position to increase we will do it, otherwise
389           // there's no "next" row key that satisfies fuzzy rule and "greater" than given row
390           break;
391         }
392       }
393     }
394 
395     if (!increased) {
396       if (toInc < 0) {
397         return null;
398       }
399       result[toInc] = order.inc(result[toInc]);
400 
401       // Setting all "non-fixed" positions to zeroes to the right of the one we increased so
402       // that found "next" row key is the smallest possible
403       for (int i = toInc + 1; i < result.length; i++) {
404         if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
405           result[i] = order.min();
406         }
407       }
408     }
409 
410     return result;
411   }
412 
413   /**
414    * @return true if and only if the fields of the filter that are serialized
415    * are equal to the corresponding fields in other.  Used for testing.
416    */
417   boolean areSerializedFieldsEqual(Filter o) {
418     if (o == this) return true;
419     if (!(o instanceof FuzzyRowFilter)) return false;
420 
421     FuzzyRowFilter other = (FuzzyRowFilter)o;
422     if (this.fuzzyKeysData.size() != other.fuzzyKeysData.size()) return false;
423     for (int i = 0; i < fuzzyKeysData.size(); ++i) {
424       Pair<byte[], byte[]> thisData = this.fuzzyKeysData.get(i);
425       Pair<byte[], byte[]> otherData = other.fuzzyKeysData.get(i);
426       if (!(Bytes.equals(thisData.getFirst(), otherData.getFirst())
427         && Bytes.equals(thisData.getSecond(), otherData.getSecond()))) {
428         return false;
429       }
430     }
431     return true;
432   }
433 }