1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.zookeeper;
20
21 import java.io.IOException;
22 import java.lang.management.ManagementFactory;
23 import java.security.SecureRandom;
24 import java.util.ArrayList;
25 import java.util.LinkedList;
26 import java.util.List;
27 import java.util.Random;
28
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.apache.hadoop.hbase.classification.InterfaceAudience;
32 import org.apache.hadoop.hbase.util.Bytes;
33 import org.apache.hadoop.hbase.util.RetryCounter;
34 import org.apache.hadoop.hbase.util.RetryCounterFactory;
35 import org.apache.zookeeper.AsyncCallback;
36 import org.apache.zookeeper.CreateMode;
37 import org.apache.zookeeper.KeeperException;
38 import org.apache.zookeeper.Op;
39 import org.apache.zookeeper.OpResult;
40 import org.apache.zookeeper.Watcher;
41 import org.apache.zookeeper.ZooDefs;
42 import org.apache.zookeeper.ZooKeeper;
43 import org.apache.zookeeper.ZooKeeper.States;
44 import org.apache.zookeeper.data.ACL;
45 import org.apache.zookeeper.data.Stat;
46 import org.apache.zookeeper.proto.CreateRequest;
47 import org.apache.zookeeper.proto.SetDataRequest;
48 import org.cloudera.htrace.Trace;
49 import org.cloudera.htrace.TraceScope;
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74 @InterfaceAudience.Private
75 public class RecoverableZooKeeper {
76 private static final Log LOG = LogFactory.getLog(RecoverableZooKeeper.class);
77
78 private ZooKeeper zk;
79 private final RetryCounterFactory retryCounterFactory;
80
81 private final String identifier;
82 private final byte[] id;
83 private Watcher watcher;
84 private int sessionTimeout;
85 private String quorumServers;
86 private final Random salter;
87
88
89
90
91
92
93
94
95
96 private static final byte MAGIC =(byte) 0XFF;
97 private static final int MAGIC_SIZE = Bytes.SIZEOF_BYTE;
98 private static final int ID_LENGTH_OFFSET = MAGIC_SIZE;
99 private static final int ID_LENGTH_SIZE = Bytes.SIZEOF_INT;
100
101 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="DE_MIGHT_IGNORE",
102 justification="None. Its always been this way.")
103 public RecoverableZooKeeper(String quorumServers, int sessionTimeout,
104 Watcher watcher, int maxRetries, int retryIntervalMillis)
105 throws IOException {
106 this(quorumServers, sessionTimeout, watcher, maxRetries, retryIntervalMillis,
107 null);
108 }
109
110 public RecoverableZooKeeper(String quorumServers, int sessionTimeout,
111 Watcher watcher, int maxRetries, int retryIntervalMillis, String identifier)
112 throws IOException {
113
114 this.retryCounterFactory =
115 new RetryCounterFactory(maxRetries+1, retryIntervalMillis);
116
117 if (identifier == null || identifier.length() == 0) {
118
119 identifier = ManagementFactory.getRuntimeMXBean().getName();
120 }
121 LOG.info("Process identifier=" + identifier +
122 " connecting to ZooKeeper ensemble=" + quorumServers);
123 this.identifier = identifier;
124 this.id = Bytes.toBytes(identifier);
125
126 this.watcher = watcher;
127 this.sessionTimeout = sessionTimeout;
128 this.quorumServers = quorumServers;
129 try {checkZk();} catch (Exception x) {
130 salter = new SecureRandom();
131 }
132
133
134
135
136
137
138
139 protected synchronized ZooKeeper checkZk() throws KeeperException {
140 if (this.zk == null) {
141 try {
142 this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher);
143 } catch (IOException ex) {
144 LOG.warn("Unable to create ZooKeeper Connection", ex);
145 throw new KeeperException.OperationTimeoutException();
146 }
147 }
148 return zk;
149 }
150
151 public synchronized void reconnectAfterExpiration()
152 throws IOException, KeeperException, InterruptedException {
153 if (zk != null) {
154 LOG.info("Closing dead ZooKeeper connection, session" +
155 " was: 0x"+Long.toHexString(zk.getSessionId()));
156 zk.close();
157
158 zk = null;
159 }
160 checkZk();
161 LOG.info("Recreated a ZooKeeper, session" +
162 " is: 0x"+Long.toHexString(zk.getSessionId()));
163 }
164
165
166
167
168
169
170 public void delete(String path, int version)
171 throws InterruptedException, KeeperException {
172 TraceScope traceScope = null;
173 try {
174 traceScope = Trace.startSpan("RecoverableZookeeper.delete");
175 RetryCounter retryCounter = retryCounterFactory.create();
176 boolean isRetry = false;
177 while (true) {
178 try {
179 checkZk().delete(path, version);
180 return;
181 } catch (KeeperException e) {
182 switch (e.code()) {
183 case NONODE:
184 if (isRetry) {
185 LOG.info("Node " + path + " already deleted. Assuming a " +
186 "previous attempt succeeded.");
187 return;
188 }
189 LOG.warn("Node " + path + " already deleted, retry=" + isRetry);
190 throw e;
191
192 case CONNECTIONLOSS:
193 case SESSIONEXPIRED:
194 case OPERATIONTIMEOUT:
195 retryOrThrow(retryCounter, e, "delete");
196 break;
197
198 default:
199 throw e;
200 }
201 }
202 retryCounter.sleepUntilNextRetry();
203 isRetry = true;
204 }
205 } finally {
206 if (traceScope != null) traceScope.close();
207 }
208 }
209
210
211
212
213
214 public Stat exists(String path, Watcher watcher)
215 throws KeeperException, InterruptedException {
216 TraceScope traceScope = null;
217 try {
218 traceScope = Trace.startSpan("RecoverableZookeeper.exists");
219 RetryCounter retryCounter = retryCounterFactory.create();
220 while (true) {
221 try {
222 return checkZk().exists(path, watcher);
223 } catch (KeeperException e) {
224 switch (e.code()) {
225 case CONNECTIONLOSS:
226 case SESSIONEXPIRED:
227 case OPERATIONTIMEOUT:
228 retryOrThrow(retryCounter, e, "exists");
229 break;
230
231 default:
232 throw e;
233 }
234 }
235 retryCounter.sleepUntilNextRetry();
236 }
237 } finally {
238 if (traceScope != null) traceScope.close();
239 }
240 }
241
242
243
244
245
246 public Stat exists(String path, boolean watch)
247 throws KeeperException, InterruptedException {
248 TraceScope traceScope = null;
249 try {
250 traceScope = Trace.startSpan("RecoverableZookeeper.exists");
251 RetryCounter retryCounter = retryCounterFactory.create();
252 while (true) {
253 try {
254 return checkZk().exists(path, watch);
255 } catch (KeeperException e) {
256 switch (e.code()) {
257 case CONNECTIONLOSS:
258 case SESSIONEXPIRED:
259 case OPERATIONTIMEOUT:
260 retryOrThrow(retryCounter, e, "exists");
261 break;
262
263 default:
264 throw e;
265 }
266 }
267 retryCounter.sleepUntilNextRetry();
268 }
269 } finally {
270 if (traceScope != null) traceScope.close();
271 }
272 }
273
274 private void retryOrThrow(RetryCounter retryCounter, KeeperException e,
275 String opName) throws KeeperException {
276 LOG.warn("Possibly transient ZooKeeper, quorum=" + quorumServers + ", exception=" + e);
277 if (!retryCounter.shouldRetry()) {
278 LOG.error("ZooKeeper " + opName + " failed after "
279 + retryCounter.getMaxAttempts() + " attempts");
280 throw e;
281 }
282 }
283
284
285
286
287
288 public List<String> getChildren(String path, Watcher watcher)
289 throws KeeperException, InterruptedException {
290 TraceScope traceScope = null;
291 try {
292 traceScope = Trace.startSpan("RecoverableZookeeper.getChildren");
293 RetryCounter retryCounter = retryCounterFactory.create();
294 while (true) {
295 try {
296 return checkZk().getChildren(path, watcher);
297 } catch (KeeperException e) {
298 switch (e.code()) {
299 case CONNECTIONLOSS:
300 case SESSIONEXPIRED:
301 case OPERATIONTIMEOUT:
302 retryOrThrow(retryCounter, e, "getChildren");
303 break;
304
305 default:
306 throw e;
307 }
308 }
309 retryCounter.sleepUntilNextRetry();
310 }
311 } finally {
312 if (traceScope != null) traceScope.close();
313 }
314 }
315
316
317
318
319
320 public List<String> getChildren(String path, boolean watch)
321 throws KeeperException, InterruptedException {
322 TraceScope traceScope = null;
323 try {
324 traceScope = Trace.startSpan("RecoverableZookeeper.getChildren");
325 RetryCounter retryCounter = retryCounterFactory.create();
326 while (true) {
327 try {
328 return checkZk().getChildren(path, watch);
329 } catch (KeeperException e) {
330 switch (e.code()) {
331 case CONNECTIONLOSS:
332 case SESSIONEXPIRED:
333 case OPERATIONTIMEOUT:
334 retryOrThrow(retryCounter, e, "getChildren");
335 break;
336
337 default:
338 throw e;
339 }
340 }
341 retryCounter.sleepUntilNextRetry();
342 }
343 } finally {
344 if (traceScope != null) traceScope.close();
345 }
346 }
347
348
349
350
351
352 public byte[] getData(String path, Watcher watcher, Stat stat)
353 throws KeeperException, InterruptedException {
354 TraceScope traceScope = null;
355 try {
356 traceScope = Trace.startSpan("RecoverableZookeeper.getData");
357 RetryCounter retryCounter = retryCounterFactory.create();
358 while (true) {
359 try {
360 byte[] revData = checkZk().getData(path, watcher, stat);
361 return this.removeMetaData(revData);
362 } catch (KeeperException e) {
363 switch (e.code()) {
364 case CONNECTIONLOSS:
365 case SESSIONEXPIRED:
366 case OPERATIONTIMEOUT:
367 retryOrThrow(retryCounter, e, "getData");
368 break;
369
370 default:
371 throw e;
372 }
373 }
374 retryCounter.sleepUntilNextRetry();
375 }
376 } finally {
377 if (traceScope != null) traceScope.close();
378 }
379 }
380
381
382
383
384
385 public byte[] getData(String path, boolean watch, Stat stat)
386 throws KeeperException, InterruptedException {
387 TraceScope traceScope = null;
388 try {
389 traceScope = Trace.startSpan("RecoverableZookeeper.getData");
390 RetryCounter retryCounter = retryCounterFactory.create();
391 while (true) {
392 try {
393 byte[] revData = checkZk().getData(path, watch, stat);
394 return this.removeMetaData(revData);
395 } catch (KeeperException e) {
396 switch (e.code()) {
397 case CONNECTIONLOSS:
398 case SESSIONEXPIRED:
399 case OPERATIONTIMEOUT:
400 retryOrThrow(retryCounter, e, "getData");
401 break;
402
403 default:
404 throw e;
405 }
406 }
407 retryCounter.sleepUntilNextRetry();
408 }
409 } finally {
410 if (traceScope != null) traceScope.close();
411 }
412 }
413
414
415
416
417
418
419
420 public Stat setData(String path, byte[] data, int version)
421 throws KeeperException, InterruptedException {
422 TraceScope traceScope = null;
423 try {
424 traceScope = Trace.startSpan("RecoverableZookeeper.setData");
425 RetryCounter retryCounter = retryCounterFactory.create();
426 byte[] newData = appendMetaData(data);
427 boolean isRetry = false;
428 while (true) {
429 try {
430 return checkZk().setData(path, newData, version);
431 } catch (KeeperException e) {
432 switch (e.code()) {
433 case CONNECTIONLOSS:
434 case SESSIONEXPIRED:
435 case OPERATIONTIMEOUT:
436 retryOrThrow(retryCounter, e, "setData");
437 break;
438 case BADVERSION:
439 if (isRetry) {
440
441 try{
442 Stat stat = new Stat();
443 byte[] revData = checkZk().getData(path, false, stat);
444 if(Bytes.compareTo(revData, newData) == 0) {
445
446 return stat;
447 }
448 } catch(KeeperException keeperException){
449
450 throw keeperException;
451 }
452 }
453
454 default:
455 throw e;
456 }
457 }
458 retryCounter.sleepUntilNextRetry();
459 isRetry = true;
460 }
461 } finally {
462 if (traceScope != null) traceScope.close();
463 }
464 }
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481 public String create(String path, byte[] data, List<ACL> acl,
482 CreateMode createMode)
483 throws KeeperException, InterruptedException {
484 TraceScope traceScope = null;
485 try {
486 traceScope = Trace.startSpan("RecoverableZookeeper.create");
487 byte[] newData = appendMetaData(data);
488 switch (createMode) {
489 case EPHEMERAL:
490 case PERSISTENT:
491 return createNonSequential(path, newData, acl, createMode);
492
493 case EPHEMERAL_SEQUENTIAL:
494 case PERSISTENT_SEQUENTIAL:
495 return createSequential(path, newData, acl, createMode);
496
497 default:
498 throw new IllegalArgumentException("Unrecognized CreateMode: " +
499 createMode);
500 }
501 } finally {
502 if (traceScope != null) traceScope.close();
503 }
504 }
505
506 private String createNonSequential(String path, byte[] data, List<ACL> acl,
507 CreateMode createMode) throws KeeperException, InterruptedException {
508 RetryCounter retryCounter = retryCounterFactory.create();
509 boolean isRetry = false;
510 while (true) {
511 try {
512 return checkZk().create(path, data, acl, createMode);
513 } catch (KeeperException e) {
514 switch (e.code()) {
515 case NODEEXISTS:
516 if (isRetry) {
517
518
519
520 byte[] currentData = checkZk().getData(path, false, null);
521 if (currentData != null &&
522 Bytes.compareTo(currentData, data) == 0) {
523
524 return path;
525 }
526 LOG.error("Node " + path + " already exists with " +
527 Bytes.toStringBinary(currentData) + ", could not write " +
528 Bytes.toStringBinary(data));
529 throw e;
530 }
531 LOG.info("Node " + path + " already exists and this is not a " +
532 "retry");
533 throw e;
534
535 case CONNECTIONLOSS:
536 case SESSIONEXPIRED:
537 case OPERATIONTIMEOUT:
538 retryOrThrow(retryCounter, e, "create");
539 break;
540
541 default:
542 throw e;
543 }
544 }
545 retryCounter.sleepUntilNextRetry();
546 isRetry = true;
547 }
548 }
549
550 private String createSequential(String path, byte[] data,
551 List<ACL> acl, CreateMode createMode)
552 throws KeeperException, InterruptedException {
553 RetryCounter retryCounter = retryCounterFactory.create();
554 boolean first = true;
555 String newPath = path+this.identifier;
556 while (true) {
557 try {
558 if (!first) {
559
560 String previousResult = findPreviousSequentialNode(newPath);
561 if (previousResult != null) {
562 return previousResult;
563 }
564 }
565 first = false;
566 return checkZk().create(newPath, data, acl, createMode);
567 } catch (KeeperException e) {
568 switch (e.code()) {
569 case CONNECTIONLOSS:
570 case SESSIONEXPIRED:
571 case OPERATIONTIMEOUT:
572 retryOrThrow(retryCounter, e, "create");
573 break;
574
575 default:
576 throw e;
577 }
578 }
579 retryCounter.sleepUntilNextRetry();
580 }
581 }
582
583
584
585
586 private Iterable<Op> prepareZKMulti(Iterable<Op> ops)
587 throws UnsupportedOperationException {
588 if(ops == null) return null;
589
590 List<Op> preparedOps = new LinkedList<Op>();
591 for (Op op : ops) {
592 if (op.getType() == ZooDefs.OpCode.create) {
593 CreateRequest create = (CreateRequest)op.toRequestRecord();
594 preparedOps.add(Op.create(create.getPath(), appendMetaData(create.getData()),
595 create.getAcl(), create.getFlags()));
596 } else if (op.getType() == ZooDefs.OpCode.delete) {
597
598 preparedOps.add(op);
599 } else if (op.getType() == ZooDefs.OpCode.setData) {
600 SetDataRequest setData = (SetDataRequest)op.toRequestRecord();
601 preparedOps.add(Op.setData(setData.getPath(), appendMetaData(setData.getData()),
602 setData.getVersion()));
603 } else {
604 throw new UnsupportedOperationException("Unexpected ZKOp type: " + op.getClass().getName());
605 }
606 }
607 return preparedOps;
608 }
609
610
611
612
613 public List<OpResult> multi(Iterable<Op> ops)
614 throws KeeperException, InterruptedException {
615 TraceScope traceScope = null;
616 try {
617 traceScope = Trace.startSpan("RecoverableZookeeper.multi");
618 RetryCounter retryCounter = retryCounterFactory.create();
619 Iterable<Op> multiOps = prepareZKMulti(ops);
620 while (true) {
621 try {
622 return checkZk().multi(multiOps);
623 } catch (KeeperException e) {
624 switch (e.code()) {
625 case CONNECTIONLOSS:
626 case SESSIONEXPIRED:
627 case OPERATIONTIMEOUT:
628 retryOrThrow(retryCounter, e, "multi");
629 break;
630
631 default:
632 throw e;
633 }
634 }
635 retryCounter.sleepUntilNextRetry();
636 }
637 } finally {
638 if (traceScope != null) traceScope.close();
639 }
640 }
641
642 private String findPreviousSequentialNode(String path)
643 throws KeeperException, InterruptedException {
644 int lastSlashIdx = path.lastIndexOf('/');
645 assert(lastSlashIdx != -1);
646 String parent = path.substring(0, lastSlashIdx);
647 String nodePrefix = path.substring(lastSlashIdx+1);
648
649 List<String> nodes = checkZk().getChildren(parent, false);
650 List<String> matching = filterByPrefix(nodes, nodePrefix);
651 for (String node : matching) {
652 String nodePath = parent + "/" + node;
653 Stat stat = checkZk().exists(nodePath, false);
654 if (stat != null) {
655 return nodePath;
656 }
657 }
658 return null;
659 }
660
661 public byte[] removeMetaData(byte[] data) {
662 if(data == null || data.length == 0) {
663 return data;
664 }
665
666 byte magic = data[0];
667 if(magic != MAGIC) {
668 return data;
669 }
670
671 int idLength = Bytes.toInt(data, ID_LENGTH_OFFSET);
672 int dataLength = data.length-MAGIC_SIZE-ID_LENGTH_SIZE-idLength;
673 int dataOffset = MAGIC_SIZE+ID_LENGTH_SIZE+idLength;
674
675 byte[] newData = new byte[dataLength];
676 System.arraycopy(data, dataOffset, newData, 0, dataLength);
677 return newData;
678 }
679
680 private byte[] appendMetaData(byte[] data) {
681 if(data == null || data.length == 0){
682 return data;
683 }
684 byte[] salt = Bytes.toBytes(salter.nextLong());
685 int idLength = id.length + salt.length;
686 byte[] newData = new byte[MAGIC_SIZE+ID_LENGTH_SIZE+idLength+data.length];
687 int pos = 0;
688 pos = Bytes.putByte(newData, pos, MAGIC);
689 pos = Bytes.putInt(newData, pos, idLength);
690 pos = Bytes.putBytes(newData, pos, id, 0, id.length);
691 pos = Bytes.putBytes(newData, pos, salt, 0, salt.length);
692 pos = Bytes.putBytes(newData, pos, data, 0, data.length);
693 return newData;
694 }
695
696 public synchronized long getSessionId() {
697 return zk == null ? null : zk.getSessionId();
698 }
699
700 public synchronized void close() throws InterruptedException {
701 if (zk != null) zk.close();
702 }
703
704 public synchronized States getState() {
705 return zk == null ? null : zk.getState();
706 }
707
708 public synchronized ZooKeeper getZooKeeper() {
709 return zk;
710 }
711
712 public synchronized byte[] getSessionPasswd() {
713 return zk == null ? null : zk.getSessionPasswd();
714 }
715
716 public void sync(String path, AsyncCallback.VoidCallback cb, Object ctx) throws KeeperException {
717 checkZk().sync(path, null, null);
718 }
719
720
721
722
723
724
725
726
727
728
729 private static List<String> filterByPrefix(List<String> nodes,
730 String... prefixes) {
731 List<String> lockChildren = new ArrayList<String>();
732 for (String child : nodes){
733 for (String prefix : prefixes){
734 if (child.startsWith(prefix)){
735 lockChildren.add(child);
736 break;
737 }
738 }
739 }
740 return lockChildren;
741 }
742
743 public String getIdentifier() {
744 return identifier;
745 }
746 }