001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.ha;
020
021 import java.io.IOException;
022 import java.util.Arrays;
023 import java.util.List;
024 import java.util.concurrent.CountDownLatch;
025 import java.util.concurrent.TimeUnit;
026 import java.util.concurrent.locks.Lock;
027 import java.util.concurrent.locks.ReentrantLock;
028
029 import org.apache.commons.logging.Log;
030 import org.apache.commons.logging.LogFactory;
031 import org.apache.hadoop.HadoopIllegalArgumentException;
032 import org.apache.hadoop.classification.InterfaceAudience;
033 import org.apache.hadoop.classification.InterfaceStability;
034 import org.apache.hadoop.util.ZKUtil.ZKAuthInfo;
035 import org.apache.hadoop.util.StringUtils;
036 import org.apache.zookeeper.data.ACL;
037 import org.apache.zookeeper.KeeperException;
038 import org.apache.zookeeper.Watcher;
039 import org.apache.zookeeper.WatchedEvent;
040 import org.apache.zookeeper.Watcher.Event;
041 import org.apache.zookeeper.ZKUtil;
042 import org.apache.zookeeper.ZooKeeper;
043 import org.apache.zookeeper.CreateMode;
044 import org.apache.zookeeper.AsyncCallback.*;
045 import org.apache.zookeeper.data.Stat;
046 import org.apache.zookeeper.KeeperException.Code;
047
048 import com.google.common.annotations.VisibleForTesting;
049 import com.google.common.base.Preconditions;
050
051 /**
052 *
053 * This class implements a simple library to perform leader election on top of
054 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election
055 * can be performed by atomically creating an ephemeral lock file (znode) on
056 * Zookeeper. The service instance that successfully creates the znode becomes
057 * active and the rest become standbys. <br/>
058 * This election mechanism is only efficient for small number of election
059 * candidates (order of 10's) because contention on single znode by a large
060 * number of candidates can result in Zookeeper overload. <br/>
061 * The elector does not guarantee fencing (protection of shared resources) among
062 * service instances. After it has notified an instance about becoming a leader,
063 * then that instance must ensure that it meets the service consistency
064 * requirements. If it cannot do so, then it is recommended to quit the
065 * election. The application implements the {@link ActiveStandbyElectorCallback}
066 * to interact with the elector
067 */
068 @InterfaceAudience.Private
069 @InterfaceStability.Evolving
070 public class ActiveStandbyElector implements StatCallback, StringCallback {
071
072 /**
073 * Callback interface to interact with the ActiveStandbyElector object. <br/>
074 * The application will be notified with a callback only on state changes
075 * (i.e. there will never be successive calls to becomeActive without an
076 * intermediate call to enterNeutralMode). <br/>
077 * The callbacks will be running on Zookeeper client library threads. The
078 * application should return from these callbacks quickly so as not to impede
079 * Zookeeper client library performance and notifications. The app will
080 * typically remember the state change and return from the callback. It will
081 * then proceed with implementing actions around that state change. It is
082 * possible to be called back again while these actions are in flight and the
083 * app should handle this scenario.
084 */
085 public interface ActiveStandbyElectorCallback {
086 /**
087 * This method is called when the app becomes the active leader.
088 * If the service fails to become active, it should throw
089 * ServiceFailedException. This will cause the elector to
090 * sleep for a short period, then re-join the election.
091 *
092 * Callback implementations are expected to manage their own
093 * timeouts (e.g. when making an RPC to a remote node).
094 */
095 void becomeActive() throws ServiceFailedException;
096
097 /**
098 * This method is called when the app becomes a standby
099 */
100 void becomeStandby();
101
102 /**
103 * If the elector gets disconnected from Zookeeper and does not know about
104 * the lock state, then it will notify the service via the enterNeutralMode
105 * interface. The service may choose to ignore this or stop doing state
106 * changing operations. Upon reconnection, the elector verifies the leader
107 * status and calls back on the becomeActive and becomeStandby app
108 * interfaces. <br/>
109 * Zookeeper disconnects can happen due to network issues or loss of
110 * Zookeeper quorum. Thus enterNeutralMode can be used to guard against
111 * split-brain issues. In such situations it might be prudent to call
112 * becomeStandby too. However, such state change operations might be
113 * expensive and enterNeutralMode can help guard against doing that for
114 * transient issues.
115 */
116 void enterNeutralMode();
117
118 /**
119 * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper
120 * errors or Zookeeper persistent unavailability) then notifyFatalError is
121 * called to notify the app about it.
122 */
123 void notifyFatalError(String errorMessage);
124
125 /**
126 * If an old active has failed, rather than exited gracefully, then
127 * the new active may need to take some fencing actions against it
128 * before proceeding with failover.
129 *
130 * @param oldActiveData the application data provided by the prior active
131 */
132 void fenceOldActive(byte[] oldActiveData);
133 }
134
135 /**
136 * Name of the lock znode used by the library. Protected for access in test
137 * classes
138 */
139 @VisibleForTesting
140 protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock";
141 @VisibleForTesting
142 protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb";
143
144 public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class);
145
146 private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000;
147
148 private static enum ConnectionState {
149 DISCONNECTED, CONNECTED, TERMINATED
150 };
151
152 static enum State {
153 INIT, ACTIVE, STANDBY, NEUTRAL
154 };
155
156 private State state = State.INIT;
157 private int createRetryCount = 0;
158 private int statRetryCount = 0;
159 private ZooKeeper zkClient;
160 private WatcherWithClientRef watcher;
161 private ConnectionState zkConnectionState = ConnectionState.TERMINATED;
162
163 private final ActiveStandbyElectorCallback appClient;
164 private final String zkHostPort;
165 private final int zkSessionTimeout;
166 private final List<ACL> zkAcl;
167 private final List<ZKAuthInfo> zkAuthInfo;
168 private byte[] appData;
169 private final String zkLockFilePath;
170 private final String zkBreadCrumbPath;
171 private final String znodeWorkingDir;
172 private final int maxRetryNum;
173
174 private Lock sessionReestablishLockForTests = new ReentrantLock();
175 private boolean wantToBeInElection;
176
177 /**
178 * Create a new ActiveStandbyElector object <br/>
179 * The elector is created by providing to it the Zookeeper configuration, the
180 * parent znode under which to create the znode and a reference to the
181 * callback interface. <br/>
182 * The parent znode name must be the same for all service instances and
183 * different across services. <br/>
184 * After the leader has been lost, a new leader will be elected after the
185 * session timeout expires. Hence, the app must set this parameter based on
186 * its needs for failure response time. The session timeout must be greater
187 * than the Zookeeper disconnect timeout and is recommended to be 3X that
188 * value to enable Zookeeper to retry transient disconnections. Setting a very
189 * short session timeout may result in frequent transitions between active and
190 * standby states during issues like network outages/GS pauses.
191 *
192 * @param zookeeperHostPorts
193 * ZooKeeper hostPort for all ZooKeeper servers
194 * @param zookeeperSessionTimeout
195 * ZooKeeper session timeout
196 * @param parentZnodeName
197 * znode under which to create the lock
198 * @param acl
199 * ZooKeeper ACL's
200 * @param authInfo a list of authentication credentials to add to the
201 * ZK connection
202 * @param app
203 * reference to callback interface object
204 * @throws IOException
205 * @throws HadoopIllegalArgumentException
206 */
207 public ActiveStandbyElector(String zookeeperHostPorts,
208 int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
209 List<ZKAuthInfo> authInfo,
210 ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException,
211 HadoopIllegalArgumentException, KeeperException {
212 if (app == null || acl == null || parentZnodeName == null
213 || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
214 throw new HadoopIllegalArgumentException("Invalid argument");
215 }
216 zkHostPort = zookeeperHostPorts;
217 zkSessionTimeout = zookeeperSessionTimeout;
218 zkAcl = acl;
219 zkAuthInfo = authInfo;
220 appClient = app;
221 znodeWorkingDir = parentZnodeName;
222 zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME;
223 zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
224 this.maxRetryNum = maxRetryNum;
225
226 // createConnection for future API calls
227 createConnection();
228 }
229
230 /**
231 * To participate in election, the app will call joinElection. The result will
232 * be notified by a callback on either the becomeActive or becomeStandby app
233 * interfaces. <br/>
234 * After this the elector will automatically monitor the leader status and
235 * perform re-election if necessary<br/>
236 * The app could potentially start off in standby mode and ignore the
237 * becomeStandby call.
238 *
239 * @param data
240 * to be set by the app. non-null data must be set.
241 * @throws HadoopIllegalArgumentException
242 * if valid data is not supplied
243 */
244 public synchronized void joinElection(byte[] data)
245 throws HadoopIllegalArgumentException {
246
247 if (data == null) {
248 throw new HadoopIllegalArgumentException("data cannot be null");
249 }
250
251 if (wantToBeInElection) {
252 LOG.info("Already in election. Not re-connecting.");
253 return;
254 }
255
256 appData = new byte[data.length];
257 System.arraycopy(data, 0, appData, 0, data.length);
258
259 LOG.debug("Attempting active election for " + this);
260 joinElectionInternal();
261 }
262
263 /**
264 * @return true if the configured parent znode exists
265 */
266 public synchronized boolean parentZNodeExists()
267 throws IOException, InterruptedException {
268 Preconditions.checkState(zkClient != null);
269 try {
270 return zkClient.exists(znodeWorkingDir, false) != null;
271 } catch (KeeperException e) {
272 throw new IOException("Couldn't determine existence of znode '" +
273 znodeWorkingDir + "'", e);
274 }
275 }
276
277 /**
278 * Utility function to ensure that the configured base znode exists.
279 * This recursively creates the znode as well as all of its parents.
280 */
281 public synchronized void ensureParentZNode()
282 throws IOException, InterruptedException {
283 Preconditions.checkState(!wantToBeInElection,
284 "ensureParentZNode() may not be called while in the election");
285
286 String pathParts[] = znodeWorkingDir.split("/");
287 Preconditions.checkArgument(pathParts.length >= 1 &&
288 pathParts[0].isEmpty(),
289 "Invalid path: %s", znodeWorkingDir);
290
291 StringBuilder sb = new StringBuilder();
292 for (int i = 1; i < pathParts.length; i++) {
293 sb.append("/").append(pathParts[i]);
294 String prefixPath = sb.toString();
295 LOG.debug("Ensuring existence of " + prefixPath);
296 try {
297 createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT);
298 } catch (KeeperException e) {
299 if (isNodeExists(e.code())) {
300 // This is OK - just ensuring existence.
301 continue;
302 } else {
303 throw new IOException("Couldn't create " + prefixPath, e);
304 }
305 }
306 }
307
308 LOG.info("Successfully created " + znodeWorkingDir + " in ZK.");
309 }
310
311 /**
312 * Clear all of the state held within the parent ZNode.
313 * This recursively deletes everything within the znode as well as the
314 * parent znode itself. It should only be used when it's certain that
315 * no electors are currently participating in the election.
316 */
317 public synchronized void clearParentZNode()
318 throws IOException, InterruptedException {
319 Preconditions.checkState(!wantToBeInElection,
320 "clearParentZNode() may not be called while in the election");
321
322 try {
323 LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
324
325 zkDoWithRetries(new ZKAction<Void>() {
326 @Override
327 public Void run() throws KeeperException, InterruptedException {
328 ZKUtil.deleteRecursive(zkClient, znodeWorkingDir);
329 return null;
330 }
331 });
332 } catch (KeeperException e) {
333 throw new IOException("Couldn't clear parent znode " + znodeWorkingDir,
334 e);
335 }
336 LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK.");
337 }
338
339
340 /**
341 * Any service instance can drop out of the election by calling quitElection.
342 * <br/>
343 * This will lose any leader status, if held, and stop monitoring of the lock
344 * node. <br/>
345 * If the instance wants to participate in election again, then it needs to
346 * call joinElection(). <br/>
347 * This allows service instances to take themselves out of rotation for known
348 * impending unavailable states (e.g. long GC pause or software upgrade).
349 *
350 * @param needFence true if the underlying daemon may need to be fenced
351 * if a failover occurs due to dropping out of the election.
352 */
353 public synchronized void quitElection(boolean needFence) {
354 LOG.info("Yielding from election");
355 if (!needFence && state == State.ACTIVE) {
356 // If active is gracefully going back to standby mode, remove
357 // our permanent znode so no one fences us.
358 tryDeleteOwnBreadCrumbNode();
359 }
360 reset();
361 wantToBeInElection = false;
362 }
363
364 /**
365 * Exception thrown when there is no active leader
366 */
367 public static class ActiveNotFoundException extends Exception {
368 private static final long serialVersionUID = 3505396722342846462L;
369 }
370
371 /**
372 * get data set by the active leader
373 *
374 * @return data set by the active instance
375 * @throws ActiveNotFoundException
376 * when there is no active leader
377 * @throws KeeperException
378 * other zookeeper operation errors
379 * @throws InterruptedException
380 * @throws IOException
381 * when ZooKeeper connection could not be established
382 */
383 public synchronized byte[] getActiveData() throws ActiveNotFoundException,
384 KeeperException, InterruptedException, IOException {
385 try {
386 if (zkClient == null) {
387 createConnection();
388 }
389 Stat stat = new Stat();
390 return getDataWithRetries(zkLockFilePath, false, stat);
391 } catch(KeeperException e) {
392 Code code = e.code();
393 if (isNodeDoesNotExist(code)) {
394 // handle the commonly expected cases that make sense for us
395 throw new ActiveNotFoundException();
396 } else {
397 throw e;
398 }
399 }
400 }
401
402 /**
403 * interface implementation of Zookeeper callback for create
404 */
405 @Override
406 public synchronized void processResult(int rc, String path, Object ctx,
407 String name) {
408 if (isStaleClient(ctx)) return;
409 LOG.debug("CreateNode result: " + rc + " for path: " + path
410 + " connectionState: " + zkConnectionState +
411 " for " + this);
412
413 Code code = Code.get(rc);
414 if (isSuccess(code)) {
415 // we successfully created the znode. we are the leader. start monitoring
416 if (becomeActive()) {
417 monitorActiveStatus();
418 } else {
419 reJoinElectionAfterFailureToBecomeActive();
420 }
421 return;
422 }
423
424 if (isNodeExists(code)) {
425 if (createRetryCount == 0) {
426 // znode exists and we did not retry the operation. so a different
427 // instance has created it. become standby and monitor lock.
428 becomeStandby();
429 }
430 // if we had retried then the znode could have been created by our first
431 // attempt to the server (that we lost) and this node exists response is
432 // for the second attempt. verify this case via ephemeral node owner. this
433 // will happen on the callback for monitoring the lock.
434 monitorActiveStatus();
435 return;
436 }
437
438 String errorMessage = "Received create error from Zookeeper. code:"
439 + code.toString() + " for path " + path;
440 LOG.debug(errorMessage);
441
442 if (shouldRetry(code)) {
443 if (createRetryCount < maxRetryNum) {
444 LOG.debug("Retrying createNode createRetryCount: " + createRetryCount);
445 ++createRetryCount;
446 createLockNodeAsync();
447 return;
448 }
449 errorMessage = errorMessage
450 + ". Not retrying further znode create connection errors.";
451 } else if (isSessionExpired(code)) {
452 // This isn't fatal - the client Watcher will re-join the election
453 LOG.warn("Lock acquisition failed because session was lost");
454 return;
455 }
456
457 fatalError(errorMessage);
458 }
459
460 /**
461 * interface implementation of Zookeeper callback for monitor (exists)
462 */
463 @Override
464 public synchronized void processResult(int rc, String path, Object ctx,
465 Stat stat) {
466 if (isStaleClient(ctx)) return;
467
468 assert wantToBeInElection :
469 "Got a StatNode result after quitting election";
470
471 LOG.debug("StatNode result: " + rc + " for path: " + path
472 + " connectionState: " + zkConnectionState + " for " + this);
473
474
475 Code code = Code.get(rc);
476 if (isSuccess(code)) {
477 // the following owner check completes verification in case the lock znode
478 // creation was retried
479 if (stat.getEphemeralOwner() == zkClient.getSessionId()) {
480 // we own the lock znode. so we are the leader
481 if (!becomeActive()) {
482 reJoinElectionAfterFailureToBecomeActive();
483 }
484 } else {
485 // we dont own the lock znode. so we are a standby.
486 becomeStandby();
487 }
488 // the watch set by us will notify about changes
489 return;
490 }
491
492 if (isNodeDoesNotExist(code)) {
493 // the lock znode disappeared before we started monitoring it
494 enterNeutralMode();
495 joinElectionInternal();
496 return;
497 }
498
499 String errorMessage = "Received stat error from Zookeeper. code:"
500 + code.toString();
501 LOG.debug(errorMessage);
502
503 if (shouldRetry(code)) {
504 if (statRetryCount < maxRetryNum) {
505 ++statRetryCount;
506 monitorLockNodeAsync();
507 return;
508 }
509 errorMessage = errorMessage
510 + ". Not retrying further znode monitoring connection errors.";
511 } else if (isSessionExpired(code)) {
512 // This isn't fatal - the client Watcher will re-join the election
513 LOG.warn("Lock monitoring failed because session was lost");
514 return;
515 }
516
517 fatalError(errorMessage);
518 }
519
520 /**
521 * We failed to become active. Re-join the election, but
522 * sleep for a few seconds after terminating our existing
523 * session, so that other nodes have a chance to become active.
524 * The failure to become active is already logged inside
525 * becomeActive().
526 */
527 private void reJoinElectionAfterFailureToBecomeActive() {
528 reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE);
529 }
530
531 /**
532 * interface implementation of Zookeeper watch events (connection and node),
533 * proxied by {@link WatcherWithClientRef}.
534 */
535 synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
536 Event.EventType eventType = event.getType();
537 if (isStaleClient(zk)) return;
538 LOG.debug("Watcher event type: " + eventType + " with state:"
539 + event.getState() + " for path:" + event.getPath()
540 + " connectionState: " + zkConnectionState
541 + " for " + this);
542
543 if (eventType == Event.EventType.None) {
544 // the connection state has changed
545 switch (event.getState()) {
546 case SyncConnected:
547 LOG.info("Session connected.");
548 // if the listener was asked to move to safe state then it needs to
549 // be undone
550 ConnectionState prevConnectionState = zkConnectionState;
551 zkConnectionState = ConnectionState.CONNECTED;
552 if (prevConnectionState == ConnectionState.DISCONNECTED &&
553 wantToBeInElection) {
554 monitorActiveStatus();
555 }
556 break;
557 case Disconnected:
558 LOG.info("Session disconnected. Entering neutral mode...");
559
560 // ask the app to move to safe state because zookeeper connection
561 // is not active and we dont know our state
562 zkConnectionState = ConnectionState.DISCONNECTED;
563 enterNeutralMode();
564 break;
565 case Expired:
566 // the connection got terminated because of session timeout
567 // call listener to reconnect
568 LOG.info("Session expired. Entering neutral mode and rejoining...");
569 enterNeutralMode();
570 reJoinElection(0);
571 break;
572 case SaslAuthenticated:
573 LOG.info("Successfully authenticated to ZooKeeper using SASL.");
574 break;
575 default:
576 fatalError("Unexpected Zookeeper watch event state: "
577 + event.getState());
578 break;
579 }
580
581 return;
582 }
583
584 // a watch on lock path in zookeeper has fired. so something has changed on
585 // the lock. ideally we should check that the path is the same as the lock
586 // path but trusting zookeeper for now
587 String path = event.getPath();
588 if (path != null) {
589 switch (eventType) {
590 case NodeDeleted:
591 if (state == State.ACTIVE) {
592 enterNeutralMode();
593 }
594 joinElectionInternal();
595 break;
596 case NodeDataChanged:
597 monitorActiveStatus();
598 break;
599 default:
600 LOG.debug("Unexpected node event: " + eventType + " for path: " + path);
601 monitorActiveStatus();
602 }
603
604 return;
605 }
606
607 // some unexpected error has occurred
608 fatalError("Unexpected watch error from Zookeeper");
609 }
610
611 /**
612 * Get a new zookeeper client instance. protected so that test class can
613 * inherit and pass in a mock object for zookeeper
614 *
615 * @return new zookeeper client instance
616 * @throws IOException
617 * @throws KeeperException zookeeper connectionloss exception
618 */
619 protected synchronized ZooKeeper getNewZooKeeper() throws IOException,
620 KeeperException {
621
622 // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and
623 // may trigger the Connected event immediately. So, if we register the
624 // watcher after constructing ZooKeeper, we may miss that event. Instead,
625 // we construct the watcher first, and have it block any events it receives
626 // before we can set its ZooKeeper reference.
627 watcher = new WatcherWithClientRef();
628 ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher);
629 watcher.setZooKeeperRef(zk);
630
631 // Wait for the asynchronous success/failure. This may throw an exception
632 // if we don't connect within the session timeout.
633 watcher.waitForZKConnectionEvent(zkSessionTimeout);
634
635 for (ZKAuthInfo auth : zkAuthInfo) {
636 zk.addAuthInfo(auth.getScheme(), auth.getAuth());
637 }
638 return zk;
639 }
640
641 private void fatalError(String errorMessage) {
642 LOG.fatal(errorMessage);
643 reset();
644 appClient.notifyFatalError(errorMessage);
645 }
646
647 private void monitorActiveStatus() {
648 assert wantToBeInElection;
649 LOG.debug("Monitoring active leader for " + this);
650 statRetryCount = 0;
651 monitorLockNodeAsync();
652 }
653
654 private void joinElectionInternal() {
655 Preconditions.checkState(appData != null,
656 "trying to join election without any app data");
657 if (zkClient == null) {
658 if (!reEstablishSession()) {
659 fatalError("Failed to reEstablish connection with ZooKeeper");
660 return;
661 }
662 }
663
664 createRetryCount = 0;
665 wantToBeInElection = true;
666 createLockNodeAsync();
667 }
668
669 private void reJoinElection(int sleepTime) {
670 LOG.info("Trying to re-establish ZK session");
671
672 // Some of the test cases rely on expiring the ZK sessions and
673 // ensuring that the other node takes over. But, there's a race
674 // where the original lease holder could reconnect faster than the other
675 // thread manages to take the lock itself. This lock allows the
676 // tests to block the reconnection. It's a shame that this leaked
677 // into non-test code, but the lock is only acquired here so will never
678 // be contended.
679 sessionReestablishLockForTests.lock();
680 try {
681 terminateConnection();
682 sleepFor(sleepTime);
683 // Should not join election even before the SERVICE is reported
684 // as HEALTHY from ZKFC monitoring.
685 if (appData != null) {
686 joinElectionInternal();
687 } else {
688 LOG.info("Not joining election since service has not yet been " +
689 "reported as healthy.");
690 }
691 } finally {
692 sessionReestablishLockForTests.unlock();
693 }
694 }
695
696 /**
697 * Sleep for the given number of milliseconds.
698 * This is non-static, and separated out, so that unit tests
699 * can override the behavior not to sleep.
700 */
701 @VisibleForTesting
702 protected void sleepFor(int sleepMs) {
703 if (sleepMs > 0) {
704 try {
705 Thread.sleep(sleepMs);
706 } catch (InterruptedException e) {
707 Thread.currentThread().interrupt();
708 }
709 }
710 }
711
712 @VisibleForTesting
713 void preventSessionReestablishmentForTests() {
714 sessionReestablishLockForTests.lock();
715 }
716
717 @VisibleForTesting
718 void allowSessionReestablishmentForTests() {
719 sessionReestablishLockForTests.unlock();
720 }
721
722 @VisibleForTesting
723 synchronized long getZKSessionIdForTests() {
724 if (zkClient != null) {
725 return zkClient.getSessionId();
726 } else {
727 return -1;
728 }
729 }
730
731 @VisibleForTesting
732 synchronized State getStateForTests() {
733 return state;
734 }
735
736 private boolean reEstablishSession() {
737 int connectionRetryCount = 0;
738 boolean success = false;
739 while(!success && connectionRetryCount < maxRetryNum) {
740 LOG.debug("Establishing zookeeper connection for " + this);
741 try {
742 createConnection();
743 success = true;
744 } catch(IOException e) {
745 LOG.warn(e);
746 sleepFor(5000);
747 } catch(KeeperException e) {
748 LOG.warn(e);
749 sleepFor(5000);
750 }
751 ++connectionRetryCount;
752 }
753 return success;
754 }
755
756 private void createConnection() throws IOException, KeeperException {
757 if (zkClient != null) {
758 try {
759 zkClient.close();
760 } catch (InterruptedException e) {
761 throw new IOException("Interrupted while closing ZK",
762 e);
763 }
764 zkClient = null;
765 watcher = null;
766 }
767 zkClient = getNewZooKeeper();
768 LOG.debug("Created new connection for " + this);
769 }
770
771 @InterfaceAudience.Private
772 public synchronized void terminateConnection() {
773 if (zkClient == null) {
774 return;
775 }
776 LOG.debug("Terminating ZK connection for " + this);
777 ZooKeeper tempZk = zkClient;
778 zkClient = null;
779 watcher = null;
780 try {
781 tempZk.close();
782 } catch(InterruptedException e) {
783 LOG.warn(e);
784 }
785 zkConnectionState = ConnectionState.TERMINATED;
786 wantToBeInElection = false;
787 }
788
789 private void reset() {
790 state = State.INIT;
791 terminateConnection();
792 }
793
794 private boolean becomeActive() {
795 assert wantToBeInElection;
796 if (state == State.ACTIVE) {
797 // already active
798 return true;
799 }
800 try {
801 Stat oldBreadcrumbStat = fenceOldActive();
802 writeBreadCrumbNode(oldBreadcrumbStat);
803
804 LOG.debug("Becoming active for " + this);
805 appClient.becomeActive();
806 state = State.ACTIVE;
807 return true;
808 } catch (Exception e) {
809 LOG.warn("Exception handling the winning of election", e);
810 // Caller will handle quitting and rejoining the election.
811 return false;
812 }
813 }
814
815 /**
816 * Write the "ActiveBreadCrumb" node, indicating that this node may need
817 * to be fenced on failover.
818 * @param oldBreadcrumbStat
819 */
820 private void writeBreadCrumbNode(Stat oldBreadcrumbStat)
821 throws KeeperException, InterruptedException {
822 Preconditions.checkState(appData != null, "no appdata");
823
824 LOG.info("Writing znode " + zkBreadCrumbPath +
825 " to indicate that the local node is the most recent active...");
826 if (oldBreadcrumbStat == null) {
827 // No previous active, just create the node
828 createWithRetries(zkBreadCrumbPath, appData, zkAcl,
829 CreateMode.PERSISTENT);
830 } else {
831 // There was a previous active, update the node
832 setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion());
833 }
834 }
835
836 /**
837 * Try to delete the "ActiveBreadCrumb" node when gracefully giving up
838 * active status.
839 * If this fails, it will simply warn, since the graceful release behavior
840 * is only an optimization.
841 */
842 private void tryDeleteOwnBreadCrumbNode() {
843 assert state == State.ACTIVE;
844 LOG.info("Deleting bread-crumb of active node...");
845
846 // Sanity check the data. This shouldn't be strictly necessary,
847 // but better to play it safe.
848 Stat stat = new Stat();
849 byte[] data = null;
850 try {
851 data = zkClient.getData(zkBreadCrumbPath, false, stat);
852
853 if (!Arrays.equals(data, appData)) {
854 throw new IllegalStateException(
855 "We thought we were active, but in fact " +
856 "the active znode had the wrong data: " +
857 StringUtils.byteToHexString(data) + " (stat=" + stat + ")");
858 }
859
860 deleteWithRetries(zkBreadCrumbPath, stat.getVersion());
861 } catch (Exception e) {
862 LOG.warn("Unable to delete our own bread-crumb of being active at " +
863 zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " +
864 "Expecting to be fenced by the next active.");
865 }
866 }
867
868 /**
869 * If there is a breadcrumb node indicating that another node may need
870 * fencing, try to fence that node.
871 * @return the Stat of the breadcrumb node that was read, or null
872 * if no breadcrumb node existed
873 */
874 private Stat fenceOldActive() throws InterruptedException, KeeperException {
875 final Stat stat = new Stat();
876 byte[] data;
877 LOG.info("Checking for any old active which needs to be fenced...");
878 try {
879 data = zkDoWithRetries(new ZKAction<byte[]>() {
880 @Override
881 public byte[] run() throws KeeperException, InterruptedException {
882 return zkClient.getData(zkBreadCrumbPath, false, stat);
883 }
884 });
885 } catch (KeeperException ke) {
886 if (isNodeDoesNotExist(ke.code())) {
887 LOG.info("No old node to fence");
888 return null;
889 }
890
891 // If we failed to read for any other reason, then likely we lost
892 // our session, or we don't have permissions, etc. In any case,
893 // we probably shouldn't become active, and failing the whole
894 // thing is the best bet.
895 throw ke;
896 }
897
898 LOG.info("Old node exists: " + StringUtils.byteToHexString(data));
899 if (Arrays.equals(data, appData)) {
900 LOG.info("But old node has our own data, so don't need to fence it.");
901 } else {
902 appClient.fenceOldActive(data);
903 }
904 return stat;
905 }
906
907 private void becomeStandby() {
908 if (state != State.STANDBY) {
909 LOG.debug("Becoming standby for " + this);
910 state = State.STANDBY;
911 appClient.becomeStandby();
912 }
913 }
914
915 private void enterNeutralMode() {
916 if (state != State.NEUTRAL) {
917 LOG.debug("Entering neutral mode for " + this);
918 state = State.NEUTRAL;
919 appClient.enterNeutralMode();
920 }
921 }
922
923 private void createLockNodeAsync() {
924 zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL,
925 this, zkClient);
926 }
927
928 private void monitorLockNodeAsync() {
929 zkClient.exists(zkLockFilePath,
930 watcher, this,
931 zkClient);
932 }
933
934 private String createWithRetries(final String path, final byte[] data,
935 final List<ACL> acl, final CreateMode mode)
936 throws InterruptedException, KeeperException {
937 return zkDoWithRetries(new ZKAction<String>() {
938 @Override
939 public String run() throws KeeperException, InterruptedException {
940 return zkClient.create(path, data, acl, mode);
941 }
942 });
943 }
944
945 private byte[] getDataWithRetries(final String path, final boolean watch,
946 final Stat stat) throws InterruptedException, KeeperException {
947 return zkDoWithRetries(new ZKAction<byte[]>() {
948 @Override
949 public byte[] run() throws KeeperException, InterruptedException {
950 return zkClient.getData(path, watch, stat);
951 }
952 });
953 }
954
955 private Stat setDataWithRetries(final String path, final byte[] data,
956 final int version) throws InterruptedException, KeeperException {
957 return zkDoWithRetries(new ZKAction<Stat>() {
958 @Override
959 public Stat run() throws KeeperException, InterruptedException {
960 return zkClient.setData(path, data, version);
961 }
962 });
963 }
964
965 private void deleteWithRetries(final String path, final int version)
966 throws KeeperException, InterruptedException {
967 zkDoWithRetries(new ZKAction<Void>() {
968 @Override
969 public Void run() throws KeeperException, InterruptedException {
970 zkClient.delete(path, version);
971 return null;
972 }
973 });
974 }
975
976 private <T> T zkDoWithRetries(ZKAction<T> action) throws KeeperException,
977 InterruptedException {
978 int retry = 0;
979 while (true) {
980 try {
981 return action.run();
982 } catch (KeeperException ke) {
983 if (shouldRetry(ke.code()) && ++retry < maxRetryNum) {
984 continue;
985 }
986 throw ke;
987 }
988 }
989 }
990
991 private interface ZKAction<T> {
992 T run() throws KeeperException, InterruptedException;
993 }
994
995 /**
996 * The callbacks and watchers pass a reference to the ZK client
997 * which made the original call. We don't want to take action
998 * based on any callbacks from prior clients after we quit
999 * the election.
1000 * @param ctx the ZK client passed into the watcher
1001 * @return true if it matches the current client
1002 */
1003 private synchronized boolean isStaleClient(Object ctx) {
1004 Preconditions.checkNotNull(ctx);
1005 if (zkClient != (ZooKeeper)ctx) {
1006 LOG.warn("Ignoring stale result from old client with sessionId " +
1007 String.format("0x%08x", ((ZooKeeper)ctx).getSessionId()));
1008 return true;
1009 }
1010 return false;
1011 }
1012
1013 /**
1014 * Watcher implementation which keeps a reference around to the
1015 * original ZK connection, and passes it back along with any
1016 * events.
1017 */
1018 private final class WatcherWithClientRef implements Watcher {
1019 private ZooKeeper zk;
1020
1021 /**
1022 * Latch fired whenever any event arrives. This is used in order
1023 * to wait for the Connected event when the client is first created.
1024 */
1025 private CountDownLatch hasReceivedEvent = new CountDownLatch(1);
1026
1027 /**
1028 * Latch used to wait until the reference to ZooKeeper is set.
1029 */
1030 private CountDownLatch hasSetZooKeeper = new CountDownLatch(1);
1031
1032 /**
1033 * Waits for the next event from ZooKeeper to arrive.
1034 *
1035 * @param connectionTimeoutMs zookeeper connection timeout in milliseconds
1036 * @throws KeeperException if the connection attempt times out. This will
1037 * be a ZooKeeper ConnectionLoss exception code.
1038 * @throws IOException if interrupted while connecting to ZooKeeper
1039 */
1040 private void waitForZKConnectionEvent(int connectionTimeoutMs)
1041 throws KeeperException, IOException {
1042 try {
1043 if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) {
1044 LOG.error("Connection timed out: couldn't connect to ZooKeeper in "
1045 + connectionTimeoutMs + " milliseconds");
1046 zk.close();
1047 throw KeeperException.create(Code.CONNECTIONLOSS);
1048 }
1049 } catch (InterruptedException e) {
1050 Thread.currentThread().interrupt();
1051 throw new IOException(
1052 "Interrupted when connecting to zookeeper server", e);
1053 }
1054 }
1055
1056 private void setZooKeeperRef(ZooKeeper zk) {
1057 Preconditions.checkState(this.zk == null,
1058 "zk already set -- must be set exactly once");
1059 this.zk = zk;
1060 hasSetZooKeeper.countDown();
1061 }
1062
1063 @Override
1064 public void process(WatchedEvent event) {
1065 hasReceivedEvent.countDown();
1066 try {
1067 hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS);
1068 ActiveStandbyElector.this.processWatchEvent(
1069 zk, event);
1070 } catch (Throwable t) {
1071 fatalError(
1072 "Failed to process watcher event " + event + ": " +
1073 StringUtils.stringifyException(t));
1074 }
1075 }
1076 }
1077
1078 private static boolean isSuccess(Code code) {
1079 return (code == Code.OK);
1080 }
1081
1082 private static boolean isNodeExists(Code code) {
1083 return (code == Code.NODEEXISTS);
1084 }
1085
1086 private static boolean isNodeDoesNotExist(Code code) {
1087 return (code == Code.NONODE);
1088 }
1089
1090 private static boolean isSessionExpired(Code code) {
1091 return (code == Code.SESSIONEXPIRED);
1092 }
1093
1094 private static boolean shouldRetry(Code code) {
1095 return code == Code.CONNECTIONLOSS || code == Code.OPERATIONTIMEOUT;
1096 }
1097
1098 @Override
1099 public String toString() {
1100 return "elector id=" + System.identityHashCode(this) +
1101 " appData=" +
1102 ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) +
1103 " cb=" + appClient;
1104 }
1105 }