001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.ha;
020    
021    import java.io.IOException;
022    import java.util.Arrays;
023    import java.util.List;
024    import java.util.concurrent.CountDownLatch;
025    import java.util.concurrent.TimeUnit;
026    import java.util.concurrent.locks.Lock;
027    import java.util.concurrent.locks.ReentrantLock;
028    
029    import org.apache.commons.logging.Log;
030    import org.apache.commons.logging.LogFactory;
031    import org.apache.hadoop.HadoopIllegalArgumentException;
032    import org.apache.hadoop.classification.InterfaceAudience;
033    import org.apache.hadoop.classification.InterfaceStability;
034    import org.apache.hadoop.util.ZKUtil.ZKAuthInfo;
035    import org.apache.hadoop.util.StringUtils;
036    import org.apache.zookeeper.data.ACL;
037    import org.apache.zookeeper.KeeperException;
038    import org.apache.zookeeper.Watcher;
039    import org.apache.zookeeper.WatchedEvent;
040    import org.apache.zookeeper.Watcher.Event;
041    import org.apache.zookeeper.ZKUtil;
042    import org.apache.zookeeper.ZooKeeper;
043    import org.apache.zookeeper.CreateMode;
044    import org.apache.zookeeper.AsyncCallback.*;
045    import org.apache.zookeeper.data.Stat;
046    import org.apache.zookeeper.KeeperException.Code;
047    
048    import com.google.common.annotations.VisibleForTesting;
049    import com.google.common.base.Preconditions;
050    
051    /**
052     * 
053     * This class implements a simple library to perform leader election on top of
054     * Apache Zookeeper. Using Zookeeper as a coordination service, leader election
055     * can be performed by atomically creating an ephemeral lock file (znode) on
056     * Zookeeper. The service instance that successfully creates the znode becomes
057     * active and the rest become standbys. <br/>
058     * This election mechanism is only efficient for small number of election
059     * candidates (order of 10's) because contention on single znode by a large
060     * number of candidates can result in Zookeeper overload. <br/>
061     * The elector does not guarantee fencing (protection of shared resources) among
062     * service instances. After it has notified an instance about becoming a leader,
063     * then that instance must ensure that it meets the service consistency
064     * requirements. If it cannot do so, then it is recommended to quit the
065     * election. The application implements the {@link ActiveStandbyElectorCallback}
066     * to interact with the elector
067     */
068    @InterfaceAudience.Private
069    @InterfaceStability.Evolving
070    public class ActiveStandbyElector implements StatCallback, StringCallback {
071    
072      /**
073       * Callback interface to interact with the ActiveStandbyElector object. <br/>
074       * The application will be notified with a callback only on state changes
075       * (i.e. there will never be successive calls to becomeActive without an
076       * intermediate call to enterNeutralMode). <br/>
077       * The callbacks will be running on Zookeeper client library threads. The
078       * application should return from these callbacks quickly so as not to impede
079       * Zookeeper client library performance and notifications. The app will
080       * typically remember the state change and return from the callback. It will
081       * then proceed with implementing actions around that state change. It is
082       * possible to be called back again while these actions are in flight and the
083       * app should handle this scenario.
084       */
085      public interface ActiveStandbyElectorCallback {
086        /**
087         * This method is called when the app becomes the active leader.
088         * If the service fails to become active, it should throw
089         * ServiceFailedException. This will cause the elector to
090         * sleep for a short period, then re-join the election.
091         * 
092         * Callback implementations are expected to manage their own
093         * timeouts (e.g. when making an RPC to a remote node).
094         */
095        void becomeActive() throws ServiceFailedException;
096    
097        /**
098         * This method is called when the app becomes a standby
099         */
100        void becomeStandby();
101    
102        /**
103         * If the elector gets disconnected from Zookeeper and does not know about
104         * the lock state, then it will notify the service via the enterNeutralMode
105         * interface. The service may choose to ignore this or stop doing state
106         * changing operations. Upon reconnection, the elector verifies the leader
107         * status and calls back on the becomeActive and becomeStandby app
108         * interfaces. <br/>
109         * Zookeeper disconnects can happen due to network issues or loss of
110         * Zookeeper quorum. Thus enterNeutralMode can be used to guard against
111         * split-brain issues. In such situations it might be prudent to call
112         * becomeStandby too. However, such state change operations might be
113         * expensive and enterNeutralMode can help guard against doing that for
114         * transient issues.
115         */
116        void enterNeutralMode();
117    
118        /**
119         * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper
120         * errors or Zookeeper persistent unavailability) then notifyFatalError is
121         * called to notify the app about it.
122         */
123        void notifyFatalError(String errorMessage);
124    
125        /**
126         * If an old active has failed, rather than exited gracefully, then
127         * the new active may need to take some fencing actions against it
128         * before proceeding with failover.
129         * 
130         * @param oldActiveData the application data provided by the prior active
131         */
132        void fenceOldActive(byte[] oldActiveData);
133      }
134    
135      /**
136       * Name of the lock znode used by the library. Protected for access in test
137       * classes
138       */
139      @VisibleForTesting
140      protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock";
141      @VisibleForTesting
142      protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb";
143    
144      public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class);
145    
146      private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000;
147    
148      private static enum ConnectionState {
149        DISCONNECTED, CONNECTED, TERMINATED
150      };
151    
152      static enum State {
153        INIT, ACTIVE, STANDBY, NEUTRAL
154      };
155    
156      private State state = State.INIT;
157      private int createRetryCount = 0;
158      private int statRetryCount = 0;
159      private ZooKeeper zkClient;
160      private WatcherWithClientRef watcher;
161      private ConnectionState zkConnectionState = ConnectionState.TERMINATED;
162    
163      private final ActiveStandbyElectorCallback appClient;
164      private final String zkHostPort;
165      private final int zkSessionTimeout;
166      private final List<ACL> zkAcl;
167      private final List<ZKAuthInfo> zkAuthInfo;
168      private byte[] appData;
169      private final String zkLockFilePath;
170      private final String zkBreadCrumbPath;
171      private final String znodeWorkingDir;
172      private final int maxRetryNum;
173    
174      private Lock sessionReestablishLockForTests = new ReentrantLock();
175      private boolean wantToBeInElection;
176      
177      /**
178       * Create a new ActiveStandbyElector object <br/>
179       * The elector is created by providing to it the Zookeeper configuration, the
180       * parent znode under which to create the znode and a reference to the
181       * callback interface. <br/>
182       * The parent znode name must be the same for all service instances and
183       * different across services. <br/>
184       * After the leader has been lost, a new leader will be elected after the
185       * session timeout expires. Hence, the app must set this parameter based on
186       * its needs for failure response time. The session timeout must be greater
187       * than the Zookeeper disconnect timeout and is recommended to be 3X that
188       * value to enable Zookeeper to retry transient disconnections. Setting a very
189       * short session timeout may result in frequent transitions between active and
190       * standby states during issues like network outages/GS pauses.
191       * 
192       * @param zookeeperHostPorts
193       *          ZooKeeper hostPort for all ZooKeeper servers
194       * @param zookeeperSessionTimeout
195       *          ZooKeeper session timeout
196       * @param parentZnodeName
197       *          znode under which to create the lock
198       * @param acl
199       *          ZooKeeper ACL's
200       * @param authInfo a list of authentication credentials to add to the
201       *                 ZK connection
202       * @param app
203       *          reference to callback interface object
204       * @throws IOException
205       * @throws HadoopIllegalArgumentException
206       */
207      public ActiveStandbyElector(String zookeeperHostPorts,
208          int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
209          List<ZKAuthInfo> authInfo,
210          ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException,
211          HadoopIllegalArgumentException, KeeperException {
212        if (app == null || acl == null || parentZnodeName == null
213            || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
214          throw new HadoopIllegalArgumentException("Invalid argument");
215        }
216        zkHostPort = zookeeperHostPorts;
217        zkSessionTimeout = zookeeperSessionTimeout;
218        zkAcl = acl;
219        zkAuthInfo = authInfo;
220        appClient = app;
221        znodeWorkingDir = parentZnodeName;
222        zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME;
223        zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
224        this.maxRetryNum = maxRetryNum;
225    
226        // createConnection for future API calls
227        createConnection();
228      }
229    
230      /**
231       * To participate in election, the app will call joinElection. The result will
232       * be notified by a callback on either the becomeActive or becomeStandby app
233       * interfaces. <br/>
234       * After this the elector will automatically monitor the leader status and
235       * perform re-election if necessary<br/>
236       * The app could potentially start off in standby mode and ignore the
237       * becomeStandby call.
238       * 
239       * @param data
240       *          to be set by the app. non-null data must be set.
241       * @throws HadoopIllegalArgumentException
242       *           if valid data is not supplied
243       */
244      public synchronized void joinElection(byte[] data)
245          throws HadoopIllegalArgumentException {
246        
247        if (data == null) {
248          throw new HadoopIllegalArgumentException("data cannot be null");
249        }
250        
251        if (wantToBeInElection) {
252          LOG.info("Already in election. Not re-connecting.");
253          return;
254        }
255    
256        appData = new byte[data.length];
257        System.arraycopy(data, 0, appData, 0, data.length);
258    
259        LOG.debug("Attempting active election for " + this);
260        joinElectionInternal();
261      }
262      
263      /**
264       * @return true if the configured parent znode exists
265       */
266      public synchronized boolean parentZNodeExists()
267          throws IOException, InterruptedException {
268        Preconditions.checkState(zkClient != null);
269        try {
270          return zkClient.exists(znodeWorkingDir, false) != null;
271        } catch (KeeperException e) {
272          throw new IOException("Couldn't determine existence of znode '" +
273              znodeWorkingDir + "'", e);
274        }
275      }
276    
277      /**
278       * Utility function to ensure that the configured base znode exists.
279       * This recursively creates the znode as well as all of its parents.
280       */
281      public synchronized void ensureParentZNode()
282          throws IOException, InterruptedException {
283        Preconditions.checkState(!wantToBeInElection,
284            "ensureParentZNode() may not be called while in the election");
285    
286        String pathParts[] = znodeWorkingDir.split("/");
287        Preconditions.checkArgument(pathParts.length >= 1 &&
288            pathParts[0].isEmpty(),
289            "Invalid path: %s", znodeWorkingDir);
290        
291        StringBuilder sb = new StringBuilder();
292        for (int i = 1; i < pathParts.length; i++) {
293          sb.append("/").append(pathParts[i]);
294          String prefixPath = sb.toString();
295          LOG.debug("Ensuring existence of " + prefixPath);
296          try {
297            createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT);
298          } catch (KeeperException e) {
299            if (isNodeExists(e.code())) {
300              // This is OK - just ensuring existence.
301              continue;
302            } else {
303              throw new IOException("Couldn't create " + prefixPath, e);
304            }
305          }
306        }
307        
308        LOG.info("Successfully created " + znodeWorkingDir + " in ZK.");
309      }
310      
311      /**
312       * Clear all of the state held within the parent ZNode.
313       * This recursively deletes everything within the znode as well as the
314       * parent znode itself. It should only be used when it's certain that
315       * no electors are currently participating in the election.
316       */
317      public synchronized void clearParentZNode()
318          throws IOException, InterruptedException {
319        Preconditions.checkState(!wantToBeInElection,
320            "clearParentZNode() may not be called while in the election");
321    
322        try {
323          LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
324    
325          zkDoWithRetries(new ZKAction<Void>() {
326            @Override
327            public Void run() throws KeeperException, InterruptedException {
328              ZKUtil.deleteRecursive(zkClient, znodeWorkingDir);
329              return null;
330            }
331          });
332        } catch (KeeperException e) {
333          throw new IOException("Couldn't clear parent znode " + znodeWorkingDir,
334              e);
335        }
336        LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK.");
337      }
338    
339    
340      /**
341       * Any service instance can drop out of the election by calling quitElection. 
342       * <br/>
343       * This will lose any leader status, if held, and stop monitoring of the lock
344       * node. <br/>
345       * If the instance wants to participate in election again, then it needs to
346       * call joinElection(). <br/>
347       * This allows service instances to take themselves out of rotation for known
348       * impending unavailable states (e.g. long GC pause or software upgrade).
349       * 
350       * @param needFence true if the underlying daemon may need to be fenced
351       * if a failover occurs due to dropping out of the election.
352       */
353      public synchronized void quitElection(boolean needFence) {
354        LOG.info("Yielding from election");
355        if (!needFence && state == State.ACTIVE) {
356          // If active is gracefully going back to standby mode, remove
357          // our permanent znode so no one fences us.
358          tryDeleteOwnBreadCrumbNode();
359        }
360        reset();
361        wantToBeInElection = false;
362      }
363    
364      /**
365       * Exception thrown when there is no active leader
366       */
367      public static class ActiveNotFoundException extends Exception {
368        private static final long serialVersionUID = 3505396722342846462L;
369      }
370    
371      /**
372       * get data set by the active leader
373       * 
374       * @return data set by the active instance
375       * @throws ActiveNotFoundException
376       *           when there is no active leader
377       * @throws KeeperException
378       *           other zookeeper operation errors
379       * @throws InterruptedException
380       * @throws IOException
381       *           when ZooKeeper connection could not be established
382       */
383      public synchronized byte[] getActiveData() throws ActiveNotFoundException,
384          KeeperException, InterruptedException, IOException {
385        try {
386          if (zkClient == null) {
387            createConnection();
388          }
389          Stat stat = new Stat();
390          return getDataWithRetries(zkLockFilePath, false, stat);
391        } catch(KeeperException e) {
392          Code code = e.code();
393          if (isNodeDoesNotExist(code)) {
394            // handle the commonly expected cases that make sense for us
395            throw new ActiveNotFoundException();
396          } else {
397            throw e;
398          }
399        }
400      }
401    
402      /**
403       * interface implementation of Zookeeper callback for create
404       */
405      @Override
406      public synchronized void processResult(int rc, String path, Object ctx,
407          String name) {
408        if (isStaleClient(ctx)) return;
409        LOG.debug("CreateNode result: " + rc + " for path: " + path
410            + " connectionState: " + zkConnectionState +
411            "  for " + this);
412    
413        Code code = Code.get(rc);
414        if (isSuccess(code)) {
415          // we successfully created the znode. we are the leader. start monitoring
416          if (becomeActive()) {
417            monitorActiveStatus();
418          } else {
419            reJoinElectionAfterFailureToBecomeActive();
420          }
421          return;
422        }
423    
424        if (isNodeExists(code)) {
425          if (createRetryCount == 0) {
426            // znode exists and we did not retry the operation. so a different
427            // instance has created it. become standby and monitor lock.
428            becomeStandby();
429          }
430          // if we had retried then the znode could have been created by our first
431          // attempt to the server (that we lost) and this node exists response is
432          // for the second attempt. verify this case via ephemeral node owner. this
433          // will happen on the callback for monitoring the lock.
434          monitorActiveStatus();
435          return;
436        }
437    
438        String errorMessage = "Received create error from Zookeeper. code:"
439            + code.toString() + " for path " + path;
440        LOG.debug(errorMessage);
441    
442        if (shouldRetry(code)) {
443          if (createRetryCount < maxRetryNum) {
444            LOG.debug("Retrying createNode createRetryCount: " + createRetryCount);
445            ++createRetryCount;
446            createLockNodeAsync();
447            return;
448          }
449          errorMessage = errorMessage
450              + ". Not retrying further znode create connection errors.";
451        } else if (isSessionExpired(code)) {
452          // This isn't fatal - the client Watcher will re-join the election
453          LOG.warn("Lock acquisition failed because session was lost");
454          return;
455        }
456    
457        fatalError(errorMessage);
458      }
459    
460      /**
461       * interface implementation of Zookeeper callback for monitor (exists)
462       */
463      @Override
464      public synchronized void processResult(int rc, String path, Object ctx,
465          Stat stat) {
466        if (isStaleClient(ctx)) return;
467        
468        assert wantToBeInElection :
469            "Got a StatNode result after quitting election";
470        
471        LOG.debug("StatNode result: " + rc + " for path: " + path
472            + " connectionState: " + zkConnectionState + " for " + this);
473            
474    
475        Code code = Code.get(rc);
476        if (isSuccess(code)) {
477          // the following owner check completes verification in case the lock znode
478          // creation was retried
479          if (stat.getEphemeralOwner() == zkClient.getSessionId()) {
480            // we own the lock znode. so we are the leader
481            if (!becomeActive()) {
482              reJoinElectionAfterFailureToBecomeActive();
483            }
484          } else {
485            // we dont own the lock znode. so we are a standby.
486            becomeStandby();
487          }
488          // the watch set by us will notify about changes
489          return;
490        }
491    
492        if (isNodeDoesNotExist(code)) {
493          // the lock znode disappeared before we started monitoring it
494          enterNeutralMode();
495          joinElectionInternal();
496          return;
497        }
498    
499        String errorMessage = "Received stat error from Zookeeper. code:"
500            + code.toString();
501        LOG.debug(errorMessage);
502    
503        if (shouldRetry(code)) {
504          if (statRetryCount < maxRetryNum) {
505            ++statRetryCount;
506            monitorLockNodeAsync();
507            return;
508          }
509          errorMessage = errorMessage
510              + ". Not retrying further znode monitoring connection errors.";
511        } else if (isSessionExpired(code)) {
512          // This isn't fatal - the client Watcher will re-join the election
513          LOG.warn("Lock monitoring failed because session was lost");
514          return;
515        }
516    
517        fatalError(errorMessage);
518      }
519    
520      /**
521       * We failed to become active. Re-join the election, but
522       * sleep for a few seconds after terminating our existing
523       * session, so that other nodes have a chance to become active.
524       * The failure to become active is already logged inside
525       * becomeActive().
526       */
527      private void reJoinElectionAfterFailureToBecomeActive() {
528        reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE);
529      }
530    
531      /**
532       * interface implementation of Zookeeper watch events (connection and node),
533       * proxied by {@link WatcherWithClientRef}.
534       */
535      synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
536        Event.EventType eventType = event.getType();
537        if (isStaleClient(zk)) return;
538        LOG.debug("Watcher event type: " + eventType + " with state:"
539            + event.getState() + " for path:" + event.getPath()
540            + " connectionState: " + zkConnectionState
541            + " for " + this);
542    
543        if (eventType == Event.EventType.None) {
544          // the connection state has changed
545          switch (event.getState()) {
546          case SyncConnected:
547            LOG.info("Session connected.");
548            // if the listener was asked to move to safe state then it needs to
549            // be undone
550            ConnectionState prevConnectionState = zkConnectionState;
551            zkConnectionState = ConnectionState.CONNECTED;
552            if (prevConnectionState == ConnectionState.DISCONNECTED &&
553                wantToBeInElection) {
554              monitorActiveStatus();
555            }
556            break;
557          case Disconnected:
558            LOG.info("Session disconnected. Entering neutral mode...");
559    
560            // ask the app to move to safe state because zookeeper connection
561            // is not active and we dont know our state
562            zkConnectionState = ConnectionState.DISCONNECTED;
563            enterNeutralMode();
564            break;
565          case Expired:
566            // the connection got terminated because of session timeout
567            // call listener to reconnect
568            LOG.info("Session expired. Entering neutral mode and rejoining...");
569            enterNeutralMode();
570            reJoinElection(0);
571            break;
572          case SaslAuthenticated:
573            LOG.info("Successfully authenticated to ZooKeeper using SASL.");
574            break;
575          default:
576            fatalError("Unexpected Zookeeper watch event state: "
577                + event.getState());
578            break;
579          }
580    
581          return;
582        }
583    
584        // a watch on lock path in zookeeper has fired. so something has changed on
585        // the lock. ideally we should check that the path is the same as the lock
586        // path but trusting zookeeper for now
587        String path = event.getPath();
588        if (path != null) {
589          switch (eventType) {
590          case NodeDeleted:
591            if (state == State.ACTIVE) {
592              enterNeutralMode();
593            }
594            joinElectionInternal();
595            break;
596          case NodeDataChanged:
597            monitorActiveStatus();
598            break;
599          default:
600            LOG.debug("Unexpected node event: " + eventType + " for path: " + path);
601            monitorActiveStatus();
602          }
603    
604          return;
605        }
606    
607        // some unexpected error has occurred
608        fatalError("Unexpected watch error from Zookeeper");
609      }
610    
611      /**
612       * Get a new zookeeper client instance. protected so that test class can
613       * inherit and pass in a mock object for zookeeper
614       * 
615       * @return new zookeeper client instance
616       * @throws IOException
617       * @throws KeeperException zookeeper connectionloss exception
618       */
619      protected synchronized ZooKeeper getNewZooKeeper() throws IOException,
620          KeeperException {
621        
622        // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and
623        // may trigger the Connected event immediately. So, if we register the
624        // watcher after constructing ZooKeeper, we may miss that event. Instead,
625        // we construct the watcher first, and have it block any events it receives
626        // before we can set its ZooKeeper reference.
627        watcher = new WatcherWithClientRef();
628        ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher);
629        watcher.setZooKeeperRef(zk);
630    
631        // Wait for the asynchronous success/failure. This may throw an exception
632        // if we don't connect within the session timeout.
633        watcher.waitForZKConnectionEvent(zkSessionTimeout);
634        
635        for (ZKAuthInfo auth : zkAuthInfo) {
636          zk.addAuthInfo(auth.getScheme(), auth.getAuth());
637        }
638        return zk;
639      }
640    
641      private void fatalError(String errorMessage) {
642        LOG.fatal(errorMessage);
643        reset();
644        appClient.notifyFatalError(errorMessage);
645      }
646    
647      private void monitorActiveStatus() {
648        assert wantToBeInElection;
649        LOG.debug("Monitoring active leader for " + this);
650        statRetryCount = 0;
651        monitorLockNodeAsync();
652      }
653    
654      private void joinElectionInternal() {
655        Preconditions.checkState(appData != null,
656            "trying to join election without any app data");
657        if (zkClient == null) {
658          if (!reEstablishSession()) {
659            fatalError("Failed to reEstablish connection with ZooKeeper");
660            return;
661          }
662        }
663    
664        createRetryCount = 0;
665        wantToBeInElection = true;
666        createLockNodeAsync();
667      }
668    
669      private void reJoinElection(int sleepTime) {
670        LOG.info("Trying to re-establish ZK session");
671        
672        // Some of the test cases rely on expiring the ZK sessions and
673        // ensuring that the other node takes over. But, there's a race
674        // where the original lease holder could reconnect faster than the other
675        // thread manages to take the lock itself. This lock allows the
676        // tests to block the reconnection. It's a shame that this leaked
677        // into non-test code, but the lock is only acquired here so will never
678        // be contended.
679        sessionReestablishLockForTests.lock();
680        try {
681          terminateConnection();
682          sleepFor(sleepTime);
683          // Should not join election even before the SERVICE is reported
684          // as HEALTHY from ZKFC monitoring.
685          if (appData != null) {
686            joinElectionInternal();
687          } else {
688            LOG.info("Not joining election since service has not yet been " +
689                "reported as healthy.");
690          }
691        } finally {
692          sessionReestablishLockForTests.unlock();
693        }
694      }
695    
696      /**
697       * Sleep for the given number of milliseconds.
698       * This is non-static, and separated out, so that unit tests
699       * can override the behavior not to sleep.
700       */
701      @VisibleForTesting
702      protected void sleepFor(int sleepMs) {
703        if (sleepMs > 0) {
704          try {
705            Thread.sleep(sleepMs);
706          } catch (InterruptedException e) {
707            Thread.currentThread().interrupt();
708          }
709        }
710      }
711    
712      @VisibleForTesting
713      void preventSessionReestablishmentForTests() {
714        sessionReestablishLockForTests.lock();
715      }
716      
717      @VisibleForTesting
718      void allowSessionReestablishmentForTests() {
719        sessionReestablishLockForTests.unlock();
720      }
721      
722      @VisibleForTesting
723      synchronized long getZKSessionIdForTests() {
724        if (zkClient != null) {
725          return zkClient.getSessionId();
726        } else {
727          return -1;
728        }
729      }
730      
731      @VisibleForTesting
732      synchronized State getStateForTests() {
733        return state;
734      }
735    
736      private boolean reEstablishSession() {
737        int connectionRetryCount = 0;
738        boolean success = false;
739        while(!success && connectionRetryCount < maxRetryNum) {
740          LOG.debug("Establishing zookeeper connection for " + this);
741          try {
742            createConnection();
743            success = true;
744          } catch(IOException e) {
745            LOG.warn(e);
746            sleepFor(5000);
747          } catch(KeeperException e) {
748            LOG.warn(e);
749            sleepFor(5000);
750          }
751          ++connectionRetryCount;
752        }
753        return success;
754      }
755    
756      private void createConnection() throws IOException, KeeperException {
757        if (zkClient != null) {
758          try {
759            zkClient.close();
760          } catch (InterruptedException e) {
761            throw new IOException("Interrupted while closing ZK",
762                e);
763          }
764          zkClient = null;
765          watcher = null;
766        }
767        zkClient = getNewZooKeeper();
768        LOG.debug("Created new connection for " + this);
769      }
770    
771      @InterfaceAudience.Private
772      public synchronized void terminateConnection() {
773        if (zkClient == null) {
774          return;
775        }
776        LOG.debug("Terminating ZK connection for " + this);
777        ZooKeeper tempZk = zkClient;
778        zkClient = null;
779        watcher = null;
780        try {
781          tempZk.close();
782        } catch(InterruptedException e) {
783          LOG.warn(e);
784        }
785        zkConnectionState = ConnectionState.TERMINATED;
786        wantToBeInElection = false;
787      }
788    
789      private void reset() {
790        state = State.INIT;
791        terminateConnection();
792      }
793    
794      private boolean becomeActive() {
795        assert wantToBeInElection;
796        if (state == State.ACTIVE) {
797          // already active
798          return true;
799        }
800        try {
801          Stat oldBreadcrumbStat = fenceOldActive();
802          writeBreadCrumbNode(oldBreadcrumbStat);
803          
804          LOG.debug("Becoming active for " + this);
805          appClient.becomeActive();
806          state = State.ACTIVE;
807          return true;
808        } catch (Exception e) {
809          LOG.warn("Exception handling the winning of election", e);
810          // Caller will handle quitting and rejoining the election.
811          return false;
812        }
813      }
814    
815      /**
816       * Write the "ActiveBreadCrumb" node, indicating that this node may need
817       * to be fenced on failover.
818       * @param oldBreadcrumbStat 
819       */
820      private void writeBreadCrumbNode(Stat oldBreadcrumbStat)
821          throws KeeperException, InterruptedException {
822        Preconditions.checkState(appData != null, "no appdata");
823        
824        LOG.info("Writing znode " + zkBreadCrumbPath +
825            " to indicate that the local node is the most recent active...");
826        if (oldBreadcrumbStat == null) {
827          // No previous active, just create the node
828          createWithRetries(zkBreadCrumbPath, appData, zkAcl,
829            CreateMode.PERSISTENT);
830        } else {
831          // There was a previous active, update the node
832          setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion());
833        }
834      }
835      
836      /**
837       * Try to delete the "ActiveBreadCrumb" node when gracefully giving up
838       * active status.
839       * If this fails, it will simply warn, since the graceful release behavior
840       * is only an optimization.
841       */
842      private void tryDeleteOwnBreadCrumbNode() {
843        assert state == State.ACTIVE;
844        LOG.info("Deleting bread-crumb of active node...");
845        
846        // Sanity check the data. This shouldn't be strictly necessary,
847        // but better to play it safe.
848        Stat stat = new Stat();
849        byte[] data = null;
850        try {
851          data = zkClient.getData(zkBreadCrumbPath, false, stat);
852    
853          if (!Arrays.equals(data, appData)) {
854            throw new IllegalStateException(
855                "We thought we were active, but in fact " +
856                "the active znode had the wrong data: " +
857                StringUtils.byteToHexString(data) + " (stat=" + stat + ")");
858          }
859          
860          deleteWithRetries(zkBreadCrumbPath, stat.getVersion());
861        } catch (Exception e) {
862          LOG.warn("Unable to delete our own bread-crumb of being active at " +
863              zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " +
864              "Expecting to be fenced by the next active.");
865        }
866      }
867    
868      /**
869       * If there is a breadcrumb node indicating that another node may need
870       * fencing, try to fence that node.
871       * @return the Stat of the breadcrumb node that was read, or null
872       * if no breadcrumb node existed
873       */
874      private Stat fenceOldActive() throws InterruptedException, KeeperException {
875        final Stat stat = new Stat();
876        byte[] data;
877        LOG.info("Checking for any old active which needs to be fenced...");
878        try {
879          data = zkDoWithRetries(new ZKAction<byte[]>() {
880            @Override
881            public byte[] run() throws KeeperException, InterruptedException {
882              return zkClient.getData(zkBreadCrumbPath, false, stat);
883            }
884          });
885        } catch (KeeperException ke) {
886          if (isNodeDoesNotExist(ke.code())) {
887            LOG.info("No old node to fence");
888            return null;
889          }
890          
891          // If we failed to read for any other reason, then likely we lost
892          // our session, or we don't have permissions, etc. In any case,
893          // we probably shouldn't become active, and failing the whole
894          // thing is the best bet.
895          throw ke;
896        }
897    
898        LOG.info("Old node exists: " + StringUtils.byteToHexString(data));
899        if (Arrays.equals(data, appData)) {
900          LOG.info("But old node has our own data, so don't need to fence it.");
901        } else {
902          appClient.fenceOldActive(data);
903        }
904        return stat;
905      }
906    
907      private void becomeStandby() {
908        if (state != State.STANDBY) {
909          LOG.debug("Becoming standby for " + this);
910          state = State.STANDBY;
911          appClient.becomeStandby();
912        }
913      }
914    
915      private void enterNeutralMode() {
916        if (state != State.NEUTRAL) {
917          LOG.debug("Entering neutral mode for " + this);
918          state = State.NEUTRAL;
919          appClient.enterNeutralMode();
920        }
921      }
922    
923      private void createLockNodeAsync() {
924        zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL,
925            this, zkClient);
926      }
927    
928      private void monitorLockNodeAsync() {
929        zkClient.exists(zkLockFilePath, 
930            watcher, this,
931            zkClient);
932      }
933    
934      private String createWithRetries(final String path, final byte[] data,
935          final List<ACL> acl, final CreateMode mode)
936          throws InterruptedException, KeeperException {
937        return zkDoWithRetries(new ZKAction<String>() {
938          @Override
939          public String run() throws KeeperException, InterruptedException {
940            return zkClient.create(path, data, acl, mode);
941          }
942        });
943      }
944    
945      private byte[] getDataWithRetries(final String path, final boolean watch,
946          final Stat stat) throws InterruptedException, KeeperException {
947        return zkDoWithRetries(new ZKAction<byte[]>() {
948          @Override
949          public byte[] run() throws KeeperException, InterruptedException {
950            return zkClient.getData(path, watch, stat);
951          }
952        });
953      }
954    
955      private Stat setDataWithRetries(final String path, final byte[] data,
956          final int version) throws InterruptedException, KeeperException {
957        return zkDoWithRetries(new ZKAction<Stat>() {
958          @Override
959          public Stat run() throws KeeperException, InterruptedException {
960            return zkClient.setData(path, data, version);
961          }
962        });
963      }
964      
965      private void deleteWithRetries(final String path, final int version)
966          throws KeeperException, InterruptedException {
967        zkDoWithRetries(new ZKAction<Void>() {
968          @Override
969          public Void run() throws KeeperException, InterruptedException {
970            zkClient.delete(path, version);
971            return null;
972          }
973        });
974      }
975    
976      private <T> T zkDoWithRetries(ZKAction<T> action) throws KeeperException,
977          InterruptedException {
978        int retry = 0;
979        while (true) {
980          try {
981            return action.run();
982          } catch (KeeperException ke) {
983            if (shouldRetry(ke.code()) && ++retry < maxRetryNum) {
984              continue;
985            }
986            throw ke;
987          }
988        }
989      }
990    
991      private interface ZKAction<T> {
992        T run() throws KeeperException, InterruptedException; 
993      }
994      
995      /**
996       * The callbacks and watchers pass a reference to the ZK client
997       * which made the original call. We don't want to take action
998       * based on any callbacks from prior clients after we quit
999       * the election.
1000       * @param ctx the ZK client passed into the watcher
1001       * @return true if it matches the current client
1002       */
1003      private synchronized boolean isStaleClient(Object ctx) {
1004        Preconditions.checkNotNull(ctx);
1005        if (zkClient != (ZooKeeper)ctx) {
1006          LOG.warn("Ignoring stale result from old client with sessionId " +
1007              String.format("0x%08x", ((ZooKeeper)ctx).getSessionId()));
1008          return true;
1009        }
1010        return false;
1011      }
1012    
1013      /**
1014       * Watcher implementation which keeps a reference around to the
1015       * original ZK connection, and passes it back along with any
1016       * events.
1017       */
1018      private final class WatcherWithClientRef implements Watcher {
1019        private ZooKeeper zk;
1020        
1021        /**
1022         * Latch fired whenever any event arrives. This is used in order
1023         * to wait for the Connected event when the client is first created.
1024         */
1025        private CountDownLatch hasReceivedEvent = new CountDownLatch(1);
1026    
1027        /**
1028         * Latch used to wait until the reference to ZooKeeper is set.
1029         */
1030        private CountDownLatch hasSetZooKeeper = new CountDownLatch(1);
1031    
1032        /**
1033         * Waits for the next event from ZooKeeper to arrive.
1034         * 
1035         * @param connectionTimeoutMs zookeeper connection timeout in milliseconds
1036         * @throws KeeperException if the connection attempt times out. This will
1037         * be a ZooKeeper ConnectionLoss exception code.
1038         * @throws IOException if interrupted while connecting to ZooKeeper
1039         */
1040        private void waitForZKConnectionEvent(int connectionTimeoutMs)
1041            throws KeeperException, IOException {
1042          try {
1043            if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) {
1044              LOG.error("Connection timed out: couldn't connect to ZooKeeper in "
1045                  + connectionTimeoutMs + " milliseconds");
1046              zk.close();
1047              throw KeeperException.create(Code.CONNECTIONLOSS);
1048            }
1049          } catch (InterruptedException e) {
1050            Thread.currentThread().interrupt();
1051            throw new IOException(
1052                "Interrupted when connecting to zookeeper server", e);
1053          }
1054        }
1055    
1056        private void setZooKeeperRef(ZooKeeper zk) {
1057          Preconditions.checkState(this.zk == null,
1058              "zk already set -- must be set exactly once");
1059          this.zk = zk;
1060          hasSetZooKeeper.countDown();
1061        }
1062    
1063        @Override
1064        public void process(WatchedEvent event) {
1065          hasReceivedEvent.countDown();
1066          try {
1067            hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS);
1068            ActiveStandbyElector.this.processWatchEvent(
1069                zk, event);
1070          } catch (Throwable t) {
1071            fatalError(
1072                "Failed to process watcher event " + event + ": " +
1073                StringUtils.stringifyException(t));
1074          }
1075        }
1076      }
1077    
1078      private static boolean isSuccess(Code code) {
1079        return (code == Code.OK);
1080      }
1081    
1082      private static boolean isNodeExists(Code code) {
1083        return (code == Code.NODEEXISTS);
1084      }
1085    
1086      private static boolean isNodeDoesNotExist(Code code) {
1087        return (code == Code.NONODE);
1088      }
1089      
1090      private static boolean isSessionExpired(Code code) {
1091        return (code == Code.SESSIONEXPIRED);
1092      }
1093    
1094      private static boolean shouldRetry(Code code) {
1095        return code == Code.CONNECTIONLOSS || code == Code.OPERATIONTIMEOUT;
1096      }
1097      
1098      @Override
1099      public String toString() {
1100        return "elector id=" + System.identityHashCode(this) +
1101          " appData=" +
1102          ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + 
1103          " cb=" + appClient;
1104      }
1105    }