001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.ha;
020
021import java.io.IOException;
022import java.util.Arrays;
023import java.util.List;
024import java.util.concurrent.CountDownLatch;
025import java.util.concurrent.TimeUnit;
026import java.util.concurrent.locks.Lock;
027import java.util.concurrent.locks.ReentrantLock;
028
029import org.apache.commons.logging.Log;
030import org.apache.commons.logging.LogFactory;
031import org.apache.hadoop.HadoopIllegalArgumentException;
032import org.apache.hadoop.classification.InterfaceAudience;
033import org.apache.hadoop.classification.InterfaceStability;
034import org.apache.hadoop.util.ZKUtil.ZKAuthInfo;
035import org.apache.hadoop.util.StringUtils;
036import org.apache.zookeeper.data.ACL;
037import org.apache.zookeeper.KeeperException;
038import org.apache.zookeeper.Watcher;
039import org.apache.zookeeper.WatchedEvent;
040import org.apache.zookeeper.Watcher.Event;
041import org.apache.zookeeper.ZKUtil;
042import org.apache.zookeeper.ZooKeeper;
043import org.apache.zookeeper.CreateMode;
044import org.apache.zookeeper.AsyncCallback.*;
045import org.apache.zookeeper.data.Stat;
046import org.apache.zookeeper.KeeperException.Code;
047
048import com.google.common.annotations.VisibleForTesting;
049import com.google.common.base.Preconditions;
050
051/**
052 * 
053 * This class implements a simple library to perform leader election on top of
054 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election
055 * can be performed by atomically creating an ephemeral lock file (znode) on
056 * Zookeeper. The service instance that successfully creates the znode becomes
057 * active and the rest become standbys. <br/>
058 * This election mechanism is only efficient for small number of election
059 * candidates (order of 10's) because contention on single znode by a large
060 * number of candidates can result in Zookeeper overload. <br/>
061 * The elector does not guarantee fencing (protection of shared resources) among
062 * service instances. After it has notified an instance about becoming a leader,
063 * then that instance must ensure that it meets the service consistency
064 * requirements. If it cannot do so, then it is recommended to quit the
065 * election. The application implements the {@link ActiveStandbyElectorCallback}
066 * to interact with the elector
067 */
068@InterfaceAudience.Private
069@InterfaceStability.Evolving
070public class ActiveStandbyElector implements StatCallback, StringCallback {
071
072  /**
073   * Callback interface to interact with the ActiveStandbyElector object. <br/>
074   * The application will be notified with a callback only on state changes
075   * (i.e. there will never be successive calls to becomeActive without an
076   * intermediate call to enterNeutralMode). <br/>
077   * The callbacks will be running on Zookeeper client library threads. The
078   * application should return from these callbacks quickly so as not to impede
079   * Zookeeper client library performance and notifications. The app will
080   * typically remember the state change and return from the callback. It will
081   * then proceed with implementing actions around that state change. It is
082   * possible to be called back again while these actions are in flight and the
083   * app should handle this scenario.
084   */
085  public interface ActiveStandbyElectorCallback {
086    /**
087     * This method is called when the app becomes the active leader.
088     * If the service fails to become active, it should throw
089     * ServiceFailedException. This will cause the elector to
090     * sleep for a short period, then re-join the election.
091     * 
092     * Callback implementations are expected to manage their own
093     * timeouts (e.g. when making an RPC to a remote node).
094     */
095    void becomeActive() throws ServiceFailedException;
096
097    /**
098     * This method is called when the app becomes a standby
099     */
100    void becomeStandby();
101
102    /**
103     * If the elector gets disconnected from Zookeeper and does not know about
104     * the lock state, then it will notify the service via the enterNeutralMode
105     * interface. The service may choose to ignore this or stop doing state
106     * changing operations. Upon reconnection, the elector verifies the leader
107     * status and calls back on the becomeActive and becomeStandby app
108     * interfaces. <br/>
109     * Zookeeper disconnects can happen due to network issues or loss of
110     * Zookeeper quorum. Thus enterNeutralMode can be used to guard against
111     * split-brain issues. In such situations it might be prudent to call
112     * becomeStandby too. However, such state change operations might be
113     * expensive and enterNeutralMode can help guard against doing that for
114     * transient issues.
115     */
116    void enterNeutralMode();
117
118    /**
119     * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper
120     * errors or Zookeeper persistent unavailability) then notifyFatalError is
121     * called to notify the app about it.
122     */
123    void notifyFatalError(String errorMessage);
124
125    /**
126     * If an old active has failed, rather than exited gracefully, then
127     * the new active may need to take some fencing actions against it
128     * before proceeding with failover.
129     * 
130     * @param oldActiveData the application data provided by the prior active
131     */
132    void fenceOldActive(byte[] oldActiveData);
133  }
134
135  /**
136   * Name of the lock znode used by the library. Protected for access in test
137   * classes
138   */
139  @VisibleForTesting
140  protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock";
141  @VisibleForTesting
142  protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb";
143
144  public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class);
145
146  private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000;
147
148  private static enum ConnectionState {
149    DISCONNECTED, CONNECTED, TERMINATED
150  };
151
152  static enum State {
153    INIT, ACTIVE, STANDBY, NEUTRAL
154  };
155
156  private State state = State.INIT;
157  private int createRetryCount = 0;
158  private int statRetryCount = 0;
159  private ZooKeeper zkClient;
160  private WatcherWithClientRef watcher;
161  private ConnectionState zkConnectionState = ConnectionState.TERMINATED;
162
163  private final ActiveStandbyElectorCallback appClient;
164  private final String zkHostPort;
165  private final int zkSessionTimeout;
166  private final List<ACL> zkAcl;
167  private final List<ZKAuthInfo> zkAuthInfo;
168  private byte[] appData;
169  private final String zkLockFilePath;
170  private final String zkBreadCrumbPath;
171  private final String znodeWorkingDir;
172  private final int maxRetryNum;
173
174  private Lock sessionReestablishLockForTests = new ReentrantLock();
175  private boolean wantToBeInElection;
176  
177  /**
178   * Create a new ActiveStandbyElector object <br/>
179   * The elector is created by providing to it the Zookeeper configuration, the
180   * parent znode under which to create the znode and a reference to the
181   * callback interface. <br/>
182   * The parent znode name must be the same for all service instances and
183   * different across services. <br/>
184   * After the leader has been lost, a new leader will be elected after the
185   * session timeout expires. Hence, the app must set this parameter based on
186   * its needs for failure response time. The session timeout must be greater
187   * than the Zookeeper disconnect timeout and is recommended to be 3X that
188   * value to enable Zookeeper to retry transient disconnections. Setting a very
189   * short session timeout may result in frequent transitions between active and
190   * standby states during issues like network outages/GS pauses.
191   * 
192   * @param zookeeperHostPorts
193   *          ZooKeeper hostPort for all ZooKeeper servers
194   * @param zookeeperSessionTimeout
195   *          ZooKeeper session timeout
196   * @param parentZnodeName
197   *          znode under which to create the lock
198   * @param acl
199   *          ZooKeeper ACL's
200   * @param authInfo a list of authentication credentials to add to the
201   *                 ZK connection
202   * @param app
203   *          reference to callback interface object
204   * @throws IOException
205   * @throws HadoopIllegalArgumentException
206   */
207  public ActiveStandbyElector(String zookeeperHostPorts,
208      int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
209      List<ZKAuthInfo> authInfo,
210      ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException,
211      HadoopIllegalArgumentException, KeeperException {
212    if (app == null || acl == null || parentZnodeName == null
213        || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
214      throw new HadoopIllegalArgumentException("Invalid argument");
215    }
216    zkHostPort = zookeeperHostPorts;
217    zkSessionTimeout = zookeeperSessionTimeout;
218    zkAcl = acl;
219    zkAuthInfo = authInfo;
220    appClient = app;
221    znodeWorkingDir = parentZnodeName;
222    zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME;
223    zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
224    this.maxRetryNum = maxRetryNum;
225
226    // createConnection for future API calls
227    createConnection();
228  }
229
230  /**
231   * To participate in election, the app will call joinElection. The result will
232   * be notified by a callback on either the becomeActive or becomeStandby app
233   * interfaces. <br/>
234   * After this the elector will automatically monitor the leader status and
235   * perform re-election if necessary<br/>
236   * The app could potentially start off in standby mode and ignore the
237   * becomeStandby call.
238   * 
239   * @param data
240   *          to be set by the app. non-null data must be set.
241   * @throws HadoopIllegalArgumentException
242   *           if valid data is not supplied
243   */
244  public synchronized void joinElection(byte[] data)
245      throws HadoopIllegalArgumentException {
246    
247    if (data == null) {
248      throw new HadoopIllegalArgumentException("data cannot be null");
249    }
250    
251    if (wantToBeInElection) {
252      LOG.info("Already in election. Not re-connecting.");
253      return;
254    }
255
256    appData = new byte[data.length];
257    System.arraycopy(data, 0, appData, 0, data.length);
258
259    LOG.debug("Attempting active election for " + this);
260    joinElectionInternal();
261  }
262  
263  /**
264   * @return true if the configured parent znode exists
265   */
266  public synchronized boolean parentZNodeExists()
267      throws IOException, InterruptedException {
268    Preconditions.checkState(zkClient != null);
269    try {
270      return zkClient.exists(znodeWorkingDir, false) != null;
271    } catch (KeeperException e) {
272      throw new IOException("Couldn't determine existence of znode '" +
273          znodeWorkingDir + "'", e);
274    }
275  }
276
277  /**
278   * Utility function to ensure that the configured base znode exists.
279   * This recursively creates the znode as well as all of its parents.
280   */
281  public synchronized void ensureParentZNode()
282      throws IOException, InterruptedException {
283    Preconditions.checkState(!wantToBeInElection,
284        "ensureParentZNode() may not be called while in the election");
285
286    String pathParts[] = znodeWorkingDir.split("/");
287    Preconditions.checkArgument(pathParts.length >= 1 &&
288        pathParts[0].isEmpty(),
289        "Invalid path: %s", znodeWorkingDir);
290    
291    StringBuilder sb = new StringBuilder();
292    for (int i = 1; i < pathParts.length; i++) {
293      sb.append("/").append(pathParts[i]);
294      String prefixPath = sb.toString();
295      LOG.debug("Ensuring existence of " + prefixPath);
296      try {
297        createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT);
298      } catch (KeeperException e) {
299        if (isNodeExists(e.code())) {
300          // This is OK - just ensuring existence.
301          continue;
302        } else {
303          throw new IOException("Couldn't create " + prefixPath, e);
304        }
305      }
306    }
307    
308    LOG.info("Successfully created " + znodeWorkingDir + " in ZK.");
309  }
310  
311  /**
312   * Clear all of the state held within the parent ZNode.
313   * This recursively deletes everything within the znode as well as the
314   * parent znode itself. It should only be used when it's certain that
315   * no electors are currently participating in the election.
316   */
317  public synchronized void clearParentZNode()
318      throws IOException, InterruptedException {
319    Preconditions.checkState(!wantToBeInElection,
320        "clearParentZNode() may not be called while in the election");
321
322    try {
323      LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
324
325      zkDoWithRetries(new ZKAction<Void>() {
326        @Override
327        public Void run() throws KeeperException, InterruptedException {
328          ZKUtil.deleteRecursive(zkClient, znodeWorkingDir);
329          return null;
330        }
331      });
332    } catch (KeeperException e) {
333      throw new IOException("Couldn't clear parent znode " + znodeWorkingDir,
334          e);
335    }
336    LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK.");
337  }
338
339
340  /**
341   * Any service instance can drop out of the election by calling quitElection. 
342   * <br/>
343   * This will lose any leader status, if held, and stop monitoring of the lock
344   * node. <br/>
345   * If the instance wants to participate in election again, then it needs to
346   * call joinElection(). <br/>
347   * This allows service instances to take themselves out of rotation for known
348   * impending unavailable states (e.g. long GC pause or software upgrade).
349   * 
350   * @param needFence true if the underlying daemon may need to be fenced
351   * if a failover occurs due to dropping out of the election.
352   */
353  public synchronized void quitElection(boolean needFence) {
354    LOG.info("Yielding from election");
355    if (!needFence && state == State.ACTIVE) {
356      // If active is gracefully going back to standby mode, remove
357      // our permanent znode so no one fences us.
358      tryDeleteOwnBreadCrumbNode();
359    }
360    reset();
361    wantToBeInElection = false;
362  }
363
364  /**
365   * Exception thrown when there is no active leader
366   */
367  public static class ActiveNotFoundException extends Exception {
368    private static final long serialVersionUID = 3505396722342846462L;
369  }
370
371  /**
372   * get data set by the active leader
373   * 
374   * @return data set by the active instance
375   * @throws ActiveNotFoundException
376   *           when there is no active leader
377   * @throws KeeperException
378   *           other zookeeper operation errors
379   * @throws InterruptedException
380   * @throws IOException
381   *           when ZooKeeper connection could not be established
382   */
383  public synchronized byte[] getActiveData() throws ActiveNotFoundException,
384      KeeperException, InterruptedException, IOException {
385    try {
386      if (zkClient == null) {
387        createConnection();
388      }
389      Stat stat = new Stat();
390      return getDataWithRetries(zkLockFilePath, false, stat);
391    } catch(KeeperException e) {
392      Code code = e.code();
393      if (isNodeDoesNotExist(code)) {
394        // handle the commonly expected cases that make sense for us
395        throw new ActiveNotFoundException();
396      } else {
397        throw e;
398      }
399    }
400  }
401
402  /**
403   * interface implementation of Zookeeper callback for create
404   */
405  @Override
406  public synchronized void processResult(int rc, String path, Object ctx,
407      String name) {
408    if (isStaleClient(ctx)) return;
409    LOG.debug("CreateNode result: " + rc + " for path: " + path
410        + " connectionState: " + zkConnectionState +
411        "  for " + this);
412
413    Code code = Code.get(rc);
414    if (isSuccess(code)) {
415      // we successfully created the znode. we are the leader. start monitoring
416      if (becomeActive()) {
417        monitorActiveStatus();
418      } else {
419        reJoinElectionAfterFailureToBecomeActive();
420      }
421      return;
422    }
423
424    if (isNodeExists(code)) {
425      if (createRetryCount == 0) {
426        // znode exists and we did not retry the operation. so a different
427        // instance has created it. become standby and monitor lock.
428        becomeStandby();
429      }
430      // if we had retried then the znode could have been created by our first
431      // attempt to the server (that we lost) and this node exists response is
432      // for the second attempt. verify this case via ephemeral node owner. this
433      // will happen on the callback for monitoring the lock.
434      monitorActiveStatus();
435      return;
436    }
437
438    String errorMessage = "Received create error from Zookeeper. code:"
439        + code.toString() + " for path " + path;
440    LOG.debug(errorMessage);
441
442    if (shouldRetry(code)) {
443      if (createRetryCount < maxRetryNum) {
444        LOG.debug("Retrying createNode createRetryCount: " + createRetryCount);
445        ++createRetryCount;
446        createLockNodeAsync();
447        return;
448      }
449      errorMessage = errorMessage
450          + ". Not retrying further znode create connection errors.";
451    } else if (isSessionExpired(code)) {
452      // This isn't fatal - the client Watcher will re-join the election
453      LOG.warn("Lock acquisition failed because session was lost");
454      return;
455    }
456
457    fatalError(errorMessage);
458  }
459
460  /**
461   * interface implementation of Zookeeper callback for monitor (exists)
462   */
463  @Override
464  public synchronized void processResult(int rc, String path, Object ctx,
465      Stat stat) {
466    if (isStaleClient(ctx)) return;
467    
468    assert wantToBeInElection :
469        "Got a StatNode result after quitting election";
470    
471    LOG.debug("StatNode result: " + rc + " for path: " + path
472        + " connectionState: " + zkConnectionState + " for " + this);
473        
474
475    Code code = Code.get(rc);
476    if (isSuccess(code)) {
477      // the following owner check completes verification in case the lock znode
478      // creation was retried
479      if (stat.getEphemeralOwner() == zkClient.getSessionId()) {
480        // we own the lock znode. so we are the leader
481        if (!becomeActive()) {
482          reJoinElectionAfterFailureToBecomeActive();
483        }
484      } else {
485        // we dont own the lock znode. so we are a standby.
486        becomeStandby();
487      }
488      // the watch set by us will notify about changes
489      return;
490    }
491
492    if (isNodeDoesNotExist(code)) {
493      // the lock znode disappeared before we started monitoring it
494      enterNeutralMode();
495      joinElectionInternal();
496      return;
497    }
498
499    String errorMessage = "Received stat error from Zookeeper. code:"
500        + code.toString();
501    LOG.debug(errorMessage);
502
503    if (shouldRetry(code)) {
504      if (statRetryCount < maxRetryNum) {
505        ++statRetryCount;
506        monitorLockNodeAsync();
507        return;
508      }
509      errorMessage = errorMessage
510          + ". Not retrying further znode monitoring connection errors.";
511    } else if (isSessionExpired(code)) {
512      // This isn't fatal - the client Watcher will re-join the election
513      LOG.warn("Lock monitoring failed because session was lost");
514      return;
515    }
516
517    fatalError(errorMessage);
518  }
519
520  /**
521   * We failed to become active. Re-join the election, but
522   * sleep for a few seconds after terminating our existing
523   * session, so that other nodes have a chance to become active.
524   * The failure to become active is already logged inside
525   * becomeActive().
526   */
527  private void reJoinElectionAfterFailureToBecomeActive() {
528    reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE);
529  }
530
531  /**
532   * interface implementation of Zookeeper watch events (connection and node),
533   * proxied by {@link WatcherWithClientRef}.
534   */
535  synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
536    Event.EventType eventType = event.getType();
537    if (isStaleClient(zk)) return;
538    LOG.debug("Watcher event type: " + eventType + " with state:"
539        + event.getState() + " for path:" + event.getPath()
540        + " connectionState: " + zkConnectionState
541        + " for " + this);
542
543    if (eventType == Event.EventType.None) {
544      // the connection state has changed
545      switch (event.getState()) {
546      case SyncConnected:
547        LOG.info("Session connected.");
548        // if the listener was asked to move to safe state then it needs to
549        // be undone
550        ConnectionState prevConnectionState = zkConnectionState;
551        zkConnectionState = ConnectionState.CONNECTED;
552        if (prevConnectionState == ConnectionState.DISCONNECTED &&
553            wantToBeInElection) {
554          monitorActiveStatus();
555        }
556        break;
557      case Disconnected:
558        LOG.info("Session disconnected. Entering neutral mode...");
559
560        // ask the app to move to safe state because zookeeper connection
561        // is not active and we dont know our state
562        zkConnectionState = ConnectionState.DISCONNECTED;
563        enterNeutralMode();
564        break;
565      case Expired:
566        // the connection got terminated because of session timeout
567        // call listener to reconnect
568        LOG.info("Session expired. Entering neutral mode and rejoining...");
569        enterNeutralMode();
570        reJoinElection(0);
571        break;
572      case SaslAuthenticated:
573        LOG.info("Successfully authenticated to ZooKeeper using SASL.");
574        break;
575      default:
576        fatalError("Unexpected Zookeeper watch event state: "
577            + event.getState());
578        break;
579      }
580
581      return;
582    }
583
584    // a watch on lock path in zookeeper has fired. so something has changed on
585    // the lock. ideally we should check that the path is the same as the lock
586    // path but trusting zookeeper for now
587    String path = event.getPath();
588    if (path != null) {
589      switch (eventType) {
590      case NodeDeleted:
591        if (state == State.ACTIVE) {
592          enterNeutralMode();
593        }
594        joinElectionInternal();
595        break;
596      case NodeDataChanged:
597        monitorActiveStatus();
598        break;
599      default:
600        LOG.debug("Unexpected node event: " + eventType + " for path: " + path);
601        monitorActiveStatus();
602      }
603
604      return;
605    }
606
607    // some unexpected error has occurred
608    fatalError("Unexpected watch error from Zookeeper");
609  }
610
611  /**
612   * Get a new zookeeper client instance. protected so that test class can
613   * inherit and pass in a mock object for zookeeper
614   * 
615   * @return new zookeeper client instance
616   * @throws IOException
617   * @throws KeeperException zookeeper connectionloss exception
618   */
619  protected synchronized ZooKeeper getNewZooKeeper() throws IOException,
620      KeeperException {
621    
622    // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and
623    // may trigger the Connected event immediately. So, if we register the
624    // watcher after constructing ZooKeeper, we may miss that event. Instead,
625    // we construct the watcher first, and have it block any events it receives
626    // before we can set its ZooKeeper reference.
627    watcher = new WatcherWithClientRef();
628    ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher);
629    watcher.setZooKeeperRef(zk);
630
631    // Wait for the asynchronous success/failure. This may throw an exception
632    // if we don't connect within the session timeout.
633    watcher.waitForZKConnectionEvent(zkSessionTimeout);
634    
635    for (ZKAuthInfo auth : zkAuthInfo) {
636      zk.addAuthInfo(auth.getScheme(), auth.getAuth());
637    }
638    return zk;
639  }
640
641  private void fatalError(String errorMessage) {
642    LOG.fatal(errorMessage);
643    reset();
644    appClient.notifyFatalError(errorMessage);
645  }
646
647  private void monitorActiveStatus() {
648    assert wantToBeInElection;
649    LOG.debug("Monitoring active leader for " + this);
650    statRetryCount = 0;
651    monitorLockNodeAsync();
652  }
653
654  private void joinElectionInternal() {
655    Preconditions.checkState(appData != null,
656        "trying to join election without any app data");
657    if (zkClient == null) {
658      if (!reEstablishSession()) {
659        fatalError("Failed to reEstablish connection with ZooKeeper");
660        return;
661      }
662    }
663
664    createRetryCount = 0;
665    wantToBeInElection = true;
666    createLockNodeAsync();
667  }
668
669  private void reJoinElection(int sleepTime) {
670    LOG.info("Trying to re-establish ZK session");
671    
672    // Some of the test cases rely on expiring the ZK sessions and
673    // ensuring that the other node takes over. But, there's a race
674    // where the original lease holder could reconnect faster than the other
675    // thread manages to take the lock itself. This lock allows the
676    // tests to block the reconnection. It's a shame that this leaked
677    // into non-test code, but the lock is only acquired here so will never
678    // be contended.
679    sessionReestablishLockForTests.lock();
680    try {
681      terminateConnection();
682      sleepFor(sleepTime);
683      // Should not join election even before the SERVICE is reported
684      // as HEALTHY from ZKFC monitoring.
685      if (appData != null) {
686        joinElectionInternal();
687      } else {
688        LOG.info("Not joining election since service has not yet been " +
689            "reported as healthy.");
690      }
691    } finally {
692      sessionReestablishLockForTests.unlock();
693    }
694  }
695
696  /**
697   * Sleep for the given number of milliseconds.
698   * This is non-static, and separated out, so that unit tests
699   * can override the behavior not to sleep.
700   */
701  @VisibleForTesting
702  protected void sleepFor(int sleepMs) {
703    if (sleepMs > 0) {
704      try {
705        Thread.sleep(sleepMs);
706      } catch (InterruptedException e) {
707        Thread.currentThread().interrupt();
708      }
709    }
710  }
711
712  @VisibleForTesting
713  void preventSessionReestablishmentForTests() {
714    sessionReestablishLockForTests.lock();
715  }
716  
717  @VisibleForTesting
718  void allowSessionReestablishmentForTests() {
719    sessionReestablishLockForTests.unlock();
720  }
721  
722  @VisibleForTesting
723  synchronized long getZKSessionIdForTests() {
724    if (zkClient != null) {
725      return zkClient.getSessionId();
726    } else {
727      return -1;
728    }
729  }
730  
731  @VisibleForTesting
732  synchronized State getStateForTests() {
733    return state;
734  }
735
736  private boolean reEstablishSession() {
737    int connectionRetryCount = 0;
738    boolean success = false;
739    while(!success && connectionRetryCount < maxRetryNum) {
740      LOG.debug("Establishing zookeeper connection for " + this);
741      try {
742        createConnection();
743        success = true;
744      } catch(IOException e) {
745        LOG.warn(e);
746        sleepFor(5000);
747      } catch(KeeperException e) {
748        LOG.warn(e);
749        sleepFor(5000);
750      }
751      ++connectionRetryCount;
752    }
753    return success;
754  }
755
756  private void createConnection() throws IOException, KeeperException {
757    if (zkClient != null) {
758      try {
759        zkClient.close();
760      } catch (InterruptedException e) {
761        throw new IOException("Interrupted while closing ZK",
762            e);
763      }
764      zkClient = null;
765      watcher = null;
766    }
767    zkClient = getNewZooKeeper();
768    LOG.debug("Created new connection for " + this);
769  }
770
771  @InterfaceAudience.Private
772  public synchronized void terminateConnection() {
773    if (zkClient == null) {
774      return;
775    }
776    LOG.debug("Terminating ZK connection for " + this);
777    ZooKeeper tempZk = zkClient;
778    zkClient = null;
779    watcher = null;
780    try {
781      tempZk.close();
782    } catch(InterruptedException e) {
783      LOG.warn(e);
784    }
785    zkConnectionState = ConnectionState.TERMINATED;
786    wantToBeInElection = false;
787  }
788
789  private void reset() {
790    state = State.INIT;
791    terminateConnection();
792  }
793
794  private boolean becomeActive() {
795    assert wantToBeInElection;
796    if (state == State.ACTIVE) {
797      // already active
798      return true;
799    }
800    try {
801      Stat oldBreadcrumbStat = fenceOldActive();
802      writeBreadCrumbNode(oldBreadcrumbStat);
803      
804      LOG.debug("Becoming active for " + this);
805      appClient.becomeActive();
806      state = State.ACTIVE;
807      return true;
808    } catch (Exception e) {
809      LOG.warn("Exception handling the winning of election", e);
810      // Caller will handle quitting and rejoining the election.
811      return false;
812    }
813  }
814
815  /**
816   * Write the "ActiveBreadCrumb" node, indicating that this node may need
817   * to be fenced on failover.
818   * @param oldBreadcrumbStat 
819   */
820  private void writeBreadCrumbNode(Stat oldBreadcrumbStat)
821      throws KeeperException, InterruptedException {
822    Preconditions.checkState(appData != null, "no appdata");
823    
824    LOG.info("Writing znode " + zkBreadCrumbPath +
825        " to indicate that the local node is the most recent active...");
826    if (oldBreadcrumbStat == null) {
827      // No previous active, just create the node
828      createWithRetries(zkBreadCrumbPath, appData, zkAcl,
829        CreateMode.PERSISTENT);
830    } else {
831      // There was a previous active, update the node
832      setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion());
833    }
834  }
835  
836  /**
837   * Try to delete the "ActiveBreadCrumb" node when gracefully giving up
838   * active status.
839   * If this fails, it will simply warn, since the graceful release behavior
840   * is only an optimization.
841   */
842  private void tryDeleteOwnBreadCrumbNode() {
843    assert state == State.ACTIVE;
844    LOG.info("Deleting bread-crumb of active node...");
845    
846    // Sanity check the data. This shouldn't be strictly necessary,
847    // but better to play it safe.
848    Stat stat = new Stat();
849    byte[] data = null;
850    try {
851      data = zkClient.getData(zkBreadCrumbPath, false, stat);
852
853      if (!Arrays.equals(data, appData)) {
854        throw new IllegalStateException(
855            "We thought we were active, but in fact " +
856            "the active znode had the wrong data: " +
857            StringUtils.byteToHexString(data) + " (stat=" + stat + ")");
858      }
859      
860      deleteWithRetries(zkBreadCrumbPath, stat.getVersion());
861    } catch (Exception e) {
862      LOG.warn("Unable to delete our own bread-crumb of being active at " +
863          zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " +
864          "Expecting to be fenced by the next active.");
865    }
866  }
867
868  /**
869   * If there is a breadcrumb node indicating that another node may need
870   * fencing, try to fence that node.
871   * @return the Stat of the breadcrumb node that was read, or null
872   * if no breadcrumb node existed
873   */
874  private Stat fenceOldActive() throws InterruptedException, KeeperException {
875    final Stat stat = new Stat();
876    byte[] data;
877    LOG.info("Checking for any old active which needs to be fenced...");
878    try {
879      data = zkDoWithRetries(new ZKAction<byte[]>() {
880        @Override
881        public byte[] run() throws KeeperException, InterruptedException {
882          return zkClient.getData(zkBreadCrumbPath, false, stat);
883        }
884      });
885    } catch (KeeperException ke) {
886      if (isNodeDoesNotExist(ke.code())) {
887        LOG.info("No old node to fence");
888        return null;
889      }
890      
891      // If we failed to read for any other reason, then likely we lost
892      // our session, or we don't have permissions, etc. In any case,
893      // we probably shouldn't become active, and failing the whole
894      // thing is the best bet.
895      throw ke;
896    }
897
898    LOG.info("Old node exists: " + StringUtils.byteToHexString(data));
899    if (Arrays.equals(data, appData)) {
900      LOG.info("But old node has our own data, so don't need to fence it.");
901    } else {
902      appClient.fenceOldActive(data);
903    }
904    return stat;
905  }
906
907  private void becomeStandby() {
908    if (state != State.STANDBY) {
909      LOG.debug("Becoming standby for " + this);
910      state = State.STANDBY;
911      appClient.becomeStandby();
912    }
913  }
914
915  private void enterNeutralMode() {
916    if (state != State.NEUTRAL) {
917      LOG.debug("Entering neutral mode for " + this);
918      state = State.NEUTRAL;
919      appClient.enterNeutralMode();
920    }
921  }
922
923  private void createLockNodeAsync() {
924    zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL,
925        this, zkClient);
926  }
927
928  private void monitorLockNodeAsync() {
929    zkClient.exists(zkLockFilePath, 
930        watcher, this,
931        zkClient);
932  }
933
934  private String createWithRetries(final String path, final byte[] data,
935      final List<ACL> acl, final CreateMode mode)
936      throws InterruptedException, KeeperException {
937    return zkDoWithRetries(new ZKAction<String>() {
938      @Override
939      public String run() throws KeeperException, InterruptedException {
940        return zkClient.create(path, data, acl, mode);
941      }
942    });
943  }
944
945  private byte[] getDataWithRetries(final String path, final boolean watch,
946      final Stat stat) throws InterruptedException, KeeperException {
947    return zkDoWithRetries(new ZKAction<byte[]>() {
948      @Override
949      public byte[] run() throws KeeperException, InterruptedException {
950        return zkClient.getData(path, watch, stat);
951      }
952    });
953  }
954
955  private Stat setDataWithRetries(final String path, final byte[] data,
956      final int version) throws InterruptedException, KeeperException {
957    return zkDoWithRetries(new ZKAction<Stat>() {
958      @Override
959      public Stat run() throws KeeperException, InterruptedException {
960        return zkClient.setData(path, data, version);
961      }
962    });
963  }
964  
965  private void deleteWithRetries(final String path, final int version)
966      throws KeeperException, InterruptedException {
967    zkDoWithRetries(new ZKAction<Void>() {
968      @Override
969      public Void run() throws KeeperException, InterruptedException {
970        zkClient.delete(path, version);
971        return null;
972      }
973    });
974  }
975
976  private <T> T zkDoWithRetries(ZKAction<T> action) throws KeeperException,
977      InterruptedException {
978    int retry = 0;
979    while (true) {
980      try {
981        return action.run();
982      } catch (KeeperException ke) {
983        if (shouldRetry(ke.code()) && ++retry < maxRetryNum) {
984          continue;
985        }
986        throw ke;
987      }
988    }
989  }
990
991  private interface ZKAction<T> {
992    T run() throws KeeperException, InterruptedException; 
993  }
994  
995  /**
996   * The callbacks and watchers pass a reference to the ZK client
997   * which made the original call. We don't want to take action
998   * based on any callbacks from prior clients after we quit
999   * the election.
1000   * @param ctx the ZK client passed into the watcher
1001   * @return true if it matches the current client
1002   */
1003  private synchronized boolean isStaleClient(Object ctx) {
1004    Preconditions.checkNotNull(ctx);
1005    if (zkClient != (ZooKeeper)ctx) {
1006      LOG.warn("Ignoring stale result from old client with sessionId " +
1007          String.format("0x%08x", ((ZooKeeper)ctx).getSessionId()));
1008      return true;
1009    }
1010    return false;
1011  }
1012
1013  /**
1014   * Watcher implementation which keeps a reference around to the
1015   * original ZK connection, and passes it back along with any
1016   * events.
1017   */
1018  private final class WatcherWithClientRef implements Watcher {
1019    private ZooKeeper zk;
1020    
1021    /**
1022     * Latch fired whenever any event arrives. This is used in order
1023     * to wait for the Connected event when the client is first created.
1024     */
1025    private CountDownLatch hasReceivedEvent = new CountDownLatch(1);
1026
1027    /**
1028     * Latch used to wait until the reference to ZooKeeper is set.
1029     */
1030    private CountDownLatch hasSetZooKeeper = new CountDownLatch(1);
1031
1032    /**
1033     * Waits for the next event from ZooKeeper to arrive.
1034     * 
1035     * @param connectionTimeoutMs zookeeper connection timeout in milliseconds
1036     * @throws KeeperException if the connection attempt times out. This will
1037     * be a ZooKeeper ConnectionLoss exception code.
1038     * @throws IOException if interrupted while connecting to ZooKeeper
1039     */
1040    private void waitForZKConnectionEvent(int connectionTimeoutMs)
1041        throws KeeperException, IOException {
1042      try {
1043        if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) {
1044          LOG.error("Connection timed out: couldn't connect to ZooKeeper in "
1045              + connectionTimeoutMs + " milliseconds");
1046          zk.close();
1047          throw KeeperException.create(Code.CONNECTIONLOSS);
1048        }
1049      } catch (InterruptedException e) {
1050        Thread.currentThread().interrupt();
1051        throw new IOException(
1052            "Interrupted when connecting to zookeeper server", e);
1053      }
1054    }
1055
1056    private void setZooKeeperRef(ZooKeeper zk) {
1057      Preconditions.checkState(this.zk == null,
1058          "zk already set -- must be set exactly once");
1059      this.zk = zk;
1060      hasSetZooKeeper.countDown();
1061    }
1062
1063    @Override
1064    public void process(WatchedEvent event) {
1065      hasReceivedEvent.countDown();
1066      try {
1067        hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS);
1068        ActiveStandbyElector.this.processWatchEvent(
1069            zk, event);
1070      } catch (Throwable t) {
1071        fatalError(
1072            "Failed to process watcher event " + event + ": " +
1073            StringUtils.stringifyException(t));
1074      }
1075    }
1076  }
1077
1078  private static boolean isSuccess(Code code) {
1079    return (code == Code.OK);
1080  }
1081
1082  private static boolean isNodeExists(Code code) {
1083    return (code == Code.NODEEXISTS);
1084  }
1085
1086  private static boolean isNodeDoesNotExist(Code code) {
1087    return (code == Code.NONODE);
1088  }
1089  
1090  private static boolean isSessionExpired(Code code) {
1091    return (code == Code.SESSIONEXPIRED);
1092  }
1093
1094  private static boolean shouldRetry(Code code) {
1095    return code == Code.CONNECTIONLOSS || code == Code.OPERATIONTIMEOUT;
1096  }
1097  
1098  @Override
1099  public String toString() {
1100    return "elector id=" + System.identityHashCode(this) +
1101      " appData=" +
1102      ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + 
1103      " cb=" + appClient;
1104  }
1105
1106  public String getHAZookeeperConnectionState() {
1107    return this.zkConnectionState.name();
1108  }
1109}