001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.ha; 020 021import java.io.IOException; 022import java.util.Arrays; 023import java.util.List; 024import java.util.concurrent.CountDownLatch; 025import java.util.concurrent.TimeUnit; 026import java.util.concurrent.locks.Lock; 027import java.util.concurrent.locks.ReentrantLock; 028 029import org.apache.commons.logging.Log; 030import org.apache.commons.logging.LogFactory; 031import org.apache.hadoop.HadoopIllegalArgumentException; 032import org.apache.hadoop.classification.InterfaceAudience; 033import org.apache.hadoop.classification.InterfaceStability; 034import org.apache.hadoop.util.ZKUtil.ZKAuthInfo; 035import org.apache.hadoop.util.StringUtils; 036import org.apache.zookeeper.data.ACL; 037import org.apache.zookeeper.KeeperException; 038import org.apache.zookeeper.Watcher; 039import org.apache.zookeeper.WatchedEvent; 040import org.apache.zookeeper.Watcher.Event; 041import org.apache.zookeeper.ZKUtil; 042import org.apache.zookeeper.ZooKeeper; 043import org.apache.zookeeper.CreateMode; 044import org.apache.zookeeper.AsyncCallback.*; 045import org.apache.zookeeper.data.Stat; 046import org.apache.zookeeper.KeeperException.Code; 047 048import com.google.common.annotations.VisibleForTesting; 049import com.google.common.base.Preconditions; 050 051/** 052 * 053 * This class implements a simple library to perform leader election on top of 054 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election 055 * can be performed by atomically creating an ephemeral lock file (znode) on 056 * Zookeeper. The service instance that successfully creates the znode becomes 057 * active and the rest become standbys. <br/> 058 * This election mechanism is only efficient for small number of election 059 * candidates (order of 10's) because contention on single znode by a large 060 * number of candidates can result in Zookeeper overload. <br/> 061 * The elector does not guarantee fencing (protection of shared resources) among 062 * service instances. After it has notified an instance about becoming a leader, 063 * then that instance must ensure that it meets the service consistency 064 * requirements. If it cannot do so, then it is recommended to quit the 065 * election. The application implements the {@link ActiveStandbyElectorCallback} 066 * to interact with the elector 067 */ 068@InterfaceAudience.Private 069@InterfaceStability.Evolving 070public class ActiveStandbyElector implements StatCallback, StringCallback { 071 072 /** 073 * Callback interface to interact with the ActiveStandbyElector object. <br/> 074 * The application will be notified with a callback only on state changes 075 * (i.e. there will never be successive calls to becomeActive without an 076 * intermediate call to enterNeutralMode). <br/> 077 * The callbacks will be running on Zookeeper client library threads. The 078 * application should return from these callbacks quickly so as not to impede 079 * Zookeeper client library performance and notifications. The app will 080 * typically remember the state change and return from the callback. It will 081 * then proceed with implementing actions around that state change. It is 082 * possible to be called back again while these actions are in flight and the 083 * app should handle this scenario. 084 */ 085 public interface ActiveStandbyElectorCallback { 086 /** 087 * This method is called when the app becomes the active leader. 088 * If the service fails to become active, it should throw 089 * ServiceFailedException. This will cause the elector to 090 * sleep for a short period, then re-join the election. 091 * 092 * Callback implementations are expected to manage their own 093 * timeouts (e.g. when making an RPC to a remote node). 094 */ 095 void becomeActive() throws ServiceFailedException; 096 097 /** 098 * This method is called when the app becomes a standby 099 */ 100 void becomeStandby(); 101 102 /** 103 * If the elector gets disconnected from Zookeeper and does not know about 104 * the lock state, then it will notify the service via the enterNeutralMode 105 * interface. The service may choose to ignore this or stop doing state 106 * changing operations. Upon reconnection, the elector verifies the leader 107 * status and calls back on the becomeActive and becomeStandby app 108 * interfaces. <br/> 109 * Zookeeper disconnects can happen due to network issues or loss of 110 * Zookeeper quorum. Thus enterNeutralMode can be used to guard against 111 * split-brain issues. In such situations it might be prudent to call 112 * becomeStandby too. However, such state change operations might be 113 * expensive and enterNeutralMode can help guard against doing that for 114 * transient issues. 115 */ 116 void enterNeutralMode(); 117 118 /** 119 * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper 120 * errors or Zookeeper persistent unavailability) then notifyFatalError is 121 * called to notify the app about it. 122 */ 123 void notifyFatalError(String errorMessage); 124 125 /** 126 * If an old active has failed, rather than exited gracefully, then 127 * the new active may need to take some fencing actions against it 128 * before proceeding with failover. 129 * 130 * @param oldActiveData the application data provided by the prior active 131 */ 132 void fenceOldActive(byte[] oldActiveData); 133 } 134 135 /** 136 * Name of the lock znode used by the library. Protected for access in test 137 * classes 138 */ 139 @VisibleForTesting 140 protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock"; 141 @VisibleForTesting 142 protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb"; 143 144 public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class); 145 146 private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000; 147 148 private static enum ConnectionState { 149 DISCONNECTED, CONNECTED, TERMINATED 150 }; 151 152 static enum State { 153 INIT, ACTIVE, STANDBY, NEUTRAL 154 }; 155 156 private State state = State.INIT; 157 private int createRetryCount = 0; 158 private int statRetryCount = 0; 159 private ZooKeeper zkClient; 160 private WatcherWithClientRef watcher; 161 private ConnectionState zkConnectionState = ConnectionState.TERMINATED; 162 163 private final ActiveStandbyElectorCallback appClient; 164 private final String zkHostPort; 165 private final int zkSessionTimeout; 166 private final List<ACL> zkAcl; 167 private final List<ZKAuthInfo> zkAuthInfo; 168 private byte[] appData; 169 private final String zkLockFilePath; 170 private final String zkBreadCrumbPath; 171 private final String znodeWorkingDir; 172 private final int maxRetryNum; 173 174 private Lock sessionReestablishLockForTests = new ReentrantLock(); 175 private boolean wantToBeInElection; 176 177 /** 178 * Create a new ActiveStandbyElector object <br/> 179 * The elector is created by providing to it the Zookeeper configuration, the 180 * parent znode under which to create the znode and a reference to the 181 * callback interface. <br/> 182 * The parent znode name must be the same for all service instances and 183 * different across services. <br/> 184 * After the leader has been lost, a new leader will be elected after the 185 * session timeout expires. Hence, the app must set this parameter based on 186 * its needs for failure response time. The session timeout must be greater 187 * than the Zookeeper disconnect timeout and is recommended to be 3X that 188 * value to enable Zookeeper to retry transient disconnections. Setting a very 189 * short session timeout may result in frequent transitions between active and 190 * standby states during issues like network outages/GS pauses. 191 * 192 * @param zookeeperHostPorts 193 * ZooKeeper hostPort for all ZooKeeper servers 194 * @param zookeeperSessionTimeout 195 * ZooKeeper session timeout 196 * @param parentZnodeName 197 * znode under which to create the lock 198 * @param acl 199 * ZooKeeper ACL's 200 * @param authInfo a list of authentication credentials to add to the 201 * ZK connection 202 * @param app 203 * reference to callback interface object 204 * @throws IOException 205 * @throws HadoopIllegalArgumentException 206 */ 207 public ActiveStandbyElector(String zookeeperHostPorts, 208 int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl, 209 List<ZKAuthInfo> authInfo, 210 ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException, 211 HadoopIllegalArgumentException, KeeperException { 212 if (app == null || acl == null || parentZnodeName == null 213 || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) { 214 throw new HadoopIllegalArgumentException("Invalid argument"); 215 } 216 zkHostPort = zookeeperHostPorts; 217 zkSessionTimeout = zookeeperSessionTimeout; 218 zkAcl = acl; 219 zkAuthInfo = authInfo; 220 appClient = app; 221 znodeWorkingDir = parentZnodeName; 222 zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME; 223 zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME; 224 this.maxRetryNum = maxRetryNum; 225 226 // createConnection for future API calls 227 createConnection(); 228 } 229 230 /** 231 * To participate in election, the app will call joinElection. The result will 232 * be notified by a callback on either the becomeActive or becomeStandby app 233 * interfaces. <br/> 234 * After this the elector will automatically monitor the leader status and 235 * perform re-election if necessary<br/> 236 * The app could potentially start off in standby mode and ignore the 237 * becomeStandby call. 238 * 239 * @param data 240 * to be set by the app. non-null data must be set. 241 * @throws HadoopIllegalArgumentException 242 * if valid data is not supplied 243 */ 244 public synchronized void joinElection(byte[] data) 245 throws HadoopIllegalArgumentException { 246 247 if (data == null) { 248 throw new HadoopIllegalArgumentException("data cannot be null"); 249 } 250 251 if (wantToBeInElection) { 252 LOG.info("Already in election. Not re-connecting."); 253 return; 254 } 255 256 appData = new byte[data.length]; 257 System.arraycopy(data, 0, appData, 0, data.length); 258 259 LOG.debug("Attempting active election for " + this); 260 joinElectionInternal(); 261 } 262 263 /** 264 * @return true if the configured parent znode exists 265 */ 266 public synchronized boolean parentZNodeExists() 267 throws IOException, InterruptedException { 268 Preconditions.checkState(zkClient != null); 269 try { 270 return zkClient.exists(znodeWorkingDir, false) != null; 271 } catch (KeeperException e) { 272 throw new IOException("Couldn't determine existence of znode '" + 273 znodeWorkingDir + "'", e); 274 } 275 } 276 277 /** 278 * Utility function to ensure that the configured base znode exists. 279 * This recursively creates the znode as well as all of its parents. 280 */ 281 public synchronized void ensureParentZNode() 282 throws IOException, InterruptedException { 283 Preconditions.checkState(!wantToBeInElection, 284 "ensureParentZNode() may not be called while in the election"); 285 286 String pathParts[] = znodeWorkingDir.split("/"); 287 Preconditions.checkArgument(pathParts.length >= 1 && 288 pathParts[0].isEmpty(), 289 "Invalid path: %s", znodeWorkingDir); 290 291 StringBuilder sb = new StringBuilder(); 292 for (int i = 1; i < pathParts.length; i++) { 293 sb.append("/").append(pathParts[i]); 294 String prefixPath = sb.toString(); 295 LOG.debug("Ensuring existence of " + prefixPath); 296 try { 297 createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT); 298 } catch (KeeperException e) { 299 if (isNodeExists(e.code())) { 300 // This is OK - just ensuring existence. 301 continue; 302 } else { 303 throw new IOException("Couldn't create " + prefixPath, e); 304 } 305 } 306 } 307 308 LOG.info("Successfully created " + znodeWorkingDir + " in ZK."); 309 } 310 311 /** 312 * Clear all of the state held within the parent ZNode. 313 * This recursively deletes everything within the znode as well as the 314 * parent znode itself. It should only be used when it's certain that 315 * no electors are currently participating in the election. 316 */ 317 public synchronized void clearParentZNode() 318 throws IOException, InterruptedException { 319 Preconditions.checkState(!wantToBeInElection, 320 "clearParentZNode() may not be called while in the election"); 321 322 try { 323 LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK..."); 324 325 zkDoWithRetries(new ZKAction<Void>() { 326 @Override 327 public Void run() throws KeeperException, InterruptedException { 328 ZKUtil.deleteRecursive(zkClient, znodeWorkingDir); 329 return null; 330 } 331 }); 332 } catch (KeeperException e) { 333 throw new IOException("Couldn't clear parent znode " + znodeWorkingDir, 334 e); 335 } 336 LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK."); 337 } 338 339 340 /** 341 * Any service instance can drop out of the election by calling quitElection. 342 * <br/> 343 * This will lose any leader status, if held, and stop monitoring of the lock 344 * node. <br/> 345 * If the instance wants to participate in election again, then it needs to 346 * call joinElection(). <br/> 347 * This allows service instances to take themselves out of rotation for known 348 * impending unavailable states (e.g. long GC pause or software upgrade). 349 * 350 * @param needFence true if the underlying daemon may need to be fenced 351 * if a failover occurs due to dropping out of the election. 352 */ 353 public synchronized void quitElection(boolean needFence) { 354 LOG.info("Yielding from election"); 355 if (!needFence && state == State.ACTIVE) { 356 // If active is gracefully going back to standby mode, remove 357 // our permanent znode so no one fences us. 358 tryDeleteOwnBreadCrumbNode(); 359 } 360 reset(); 361 wantToBeInElection = false; 362 } 363 364 /** 365 * Exception thrown when there is no active leader 366 */ 367 public static class ActiveNotFoundException extends Exception { 368 private static final long serialVersionUID = 3505396722342846462L; 369 } 370 371 /** 372 * get data set by the active leader 373 * 374 * @return data set by the active instance 375 * @throws ActiveNotFoundException 376 * when there is no active leader 377 * @throws KeeperException 378 * other zookeeper operation errors 379 * @throws InterruptedException 380 * @throws IOException 381 * when ZooKeeper connection could not be established 382 */ 383 public synchronized byte[] getActiveData() throws ActiveNotFoundException, 384 KeeperException, InterruptedException, IOException { 385 try { 386 if (zkClient == null) { 387 createConnection(); 388 } 389 Stat stat = new Stat(); 390 return getDataWithRetries(zkLockFilePath, false, stat); 391 } catch(KeeperException e) { 392 Code code = e.code(); 393 if (isNodeDoesNotExist(code)) { 394 // handle the commonly expected cases that make sense for us 395 throw new ActiveNotFoundException(); 396 } else { 397 throw e; 398 } 399 } 400 } 401 402 /** 403 * interface implementation of Zookeeper callback for create 404 */ 405 @Override 406 public synchronized void processResult(int rc, String path, Object ctx, 407 String name) { 408 if (isStaleClient(ctx)) return; 409 LOG.debug("CreateNode result: " + rc + " for path: " + path 410 + " connectionState: " + zkConnectionState + 411 " for " + this); 412 413 Code code = Code.get(rc); 414 if (isSuccess(code)) { 415 // we successfully created the znode. we are the leader. start monitoring 416 if (becomeActive()) { 417 monitorActiveStatus(); 418 } else { 419 reJoinElectionAfterFailureToBecomeActive(); 420 } 421 return; 422 } 423 424 if (isNodeExists(code)) { 425 if (createRetryCount == 0) { 426 // znode exists and we did not retry the operation. so a different 427 // instance has created it. become standby and monitor lock. 428 becomeStandby(); 429 } 430 // if we had retried then the znode could have been created by our first 431 // attempt to the server (that we lost) and this node exists response is 432 // for the second attempt. verify this case via ephemeral node owner. this 433 // will happen on the callback for monitoring the lock. 434 monitorActiveStatus(); 435 return; 436 } 437 438 String errorMessage = "Received create error from Zookeeper. code:" 439 + code.toString() + " for path " + path; 440 LOG.debug(errorMessage); 441 442 if (shouldRetry(code)) { 443 if (createRetryCount < maxRetryNum) { 444 LOG.debug("Retrying createNode createRetryCount: " + createRetryCount); 445 ++createRetryCount; 446 createLockNodeAsync(); 447 return; 448 } 449 errorMessage = errorMessage 450 + ". Not retrying further znode create connection errors."; 451 } else if (isSessionExpired(code)) { 452 // This isn't fatal - the client Watcher will re-join the election 453 LOG.warn("Lock acquisition failed because session was lost"); 454 return; 455 } 456 457 fatalError(errorMessage); 458 } 459 460 /** 461 * interface implementation of Zookeeper callback for monitor (exists) 462 */ 463 @Override 464 public synchronized void processResult(int rc, String path, Object ctx, 465 Stat stat) { 466 if (isStaleClient(ctx)) return; 467 468 assert wantToBeInElection : 469 "Got a StatNode result after quitting election"; 470 471 LOG.debug("StatNode result: " + rc + " for path: " + path 472 + " connectionState: " + zkConnectionState + " for " + this); 473 474 475 Code code = Code.get(rc); 476 if (isSuccess(code)) { 477 // the following owner check completes verification in case the lock znode 478 // creation was retried 479 if (stat.getEphemeralOwner() == zkClient.getSessionId()) { 480 // we own the lock znode. so we are the leader 481 if (!becomeActive()) { 482 reJoinElectionAfterFailureToBecomeActive(); 483 } 484 } else { 485 // we dont own the lock znode. so we are a standby. 486 becomeStandby(); 487 } 488 // the watch set by us will notify about changes 489 return; 490 } 491 492 if (isNodeDoesNotExist(code)) { 493 // the lock znode disappeared before we started monitoring it 494 enterNeutralMode(); 495 joinElectionInternal(); 496 return; 497 } 498 499 String errorMessage = "Received stat error from Zookeeper. code:" 500 + code.toString(); 501 LOG.debug(errorMessage); 502 503 if (shouldRetry(code)) { 504 if (statRetryCount < maxRetryNum) { 505 ++statRetryCount; 506 monitorLockNodeAsync(); 507 return; 508 } 509 errorMessage = errorMessage 510 + ". Not retrying further znode monitoring connection errors."; 511 } else if (isSessionExpired(code)) { 512 // This isn't fatal - the client Watcher will re-join the election 513 LOG.warn("Lock monitoring failed because session was lost"); 514 return; 515 } 516 517 fatalError(errorMessage); 518 } 519 520 /** 521 * We failed to become active. Re-join the election, but 522 * sleep for a few seconds after terminating our existing 523 * session, so that other nodes have a chance to become active. 524 * The failure to become active is already logged inside 525 * becomeActive(). 526 */ 527 private void reJoinElectionAfterFailureToBecomeActive() { 528 reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE); 529 } 530 531 /** 532 * interface implementation of Zookeeper watch events (connection and node), 533 * proxied by {@link WatcherWithClientRef}. 534 */ 535 synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) { 536 Event.EventType eventType = event.getType(); 537 if (isStaleClient(zk)) return; 538 LOG.debug("Watcher event type: " + eventType + " with state:" 539 + event.getState() + " for path:" + event.getPath() 540 + " connectionState: " + zkConnectionState 541 + " for " + this); 542 543 if (eventType == Event.EventType.None) { 544 // the connection state has changed 545 switch (event.getState()) { 546 case SyncConnected: 547 LOG.info("Session connected."); 548 // if the listener was asked to move to safe state then it needs to 549 // be undone 550 ConnectionState prevConnectionState = zkConnectionState; 551 zkConnectionState = ConnectionState.CONNECTED; 552 if (prevConnectionState == ConnectionState.DISCONNECTED && 553 wantToBeInElection) { 554 monitorActiveStatus(); 555 } 556 break; 557 case Disconnected: 558 LOG.info("Session disconnected. Entering neutral mode..."); 559 560 // ask the app to move to safe state because zookeeper connection 561 // is not active and we dont know our state 562 zkConnectionState = ConnectionState.DISCONNECTED; 563 enterNeutralMode(); 564 break; 565 case Expired: 566 // the connection got terminated because of session timeout 567 // call listener to reconnect 568 LOG.info("Session expired. Entering neutral mode and rejoining..."); 569 enterNeutralMode(); 570 reJoinElection(0); 571 break; 572 case SaslAuthenticated: 573 LOG.info("Successfully authenticated to ZooKeeper using SASL."); 574 break; 575 default: 576 fatalError("Unexpected Zookeeper watch event state: " 577 + event.getState()); 578 break; 579 } 580 581 return; 582 } 583 584 // a watch on lock path in zookeeper has fired. so something has changed on 585 // the lock. ideally we should check that the path is the same as the lock 586 // path but trusting zookeeper for now 587 String path = event.getPath(); 588 if (path != null) { 589 switch (eventType) { 590 case NodeDeleted: 591 if (state == State.ACTIVE) { 592 enterNeutralMode(); 593 } 594 joinElectionInternal(); 595 break; 596 case NodeDataChanged: 597 monitorActiveStatus(); 598 break; 599 default: 600 LOG.debug("Unexpected node event: " + eventType + " for path: " + path); 601 monitorActiveStatus(); 602 } 603 604 return; 605 } 606 607 // some unexpected error has occurred 608 fatalError("Unexpected watch error from Zookeeper"); 609 } 610 611 /** 612 * Get a new zookeeper client instance. protected so that test class can 613 * inherit and pass in a mock object for zookeeper 614 * 615 * @return new zookeeper client instance 616 * @throws IOException 617 * @throws KeeperException zookeeper connectionloss exception 618 */ 619 protected synchronized ZooKeeper getNewZooKeeper() throws IOException, 620 KeeperException { 621 622 // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and 623 // may trigger the Connected event immediately. So, if we register the 624 // watcher after constructing ZooKeeper, we may miss that event. Instead, 625 // we construct the watcher first, and have it block any events it receives 626 // before we can set its ZooKeeper reference. 627 watcher = new WatcherWithClientRef(); 628 ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher); 629 watcher.setZooKeeperRef(zk); 630 631 // Wait for the asynchronous success/failure. This may throw an exception 632 // if we don't connect within the session timeout. 633 watcher.waitForZKConnectionEvent(zkSessionTimeout); 634 635 for (ZKAuthInfo auth : zkAuthInfo) { 636 zk.addAuthInfo(auth.getScheme(), auth.getAuth()); 637 } 638 return zk; 639 } 640 641 private void fatalError(String errorMessage) { 642 LOG.fatal(errorMessage); 643 reset(); 644 appClient.notifyFatalError(errorMessage); 645 } 646 647 private void monitorActiveStatus() { 648 assert wantToBeInElection; 649 LOG.debug("Monitoring active leader for " + this); 650 statRetryCount = 0; 651 monitorLockNodeAsync(); 652 } 653 654 private void joinElectionInternal() { 655 Preconditions.checkState(appData != null, 656 "trying to join election without any app data"); 657 if (zkClient == null) { 658 if (!reEstablishSession()) { 659 fatalError("Failed to reEstablish connection with ZooKeeper"); 660 return; 661 } 662 } 663 664 createRetryCount = 0; 665 wantToBeInElection = true; 666 createLockNodeAsync(); 667 } 668 669 private void reJoinElection(int sleepTime) { 670 LOG.info("Trying to re-establish ZK session"); 671 672 // Some of the test cases rely on expiring the ZK sessions and 673 // ensuring that the other node takes over. But, there's a race 674 // where the original lease holder could reconnect faster than the other 675 // thread manages to take the lock itself. This lock allows the 676 // tests to block the reconnection. It's a shame that this leaked 677 // into non-test code, but the lock is only acquired here so will never 678 // be contended. 679 sessionReestablishLockForTests.lock(); 680 try { 681 terminateConnection(); 682 sleepFor(sleepTime); 683 // Should not join election even before the SERVICE is reported 684 // as HEALTHY from ZKFC monitoring. 685 if (appData != null) { 686 joinElectionInternal(); 687 } else { 688 LOG.info("Not joining election since service has not yet been " + 689 "reported as healthy."); 690 } 691 } finally { 692 sessionReestablishLockForTests.unlock(); 693 } 694 } 695 696 /** 697 * Sleep for the given number of milliseconds. 698 * This is non-static, and separated out, so that unit tests 699 * can override the behavior not to sleep. 700 */ 701 @VisibleForTesting 702 protected void sleepFor(int sleepMs) { 703 if (sleepMs > 0) { 704 try { 705 Thread.sleep(sleepMs); 706 } catch (InterruptedException e) { 707 Thread.currentThread().interrupt(); 708 } 709 } 710 } 711 712 @VisibleForTesting 713 void preventSessionReestablishmentForTests() { 714 sessionReestablishLockForTests.lock(); 715 } 716 717 @VisibleForTesting 718 void allowSessionReestablishmentForTests() { 719 sessionReestablishLockForTests.unlock(); 720 } 721 722 @VisibleForTesting 723 synchronized long getZKSessionIdForTests() { 724 if (zkClient != null) { 725 return zkClient.getSessionId(); 726 } else { 727 return -1; 728 } 729 } 730 731 @VisibleForTesting 732 synchronized State getStateForTests() { 733 return state; 734 } 735 736 private boolean reEstablishSession() { 737 int connectionRetryCount = 0; 738 boolean success = false; 739 while(!success && connectionRetryCount < maxRetryNum) { 740 LOG.debug("Establishing zookeeper connection for " + this); 741 try { 742 createConnection(); 743 success = true; 744 } catch(IOException e) { 745 LOG.warn(e); 746 sleepFor(5000); 747 } catch(KeeperException e) { 748 LOG.warn(e); 749 sleepFor(5000); 750 } 751 ++connectionRetryCount; 752 } 753 return success; 754 } 755 756 private void createConnection() throws IOException, KeeperException { 757 if (zkClient != null) { 758 try { 759 zkClient.close(); 760 } catch (InterruptedException e) { 761 throw new IOException("Interrupted while closing ZK", 762 e); 763 } 764 zkClient = null; 765 watcher = null; 766 } 767 zkClient = getNewZooKeeper(); 768 LOG.debug("Created new connection for " + this); 769 } 770 771 @InterfaceAudience.Private 772 public synchronized void terminateConnection() { 773 if (zkClient == null) { 774 return; 775 } 776 LOG.debug("Terminating ZK connection for " + this); 777 ZooKeeper tempZk = zkClient; 778 zkClient = null; 779 watcher = null; 780 try { 781 tempZk.close(); 782 } catch(InterruptedException e) { 783 LOG.warn(e); 784 } 785 zkConnectionState = ConnectionState.TERMINATED; 786 wantToBeInElection = false; 787 } 788 789 private void reset() { 790 state = State.INIT; 791 terminateConnection(); 792 } 793 794 private boolean becomeActive() { 795 assert wantToBeInElection; 796 if (state == State.ACTIVE) { 797 // already active 798 return true; 799 } 800 try { 801 Stat oldBreadcrumbStat = fenceOldActive(); 802 writeBreadCrumbNode(oldBreadcrumbStat); 803 804 LOG.debug("Becoming active for " + this); 805 appClient.becomeActive(); 806 state = State.ACTIVE; 807 return true; 808 } catch (Exception e) { 809 LOG.warn("Exception handling the winning of election", e); 810 // Caller will handle quitting and rejoining the election. 811 return false; 812 } 813 } 814 815 /** 816 * Write the "ActiveBreadCrumb" node, indicating that this node may need 817 * to be fenced on failover. 818 * @param oldBreadcrumbStat 819 */ 820 private void writeBreadCrumbNode(Stat oldBreadcrumbStat) 821 throws KeeperException, InterruptedException { 822 Preconditions.checkState(appData != null, "no appdata"); 823 824 LOG.info("Writing znode " + zkBreadCrumbPath + 825 " to indicate that the local node is the most recent active..."); 826 if (oldBreadcrumbStat == null) { 827 // No previous active, just create the node 828 createWithRetries(zkBreadCrumbPath, appData, zkAcl, 829 CreateMode.PERSISTENT); 830 } else { 831 // There was a previous active, update the node 832 setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion()); 833 } 834 } 835 836 /** 837 * Try to delete the "ActiveBreadCrumb" node when gracefully giving up 838 * active status. 839 * If this fails, it will simply warn, since the graceful release behavior 840 * is only an optimization. 841 */ 842 private void tryDeleteOwnBreadCrumbNode() { 843 assert state == State.ACTIVE; 844 LOG.info("Deleting bread-crumb of active node..."); 845 846 // Sanity check the data. This shouldn't be strictly necessary, 847 // but better to play it safe. 848 Stat stat = new Stat(); 849 byte[] data = null; 850 try { 851 data = zkClient.getData(zkBreadCrumbPath, false, stat); 852 853 if (!Arrays.equals(data, appData)) { 854 throw new IllegalStateException( 855 "We thought we were active, but in fact " + 856 "the active znode had the wrong data: " + 857 StringUtils.byteToHexString(data) + " (stat=" + stat + ")"); 858 } 859 860 deleteWithRetries(zkBreadCrumbPath, stat.getVersion()); 861 } catch (Exception e) { 862 LOG.warn("Unable to delete our own bread-crumb of being active at " + 863 zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " + 864 "Expecting to be fenced by the next active."); 865 } 866 } 867 868 /** 869 * If there is a breadcrumb node indicating that another node may need 870 * fencing, try to fence that node. 871 * @return the Stat of the breadcrumb node that was read, or null 872 * if no breadcrumb node existed 873 */ 874 private Stat fenceOldActive() throws InterruptedException, KeeperException { 875 final Stat stat = new Stat(); 876 byte[] data; 877 LOG.info("Checking for any old active which needs to be fenced..."); 878 try { 879 data = zkDoWithRetries(new ZKAction<byte[]>() { 880 @Override 881 public byte[] run() throws KeeperException, InterruptedException { 882 return zkClient.getData(zkBreadCrumbPath, false, stat); 883 } 884 }); 885 } catch (KeeperException ke) { 886 if (isNodeDoesNotExist(ke.code())) { 887 LOG.info("No old node to fence"); 888 return null; 889 } 890 891 // If we failed to read for any other reason, then likely we lost 892 // our session, or we don't have permissions, etc. In any case, 893 // we probably shouldn't become active, and failing the whole 894 // thing is the best bet. 895 throw ke; 896 } 897 898 LOG.info("Old node exists: " + StringUtils.byteToHexString(data)); 899 if (Arrays.equals(data, appData)) { 900 LOG.info("But old node has our own data, so don't need to fence it."); 901 } else { 902 appClient.fenceOldActive(data); 903 } 904 return stat; 905 } 906 907 private void becomeStandby() { 908 if (state != State.STANDBY) { 909 LOG.debug("Becoming standby for " + this); 910 state = State.STANDBY; 911 appClient.becomeStandby(); 912 } 913 } 914 915 private void enterNeutralMode() { 916 if (state != State.NEUTRAL) { 917 LOG.debug("Entering neutral mode for " + this); 918 state = State.NEUTRAL; 919 appClient.enterNeutralMode(); 920 } 921 } 922 923 private void createLockNodeAsync() { 924 zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL, 925 this, zkClient); 926 } 927 928 private void monitorLockNodeAsync() { 929 zkClient.exists(zkLockFilePath, 930 watcher, this, 931 zkClient); 932 } 933 934 private String createWithRetries(final String path, final byte[] data, 935 final List<ACL> acl, final CreateMode mode) 936 throws InterruptedException, KeeperException { 937 return zkDoWithRetries(new ZKAction<String>() { 938 @Override 939 public String run() throws KeeperException, InterruptedException { 940 return zkClient.create(path, data, acl, mode); 941 } 942 }); 943 } 944 945 private byte[] getDataWithRetries(final String path, final boolean watch, 946 final Stat stat) throws InterruptedException, KeeperException { 947 return zkDoWithRetries(new ZKAction<byte[]>() { 948 @Override 949 public byte[] run() throws KeeperException, InterruptedException { 950 return zkClient.getData(path, watch, stat); 951 } 952 }); 953 } 954 955 private Stat setDataWithRetries(final String path, final byte[] data, 956 final int version) throws InterruptedException, KeeperException { 957 return zkDoWithRetries(new ZKAction<Stat>() { 958 @Override 959 public Stat run() throws KeeperException, InterruptedException { 960 return zkClient.setData(path, data, version); 961 } 962 }); 963 } 964 965 private void deleteWithRetries(final String path, final int version) 966 throws KeeperException, InterruptedException { 967 zkDoWithRetries(new ZKAction<Void>() { 968 @Override 969 public Void run() throws KeeperException, InterruptedException { 970 zkClient.delete(path, version); 971 return null; 972 } 973 }); 974 } 975 976 private <T> T zkDoWithRetries(ZKAction<T> action) throws KeeperException, 977 InterruptedException { 978 int retry = 0; 979 while (true) { 980 try { 981 return action.run(); 982 } catch (KeeperException ke) { 983 if (shouldRetry(ke.code()) && ++retry < maxRetryNum) { 984 continue; 985 } 986 throw ke; 987 } 988 } 989 } 990 991 private interface ZKAction<T> { 992 T run() throws KeeperException, InterruptedException; 993 } 994 995 /** 996 * The callbacks and watchers pass a reference to the ZK client 997 * which made the original call. We don't want to take action 998 * based on any callbacks from prior clients after we quit 999 * the election. 1000 * @param ctx the ZK client passed into the watcher 1001 * @return true if it matches the current client 1002 */ 1003 private synchronized boolean isStaleClient(Object ctx) { 1004 Preconditions.checkNotNull(ctx); 1005 if (zkClient != (ZooKeeper)ctx) { 1006 LOG.warn("Ignoring stale result from old client with sessionId " + 1007 String.format("0x%08x", ((ZooKeeper)ctx).getSessionId())); 1008 return true; 1009 } 1010 return false; 1011 } 1012 1013 /** 1014 * Watcher implementation which keeps a reference around to the 1015 * original ZK connection, and passes it back along with any 1016 * events. 1017 */ 1018 private final class WatcherWithClientRef implements Watcher { 1019 private ZooKeeper zk; 1020 1021 /** 1022 * Latch fired whenever any event arrives. This is used in order 1023 * to wait for the Connected event when the client is first created. 1024 */ 1025 private CountDownLatch hasReceivedEvent = new CountDownLatch(1); 1026 1027 /** 1028 * Latch used to wait until the reference to ZooKeeper is set. 1029 */ 1030 private CountDownLatch hasSetZooKeeper = new CountDownLatch(1); 1031 1032 /** 1033 * Waits for the next event from ZooKeeper to arrive. 1034 * 1035 * @param connectionTimeoutMs zookeeper connection timeout in milliseconds 1036 * @throws KeeperException if the connection attempt times out. This will 1037 * be a ZooKeeper ConnectionLoss exception code. 1038 * @throws IOException if interrupted while connecting to ZooKeeper 1039 */ 1040 private void waitForZKConnectionEvent(int connectionTimeoutMs) 1041 throws KeeperException, IOException { 1042 try { 1043 if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) { 1044 LOG.error("Connection timed out: couldn't connect to ZooKeeper in " 1045 + connectionTimeoutMs + " milliseconds"); 1046 zk.close(); 1047 throw KeeperException.create(Code.CONNECTIONLOSS); 1048 } 1049 } catch (InterruptedException e) { 1050 Thread.currentThread().interrupt(); 1051 throw new IOException( 1052 "Interrupted when connecting to zookeeper server", e); 1053 } 1054 } 1055 1056 private void setZooKeeperRef(ZooKeeper zk) { 1057 Preconditions.checkState(this.zk == null, 1058 "zk already set -- must be set exactly once"); 1059 this.zk = zk; 1060 hasSetZooKeeper.countDown(); 1061 } 1062 1063 @Override 1064 public void process(WatchedEvent event) { 1065 hasReceivedEvent.countDown(); 1066 try { 1067 hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS); 1068 ActiveStandbyElector.this.processWatchEvent( 1069 zk, event); 1070 } catch (Throwable t) { 1071 fatalError( 1072 "Failed to process watcher event " + event + ": " + 1073 StringUtils.stringifyException(t)); 1074 } 1075 } 1076 } 1077 1078 private static boolean isSuccess(Code code) { 1079 return (code == Code.OK); 1080 } 1081 1082 private static boolean isNodeExists(Code code) { 1083 return (code == Code.NODEEXISTS); 1084 } 1085 1086 private static boolean isNodeDoesNotExist(Code code) { 1087 return (code == Code.NONODE); 1088 } 1089 1090 private static boolean isSessionExpired(Code code) { 1091 return (code == Code.SESSIONEXPIRED); 1092 } 1093 1094 private static boolean shouldRetry(Code code) { 1095 return code == Code.CONNECTIONLOSS || code == Code.OPERATIONTIMEOUT; 1096 } 1097 1098 @Override 1099 public String toString() { 1100 return "elector id=" + System.identityHashCode(this) + 1101 " appData=" + 1102 ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + 1103 " cb=" + appClient; 1104 } 1105 1106 public String getHAZookeeperConnectionState() { 1107 return this.zkConnectionState.name(); 1108 } 1109}