001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.ha;
019
020 import java.io.IOException;
021 import java.io.PrintStream;
022 import java.util.ArrayList;
023 import java.util.Arrays;
024 import java.util.Collection;
025 import java.util.Map;
026
027 import org.apache.commons.cli.Options;
028 import org.apache.commons.cli.CommandLine;
029 import org.apache.commons.cli.GnuParser;
030 import org.apache.commons.cli.ParseException;
031 import org.apache.commons.logging.Log;
032 import org.apache.commons.logging.LogFactory;
033
034 import org.apache.hadoop.classification.InterfaceAudience;
035 import org.apache.hadoop.conf.Configuration;
036 import org.apache.hadoop.conf.Configured;
037 import org.apache.hadoop.fs.CommonConfigurationKeys;
038 import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
039 import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
040 import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
041 import org.apache.hadoop.util.Tool;
042 import org.apache.hadoop.util.ToolRunner;
043
044 import com.google.common.base.Preconditions;
045 import com.google.common.collect.ImmutableMap;
046
047 /**
048 * A command-line tool for making calls in the HAServiceProtocol.
049 * For example,. this can be used to force a service to standby or active
050 * mode, or to trigger a health-check.
051 */
052 @InterfaceAudience.Private
053
054 public abstract class HAAdmin extends Configured implements Tool {
055
056 private static final String FORCEFENCE = "forcefence";
057 private static final String FORCEACTIVE = "forceactive";
058
059 /**
060 * Undocumented flag which allows an administrator to use manual failover
061 * state transitions even when auto-failover is enabled. This is an unsafe
062 * operation, which is why it is not documented in the usage below.
063 */
064 private static final String FORCEMANUAL = "forcemanual";
065 private static final Log LOG = LogFactory.getLog(HAAdmin.class);
066
067 private int rpcTimeoutForChecks = -1;
068
069 protected final static Map<String, UsageInfo> USAGE =
070 ImmutableMap.<String, UsageInfo>builder()
071 .put("-transitionToActive",
072 new UsageInfo("<serviceId> [--"+FORCEACTIVE+"]", "Transitions the service into Active state"))
073 .put("-transitionToStandby",
074 new UsageInfo("<serviceId>", "Transitions the service into Standby state"))
075 .put("-failover",
076 new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>",
077 "Failover from the first service to the second.\n" +
078 "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" +
079 "Try to failover to the target service even if it is not ready if the " +
080 FORCEACTIVE + " option is used."))
081 .put("-getServiceState",
082 new UsageInfo("<serviceId>", "Returns the state of the service"))
083 .put("-checkHealth",
084 new UsageInfo("<serviceId>",
085 "Requests that the service perform a health check.\n" +
086 "The HAAdmin tool will exit with a non-zero exit code\n" +
087 "if the check fails."))
088 .put("-help",
089 new UsageInfo("<command>", "Displays help on the specified command"))
090 .build();
091
092 /** Output stream for errors, for use in tests */
093 protected PrintStream errOut = System.err;
094 protected PrintStream out = System.out;
095 private RequestSource requestSource = RequestSource.REQUEST_BY_USER;
096
097 protected HAAdmin() {
098 super();
099 }
100
101 protected HAAdmin(Configuration conf) {
102 super(conf);
103 }
104
105 protected abstract HAServiceTarget resolveTarget(String string);
106
107 protected Collection<String> getTargetIds(String targetNodeToActivate) {
108 return new ArrayList<String>(
109 Arrays.asList(new String[]{targetNodeToActivate}));
110 }
111
112 protected String getUsageString() {
113 return "Usage: HAAdmin";
114 }
115
116 protected void printUsage(PrintStream errOut) {
117 errOut.println(getUsageString());
118 for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) {
119 String cmd = e.getKey();
120 UsageInfo usage = e.getValue();
121
122 errOut.println(" [" + cmd + " " + usage.args + "]");
123 }
124 errOut.println();
125 ToolRunner.printGenericCommandUsage(errOut);
126 }
127
128 private static void printUsage(PrintStream errOut, String cmd) {
129 UsageInfo usage = USAGE.get(cmd);
130 if (usage == null) {
131 throw new RuntimeException("No usage for cmd " + cmd);
132 }
133 errOut.println("Usage: HAAdmin [" + cmd + " " + usage.args + "]");
134 }
135
136 private int transitionToActive(final CommandLine cmd)
137 throws IOException, ServiceFailedException {
138 String[] argv = cmd.getArgs();
139 if (argv.length != 1) {
140 errOut.println("transitionToActive: incorrect number of arguments");
141 printUsage(errOut, "-transitionToActive");
142 return -1;
143 }
144 /* returns true if other target node is active or some exception occurred
145 and forceActive was not set */
146 if(!cmd.hasOption(FORCEACTIVE)) {
147 if(isOtherTargetNodeActive(argv[0], cmd.hasOption(FORCEACTIVE))) {
148 return -1;
149 }
150 }
151 HAServiceTarget target = resolveTarget(argv[0]);
152 if (!checkManualStateManagementOK(target)) {
153 return -1;
154 }
155 HAServiceProtocol proto = target.getProxy(
156 getConf(), 0);
157 HAServiceProtocolHelper.transitionToActive(proto, createReqInfo());
158 return 0;
159 }
160
161 /**
162 * Checks whether other target node is active or not
163 * @param targetNodeToActivate
164 * @return true if other target node is active or some other exception
165 * occurred and forceActive was set otherwise false
166 * @throws IOException
167 */
168 private boolean isOtherTargetNodeActive(String targetNodeToActivate, boolean forceActive)
169 throws IOException {
170 Collection<String> targetIds = getTargetIds(targetNodeToActivate);
171 if(targetIds == null) {
172 errOut.println("transitionToActive: No target node in the "
173 + "current configuration");
174 printUsage(errOut, "-transitionToActive");
175 return true;
176 }
177 targetIds.remove(targetNodeToActivate);
178 for(String targetId : targetIds) {
179 HAServiceTarget target = resolveTarget(targetId);
180 if (!checkManualStateManagementOK(target)) {
181 return true;
182 }
183 try {
184 HAServiceProtocol proto = target.getProxy(getConf(), 5000);
185 if(proto.getServiceStatus().getState() == HAServiceState.ACTIVE) {
186 errOut.println("transitionToActive: Node " + targetId +" is already active");
187 printUsage(errOut, "-transitionToActive");
188 return true;
189 }
190 } catch (Exception e) {
191 //If forceActive switch is false then return true
192 if(!forceActive) {
193 errOut.println("Unexpected error occurred " + e.getMessage());
194 printUsage(errOut, "-transitionToActive");
195 return true;
196 }
197 }
198 }
199 return false;
200 }
201
202 private int transitionToStandby(final CommandLine cmd)
203 throws IOException, ServiceFailedException {
204 String[] argv = cmd.getArgs();
205 if (argv.length != 1) {
206 errOut.println("transitionToStandby: incorrect number of arguments");
207 printUsage(errOut, "-transitionToStandby");
208 return -1;
209 }
210
211 HAServiceTarget target = resolveTarget(argv[0]);
212 if (!checkManualStateManagementOK(target)) {
213 return -1;
214 }
215 HAServiceProtocol proto = target.getProxy(
216 getConf(), 0);
217 HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo());
218 return 0;
219 }
220 /**
221 * Ensure that we are allowed to manually manage the HA state of the target
222 * service. If automatic failover is configured, then the automatic
223 * failover controllers should be doing state management, and it is generally
224 * an error to use the HAAdmin command line to do so.
225 *
226 * @param target the target to check
227 * @return true if manual state management is allowed
228 */
229 private boolean checkManualStateManagementOK(HAServiceTarget target) {
230 if (target.isAutoFailoverEnabled()) {
231 if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) {
232 errOut.println(
233 "Automatic failover is enabled for " + target + "\n" +
234 "Refusing to manually manage HA state, since it may cause\n" +
235 "a split-brain scenario or other incorrect state.\n" +
236 "If you are very sure you know what you are doing, please \n" +
237 "specify the " + FORCEMANUAL + " flag.");
238 return false;
239 } else {
240 LOG.warn("Proceeding with manual HA state management even though\n" +
241 "automatic failover is enabled for " + target);
242 return true;
243 }
244 }
245 return true;
246 }
247
248 private StateChangeRequestInfo createReqInfo() {
249 return new StateChangeRequestInfo(requestSource);
250 }
251
252 private int failover(CommandLine cmd)
253 throws IOException, ServiceFailedException {
254 boolean forceFence = cmd.hasOption(FORCEFENCE);
255 boolean forceActive = cmd.hasOption(FORCEACTIVE);
256
257 int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length;
258 final String[] args = cmd.getArgs();
259
260 if (numOpts > 3 || args.length != 2) {
261 errOut.println("failover: incorrect arguments");
262 printUsage(errOut, "-failover");
263 return -1;
264 }
265
266 HAServiceTarget fromNode = resolveTarget(args[0]);
267 HAServiceTarget toNode = resolveTarget(args[1]);
268
269 // Check that auto-failover is consistently configured for both nodes.
270 Preconditions.checkState(
271 fromNode.isAutoFailoverEnabled() ==
272 toNode.isAutoFailoverEnabled(),
273 "Inconsistent auto-failover configs between %s and %s!",
274 fromNode, toNode);
275
276 if (fromNode.isAutoFailoverEnabled()) {
277 if (forceFence || forceActive) {
278 // -forceActive doesn't make sense with auto-HA, since, if the node
279 // is not healthy, then its ZKFC will immediately quit the election
280 // again the next time a health check runs.
281 //
282 // -forceFence doesn't seem to have any real use cases with auto-HA
283 // so it isn't implemented.
284 errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " +
285 "supported with auto-failover enabled.");
286 return -1;
287 }
288 return gracefulFailoverThroughZKFCs(toNode);
289 }
290
291 FailoverController fc = new FailoverController(getConf(),
292 requestSource);
293
294 try {
295 fc.failover(fromNode, toNode, forceFence, forceActive);
296 out.println("Failover from "+args[0]+" to "+args[1]+" successful");
297 } catch (FailoverFailedException ffe) {
298 errOut.println("Failover failed: " + ffe.getLocalizedMessage());
299 return -1;
300 }
301 return 0;
302 }
303
304
305 /**
306 * Initiate a graceful failover by talking to the target node's ZKFC.
307 * This sends an RPC to the ZKFC, which coordinates the failover.
308 *
309 * @param toNode the node to fail to
310 * @return status code (0 for success)
311 * @throws IOException if failover does not succeed
312 */
313 private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode)
314 throws IOException {
315
316 int timeout = FailoverController.getRpcTimeoutToNewActive(getConf());
317 ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout);
318 try {
319 proxy.gracefulFailover();
320 out.println("Failover to " + toNode + " successful");
321 } catch (ServiceFailedException sfe) {
322 errOut.println("Failover failed: " + sfe.getLocalizedMessage());
323 return -1;
324 }
325
326 return 0;
327 }
328
329 private int checkHealth(final CommandLine cmd)
330 throws IOException, ServiceFailedException {
331 String[] argv = cmd.getArgs();
332 if (argv.length != 1) {
333 errOut.println("checkHealth: incorrect number of arguments");
334 printUsage(errOut, "-checkHealth");
335 return -1;
336 }
337 HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
338 getConf(), rpcTimeoutForChecks);
339 try {
340 HAServiceProtocolHelper.monitorHealth(proto, createReqInfo());
341 } catch (HealthCheckFailedException e) {
342 errOut.println("Health check failed: " + e.getLocalizedMessage());
343 return -1;
344 }
345 return 0;
346 }
347
348 private int getServiceState(final CommandLine cmd)
349 throws IOException, ServiceFailedException {
350 String[] argv = cmd.getArgs();
351 if (argv.length != 1) {
352 errOut.println("getServiceState: incorrect number of arguments");
353 printUsage(errOut, "-getServiceState");
354 return -1;
355 }
356
357 HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
358 getConf(), rpcTimeoutForChecks);
359 out.println(proto.getServiceStatus().getState());
360 return 0;
361 }
362
363 /**
364 * Return the serviceId as is, we are assuming it was
365 * given as a service address of form <host:ipcport>.
366 */
367 protected String getServiceAddr(String serviceId) {
368 return serviceId;
369 }
370
371 @Override
372 public void setConf(Configuration conf) {
373 super.setConf(conf);
374 if (conf != null) {
375 rpcTimeoutForChecks = conf.getInt(
376 CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
377 CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
378 }
379 }
380
381 @Override
382 public int run(String[] argv) throws Exception {
383 try {
384 return runCmd(argv);
385 } catch (IllegalArgumentException iae) {
386 errOut.println("Illegal argument: " + iae.getLocalizedMessage());
387 return -1;
388 } catch (IOException ioe) {
389 errOut.println("Operation failed: " + ioe.getLocalizedMessage());
390 if (LOG.isDebugEnabled()) {
391 LOG.debug("Operation failed", ioe);
392 }
393 return -1;
394 }
395 }
396
397 protected int runCmd(String[] argv) throws Exception {
398 if (argv.length < 1) {
399 printUsage(errOut);
400 return -1;
401 }
402
403 String cmd = argv[0];
404
405 if (!cmd.startsWith("-")) {
406 errOut.println("Bad command '" + cmd + "': expected command starting with '-'");
407 printUsage(errOut);
408 return -1;
409 }
410
411 if (!USAGE.containsKey(cmd)) {
412 errOut.println(cmd.substring(1) + ": Unknown command");
413 printUsage(errOut);
414 return -1;
415 }
416
417 Options opts = new Options();
418
419 // Add command-specific options
420 if ("-failover".equals(cmd)) {
421 addFailoverCliOpts(opts);
422 }
423 if("-transitionToActive".equals(cmd)) {
424 addTransitionToActiveCliOpts(opts);
425 }
426 // Mutative commands take FORCEMANUAL option
427 if ("-transitionToActive".equals(cmd) ||
428 "-transitionToStandby".equals(cmd) ||
429 "-failover".equals(cmd)) {
430 opts.addOption(FORCEMANUAL, false,
431 "force manual control even if auto-failover is enabled");
432 }
433
434 CommandLine cmdLine = parseOpts(cmd, opts, argv);
435 if (cmdLine == null) {
436 // error already printed
437 return -1;
438 }
439
440 if (cmdLine.hasOption(FORCEMANUAL)) {
441 if (!confirmForceManual()) {
442 LOG.fatal("Aborted");
443 return -1;
444 }
445 // Instruct the NNs to honor this request even if they're
446 // configured for manual failover.
447 requestSource = RequestSource.REQUEST_BY_USER_FORCED;
448 }
449
450 if ("-transitionToActive".equals(cmd)) {
451 return transitionToActive(cmdLine);
452 } else if ("-transitionToStandby".equals(cmd)) {
453 return transitionToStandby(cmdLine);
454 } else if ("-failover".equals(cmd)) {
455 return failover(cmdLine);
456 } else if ("-getServiceState".equals(cmd)) {
457 return getServiceState(cmdLine);
458 } else if ("-checkHealth".equals(cmd)) {
459 return checkHealth(cmdLine);
460 } else if ("-help".equals(cmd)) {
461 return help(argv);
462 } else {
463 // we already checked command validity above, so getting here
464 // would be a coding error
465 throw new AssertionError("Should not get here, command: " + cmd);
466 }
467 }
468
469 private boolean confirmForceManual() throws IOException {
470 return ToolRunner.confirmPrompt(
471 "You have specified the " + FORCEMANUAL + " flag. This flag is " +
472 "dangerous, as it can induce a split-brain scenario that WILL " +
473 "CORRUPT your HDFS namespace, possibly irrecoverably.\n" +
474 "\n" +
475 "It is recommended not to use this flag, but instead to shut down the " +
476 "cluster and disable automatic failover if you prefer to manually " +
477 "manage your HA state.\n" +
478 "\n" +
479 "You may abort safely by answering 'n' or hitting ^C now.\n" +
480 "\n" +
481 "Are you sure you want to continue?");
482 }
483
484 /**
485 * Add CLI options which are specific to the failover command and no
486 * others.
487 */
488 private void addFailoverCliOpts(Options failoverOpts) {
489 failoverOpts.addOption(FORCEFENCE, false, "force fencing");
490 failoverOpts.addOption(FORCEACTIVE, false, "force failover");
491 // Don't add FORCEMANUAL, since that's added separately for all commands
492 // that change state.
493 }
494
495 /**
496 * Add CLI options which are specific to the transitionToActive command and
497 * no others.
498 */
499 private void addTransitionToActiveCliOpts(Options transitionToActiveCliOpts) {
500 transitionToActiveCliOpts.addOption(FORCEACTIVE, false, "force active");
501 }
502
503 private CommandLine parseOpts(String cmdName, Options opts, String[] argv) {
504 try {
505 // Strip off the first arg, since that's just the command name
506 argv = Arrays.copyOfRange(argv, 1, argv.length);
507 return new GnuParser().parse(opts, argv);
508 } catch (ParseException pe) {
509 errOut.println(cmdName.substring(1) +
510 ": incorrect arguments");
511 printUsage(errOut, cmdName);
512 return null;
513 }
514 }
515
516 private int help(String[] argv) {
517 if (argv.length == 1) { // only -help
518 printUsage(out);
519 return 0;
520 } else if (argv.length != 2) {
521 printUsage(errOut, "-help");
522 return -1;
523 }
524 String cmd = argv[1];
525 if (!cmd.startsWith("-")) {
526 cmd = "-" + cmd;
527 }
528 UsageInfo usageInfo = USAGE.get(cmd);
529 if (usageInfo == null) {
530 errOut.println(cmd + ": Unknown command");
531 printUsage(errOut);
532 return -1;
533 }
534
535 out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
536 return 0;
537 }
538
539 protected static class UsageInfo {
540 public final String args;
541 public final String help;
542
543 public UsageInfo(String args, String help) {
544 this.args = args;
545 this.help = help;
546 }
547 }
548 }