001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.hdfs.server.namenode; 019 020 import static org.apache.hadoop.util.ExitUtil.terminate; 021 022 import java.io.IOException; 023 import java.util.ArrayList; 024 import java.util.Collection; 025 import java.util.Collections; 026 import java.util.Comparator; 027 import java.util.LinkedList; 028 import java.util.List; 029 import java.util.PriorityQueue; 030 import java.util.SortedSet; 031 import java.util.concurrent.CopyOnWriteArrayList; 032 033 import org.apache.commons.logging.Log; 034 import org.apache.commons.logging.LogFactory; 035 import org.apache.hadoop.classification.InterfaceAudience; 036 import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; 037 import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; 038 import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; 039 040 import static org.apache.hadoop.util.ExitUtil.terminate; 041 042 import com.google.common.base.Preconditions; 043 import com.google.common.collect.ComparisonChain; 044 import com.google.common.collect.ImmutableList; 045 import com.google.common.collect.ImmutableListMultimap; 046 import com.google.common.collect.Lists; 047 import com.google.common.collect.Multimaps; 048 import com.google.common.collect.Sets; 049 050 /** 051 * Manages a collection of Journals. None of the methods are synchronized, it is 052 * assumed that FSEditLog methods, that use this class, use proper 053 * synchronization. 054 */ 055 public class JournalSet implements JournalManager { 056 057 static final Log LOG = LogFactory.getLog(FSEditLog.class); 058 059 static final public Comparator<EditLogInputStream> 060 EDIT_LOG_INPUT_STREAM_COMPARATOR = new Comparator<EditLogInputStream>() { 061 @Override 062 public int compare(EditLogInputStream a, EditLogInputStream b) { 063 return ComparisonChain.start(). 064 compare(a.getFirstTxId(), b.getFirstTxId()). 065 compare(b.getLastTxId(), a.getLastTxId()). 066 result(); 067 } 068 }; 069 070 /** 071 * Container for a JournalManager paired with its currently 072 * active stream. 073 * 074 * If a Journal gets disabled due to an error writing to its 075 * stream, then the stream will be aborted and set to null. 076 */ 077 static class JournalAndStream implements CheckableNameNodeResource { 078 private final JournalManager journal; 079 private boolean disabled = false; 080 private EditLogOutputStream stream; 081 private boolean required = false; 082 083 public JournalAndStream(JournalManager manager, boolean required) { 084 this.journal = manager; 085 this.required = required; 086 } 087 088 public void startLogSegment(long txId) throws IOException { 089 Preconditions.checkState(stream == null); 090 disabled = false; 091 stream = journal.startLogSegment(txId); 092 } 093 094 /** 095 * Closes the stream, also sets it to null. 096 */ 097 public void closeStream() throws IOException { 098 if (stream == null) return; 099 stream.close(); 100 stream = null; 101 } 102 103 /** 104 * Close the Journal and Stream 105 */ 106 public void close() throws IOException { 107 closeStream(); 108 109 journal.close(); 110 } 111 112 /** 113 * Aborts the stream, also sets it to null. 114 */ 115 public void abort() { 116 if (stream == null) return; 117 try { 118 stream.abort(); 119 } catch (IOException ioe) { 120 LOG.error("Unable to abort stream " + stream, ioe); 121 } 122 stream = null; 123 } 124 125 boolean isActive() { 126 return stream != null; 127 } 128 129 /** 130 * Should be used outside JournalSet only for testing. 131 */ 132 EditLogOutputStream getCurrentStream() { 133 return stream; 134 } 135 136 @Override 137 public String toString() { 138 return "JournalAndStream(mgr=" + journal + 139 ", " + "stream=" + stream + ")"; 140 } 141 142 void setCurrentStreamForTests(EditLogOutputStream stream) { 143 this.stream = stream; 144 } 145 146 JournalManager getManager() { 147 return journal; 148 } 149 150 boolean isDisabled() { 151 return disabled; 152 } 153 154 private void setDisabled(boolean disabled) { 155 this.disabled = disabled; 156 } 157 158 @Override 159 public boolean isResourceAvailable() { 160 return !isDisabled(); 161 } 162 163 @Override 164 public boolean isRequired() { 165 return required; 166 } 167 } 168 169 // COW implementation is necessary since some users (eg the web ui) call 170 // getAllJournalStreams() and then iterate. Since this is rarely 171 // mutated, there is no performance concern. 172 private List<JournalAndStream> journals = 173 new CopyOnWriteArrayList<JournalSet.JournalAndStream>(); 174 final int minimumRedundantJournals; 175 176 JournalSet(int minimumRedundantResources) { 177 this.minimumRedundantJournals = minimumRedundantResources; 178 } 179 180 @Override 181 public void format(NamespaceInfo nsInfo) throws IOException { 182 // The iteration is done by FSEditLog itself 183 throw new UnsupportedOperationException(); 184 } 185 186 @Override 187 public boolean hasSomeData() throws IOException { 188 // This is called individually on the underlying journals, 189 // not on the JournalSet. 190 throw new UnsupportedOperationException(); 191 } 192 193 194 @Override 195 public EditLogOutputStream startLogSegment(final long txId) throws IOException { 196 mapJournalsAndReportErrors(new JournalClosure() { 197 @Override 198 public void apply(JournalAndStream jas) throws IOException { 199 jas.startLogSegment(txId); 200 } 201 }, "starting log segment " + txId); 202 return new JournalSetOutputStream(); 203 } 204 205 @Override 206 public void finalizeLogSegment(final long firstTxId, final long lastTxId) 207 throws IOException { 208 mapJournalsAndReportErrors(new JournalClosure() { 209 @Override 210 public void apply(JournalAndStream jas) throws IOException { 211 if (jas.isActive()) { 212 jas.closeStream(); 213 jas.getManager().finalizeLogSegment(firstTxId, lastTxId); 214 } 215 } 216 }, "finalize log segment " + firstTxId + ", " + lastTxId); 217 } 218 219 @Override 220 public void close() throws IOException { 221 mapJournalsAndReportErrors(new JournalClosure() { 222 @Override 223 public void apply(JournalAndStream jas) throws IOException { 224 jas.close(); 225 } 226 }, "close journal"); 227 } 228 229 /** 230 * In this function, we get a bunch of streams from all of our JournalManager 231 * objects. Then we add these to the collection one by one. 232 * 233 * @param streams The collection to add the streams to. It may or 234 * may not be sorted-- this is up to the caller. 235 * @param fromTxId The transaction ID to start looking for streams at 236 * @param inProgressOk Should we consider unfinalized streams? 237 * @param forReading Whether or not the caller intends to read from 238 * the returned streams. 239 */ 240 @Override 241 public void selectInputStreams(Collection<EditLogInputStream> streams, 242 long fromTxId, boolean inProgressOk, boolean forReading) throws IOException { 243 final PriorityQueue<EditLogInputStream> allStreams = 244 new PriorityQueue<EditLogInputStream>(64, 245 EDIT_LOG_INPUT_STREAM_COMPARATOR); 246 for (JournalAndStream jas : journals) { 247 if (jas.isDisabled()) { 248 LOG.info("Skipping jas " + jas + " since it's disabled"); 249 continue; 250 } 251 try { 252 jas.getManager().selectInputStreams(allStreams, fromTxId, inProgressOk, 253 forReading); 254 } catch (IOException ioe) { 255 LOG.warn("Unable to determine input streams from " + jas.getManager() + 256 ". Skipping.", ioe); 257 } 258 } 259 chainAndMakeRedundantStreams(streams, allStreams, fromTxId); 260 } 261 262 public static void chainAndMakeRedundantStreams( 263 Collection<EditLogInputStream> outStreams, 264 PriorityQueue<EditLogInputStream> allStreams, long fromTxId) { 265 // We want to group together all the streams that start on the same start 266 // transaction ID. To do this, we maintain an accumulator (acc) of all 267 // the streams we've seen at a given start transaction ID. When we see a 268 // higher start transaction ID, we select a stream from the accumulator and 269 // clear it. Then we begin accumulating streams with the new, higher start 270 // transaction ID. 271 LinkedList<EditLogInputStream> acc = 272 new LinkedList<EditLogInputStream>(); 273 EditLogInputStream elis; 274 while ((elis = allStreams.poll()) != null) { 275 if (acc.isEmpty()) { 276 acc.add(elis); 277 } else { 278 long accFirstTxId = acc.get(0).getFirstTxId(); 279 if (accFirstTxId == elis.getFirstTxId()) { 280 acc.add(elis); 281 } else if (accFirstTxId < elis.getFirstTxId()) { 282 outStreams.add(new RedundantEditLogInputStream(acc, fromTxId)); 283 acc.clear(); 284 acc.add(elis); 285 } else if (accFirstTxId > elis.getFirstTxId()) { 286 throw new RuntimeException("sorted set invariants violated! " + 287 "Got stream with first txid " + elis.getFirstTxId() + 288 ", but the last firstTxId was " + accFirstTxId); 289 } 290 } 291 } 292 if (!acc.isEmpty()) { 293 outStreams.add(new RedundantEditLogInputStream(acc, fromTxId)); 294 acc.clear(); 295 } 296 } 297 298 /** 299 * Returns true if there are no journals, all redundant journals are disabled, 300 * or any required journals are disabled. 301 * 302 * @return True if there no journals, all redundant journals are disabled, 303 * or any required journals are disabled. 304 */ 305 public boolean isEmpty() { 306 return !NameNodeResourcePolicy.areResourcesAvailable(journals, 307 minimumRedundantJournals); 308 } 309 310 /** 311 * Called when some journals experience an error in some operation. 312 */ 313 private void disableAndReportErrorOnJournals(List<JournalAndStream> badJournals) { 314 if (badJournals == null || badJournals.isEmpty()) { 315 return; // nothing to do 316 } 317 318 for (JournalAndStream j : badJournals) { 319 LOG.error("Disabling journal " + j); 320 j.abort(); 321 j.setDisabled(true); 322 } 323 } 324 325 /** 326 * Implementations of this interface encapsulate operations that can be 327 * iteratively applied on all the journals. For example see 328 * {@link JournalSet#mapJournalsAndReportErrors}. 329 */ 330 private interface JournalClosure { 331 /** 332 * The operation on JournalAndStream. 333 * @param jas Object on which operations are performed. 334 * @throws IOException 335 */ 336 public void apply(JournalAndStream jas) throws IOException; 337 } 338 339 /** 340 * Apply the given operation across all of the journal managers, disabling 341 * any for which the closure throws an IOException. 342 * @param closure {@link JournalClosure} object encapsulating the operation. 343 * @param status message used for logging errors (e.g. "opening journal") 344 * @throws IOException If the operation fails on all the journals. 345 */ 346 private void mapJournalsAndReportErrors( 347 JournalClosure closure, String status) throws IOException{ 348 349 List<JournalAndStream> badJAS = Lists.newLinkedList(); 350 for (JournalAndStream jas : journals) { 351 try { 352 closure.apply(jas); 353 } catch (Throwable t) { 354 if (jas.isRequired()) { 355 final String msg = "Error: " + status + " failed for required journal (" 356 + jas + ")"; 357 LOG.fatal(msg, t); 358 // If we fail on *any* of the required journals, then we must not 359 // continue on any of the other journals. Abort them to ensure that 360 // retry behavior doesn't allow them to keep going in any way. 361 abortAllJournals(); 362 // the current policy is to shutdown the NN on errors to shared edits 363 // dir. There are many code paths to shared edits failures - syncs, 364 // roll of edits etc. All of them go through this common function 365 // where the isRequired() check is made. Applying exit policy here 366 // to catch all code paths. 367 terminate(1, msg); 368 } else { 369 LOG.error("Error: " + status + " failed for (journal " + jas + ")", t); 370 badJAS.add(jas); 371 } 372 } 373 } 374 disableAndReportErrorOnJournals(badJAS); 375 if (!NameNodeResourcePolicy.areResourcesAvailable(journals, 376 minimumRedundantJournals)) { 377 String message = status + " failed for too many journals"; 378 LOG.error("Error: " + message); 379 throw new IOException(message); 380 } 381 } 382 383 /** 384 * Abort all of the underlying streams. 385 */ 386 private void abortAllJournals() { 387 for (JournalAndStream jas : journals) { 388 if (jas.isActive()) { 389 jas.abort(); 390 } 391 } 392 } 393 394 /** 395 * An implementation of EditLogOutputStream that applies a requested method on 396 * all the journals that are currently active. 397 */ 398 private class JournalSetOutputStream extends EditLogOutputStream { 399 400 JournalSetOutputStream() throws IOException { 401 super(); 402 } 403 404 @Override 405 public void write(final FSEditLogOp op) 406 throws IOException { 407 mapJournalsAndReportErrors(new JournalClosure() { 408 @Override 409 public void apply(JournalAndStream jas) throws IOException { 410 if (jas.isActive()) { 411 jas.getCurrentStream().write(op); 412 } 413 } 414 }, "write op"); 415 } 416 417 @Override 418 public void writeRaw(final byte[] data, final int offset, final int length) 419 throws IOException { 420 mapJournalsAndReportErrors(new JournalClosure() { 421 @Override 422 public void apply(JournalAndStream jas) throws IOException { 423 if (jas.isActive()) { 424 jas.getCurrentStream().writeRaw(data, offset, length); 425 } 426 } 427 }, "write bytes"); 428 } 429 430 @Override 431 public void create() throws IOException { 432 mapJournalsAndReportErrors(new JournalClosure() { 433 @Override 434 public void apply(JournalAndStream jas) throws IOException { 435 if (jas.isActive()) { 436 jas.getCurrentStream().create(); 437 } 438 } 439 }, "create"); 440 } 441 442 @Override 443 public void close() throws IOException { 444 mapJournalsAndReportErrors(new JournalClosure() { 445 @Override 446 public void apply(JournalAndStream jas) throws IOException { 447 jas.closeStream(); 448 } 449 }, "close"); 450 } 451 452 @Override 453 public void abort() throws IOException { 454 mapJournalsAndReportErrors(new JournalClosure() { 455 @Override 456 public void apply(JournalAndStream jas) throws IOException { 457 jas.abort(); 458 } 459 }, "abort"); 460 } 461 462 @Override 463 public void setReadyToFlush() throws IOException { 464 mapJournalsAndReportErrors(new JournalClosure() { 465 @Override 466 public void apply(JournalAndStream jas) throws IOException { 467 if (jas.isActive()) { 468 jas.getCurrentStream().setReadyToFlush(); 469 } 470 } 471 }, "setReadyToFlush"); 472 } 473 474 @Override 475 protected void flushAndSync(final boolean durable) throws IOException { 476 mapJournalsAndReportErrors(new JournalClosure() { 477 @Override 478 public void apply(JournalAndStream jas) throws IOException { 479 if (jas.isActive()) { 480 jas.getCurrentStream().flushAndSync(durable); 481 } 482 } 483 }, "flushAndSync"); 484 } 485 486 @Override 487 public void flush() throws IOException { 488 mapJournalsAndReportErrors(new JournalClosure() { 489 @Override 490 public void apply(JournalAndStream jas) throws IOException { 491 if (jas.isActive()) { 492 jas.getCurrentStream().flush(); 493 } 494 } 495 }, "flush"); 496 } 497 498 @Override 499 public boolean shouldForceSync() { 500 for (JournalAndStream js : journals) { 501 if (js.isActive() && js.getCurrentStream().shouldForceSync()) { 502 return true; 503 } 504 } 505 return false; 506 } 507 508 @Override 509 protected long getNumSync() { 510 for (JournalAndStream jas : journals) { 511 if (jas.isActive()) { 512 return jas.getCurrentStream().getNumSync(); 513 } 514 } 515 return 0; 516 } 517 } 518 519 @Override 520 public void setOutputBufferCapacity(final int size) { 521 try { 522 mapJournalsAndReportErrors(new JournalClosure() { 523 @Override 524 public void apply(JournalAndStream jas) throws IOException { 525 jas.getManager().setOutputBufferCapacity(size); 526 } 527 }, "setOutputBufferCapacity"); 528 } catch (IOException e) { 529 LOG.error("Error in setting outputbuffer capacity"); 530 } 531 } 532 533 List<JournalAndStream> getAllJournalStreams() { 534 return journals; 535 } 536 537 List<JournalManager> getJournalManagers() { 538 List<JournalManager> jList = new ArrayList<JournalManager>(); 539 for (JournalAndStream j : journals) { 540 jList.add(j.getManager()); 541 } 542 return jList; 543 } 544 545 void add(JournalManager j, boolean required) { 546 JournalAndStream jas = new JournalAndStream(j, required); 547 journals.add(jas); 548 } 549 550 void remove(JournalManager j) { 551 JournalAndStream jasToRemove = null; 552 for (JournalAndStream jas: journals) { 553 if (jas.getManager().equals(j)) { 554 jasToRemove = jas; 555 break; 556 } 557 } 558 if (jasToRemove != null) { 559 jasToRemove.abort(); 560 journals.remove(jasToRemove); 561 } 562 } 563 564 @Override 565 public void purgeLogsOlderThan(final long minTxIdToKeep) throws IOException { 566 mapJournalsAndReportErrors(new JournalClosure() { 567 @Override 568 public void apply(JournalAndStream jas) throws IOException { 569 jas.getManager().purgeLogsOlderThan(minTxIdToKeep); 570 } 571 }, "purgeLogsOlderThan " + minTxIdToKeep); 572 } 573 574 @Override 575 public void recoverUnfinalizedSegments() throws IOException { 576 mapJournalsAndReportErrors(new JournalClosure() { 577 @Override 578 public void apply(JournalAndStream jas) throws IOException { 579 jas.getManager().recoverUnfinalizedSegments(); 580 } 581 }, "recoverUnfinalizedSegments"); 582 } 583 584 /** 585 * Return a manifest of what finalized edit logs are available. All available 586 * edit logs are returned starting from the transaction id passed. 587 * 588 * @param fromTxId Starting transaction id to read the logs. 589 * @return RemoteEditLogManifest object. 590 */ 591 public synchronized RemoteEditLogManifest getEditLogManifest(long fromTxId, 592 boolean forReading) { 593 // Collect RemoteEditLogs available from each FileJournalManager 594 List<RemoteEditLog> allLogs = Lists.newArrayList(); 595 for (JournalAndStream j : journals) { 596 if (j.getManager() instanceof FileJournalManager) { 597 FileJournalManager fjm = (FileJournalManager)j.getManager(); 598 try { 599 allLogs.addAll(fjm.getRemoteEditLogs(fromTxId, forReading, false)); 600 } catch (Throwable t) { 601 LOG.warn("Cannot list edit logs in " + fjm, t); 602 } 603 } 604 } 605 606 // Group logs by their starting txid 607 ImmutableListMultimap<Long, RemoteEditLog> logsByStartTxId = 608 Multimaps.index(allLogs, RemoteEditLog.GET_START_TXID); 609 long curStartTxId = fromTxId; 610 611 List<RemoteEditLog> logs = Lists.newArrayList(); 612 while (true) { 613 ImmutableList<RemoteEditLog> logGroup = logsByStartTxId.get(curStartTxId); 614 if (logGroup.isEmpty()) { 615 // we have a gap in logs - for example because we recovered some old 616 // storage directory with ancient logs. Clear out any logs we've 617 // accumulated so far, and then skip to the next segment of logs 618 // after the gap. 619 SortedSet<Long> startTxIds = Sets.newTreeSet(logsByStartTxId.keySet()); 620 startTxIds = startTxIds.tailSet(curStartTxId); 621 if (startTxIds.isEmpty()) { 622 break; 623 } else { 624 if (LOG.isDebugEnabled()) { 625 LOG.debug("Found gap in logs at " + curStartTxId + ": " + 626 "not returning previous logs in manifest."); 627 } 628 logs.clear(); 629 curStartTxId = startTxIds.first(); 630 continue; 631 } 632 } 633 634 // Find the one that extends the farthest forward 635 RemoteEditLog bestLog = Collections.max(logGroup); 636 logs.add(bestLog); 637 // And then start looking from after that point 638 curStartTxId = bestLog.getEndTxId() + 1; 639 } 640 RemoteEditLogManifest ret = new RemoteEditLogManifest(logs); 641 642 if (LOG.isDebugEnabled()) { 643 LOG.debug("Generated manifest for logs since " + fromTxId + ":" 644 + ret); 645 } 646 return ret; 647 } 648 649 /** 650 * Add sync times to the buffer. 651 */ 652 String getSyncTimes() { 653 StringBuilder buf = new StringBuilder(); 654 for (JournalAndStream jas : journals) { 655 if (jas.isActive()) { 656 buf.append(jas.getCurrentStream().getTotalSyncTime()); 657 buf.append(" "); 658 } 659 } 660 return buf.toString(); 661 } 662 }