001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.qjournal.server; 019 020import java.io.Closeable; 021import java.io.File; 022import java.io.FileInputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import java.io.OutputStreamWriter; 026import java.net.URL; 027import java.security.PrivilegedExceptionAction; 028import java.util.Iterator; 029import java.util.List; 030import java.util.concurrent.TimeUnit; 031 032import org.apache.commons.lang.math.LongRange; 033import org.apache.commons.logging.Log; 034import org.apache.commons.logging.LogFactory; 035import org.apache.hadoop.conf.Configuration; 036import org.apache.hadoop.fs.FileUtil; 037import org.apache.hadoop.hdfs.protocol.HdfsConstants; 038import org.apache.hadoop.hdfs.qjournal.protocol.JournalNotFormattedException; 039import org.apache.hadoop.hdfs.qjournal.protocol.JournalOutOfSyncException; 040import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol; 041import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos; 042import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto; 043import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PersistedRecoveryPaxosData; 044import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto; 045import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto; 046import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo; 047import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; 048import org.apache.hadoop.hdfs.server.common.StorageErrorReporter; 049import org.apache.hadoop.hdfs.server.common.StorageInfo; 050import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream; 051import org.apache.hadoop.hdfs.server.namenode.FileJournalManager; 052import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile; 053import org.apache.hadoop.hdfs.server.namenode.JournalManager; 054import org.apache.hadoop.hdfs.server.namenode.TransferFsImage; 055import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; 056import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; 057import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; 058import org.apache.hadoop.hdfs.util.AtomicFileOutputStream; 059import org.apache.hadoop.hdfs.util.BestEffortLongFile; 060import org.apache.hadoop.hdfs.util.PersistentLongFile; 061import org.apache.hadoop.io.IOUtils; 062import org.apache.hadoop.ipc.Server; 063import org.apache.hadoop.security.SecurityUtil; 064import org.apache.hadoop.security.UserGroupInformation; 065import org.apache.hadoop.util.StopWatch; 066 067import com.google.common.annotations.VisibleForTesting; 068import com.google.common.base.Charsets; 069import com.google.common.base.Preconditions; 070import com.google.common.collect.ImmutableList; 071import com.google.protobuf.TextFormat; 072 073/** 074 * A JournalNode can manage journals for several clusters at once. 075 * Each such journal is entirely independent despite being hosted by 076 * the same JVM. 077 */ 078public class Journal implements Closeable { 079 static final Log LOG = LogFactory.getLog(Journal.class); 080 081 082 // Current writing state 083 private EditLogOutputStream curSegment; 084 private long curSegmentTxId = HdfsConstants.INVALID_TXID; 085 private long nextTxId = HdfsConstants.INVALID_TXID; 086 private long highestWrittenTxId = 0; 087 088 private final String journalId; 089 090 private final JNStorage storage; 091 092 /** 093 * When a new writer comes along, it asks each node to promise 094 * to ignore requests from any previous writer, as identified 095 * by epoch number. In order to make such a promise, the epoch 096 * number of that writer is stored persistently on disk. 097 */ 098 private PersistentLongFile lastPromisedEpoch; 099 100 /** 101 * Each IPC that comes from a given client contains a serial number 102 * which only increases from the client's perspective. Whenever 103 * we switch epochs, we reset this back to -1. Whenever an IPC 104 * comes from a client, we ensure that it is strictly higher 105 * than any previous IPC. This guards against any bugs in the IPC 106 * layer that would re-order IPCs or cause a stale retry from an old 107 * request to resurface and confuse things. 108 */ 109 private long currentEpochIpcSerial = -1; 110 111 /** 112 * The epoch number of the last writer to actually write a transaction. 113 * This is used to differentiate log segments after a crash at the very 114 * beginning of a segment. See the the 'testNewerVersionOfSegmentWins' 115 * test case. 116 */ 117 private PersistentLongFile lastWriterEpoch; 118 119 /** 120 * Lower-bound on the last committed transaction ID. This is not 121 * depended upon for correctness, but acts as a sanity check 122 * during the recovery procedures, and as a visibility mark 123 * for clients reading in-progress logs. 124 */ 125 private BestEffortLongFile committedTxnId; 126 127 public static final String LAST_PROMISED_FILENAME = "last-promised-epoch"; 128 public static final String LAST_WRITER_EPOCH = "last-writer-epoch"; 129 private static final String COMMITTED_TXID_FILENAME = "committed-txid"; 130 131 private final FileJournalManager fjm; 132 133 private final JournalMetrics metrics; 134 135 /** 136 * Time threshold for sync calls, beyond which a warning should be logged to the console. 137 */ 138 private static final int WARN_SYNC_MILLIS_THRESHOLD = 1000; 139 140 Journal(Configuration conf, File logDir, String journalId, 141 StartupOption startOpt, StorageErrorReporter errorReporter) 142 throws IOException { 143 storage = new JNStorage(conf, logDir, startOpt, errorReporter); 144 this.journalId = journalId; 145 146 refreshCachedData(); 147 148 this.fjm = storage.getJournalManager(); 149 150 this.metrics = JournalMetrics.create(this); 151 152 EditLogFile latest = scanStorageForLatestEdits(); 153 if (latest != null) { 154 highestWrittenTxId = latest.getLastTxId(); 155 } 156 } 157 158 /** 159 * Reload any data that may have been cached. This is necessary 160 * when we first load the Journal, but also after any formatting 161 * operation, since the cached data is no longer relevant. 162 */ 163 private synchronized void refreshCachedData() { 164 IOUtils.closeStream(committedTxnId); 165 166 File currentDir = storage.getSingularStorageDir().getCurrentDir(); 167 this.lastPromisedEpoch = new PersistentLongFile( 168 new File(currentDir, LAST_PROMISED_FILENAME), 0); 169 this.lastWriterEpoch = new PersistentLongFile( 170 new File(currentDir, LAST_WRITER_EPOCH), 0); 171 this.committedTxnId = new BestEffortLongFile( 172 new File(currentDir, COMMITTED_TXID_FILENAME), 173 HdfsConstants.INVALID_TXID); 174 } 175 176 /** 177 * Scan the local storage directory, and return the segment containing 178 * the highest transaction. 179 * @return the EditLogFile with the highest transactions, or null 180 * if no files exist. 181 */ 182 private synchronized EditLogFile scanStorageForLatestEdits() throws IOException { 183 if (!fjm.getStorageDirectory().getCurrentDir().exists()) { 184 return null; 185 } 186 187 LOG.info("Scanning storage " + fjm); 188 List<EditLogFile> files = fjm.getLogFiles(0); 189 190 while (!files.isEmpty()) { 191 EditLogFile latestLog = files.remove(files.size() - 1); 192 latestLog.scanLog(); 193 LOG.info("Latest log is " + latestLog); 194 if (latestLog.getLastTxId() == HdfsConstants.INVALID_TXID) { 195 // the log contains no transactions 196 LOG.warn("Latest log " + latestLog + " has no transactions. " + 197 "moving it aside and looking for previous log"); 198 latestLog.moveAsideEmptyFile(); 199 } else { 200 return latestLog; 201 } 202 } 203 204 LOG.info("No files in " + fjm); 205 return null; 206 } 207 208 /** 209 * Format the local storage with the given namespace. 210 */ 211 void format(NamespaceInfo nsInfo) throws IOException { 212 Preconditions.checkState(nsInfo.getNamespaceID() != 0, 213 "can't format with uninitialized namespace info: %s", 214 nsInfo); 215 LOG.info("Formatting " + this + " with namespace info: " + 216 nsInfo); 217 storage.format(nsInfo); 218 refreshCachedData(); 219 } 220 221 /** 222 * Unlock and release resources. 223 */ 224 @Override // Closeable 225 public void close() throws IOException { 226 storage.close(); 227 IOUtils.closeStream(committedTxnId); 228 IOUtils.closeStream(curSegment); 229 } 230 231 JNStorage getStorage() { 232 return storage; 233 } 234 235 String getJournalId() { 236 return journalId; 237 } 238 239 /** 240 * @return the last epoch which this node has promised not to accept 241 * any lower epoch, or 0 if no promises have been made. 242 */ 243 synchronized long getLastPromisedEpoch() throws IOException { 244 checkFormatted(); 245 return lastPromisedEpoch.get(); 246 } 247 248 synchronized public long getLastWriterEpoch() throws IOException { 249 checkFormatted(); 250 return lastWriterEpoch.get(); 251 } 252 253 synchronized long getCommittedTxnIdForTests() throws IOException { 254 return committedTxnId.get(); 255 } 256 257 synchronized long getCurrentLagTxns() throws IOException { 258 long committed = committedTxnId.get(); 259 if (committed == 0) { 260 return 0; 261 } 262 263 return Math.max(committed - highestWrittenTxId, 0L); 264 } 265 266 synchronized long getHighestWrittenTxId() { 267 return highestWrittenTxId; 268 } 269 270 @VisibleForTesting 271 JournalMetrics getMetricsForTests() { 272 return metrics; 273 } 274 275 /** 276 * Try to create a new epoch for this journal. 277 * @param nsInfo the namespace, which is verified for consistency or used to 278 * format, if the Journal has not yet been written to. 279 * @param epoch the epoch to start 280 * @return the status information necessary to begin recovery 281 * @throws IOException if the node has already made a promise to another 282 * writer with a higher epoch number, if the namespace is inconsistent, 283 * or if a disk error occurs. 284 */ 285 synchronized NewEpochResponseProto newEpoch( 286 NamespaceInfo nsInfo, long epoch) throws IOException { 287 288 checkFormatted(); 289 storage.checkConsistentNamespace(nsInfo); 290 291 // Check that the new epoch being proposed is in fact newer than 292 // any other that we've promised. 293 if (epoch <= getLastPromisedEpoch()) { 294 throw new IOException("Proposed epoch " + epoch + " <= last promise " + 295 getLastPromisedEpoch()); 296 } 297 298 updateLastPromisedEpoch(epoch); 299 abortCurSegment(); 300 301 NewEpochResponseProto.Builder builder = 302 NewEpochResponseProto.newBuilder(); 303 304 EditLogFile latestFile = scanStorageForLatestEdits(); 305 306 if (latestFile != null) { 307 builder.setLastSegmentTxId(latestFile.getFirstTxId()); 308 } 309 310 return builder.build(); 311 } 312 313 private void updateLastPromisedEpoch(long newEpoch) throws IOException { 314 LOG.info("Updating lastPromisedEpoch from " + lastPromisedEpoch.get() + 315 " to " + newEpoch + " for client " + Server.getRemoteIp()); 316 lastPromisedEpoch.set(newEpoch); 317 318 // Since we have a new writer, reset the IPC serial - it will start 319 // counting again from 0 for this writer. 320 currentEpochIpcSerial = -1; 321 } 322 323 private void abortCurSegment() throws IOException { 324 if (curSegment == null) { 325 return; 326 } 327 328 curSegment.abort(); 329 curSegment = null; 330 curSegmentTxId = HdfsConstants.INVALID_TXID; 331 } 332 333 /** 334 * Write a batch of edits to the journal. 335 * {@see QJournalProtocol#journal(RequestInfo, long, long, int, byte[])} 336 */ 337 synchronized void journal(RequestInfo reqInfo, 338 long segmentTxId, long firstTxnId, 339 int numTxns, byte[] records) throws IOException { 340 checkFormatted(); 341 checkWriteRequest(reqInfo); 342 343 checkSync(curSegment != null, 344 "Can't write, no segment open"); 345 346 if (curSegmentTxId != segmentTxId) { 347 // Sanity check: it is possible that the writer will fail IPCs 348 // on both the finalize() and then the start() of the next segment. 349 // This could cause us to continue writing to an old segment 350 // instead of rolling to a new one, which breaks one of the 351 // invariants in the design. If it happens, abort the segment 352 // and throw an exception. 353 JournalOutOfSyncException e = new JournalOutOfSyncException( 354 "Writer out of sync: it thinks it is writing segment " + segmentTxId 355 + " but current segment is " + curSegmentTxId); 356 abortCurSegment(); 357 throw e; 358 } 359 360 checkSync(nextTxId == firstTxnId, 361 "Can't write txid " + firstTxnId + " expecting nextTxId=" + nextTxId); 362 363 long lastTxnId = firstTxnId + numTxns - 1; 364 if (LOG.isTraceEnabled()) { 365 LOG.trace("Writing txid " + firstTxnId + "-" + lastTxnId); 366 } 367 368 // If the edit has already been marked as committed, we know 369 // it has been fsynced on a quorum of other nodes, and we are 370 // "catching up" with the rest. Hence we do not need to fsync. 371 boolean isLagging = lastTxnId <= committedTxnId.get(); 372 boolean shouldFsync = !isLagging; 373 374 curSegment.writeRaw(records, 0, records.length); 375 curSegment.setReadyToFlush(); 376 StopWatch sw = new StopWatch(); 377 sw.start(); 378 curSegment.flush(shouldFsync); 379 sw.stop(); 380 381 long nanoSeconds = sw.now(); 382 metrics.addSync( 383 TimeUnit.MICROSECONDS.convert(nanoSeconds, TimeUnit.NANOSECONDS)); 384 long milliSeconds = TimeUnit.MILLISECONDS.convert( 385 nanoSeconds, TimeUnit.NANOSECONDS); 386 387 if (milliSeconds > WARN_SYNC_MILLIS_THRESHOLD) { 388 LOG.warn("Sync of transaction range " + firstTxnId + "-" + lastTxnId + 389 " took " + milliSeconds + "ms"); 390 } 391 392 if (isLagging) { 393 // This batch of edits has already been committed on a quorum of other 394 // nodes. So, we are in "catch up" mode. This gets its own metric. 395 metrics.batchesWrittenWhileLagging.incr(1); 396 } 397 398 metrics.batchesWritten.incr(1); 399 metrics.bytesWritten.incr(records.length); 400 metrics.txnsWritten.incr(numTxns); 401 402 highestWrittenTxId = lastTxnId; 403 nextTxId = lastTxnId + 1; 404 } 405 406 public void heartbeat(RequestInfo reqInfo) throws IOException { 407 checkRequest(reqInfo); 408 } 409 410 /** 411 * Ensure that the given request is coming from the correct writer and in-order. 412 * @param reqInfo the request info 413 * @throws IOException if the request is invalid. 414 */ 415 private synchronized void checkRequest(RequestInfo reqInfo) throws IOException { 416 // Invariant 25 from ZAB paper 417 if (reqInfo.getEpoch() < lastPromisedEpoch.get()) { 418 throw new IOException("IPC's epoch " + reqInfo.getEpoch() + 419 " is less than the last promised epoch " + 420 lastPromisedEpoch.get()); 421 } else if (reqInfo.getEpoch() > lastPromisedEpoch.get()) { 422 // A newer client has arrived. Fence any previous writers by updating 423 // the promise. 424 updateLastPromisedEpoch(reqInfo.getEpoch()); 425 } 426 427 // Ensure that the IPCs are arriving in-order as expected. 428 checkSync(reqInfo.getIpcSerialNumber() > currentEpochIpcSerial, 429 "IPC serial %s from client %s was not higher than prior highest " + 430 "IPC serial %s", reqInfo.getIpcSerialNumber(), 431 Server.getRemoteIp(), 432 currentEpochIpcSerial); 433 currentEpochIpcSerial = reqInfo.getIpcSerialNumber(); 434 435 if (reqInfo.hasCommittedTxId()) { 436 Preconditions.checkArgument( 437 reqInfo.getCommittedTxId() >= committedTxnId.get(), 438 "Client trying to move committed txid backward from " + 439 committedTxnId.get() + " to " + reqInfo.getCommittedTxId()); 440 441 committedTxnId.set(reqInfo.getCommittedTxId()); 442 } 443 } 444 445 private synchronized void checkWriteRequest(RequestInfo reqInfo) throws IOException { 446 checkRequest(reqInfo); 447 448 if (reqInfo.getEpoch() != lastWriterEpoch.get()) { 449 throw new IOException("IPC's epoch " + reqInfo.getEpoch() + 450 " is not the current writer epoch " + 451 lastWriterEpoch.get()); 452 } 453 } 454 455 public synchronized boolean isFormatted() { 456 return storage.isFormatted(); 457 } 458 459 private void checkFormatted() throws JournalNotFormattedException { 460 if (!isFormatted()) { 461 throw new JournalNotFormattedException("Journal " + 462 storage.getSingularStorageDir() + " not formatted"); 463 } 464 } 465 466 /** 467 * @throws JournalOutOfSyncException if the given expression is not true. 468 * The message of the exception is formatted using the 'msg' and 469 * 'formatArgs' parameters. 470 */ 471 private void checkSync(boolean expression, String msg, 472 Object... formatArgs) throws JournalOutOfSyncException { 473 if (!expression) { 474 throw new JournalOutOfSyncException(String.format(msg, formatArgs)); 475 } 476 } 477 478 /** 479 * @throws AssertionError if the given expression is not true. 480 * The message of the exception is formatted using the 'msg' and 481 * 'formatArgs' parameters. 482 * 483 * This should be used in preference to Java's built-in assert in 484 * non-performance-critical paths, where a failure of this invariant 485 * might cause the protocol to lose data. 486 */ 487 private void alwaysAssert(boolean expression, String msg, 488 Object... formatArgs) { 489 if (!expression) { 490 throw new AssertionError(String.format(msg, formatArgs)); 491 } 492 } 493 494 /** 495 * Start a new segment at the given txid. The previous segment 496 * must have already been finalized. 497 */ 498 public synchronized void startLogSegment(RequestInfo reqInfo, long txid, 499 int layoutVersion) throws IOException { 500 assert fjm != null; 501 checkFormatted(); 502 checkRequest(reqInfo); 503 504 if (curSegment != null) { 505 LOG.warn("Client is requesting a new log segment " + txid + 506 " though we are already writing " + curSegment + ". " + 507 "Aborting the current segment in order to begin the new one."); 508 // The writer may have lost a connection to us and is now 509 // re-connecting after the connection came back. 510 // We should abort our own old segment. 511 abortCurSegment(); 512 } 513 514 // Paranoid sanity check: we should never overwrite a finalized log file. 515 // Additionally, if it's in-progress, it should have at most 1 transaction. 516 // This can happen if the writer crashes exactly at the start of a segment. 517 EditLogFile existing = fjm.getLogFile(txid); 518 if (existing != null) { 519 if (!existing.isInProgress()) { 520 throw new IllegalStateException("Already have a finalized segment " + 521 existing + " beginning at " + txid); 522 } 523 524 // If it's in-progress, it should only contain one transaction, 525 // because the "startLogSegment" transaction is written alone at the 526 // start of each segment. 527 existing.scanLog(); 528 if (existing.getLastTxId() != existing.getFirstTxId()) { 529 throw new IllegalStateException("The log file " + 530 existing + " seems to contain valid transactions"); 531 } 532 } 533 534 long curLastWriterEpoch = lastWriterEpoch.get(); 535 if (curLastWriterEpoch != reqInfo.getEpoch()) { 536 LOG.info("Updating lastWriterEpoch from " + curLastWriterEpoch + 537 " to " + reqInfo.getEpoch() + " for client " + 538 Server.getRemoteIp()); 539 lastWriterEpoch.set(reqInfo.getEpoch()); 540 } 541 542 // The fact that we are starting a segment at this txid indicates 543 // that any previous recovery for this same segment was aborted. 544 // Otherwise, no writer would have started writing. So, we can 545 // remove the record of the older segment here. 546 purgePaxosDecision(txid); 547 548 curSegment = fjm.startLogSegment(txid, layoutVersion); 549 curSegmentTxId = txid; 550 nextTxId = txid; 551 } 552 553 /** 554 * Finalize the log segment at the given transaction ID. 555 */ 556 public synchronized void finalizeLogSegment(RequestInfo reqInfo, long startTxId, 557 long endTxId) throws IOException { 558 checkFormatted(); 559 checkRequest(reqInfo); 560 561 boolean needsValidation = true; 562 563 // Finalizing the log that the writer was just writing. 564 if (startTxId == curSegmentTxId) { 565 if (curSegment != null) { 566 curSegment.close(); 567 curSegment = null; 568 curSegmentTxId = HdfsConstants.INVALID_TXID; 569 } 570 571 checkSync(nextTxId == endTxId + 1, 572 "Trying to finalize in-progress log segment %s to end at " + 573 "txid %s but only written up to txid %s", 574 startTxId, endTxId, nextTxId - 1); 575 // No need to validate the edit log if the client is finalizing 576 // the log segment that it was just writing to. 577 needsValidation = false; 578 } 579 580 FileJournalManager.EditLogFile elf = fjm.getLogFile(startTxId); 581 if (elf == null) { 582 throw new JournalOutOfSyncException("No log file to finalize at " + 583 "transaction ID " + startTxId); 584 } 585 586 if (elf.isInProgress()) { 587 if (needsValidation) { 588 LOG.info("Validating log segment " + elf.getFile() + " about to be " + 589 "finalized"); 590 elf.scanLog(); 591 592 checkSync(elf.getLastTxId() == endTxId, 593 "Trying to finalize in-progress log segment %s to end at " + 594 "txid %s but log %s on disk only contains up to txid %s", 595 startTxId, endTxId, elf.getFile(), elf.getLastTxId()); 596 } 597 fjm.finalizeLogSegment(startTxId, endTxId); 598 } else { 599 Preconditions.checkArgument(endTxId == elf.getLastTxId(), 600 "Trying to re-finalize already finalized log " + 601 elf + " with different endTxId " + endTxId); 602 } 603 604 // Once logs are finalized, a different length will never be decided. 605 // During recovery, we treat a finalized segment the same as an accepted 606 // recovery. Thus, we no longer need to keep track of the previously- 607 // accepted decision. The existence of the finalized log segment is enough. 608 purgePaxosDecision(elf.getFirstTxId()); 609 } 610 611 /** 612 * @see JournalManager#purgeLogsOlderThan(long) 613 */ 614 public synchronized void purgeLogsOlderThan(RequestInfo reqInfo, 615 long minTxIdToKeep) throws IOException { 616 checkFormatted(); 617 checkRequest(reqInfo); 618 619 storage.purgeDataOlderThan(minTxIdToKeep); 620 } 621 622 /** 623 * Remove the previously-recorded 'accepted recovery' information 624 * for a given log segment, once it is no longer necessary. 625 * @param segmentTxId the transaction ID to purge 626 * @throws IOException if the file could not be deleted 627 */ 628 private void purgePaxosDecision(long segmentTxId) throws IOException { 629 File paxosFile = storage.getPaxosFile(segmentTxId); 630 if (paxosFile.exists()) { 631 if (!paxosFile.delete()) { 632 throw new IOException("Unable to delete paxos file " + paxosFile); 633 } 634 } 635 } 636 637 /** 638 * @see QJournalProtocol#getEditLogManifest(String, long, boolean) 639 */ 640 public RemoteEditLogManifest getEditLogManifest(long sinceTxId, 641 boolean inProgressOk) throws IOException { 642 // No need to checkRequest() here - anyone may ask for the list 643 // of segments. 644 checkFormatted(); 645 646 List<RemoteEditLog> logs = fjm.getRemoteEditLogs(sinceTxId, inProgressOk); 647 648 if (inProgressOk) { 649 RemoteEditLog log = null; 650 for (Iterator<RemoteEditLog> iter = logs.iterator(); iter.hasNext();) { 651 log = iter.next(); 652 if (log.isInProgress()) { 653 iter.remove(); 654 break; 655 } 656 } 657 if (log != null && log.isInProgress()) { 658 logs.add(new RemoteEditLog(log.getStartTxId(), getHighestWrittenTxId(), 659 true)); 660 } 661 } 662 663 return new RemoteEditLogManifest(logs); 664 } 665 666 /** 667 * @return the current state of the given segment, or null if the 668 * segment does not exist. 669 */ 670 @VisibleForTesting 671 SegmentStateProto getSegmentInfo(long segmentTxId) 672 throws IOException { 673 EditLogFile elf = fjm.getLogFile(segmentTxId); 674 if (elf == null) { 675 return null; 676 } 677 if (elf.isInProgress()) { 678 elf.scanLog(); 679 } 680 if (elf.getLastTxId() == HdfsConstants.INVALID_TXID) { 681 LOG.info("Edit log file " + elf + " appears to be empty. " + 682 "Moving it aside..."); 683 elf.moveAsideEmptyFile(); 684 return null; 685 } 686 SegmentStateProto ret = SegmentStateProto.newBuilder() 687 .setStartTxId(segmentTxId) 688 .setEndTxId(elf.getLastTxId()) 689 .setIsInProgress(elf.isInProgress()) 690 .build(); 691 LOG.info("getSegmentInfo(" + segmentTxId + "): " + elf + " -> " + 692 TextFormat.shortDebugString(ret)); 693 return ret; 694 } 695 696 /** 697 * @see QJournalProtocol#prepareRecovery(RequestInfo, long) 698 */ 699 public synchronized PrepareRecoveryResponseProto prepareRecovery( 700 RequestInfo reqInfo, long segmentTxId) throws IOException { 701 checkFormatted(); 702 checkRequest(reqInfo); 703 704 abortCurSegment(); 705 706 PrepareRecoveryResponseProto.Builder builder = 707 PrepareRecoveryResponseProto.newBuilder(); 708 709 PersistedRecoveryPaxosData previouslyAccepted = getPersistedPaxosData(segmentTxId); 710 completeHalfDoneAcceptRecovery(previouslyAccepted); 711 712 SegmentStateProto segInfo = getSegmentInfo(segmentTxId); 713 boolean hasFinalizedSegment = segInfo != null && !segInfo.getIsInProgress(); 714 715 if (previouslyAccepted != null && !hasFinalizedSegment) { 716 SegmentStateProto acceptedState = previouslyAccepted.getSegmentState(); 717 assert acceptedState.getEndTxId() == segInfo.getEndTxId() : 718 "prev accepted: " + TextFormat.shortDebugString(previouslyAccepted)+ "\n" + 719 "on disk: " + TextFormat.shortDebugString(segInfo); 720 721 builder.setAcceptedInEpoch(previouslyAccepted.getAcceptedInEpoch()) 722 .setSegmentState(previouslyAccepted.getSegmentState()); 723 } else { 724 if (segInfo != null) { 725 builder.setSegmentState(segInfo); 726 } 727 } 728 729 builder.setLastWriterEpoch(lastWriterEpoch.get()); 730 if (committedTxnId.get() != HdfsConstants.INVALID_TXID) { 731 builder.setLastCommittedTxId(committedTxnId.get()); 732 } 733 734 PrepareRecoveryResponseProto resp = builder.build(); 735 LOG.info("Prepared recovery for segment " + segmentTxId + ": " + 736 TextFormat.shortDebugString(resp)); 737 return resp; 738 } 739 740 /** 741 * @see QJournalProtocol#acceptRecovery(RequestInfo, QJournalProtocolProtos.SegmentStateProto, URL) 742 */ 743 public synchronized void acceptRecovery(RequestInfo reqInfo, 744 SegmentStateProto segment, URL fromUrl) 745 throws IOException { 746 checkFormatted(); 747 checkRequest(reqInfo); 748 749 abortCurSegment(); 750 751 long segmentTxId = segment.getStartTxId(); 752 753 // Basic sanity checks that the segment is well-formed and contains 754 // at least one transaction. 755 Preconditions.checkArgument(segment.getEndTxId() > 0 && 756 segment.getEndTxId() >= segmentTxId, 757 "bad recovery state for segment %s: %s", 758 segmentTxId, TextFormat.shortDebugString(segment)); 759 760 PersistedRecoveryPaxosData oldData = getPersistedPaxosData(segmentTxId); 761 PersistedRecoveryPaxosData newData = PersistedRecoveryPaxosData.newBuilder() 762 .setAcceptedInEpoch(reqInfo.getEpoch()) 763 .setSegmentState(segment) 764 .build(); 765 766 // If we previously acted on acceptRecovery() from a higher-numbered writer, 767 // this call is out of sync. We should never actually trigger this, since the 768 // checkRequest() call above should filter non-increasing epoch numbers. 769 if (oldData != null) { 770 alwaysAssert(oldData.getAcceptedInEpoch() <= reqInfo.getEpoch(), 771 "Bad paxos transition, out-of-order epochs.\nOld: %s\nNew: %s\n", 772 oldData, newData); 773 } 774 775 File syncedFile = null; 776 777 SegmentStateProto currentSegment = getSegmentInfo(segmentTxId); 778 if (currentSegment == null || 779 currentSegment.getEndTxId() != segment.getEndTxId()) { 780 if (currentSegment == null) { 781 LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) + 782 ": no current segment in place"); 783 784 // Update the highest txid for lag metrics 785 highestWrittenTxId = Math.max(segment.getEndTxId(), 786 highestWrittenTxId); 787 } else { 788 LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) + 789 ": old segment " + TextFormat.shortDebugString(currentSegment) + 790 " is not the right length"); 791 792 // Paranoid sanity check: if the new log is shorter than the log we 793 // currently have, we should not end up discarding any transactions 794 // which are already Committed. 795 if (txnRange(currentSegment).containsLong(committedTxnId.get()) && 796 !txnRange(segment).containsLong(committedTxnId.get())) { 797 throw new AssertionError( 798 "Cannot replace segment " + 799 TextFormat.shortDebugString(currentSegment) + 800 " with new segment " + 801 TextFormat.shortDebugString(segment) + 802 ": would discard already-committed txn " + 803 committedTxnId.get()); 804 } 805 806 // Another paranoid check: we should not be asked to synchronize a log 807 // on top of a finalized segment. 808 alwaysAssert(currentSegment.getIsInProgress(), 809 "Should never be asked to synchronize a different log on top of an " + 810 "already-finalized segment"); 811 812 // If we're shortening the log, update our highest txid 813 // used for lag metrics. 814 if (txnRange(currentSegment).containsLong(highestWrittenTxId)) { 815 highestWrittenTxId = segment.getEndTxId(); 816 } 817 } 818 syncedFile = syncLog(reqInfo, segment, fromUrl); 819 820 } else { 821 LOG.info("Skipping download of log " + 822 TextFormat.shortDebugString(segment) + 823 ": already have up-to-date logs"); 824 } 825 826 // This is one of the few places in the protocol where we have a single 827 // RPC that results in two distinct actions: 828 // 829 // - 1) Downloads the new log segment data (above) 830 // - 2) Records the new Paxos data about the synchronized segment (below) 831 // 832 // These need to be treated as a transaction from the perspective 833 // of any external process. We do this by treating the persistPaxosData() 834 // success as the "commit" of an atomic transaction. If we fail before 835 // this point, the downloaded edit log will only exist at a temporary 836 // path, and thus not change any externally visible state. If we fail 837 // after this point, then any future prepareRecovery() call will see 838 // the Paxos data, and by calling completeHalfDoneAcceptRecovery() will 839 // roll forward the rename of the referenced log file. 840 // 841 // See also: HDFS-3955 842 // 843 // The fault points here are exercised by the randomized fault injection 844 // test case to ensure that this atomic "transaction" operates correctly. 845 JournalFaultInjector.get().beforePersistPaxosData(); 846 persistPaxosData(segmentTxId, newData); 847 JournalFaultInjector.get().afterPersistPaxosData(); 848 849 if (syncedFile != null) { 850 FileUtil.replaceFile(syncedFile, 851 storage.getInProgressEditLog(segmentTxId)); 852 } 853 854 LOG.info("Accepted recovery for segment " + segmentTxId + ": " + 855 TextFormat.shortDebugString(newData)); 856 } 857 858 private LongRange txnRange(SegmentStateProto seg) { 859 Preconditions.checkArgument(seg.hasEndTxId(), 860 "invalid segment: %s", seg); 861 return new LongRange(seg.getStartTxId(), seg.getEndTxId()); 862 } 863 864 /** 865 * Synchronize a log segment from another JournalNode. The log is 866 * downloaded from the provided URL into a temporary location on disk, 867 * which is named based on the current request's epoch. 868 * 869 * @return the temporary location of the downloaded file 870 */ 871 private File syncLog(RequestInfo reqInfo, 872 final SegmentStateProto segment, final URL url) throws IOException { 873 final File tmpFile = storage.getSyncLogTemporaryFile( 874 segment.getStartTxId(), reqInfo.getEpoch()); 875 final List<File> localPaths = ImmutableList.of(tmpFile); 876 877 LOG.info("Synchronizing log " + 878 TextFormat.shortDebugString(segment) + " from " + url); 879 SecurityUtil.doAsLoginUser( 880 new PrivilegedExceptionAction<Void>() { 881 @Override 882 public Void run() throws IOException { 883 // We may have lost our ticket since last checkpoint, log in again, just in case 884 if (UserGroupInformation.isSecurityEnabled()) { 885 UserGroupInformation.getCurrentUser().checkTGTAndReloginFromKeytab(); 886 } 887 888 boolean success = false; 889 try { 890 TransferFsImage.doGetUrl(url, localPaths, storage, true); 891 assert tmpFile.exists(); 892 success = true; 893 } finally { 894 if (!success) { 895 if (!tmpFile.delete()) { 896 LOG.warn("Failed to delete temporary file " + tmpFile); 897 } 898 } 899 } 900 return null; 901 } 902 }); 903 return tmpFile; 904 } 905 906 907 /** 908 * In the case the node crashes in between downloading a log segment 909 * and persisting the associated paxos recovery data, the log segment 910 * will be left in its temporary location on disk. Given the paxos data, 911 * we can check if this was indeed the case, and "roll forward" 912 * the atomic operation. 913 * 914 * See the inline comments in 915 * {@link #acceptRecovery(RequestInfo, SegmentStateProto, URL)} for more 916 * details. 917 * 918 * @throws IOException if the temporary file is unable to be renamed into 919 * place 920 */ 921 private void completeHalfDoneAcceptRecovery( 922 PersistedRecoveryPaxosData paxosData) throws IOException { 923 if (paxosData == null) { 924 return; 925 } 926 927 long segmentId = paxosData.getSegmentState().getStartTxId(); 928 long epoch = paxosData.getAcceptedInEpoch(); 929 930 File tmp = storage.getSyncLogTemporaryFile(segmentId, epoch); 931 932 if (tmp.exists()) { 933 File dst = storage.getInProgressEditLog(segmentId); 934 LOG.info("Rolling forward previously half-completed synchronization: " + 935 tmp + " -> " + dst); 936 FileUtil.replaceFile(tmp, dst); 937 } 938 } 939 940 /** 941 * Retrieve the persisted data for recovering the given segment from disk. 942 */ 943 private PersistedRecoveryPaxosData getPersistedPaxosData(long segmentTxId) 944 throws IOException { 945 File f = storage.getPaxosFile(segmentTxId); 946 if (!f.exists()) { 947 // Default instance has no fields filled in (they're optional) 948 return null; 949 } 950 951 InputStream in = new FileInputStream(f); 952 try { 953 PersistedRecoveryPaxosData ret = PersistedRecoveryPaxosData.parseDelimitedFrom(in); 954 Preconditions.checkState(ret != null && 955 ret.getSegmentState().getStartTxId() == segmentTxId, 956 "Bad persisted data for segment %s: %s", 957 segmentTxId, ret); 958 return ret; 959 } finally { 960 IOUtils.closeStream(in); 961 } 962 } 963 964 /** 965 * Persist data for recovering the given segment from disk. 966 */ 967 private void persistPaxosData(long segmentTxId, 968 PersistedRecoveryPaxosData newData) throws IOException { 969 File f = storage.getPaxosFile(segmentTxId); 970 boolean success = false; 971 AtomicFileOutputStream fos = new AtomicFileOutputStream(f); 972 try { 973 newData.writeDelimitedTo(fos); 974 fos.write('\n'); 975 // Write human-readable data after the protobuf. This is only 976 // to assist in debugging -- it's not parsed at all. 977 OutputStreamWriter writer = new OutputStreamWriter(fos, Charsets.UTF_8); 978 979 writer.write(String.valueOf(newData)); 980 writer.write('\n'); 981 writer.flush(); 982 983 fos.flush(); 984 success = true; 985 } finally { 986 if (success) { 987 IOUtils.closeStream(fos); 988 } else { 989 fos.abort(); 990 } 991 } 992 } 993 994 synchronized void discardSegments(long startTxId) throws IOException { 995 storage.getJournalManager().discardSegments(startTxId); 996 // we delete all the segments after the startTxId. let's reset committedTxnId 997 committedTxnId.set(startTxId - 1); 998 } 999 1000 public synchronized void doPreUpgrade() throws IOException { 1001 // Do not hold file lock on committedTxnId, because the containing 1002 // directory will be renamed. It will be reopened lazily on next access. 1003 IOUtils.cleanup(LOG, committedTxnId); 1004 storage.getJournalManager().doPreUpgrade(); 1005 } 1006 1007 public synchronized void doUpgrade(StorageInfo sInfo) throws IOException { 1008 long oldCTime = storage.getCTime(); 1009 storage.cTime = sInfo.cTime; 1010 int oldLV = storage.getLayoutVersion(); 1011 storage.layoutVersion = sInfo.layoutVersion; 1012 LOG.info("Starting upgrade of edits directory: " 1013 + ".\n old LV = " + oldLV 1014 + "; old CTime = " + oldCTime 1015 + ".\n new LV = " + storage.getLayoutVersion() 1016 + "; new CTime = " + storage.getCTime()); 1017 storage.getJournalManager().doUpgrade(storage); 1018 storage.createPaxosDir(); 1019 1020 // Copy over the contents of the epoch data files to the new dir. 1021 File currentDir = storage.getSingularStorageDir().getCurrentDir(); 1022 File previousDir = storage.getSingularStorageDir().getPreviousDir(); 1023 1024 PersistentLongFile prevLastPromisedEpoch = new PersistentLongFile( 1025 new File(previousDir, LAST_PROMISED_FILENAME), 0); 1026 PersistentLongFile prevLastWriterEpoch = new PersistentLongFile( 1027 new File(previousDir, LAST_WRITER_EPOCH), 0); 1028 BestEffortLongFile prevCommittedTxnId = new BestEffortLongFile( 1029 new File(previousDir, COMMITTED_TXID_FILENAME), 1030 HdfsConstants.INVALID_TXID); 1031 1032 lastPromisedEpoch = new PersistentLongFile( 1033 new File(currentDir, LAST_PROMISED_FILENAME), 0); 1034 lastWriterEpoch = new PersistentLongFile( 1035 new File(currentDir, LAST_WRITER_EPOCH), 0); 1036 committedTxnId = new BestEffortLongFile( 1037 new File(currentDir, COMMITTED_TXID_FILENAME), 1038 HdfsConstants.INVALID_TXID); 1039 1040 try { 1041 lastPromisedEpoch.set(prevLastPromisedEpoch.get()); 1042 lastWriterEpoch.set(prevLastWriterEpoch.get()); 1043 committedTxnId.set(prevCommittedTxnId.get()); 1044 } finally { 1045 IOUtils.cleanup(LOG, prevCommittedTxnId); 1046 } 1047 } 1048 1049 public synchronized void doFinalize() throws IOException { 1050 LOG.info("Finalizing upgrade for journal " 1051 + storage.getRoot() + "." 1052 + (storage.getLayoutVersion()==0 ? "" : 1053 "\n cur LV = " + storage.getLayoutVersion() 1054 + "; cur CTime = " + storage.getCTime())); 1055 storage.getJournalManager().doFinalize(); 1056 } 1057 1058 public Boolean canRollBack(StorageInfo storage, StorageInfo prevStorage, 1059 int targetLayoutVersion) throws IOException { 1060 return this.storage.getJournalManager().canRollBack(storage, prevStorage, 1061 targetLayoutVersion); 1062 } 1063 1064 public synchronized void doRollback() throws IOException { 1065 // Do not hold file lock on committedTxnId, because the containing 1066 // directory will be renamed. It will be reopened lazily on next access. 1067 IOUtils.cleanup(LOG, committedTxnId); 1068 storage.getJournalManager().doRollback(); 1069 } 1070 1071 public Long getJournalCTime() throws IOException { 1072 return storage.getJournalManager().getJournalCTime(); 1073 } 1074}