001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.qjournal.server;
019
020import java.io.Closeable;
021import java.io.File;
022import java.io.FileInputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import java.io.OutputStreamWriter;
026import java.net.URL;
027import java.security.PrivilegedExceptionAction;
028import java.util.Iterator;
029import java.util.List;
030import java.util.concurrent.TimeUnit;
031
032import org.apache.commons.logging.Log;
033import org.apache.commons.logging.LogFactory;
034import org.apache.hadoop.conf.Configuration;
035import org.apache.hadoop.fs.FileUtil;
036import org.apache.hadoop.hdfs.protocol.HdfsConstants;
037import org.apache.hadoop.hdfs.qjournal.protocol.JournalNotFormattedException;
038import org.apache.hadoop.hdfs.qjournal.protocol.JournalOutOfSyncException;
039import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
040import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos;
041import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
042import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PersistedRecoveryPaxosData;
043import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
044import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
045import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
046import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
047import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
048import org.apache.hadoop.hdfs.server.common.StorageInfo;
049import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
050import org.apache.hadoop.hdfs.server.namenode.FileJournalManager;
051import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile;
052import org.apache.hadoop.hdfs.server.namenode.JournalManager;
053import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
054import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
055import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
056import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
057import org.apache.hadoop.hdfs.util.AtomicFileOutputStream;
058import org.apache.hadoop.hdfs.util.BestEffortLongFile;
059import org.apache.hadoop.hdfs.util.PersistentLongFile;
060import org.apache.hadoop.io.IOUtils;
061import org.apache.hadoop.ipc.Server;
062import org.apache.hadoop.security.SecurityUtil;
063import org.apache.hadoop.security.UserGroupInformation;
064
065import com.google.common.annotations.VisibleForTesting;
066import com.google.common.base.Charsets;
067import com.google.common.base.Preconditions;
068import com.google.common.base.Stopwatch;
069import com.google.common.collect.ImmutableList;
070import com.google.common.collect.Range;
071import com.google.common.collect.Ranges;
072import com.google.protobuf.TextFormat;
073
074/**
075 * A JournalNode can manage journals for several clusters at once.
076 * Each such journal is entirely independent despite being hosted by
077 * the same JVM.
078 */
079public class Journal implements Closeable {
080  static final Log LOG = LogFactory.getLog(Journal.class);
081
082
083  // Current writing state
084  private EditLogOutputStream curSegment;
085  private long curSegmentTxId = HdfsConstants.INVALID_TXID;
086  private long nextTxId = HdfsConstants.INVALID_TXID;
087  private long highestWrittenTxId = 0;
088  
089  private final String journalId;
090  
091  private final JNStorage storage;
092
093  /**
094   * When a new writer comes along, it asks each node to promise
095   * to ignore requests from any previous writer, as identified
096   * by epoch number. In order to make such a promise, the epoch
097   * number of that writer is stored persistently on disk.
098   */
099  private PersistentLongFile lastPromisedEpoch;
100
101  /**
102   * Each IPC that comes from a given client contains a serial number
103   * which only increases from the client's perspective. Whenever
104   * we switch epochs, we reset this back to -1. Whenever an IPC
105   * comes from a client, we ensure that it is strictly higher
106   * than any previous IPC. This guards against any bugs in the IPC
107   * layer that would re-order IPCs or cause a stale retry from an old
108   * request to resurface and confuse things.
109   */
110  private long currentEpochIpcSerial = -1;
111  
112  /**
113   * The epoch number of the last writer to actually write a transaction.
114   * This is used to differentiate log segments after a crash at the very
115   * beginning of a segment. See the the 'testNewerVersionOfSegmentWins'
116   * test case.
117   */
118  private PersistentLongFile lastWriterEpoch;
119  
120  /**
121   * Lower-bound on the last committed transaction ID. This is not
122   * depended upon for correctness, but acts as a sanity check
123   * during the recovery procedures, and as a visibility mark
124   * for clients reading in-progress logs.
125   */
126  private BestEffortLongFile committedTxnId;
127  
128  public static final String LAST_PROMISED_FILENAME = "last-promised-epoch";
129  public static final String LAST_WRITER_EPOCH = "last-writer-epoch";
130  private static final String COMMITTED_TXID_FILENAME = "committed-txid";
131  
132  private final FileJournalManager fjm;
133
134  private final JournalMetrics metrics;
135
136  /**
137   * Time threshold for sync calls, beyond which a warning should be logged to the console.
138   */
139  private static final int WARN_SYNC_MILLIS_THRESHOLD = 1000;
140
141  Journal(Configuration conf, File logDir, String journalId,
142      StartupOption startOpt, StorageErrorReporter errorReporter)
143      throws IOException {
144    storage = new JNStorage(conf, logDir, startOpt, errorReporter);
145    this.journalId = journalId;
146
147    refreshCachedData();
148    
149    this.fjm = storage.getJournalManager();
150    
151    this.metrics = JournalMetrics.create(this);
152    
153    EditLogFile latest = scanStorageForLatestEdits();
154    if (latest != null) {
155      highestWrittenTxId = latest.getLastTxId();
156    }
157  }
158
159  /**
160   * Reload any data that may have been cached. This is necessary
161   * when we first load the Journal, but also after any formatting
162   * operation, since the cached data is no longer relevant.
163   */
164  private synchronized void refreshCachedData() {
165    IOUtils.closeStream(committedTxnId);
166    
167    File currentDir = storage.getSingularStorageDir().getCurrentDir();
168    this.lastPromisedEpoch = new PersistentLongFile(
169        new File(currentDir, LAST_PROMISED_FILENAME), 0);
170    this.lastWriterEpoch = new PersistentLongFile(
171        new File(currentDir, LAST_WRITER_EPOCH), 0);
172    this.committedTxnId = new BestEffortLongFile(
173        new File(currentDir, COMMITTED_TXID_FILENAME),
174        HdfsConstants.INVALID_TXID);
175  }
176  
177  /**
178   * Scan the local storage directory, and return the segment containing
179   * the highest transaction.
180   * @return the EditLogFile with the highest transactions, or null
181   * if no files exist.
182   */
183  private synchronized EditLogFile scanStorageForLatestEdits() throws IOException {
184    if (!fjm.getStorageDirectory().getCurrentDir().exists()) {
185      return null;
186    }
187    
188    LOG.info("Scanning storage " + fjm);
189    List<EditLogFile> files = fjm.getLogFiles(0);
190    
191    while (!files.isEmpty()) {
192      EditLogFile latestLog = files.remove(files.size() - 1);
193      latestLog.scanLog();
194      LOG.info("Latest log is " + latestLog);
195      if (latestLog.getLastTxId() == HdfsConstants.INVALID_TXID) {
196        // the log contains no transactions
197        LOG.warn("Latest log " + latestLog + " has no transactions. " +
198            "moving it aside and looking for previous log");
199        latestLog.moveAsideEmptyFile();
200      } else {
201        return latestLog;
202      }
203    }
204    
205    LOG.info("No files in " + fjm);
206    return null;
207  }
208
209  /**
210   * Format the local storage with the given namespace.
211   */
212  void format(NamespaceInfo nsInfo) throws IOException {
213    Preconditions.checkState(nsInfo.getNamespaceID() != 0,
214        "can't format with uninitialized namespace info: %s",
215        nsInfo);
216    LOG.info("Formatting " + this + " with namespace info: " +
217        nsInfo);
218    storage.format(nsInfo);
219    refreshCachedData();
220  }
221
222  /**
223   * Unlock and release resources.
224   */
225  @Override // Closeable
226  public void close() throws IOException {
227    storage.close();
228    IOUtils.closeStream(committedTxnId);
229    IOUtils.closeStream(curSegment);
230  }
231  
232  JNStorage getStorage() {
233    return storage;
234  }
235  
236  String getJournalId() {
237    return journalId;
238  }
239
240  /**
241   * @return the last epoch which this node has promised not to accept
242   * any lower epoch, or 0 if no promises have been made.
243   */
244  synchronized long getLastPromisedEpoch() throws IOException {
245    checkFormatted();
246    return lastPromisedEpoch.get();
247  }
248
249  synchronized public long getLastWriterEpoch() throws IOException {
250    checkFormatted();
251    return lastWriterEpoch.get();
252  }
253  
254  synchronized long getCommittedTxnIdForTests() throws IOException {
255    return committedTxnId.get();
256  }
257  
258  synchronized long getCurrentLagTxns() throws IOException {
259    long committed = committedTxnId.get();
260    if (committed == 0) {
261      return 0;
262    }
263    
264    return Math.max(committed - highestWrittenTxId, 0L);
265  }
266  
267  synchronized long getHighestWrittenTxId() {
268    return highestWrittenTxId;
269  }
270  
271  @VisibleForTesting
272  JournalMetrics getMetricsForTests() {
273    return metrics;
274  }
275
276  /**
277   * Try to create a new epoch for this journal.
278   * @param nsInfo the namespace, which is verified for consistency or used to
279   * format, if the Journal has not yet been written to.
280   * @param epoch the epoch to start
281   * @return the status information necessary to begin recovery
282   * @throws IOException if the node has already made a promise to another
283   * writer with a higher epoch number, if the namespace is inconsistent,
284   * or if a disk error occurs.
285   */
286  synchronized NewEpochResponseProto newEpoch(
287      NamespaceInfo nsInfo, long epoch) throws IOException {
288
289    checkFormatted();
290    storage.checkConsistentNamespace(nsInfo);
291
292    // Check that the new epoch being proposed is in fact newer than
293    // any other that we've promised. 
294    if (epoch <= getLastPromisedEpoch()) {
295      throw new IOException("Proposed epoch " + epoch + " <= last promise " +
296          getLastPromisedEpoch());
297    }
298    
299    updateLastPromisedEpoch(epoch);
300    abortCurSegment();
301    
302    NewEpochResponseProto.Builder builder =
303        NewEpochResponseProto.newBuilder();
304
305    EditLogFile latestFile = scanStorageForLatestEdits();
306
307    if (latestFile != null) {
308      builder.setLastSegmentTxId(latestFile.getFirstTxId());
309    }
310    
311    return builder.build();
312  }
313
314  private void updateLastPromisedEpoch(long newEpoch) throws IOException {
315    LOG.info("Updating lastPromisedEpoch from " + lastPromisedEpoch.get() +
316        " to " + newEpoch + " for client " + Server.getRemoteIp());
317    lastPromisedEpoch.set(newEpoch);
318    
319    // Since we have a new writer, reset the IPC serial - it will start
320    // counting again from 0 for this writer.
321    currentEpochIpcSerial = -1;
322  }
323
324  private void abortCurSegment() throws IOException {
325    if (curSegment == null) {
326      return;
327    }
328    
329    curSegment.abort();
330    curSegment = null;
331    curSegmentTxId = HdfsConstants.INVALID_TXID;
332  }
333
334  /**
335   * Write a batch of edits to the journal.
336   * {@see QJournalProtocol#journal(RequestInfo, long, long, int, byte[])}
337   */
338  synchronized void journal(RequestInfo reqInfo,
339      long segmentTxId, long firstTxnId,
340      int numTxns, byte[] records) throws IOException {
341    checkFormatted();
342    checkWriteRequest(reqInfo);
343
344    checkSync(curSegment != null,
345        "Can't write, no segment open");
346    
347    if (curSegmentTxId != segmentTxId) {
348      // Sanity check: it is possible that the writer will fail IPCs
349      // on both the finalize() and then the start() of the next segment.
350      // This could cause us to continue writing to an old segment
351      // instead of rolling to a new one, which breaks one of the
352      // invariants in the design. If it happens, abort the segment
353      // and throw an exception.
354      JournalOutOfSyncException e = new JournalOutOfSyncException(
355          "Writer out of sync: it thinks it is writing segment " + segmentTxId
356          + " but current segment is " + curSegmentTxId);
357      abortCurSegment();
358      throw e;
359    }
360      
361    checkSync(nextTxId == firstTxnId,
362        "Can't write txid " + firstTxnId + " expecting nextTxId=" + nextTxId);
363    
364    long lastTxnId = firstTxnId + numTxns - 1;
365    if (LOG.isTraceEnabled()) {
366      LOG.trace("Writing txid " + firstTxnId + "-" + lastTxnId);
367    }
368
369    // If the edit has already been marked as committed, we know
370    // it has been fsynced on a quorum of other nodes, and we are
371    // "catching up" with the rest. Hence we do not need to fsync.
372    boolean isLagging = lastTxnId <= committedTxnId.get();
373    boolean shouldFsync = !isLagging;
374    
375    curSegment.writeRaw(records, 0, records.length);
376    curSegment.setReadyToFlush();
377    Stopwatch sw = new Stopwatch();
378    sw.start();
379    curSegment.flush(shouldFsync);
380    sw.stop();
381    
382    metrics.addSync(sw.elapsedTime(TimeUnit.MICROSECONDS));
383    if (sw.elapsedTime(TimeUnit.MILLISECONDS) > WARN_SYNC_MILLIS_THRESHOLD) {
384      LOG.warn("Sync of transaction range " + firstTxnId + "-" + lastTxnId +
385               " took " + sw.elapsedTime(TimeUnit.MILLISECONDS) + "ms");
386    }
387
388    if (isLagging) {
389      // This batch of edits has already been committed on a quorum of other
390      // nodes. So, we are in "catch up" mode. This gets its own metric.
391      metrics.batchesWrittenWhileLagging.incr(1);
392    }
393    
394    metrics.batchesWritten.incr(1);
395    metrics.bytesWritten.incr(records.length);
396    metrics.txnsWritten.incr(numTxns);
397    
398    highestWrittenTxId = lastTxnId;
399    nextTxId = lastTxnId + 1;
400  }
401
402  public void heartbeat(RequestInfo reqInfo) throws IOException {
403    checkRequest(reqInfo);
404  }
405  
406  /**
407   * Ensure that the given request is coming from the correct writer and in-order.
408   * @param reqInfo the request info
409   * @throws IOException if the request is invalid.
410   */
411  private synchronized void checkRequest(RequestInfo reqInfo) throws IOException {
412    // Invariant 25 from ZAB paper
413    if (reqInfo.getEpoch() < lastPromisedEpoch.get()) {
414      throw new IOException("IPC's epoch " + reqInfo.getEpoch() +
415          " is less than the last promised epoch " +
416          lastPromisedEpoch.get());
417    } else if (reqInfo.getEpoch() > lastPromisedEpoch.get()) {
418      // A newer client has arrived. Fence any previous writers by updating
419      // the promise.
420      updateLastPromisedEpoch(reqInfo.getEpoch());
421    }
422    
423    // Ensure that the IPCs are arriving in-order as expected.
424    checkSync(reqInfo.getIpcSerialNumber() > currentEpochIpcSerial,
425        "IPC serial %s from client %s was not higher than prior highest " +
426        "IPC serial %s", reqInfo.getIpcSerialNumber(),
427        Server.getRemoteIp(),
428        currentEpochIpcSerial);
429    currentEpochIpcSerial = reqInfo.getIpcSerialNumber();
430
431    if (reqInfo.hasCommittedTxId()) {
432      Preconditions.checkArgument(
433          reqInfo.getCommittedTxId() >= committedTxnId.get(),
434          "Client trying to move committed txid backward from " +
435          committedTxnId.get() + " to " + reqInfo.getCommittedTxId());
436      
437      committedTxnId.set(reqInfo.getCommittedTxId());
438    }
439  }
440  
441  private synchronized void checkWriteRequest(RequestInfo reqInfo) throws IOException {
442    checkRequest(reqInfo);
443    
444    if (reqInfo.getEpoch() != lastWriterEpoch.get()) {
445      throw new IOException("IPC's epoch " + reqInfo.getEpoch() +
446          " is not the current writer epoch  " +
447          lastWriterEpoch.get());
448    }
449  }
450  
451  public synchronized boolean isFormatted() {
452    return storage.isFormatted();
453  }
454
455  private void checkFormatted() throws JournalNotFormattedException {
456    if (!isFormatted()) {
457      throw new JournalNotFormattedException("Journal " +
458          storage.getSingularStorageDir() + " not formatted");
459    }
460  }
461
462  /**
463   * @throws JournalOutOfSyncException if the given expression is not true.
464   * The message of the exception is formatted using the 'msg' and
465   * 'formatArgs' parameters.
466   */
467  private void checkSync(boolean expression, String msg,
468      Object... formatArgs) throws JournalOutOfSyncException {
469    if (!expression) {
470      throw new JournalOutOfSyncException(String.format(msg, formatArgs));
471    }
472  }
473
474  /**
475   * @throws AssertionError if the given expression is not true.
476   * The message of the exception is formatted using the 'msg' and
477   * 'formatArgs' parameters.
478   * 
479   * This should be used in preference to Java's built-in assert in
480   * non-performance-critical paths, where a failure of this invariant
481   * might cause the protocol to lose data. 
482   */
483  private void alwaysAssert(boolean expression, String msg,
484      Object... formatArgs) {
485    if (!expression) {
486      throw new AssertionError(String.format(msg, formatArgs));
487    }
488  }
489  
490  /**
491   * Start a new segment at the given txid. The previous segment
492   * must have already been finalized.
493   */
494  public synchronized void startLogSegment(RequestInfo reqInfo, long txid,
495      int layoutVersion) throws IOException {
496    assert fjm != null;
497    checkFormatted();
498    checkRequest(reqInfo);
499    
500    if (curSegment != null) {
501      LOG.warn("Client is requesting a new log segment " + txid + 
502          " though we are already writing " + curSegment + ". " +
503          "Aborting the current segment in order to begin the new one.");
504      // The writer may have lost a connection to us and is now
505      // re-connecting after the connection came back.
506      // We should abort our own old segment.
507      abortCurSegment();
508    }
509
510    // Paranoid sanity check: we should never overwrite a finalized log file.
511    // Additionally, if it's in-progress, it should have at most 1 transaction.
512    // This can happen if the writer crashes exactly at the start of a segment.
513    EditLogFile existing = fjm.getLogFile(txid);
514    if (existing != null) {
515      if (!existing.isInProgress()) {
516        throw new IllegalStateException("Already have a finalized segment " +
517            existing + " beginning at " + txid);
518      }
519      
520      // If it's in-progress, it should only contain one transaction,
521      // because the "startLogSegment" transaction is written alone at the
522      // start of each segment. 
523      existing.scanLog();
524      if (existing.getLastTxId() != existing.getFirstTxId()) {
525        throw new IllegalStateException("The log file " +
526            existing + " seems to contain valid transactions");
527      }
528    }
529    
530    long curLastWriterEpoch = lastWriterEpoch.get();
531    if (curLastWriterEpoch != reqInfo.getEpoch()) {
532      LOG.info("Updating lastWriterEpoch from " + curLastWriterEpoch +
533          " to " + reqInfo.getEpoch() + " for client " +
534          Server.getRemoteIp());
535      lastWriterEpoch.set(reqInfo.getEpoch());
536    }
537
538    // The fact that we are starting a segment at this txid indicates
539    // that any previous recovery for this same segment was aborted.
540    // Otherwise, no writer would have started writing. So, we can
541    // remove the record of the older segment here.
542    purgePaxosDecision(txid);
543    
544    curSegment = fjm.startLogSegment(txid, layoutVersion);
545    curSegmentTxId = txid;
546    nextTxId = txid;
547  }
548  
549  /**
550   * Finalize the log segment at the given transaction ID.
551   */
552  public synchronized void finalizeLogSegment(RequestInfo reqInfo, long startTxId,
553      long endTxId) throws IOException {
554    checkFormatted();
555    checkRequest(reqInfo);
556
557    boolean needsValidation = true;
558
559    // Finalizing the log that the writer was just writing.
560    if (startTxId == curSegmentTxId) {
561      if (curSegment != null) {
562        curSegment.close();
563        curSegment = null;
564        curSegmentTxId = HdfsConstants.INVALID_TXID;
565      }
566      
567      checkSync(nextTxId == endTxId + 1,
568          "Trying to finalize in-progress log segment %s to end at " +
569          "txid %s but only written up to txid %s",
570          startTxId, endTxId, nextTxId - 1);
571      // No need to validate the edit log if the client is finalizing
572      // the log segment that it was just writing to.
573      needsValidation = false;
574    }
575    
576    FileJournalManager.EditLogFile elf = fjm.getLogFile(startTxId);
577    if (elf == null) {
578      throw new JournalOutOfSyncException("No log file to finalize at " +
579          "transaction ID " + startTxId);
580    }
581
582    if (elf.isInProgress()) {
583      if (needsValidation) {
584        LOG.info("Validating log segment " + elf.getFile() + " about to be " +
585            "finalized");
586        elf.scanLog();
587  
588        checkSync(elf.getLastTxId() == endTxId,
589            "Trying to finalize in-progress log segment %s to end at " +
590            "txid %s but log %s on disk only contains up to txid %s",
591            startTxId, endTxId, elf.getFile(), elf.getLastTxId());
592      }
593      fjm.finalizeLogSegment(startTxId, endTxId);
594    } else {
595      Preconditions.checkArgument(endTxId == elf.getLastTxId(),
596          "Trying to re-finalize already finalized log " +
597              elf + " with different endTxId " + endTxId);
598    }
599
600    // Once logs are finalized, a different length will never be decided.
601    // During recovery, we treat a finalized segment the same as an accepted
602    // recovery. Thus, we no longer need to keep track of the previously-
603    // accepted decision. The existence of the finalized log segment is enough.
604    purgePaxosDecision(elf.getFirstTxId());
605  }
606  
607  /**
608   * @see JournalManager#purgeLogsOlderThan(long)
609   */
610  public synchronized void purgeLogsOlderThan(RequestInfo reqInfo,
611      long minTxIdToKeep) throws IOException {
612    checkFormatted();
613    checkRequest(reqInfo);
614    
615    storage.purgeDataOlderThan(minTxIdToKeep);
616  }
617  
618  /**
619   * Remove the previously-recorded 'accepted recovery' information
620   * for a given log segment, once it is no longer necessary. 
621   * @param segmentTxId the transaction ID to purge
622   * @throws IOException if the file could not be deleted
623   */
624  private void purgePaxosDecision(long segmentTxId) throws IOException {
625    File paxosFile = storage.getPaxosFile(segmentTxId);
626    if (paxosFile.exists()) {
627      if (!paxosFile.delete()) {
628        throw new IOException("Unable to delete paxos file " + paxosFile);
629      }
630    }
631  }
632
633  /**
634   * @see QJournalProtocol#getEditLogManifest(String, long, boolean)
635   */
636  public RemoteEditLogManifest getEditLogManifest(long sinceTxId,
637      boolean inProgressOk) throws IOException {
638    // No need to checkRequest() here - anyone may ask for the list
639    // of segments.
640    checkFormatted();
641    
642    List<RemoteEditLog> logs = fjm.getRemoteEditLogs(sinceTxId, inProgressOk);
643    
644    if (inProgressOk) {
645      RemoteEditLog log = null;
646      for (Iterator<RemoteEditLog> iter = logs.iterator(); iter.hasNext();) {
647        log = iter.next();
648        if (log.isInProgress()) {
649          iter.remove();
650          break;
651        }
652      }
653      if (log != null && log.isInProgress()) {
654        logs.add(new RemoteEditLog(log.getStartTxId(), getHighestWrittenTxId(),
655            true));
656      }
657    }
658    
659    return new RemoteEditLogManifest(logs);
660  }
661
662  /**
663   * @return the current state of the given segment, or null if the
664   * segment does not exist.
665   */
666  @VisibleForTesting
667  SegmentStateProto getSegmentInfo(long segmentTxId)
668      throws IOException {
669    EditLogFile elf = fjm.getLogFile(segmentTxId);
670    if (elf == null) {
671      return null;
672    }
673    if (elf.isInProgress()) {
674      elf.scanLog();
675    }
676    if (elf.getLastTxId() == HdfsConstants.INVALID_TXID) {
677      LOG.info("Edit log file " + elf + " appears to be empty. " +
678          "Moving it aside...");
679      elf.moveAsideEmptyFile();
680      return null;
681    }
682    SegmentStateProto ret = SegmentStateProto.newBuilder()
683        .setStartTxId(segmentTxId)
684        .setEndTxId(elf.getLastTxId())
685        .setIsInProgress(elf.isInProgress())
686        .build();
687    LOG.info("getSegmentInfo(" + segmentTxId + "): " + elf + " -> " +
688        TextFormat.shortDebugString(ret));
689    return ret;
690  }
691
692  /**
693   * @see QJournalProtocol#prepareRecovery(RequestInfo, long)
694   */
695  public synchronized PrepareRecoveryResponseProto prepareRecovery(
696      RequestInfo reqInfo, long segmentTxId) throws IOException {
697    checkFormatted();
698    checkRequest(reqInfo);
699    
700    abortCurSegment();
701    
702    PrepareRecoveryResponseProto.Builder builder =
703        PrepareRecoveryResponseProto.newBuilder();
704
705    PersistedRecoveryPaxosData previouslyAccepted = getPersistedPaxosData(segmentTxId);
706    completeHalfDoneAcceptRecovery(previouslyAccepted);
707
708    SegmentStateProto segInfo = getSegmentInfo(segmentTxId);
709    boolean hasFinalizedSegment = segInfo != null && !segInfo.getIsInProgress();
710
711    if (previouslyAccepted != null && !hasFinalizedSegment) {
712      SegmentStateProto acceptedState = previouslyAccepted.getSegmentState();
713      assert acceptedState.getEndTxId() == segInfo.getEndTxId() :
714            "prev accepted: " + TextFormat.shortDebugString(previouslyAccepted)+ "\n" +
715            "on disk:       " + TextFormat.shortDebugString(segInfo);
716            
717      builder.setAcceptedInEpoch(previouslyAccepted.getAcceptedInEpoch())
718        .setSegmentState(previouslyAccepted.getSegmentState());
719    } else {
720      if (segInfo != null) {
721        builder.setSegmentState(segInfo);
722      }
723    }
724    
725    builder.setLastWriterEpoch(lastWriterEpoch.get());
726    if (committedTxnId.get() != HdfsConstants.INVALID_TXID) {
727      builder.setLastCommittedTxId(committedTxnId.get());
728    }
729    
730    PrepareRecoveryResponseProto resp = builder.build();
731    LOG.info("Prepared recovery for segment " + segmentTxId + ": " +
732        TextFormat.shortDebugString(resp));
733    return resp;
734  }
735  
736  /**
737   * @see QJournalProtocol#acceptRecovery(RequestInfo, QJournalProtocolProtos.SegmentStateProto, URL)
738   */
739  public synchronized void acceptRecovery(RequestInfo reqInfo,
740      SegmentStateProto segment, URL fromUrl)
741      throws IOException {
742    checkFormatted();
743    checkRequest(reqInfo);
744    
745    abortCurSegment();
746
747    long segmentTxId = segment.getStartTxId();
748
749    // Basic sanity checks that the segment is well-formed and contains
750    // at least one transaction.
751    Preconditions.checkArgument(segment.getEndTxId() > 0 &&
752        segment.getEndTxId() >= segmentTxId,
753        "bad recovery state for segment %s: %s",
754        segmentTxId, TextFormat.shortDebugString(segment));
755    
756    PersistedRecoveryPaxosData oldData = getPersistedPaxosData(segmentTxId);
757    PersistedRecoveryPaxosData newData = PersistedRecoveryPaxosData.newBuilder()
758        .setAcceptedInEpoch(reqInfo.getEpoch())
759        .setSegmentState(segment)
760        .build();
761    
762    // If we previously acted on acceptRecovery() from a higher-numbered writer,
763    // this call is out of sync. We should never actually trigger this, since the
764    // checkRequest() call above should filter non-increasing epoch numbers.
765    if (oldData != null) {
766      alwaysAssert(oldData.getAcceptedInEpoch() <= reqInfo.getEpoch(),
767          "Bad paxos transition, out-of-order epochs.\nOld: %s\nNew: %s\n",
768          oldData, newData);
769    }
770    
771    File syncedFile = null;
772    
773    SegmentStateProto currentSegment = getSegmentInfo(segmentTxId);
774    if (currentSegment == null ||
775        currentSegment.getEndTxId() != segment.getEndTxId()) {
776      if (currentSegment == null) {
777        LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) +
778            ": no current segment in place");
779        
780        // Update the highest txid for lag metrics
781        highestWrittenTxId = Math.max(segment.getEndTxId(),
782            highestWrittenTxId);
783      } else {
784        LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) +
785            ": old segment " + TextFormat.shortDebugString(currentSegment) +
786            " is not the right length");
787        
788        // Paranoid sanity check: if the new log is shorter than the log we
789        // currently have, we should not end up discarding any transactions
790        // which are already Committed.
791        if (txnRange(currentSegment).contains(committedTxnId.get()) &&
792            !txnRange(segment).contains(committedTxnId.get())) {
793          throw new AssertionError(
794              "Cannot replace segment " +
795              TextFormat.shortDebugString(currentSegment) +
796              " with new segment " +
797              TextFormat.shortDebugString(segment) + 
798              ": would discard already-committed txn " +
799              committedTxnId.get());
800        }
801        
802        // Another paranoid check: we should not be asked to synchronize a log
803        // on top of a finalized segment.
804        alwaysAssert(currentSegment.getIsInProgress(),
805            "Should never be asked to synchronize a different log on top of an " +
806            "already-finalized segment");
807        
808        // If we're shortening the log, update our highest txid
809        // used for lag metrics.
810        if (txnRange(currentSegment).contains(highestWrittenTxId)) {
811          highestWrittenTxId = segment.getEndTxId();
812        }
813      }
814      syncedFile = syncLog(reqInfo, segment, fromUrl);
815      
816    } else {
817      LOG.info("Skipping download of log " +
818          TextFormat.shortDebugString(segment) +
819          ": already have up-to-date logs");
820    }
821    
822    // This is one of the few places in the protocol where we have a single
823    // RPC that results in two distinct actions:
824    //
825    // - 1) Downloads the new log segment data (above)
826    // - 2) Records the new Paxos data about the synchronized segment (below)
827    //
828    // These need to be treated as a transaction from the perspective
829    // of any external process. We do this by treating the persistPaxosData()
830    // success as the "commit" of an atomic transaction. If we fail before
831    // this point, the downloaded edit log will only exist at a temporary
832    // path, and thus not change any externally visible state. If we fail
833    // after this point, then any future prepareRecovery() call will see
834    // the Paxos data, and by calling completeHalfDoneAcceptRecovery() will
835    // roll forward the rename of the referenced log file.
836    //
837    // See also: HDFS-3955
838    //
839    // The fault points here are exercised by the randomized fault injection
840    // test case to ensure that this atomic "transaction" operates correctly.
841    JournalFaultInjector.get().beforePersistPaxosData();
842    persistPaxosData(segmentTxId, newData);
843    JournalFaultInjector.get().afterPersistPaxosData();
844
845    if (syncedFile != null) {
846      FileUtil.replaceFile(syncedFile,
847          storage.getInProgressEditLog(segmentTxId));
848    }
849
850    LOG.info("Accepted recovery for segment " + segmentTxId + ": " +
851        TextFormat.shortDebugString(newData));
852  }
853
854  private Range<Long> txnRange(SegmentStateProto seg) {
855    Preconditions.checkArgument(seg.hasEndTxId(),
856        "invalid segment: %s", seg);
857    return Ranges.closed(seg.getStartTxId(), seg.getEndTxId());
858  }
859
860  /**
861   * Synchronize a log segment from another JournalNode. The log is
862   * downloaded from the provided URL into a temporary location on disk,
863   * which is named based on the current request's epoch.
864   *
865   * @return the temporary location of the downloaded file
866   */
867  private File syncLog(RequestInfo reqInfo,
868      final SegmentStateProto segment, final URL url) throws IOException {
869    final File tmpFile = storage.getSyncLogTemporaryFile(
870        segment.getStartTxId(), reqInfo.getEpoch());
871    final List<File> localPaths = ImmutableList.of(tmpFile);
872
873    LOG.info("Synchronizing log " +
874        TextFormat.shortDebugString(segment) + " from " + url);
875    SecurityUtil.doAsLoginUser(
876        new PrivilegedExceptionAction<Void>() {
877          @Override
878          public Void run() throws IOException {
879            // We may have lost our ticket since last checkpoint, log in again, just in case
880            if (UserGroupInformation.isSecurityEnabled()) {
881              UserGroupInformation.getCurrentUser().checkTGTAndReloginFromKeytab();
882            }
883
884            boolean success = false;
885            try {
886              TransferFsImage.doGetUrl(url, localPaths, storage, true);
887              assert tmpFile.exists();
888              success = true;
889            } finally {
890              if (!success) {
891                if (!tmpFile.delete()) {
892                  LOG.warn("Failed to delete temporary file " + tmpFile);
893                }
894              }
895            }
896            return null;
897          }
898        });
899    return tmpFile;
900  }
901  
902
903  /**
904   * In the case the node crashes in between downloading a log segment
905   * and persisting the associated paxos recovery data, the log segment
906   * will be left in its temporary location on disk. Given the paxos data,
907   * we can check if this was indeed the case, and &quot;roll forward&quot;
908   * the atomic operation.
909   * 
910   * See the inline comments in
911   * {@link #acceptRecovery(RequestInfo, SegmentStateProto, URL)} for more
912   * details.
913   *
914   * @throws IOException if the temporary file is unable to be renamed into
915   * place
916   */
917  private void completeHalfDoneAcceptRecovery(
918      PersistedRecoveryPaxosData paxosData) throws IOException {
919    if (paxosData == null) {
920      return;
921    }
922
923    long segmentId = paxosData.getSegmentState().getStartTxId();
924    long epoch = paxosData.getAcceptedInEpoch();
925    
926    File tmp = storage.getSyncLogTemporaryFile(segmentId, epoch);
927    
928    if (tmp.exists()) {
929      File dst = storage.getInProgressEditLog(segmentId);
930      LOG.info("Rolling forward previously half-completed synchronization: " +
931          tmp + " -> " + dst);
932      FileUtil.replaceFile(tmp, dst);
933    }
934  }
935
936  /**
937   * Retrieve the persisted data for recovering the given segment from disk.
938   */
939  private PersistedRecoveryPaxosData getPersistedPaxosData(long segmentTxId)
940      throws IOException {
941    File f = storage.getPaxosFile(segmentTxId);
942    if (!f.exists()) {
943      // Default instance has no fields filled in (they're optional)
944      return null;
945    }
946    
947    InputStream in = new FileInputStream(f);
948    try {
949      PersistedRecoveryPaxosData ret = PersistedRecoveryPaxosData.parseDelimitedFrom(in);
950      Preconditions.checkState(ret != null &&
951          ret.getSegmentState().getStartTxId() == segmentTxId,
952          "Bad persisted data for segment %s: %s",
953          segmentTxId, ret);
954      return ret;
955    } finally {
956      IOUtils.closeStream(in);
957    }
958  }
959
960  /**
961   * Persist data for recovering the given segment from disk.
962   */
963  private void persistPaxosData(long segmentTxId,
964      PersistedRecoveryPaxosData newData) throws IOException {
965    File f = storage.getPaxosFile(segmentTxId);
966    boolean success = false;
967    AtomicFileOutputStream fos = new AtomicFileOutputStream(f);
968    try {
969      newData.writeDelimitedTo(fos);
970      fos.write('\n');
971      // Write human-readable data after the protobuf. This is only
972      // to assist in debugging -- it's not parsed at all.
973      OutputStreamWriter writer = new OutputStreamWriter(fos, Charsets.UTF_8);
974      
975      writer.write(String.valueOf(newData));
976      writer.write('\n');
977      writer.flush();
978      
979      fos.flush();
980      success = true;
981    } finally {
982      if (success) {
983        IOUtils.closeStream(fos);
984      } else {
985        fos.abort();
986      }
987    }
988  }
989
990  synchronized void discardSegments(long startTxId) throws IOException {
991    storage.getJournalManager().discardSegments(startTxId);
992    // we delete all the segments after the startTxId. let's reset committedTxnId 
993    committedTxnId.set(startTxId - 1);
994  }
995
996  public synchronized void doPreUpgrade() throws IOException {
997    // Do not hold file lock on committedTxnId, because the containing
998    // directory will be renamed.  It will be reopened lazily on next access.
999    IOUtils.cleanup(LOG, committedTxnId);
1000    storage.getJournalManager().doPreUpgrade();
1001  }
1002
1003  public synchronized void doUpgrade(StorageInfo sInfo) throws IOException {
1004    long oldCTime = storage.getCTime();
1005    storage.cTime = sInfo.cTime;
1006    int oldLV = storage.getLayoutVersion();
1007    storage.layoutVersion = sInfo.layoutVersion;
1008    LOG.info("Starting upgrade of edits directory: "
1009        + ".\n   old LV = " + oldLV
1010        + "; old CTime = " + oldCTime
1011        + ".\n   new LV = " + storage.getLayoutVersion()
1012        + "; new CTime = " + storage.getCTime());
1013    storage.getJournalManager().doUpgrade(storage);
1014    storage.createPaxosDir();
1015    
1016    // Copy over the contents of the epoch data files to the new dir.
1017    File currentDir = storage.getSingularStorageDir().getCurrentDir();
1018    File previousDir = storage.getSingularStorageDir().getPreviousDir();
1019    
1020    PersistentLongFile prevLastPromisedEpoch = new PersistentLongFile(
1021        new File(previousDir, LAST_PROMISED_FILENAME), 0);
1022    PersistentLongFile prevLastWriterEpoch = new PersistentLongFile(
1023        new File(previousDir, LAST_WRITER_EPOCH), 0);
1024    BestEffortLongFile prevCommittedTxnId = new BestEffortLongFile(
1025        new File(previousDir, COMMITTED_TXID_FILENAME),
1026        HdfsConstants.INVALID_TXID);
1027
1028    lastPromisedEpoch = new PersistentLongFile(
1029        new File(currentDir, LAST_PROMISED_FILENAME), 0);
1030    lastWriterEpoch = new PersistentLongFile(
1031        new File(currentDir, LAST_WRITER_EPOCH), 0);
1032    committedTxnId = new BestEffortLongFile(
1033        new File(currentDir, COMMITTED_TXID_FILENAME),
1034        HdfsConstants.INVALID_TXID);
1035
1036    try {
1037      lastPromisedEpoch.set(prevLastPromisedEpoch.get());
1038      lastWriterEpoch.set(prevLastWriterEpoch.get());
1039      committedTxnId.set(prevCommittedTxnId.get());
1040    } finally {
1041      IOUtils.cleanup(LOG, prevCommittedTxnId);
1042    }
1043  }
1044
1045  public synchronized void doFinalize() throws IOException {
1046    LOG.info("Finalizing upgrade for journal " 
1047          + storage.getRoot() + "."
1048          + (storage.getLayoutVersion()==0 ? "" :
1049            "\n   cur LV = " + storage.getLayoutVersion()
1050            + "; cur CTime = " + storage.getCTime()));
1051    storage.getJournalManager().doFinalize();
1052  }
1053
1054  public Boolean canRollBack(StorageInfo storage, StorageInfo prevStorage,
1055      int targetLayoutVersion) throws IOException {
1056    return this.storage.getJournalManager().canRollBack(storage, prevStorage,
1057        targetLayoutVersion);
1058  }
1059
1060  public synchronized void doRollback() throws IOException {
1061    // Do not hold file lock on committedTxnId, because the containing
1062    // directory will be renamed.  It will be reopened lazily on next access.
1063    IOUtils.cleanup(LOG, committedTxnId);
1064    storage.getJournalManager().doRollback();
1065  }
1066
1067  public Long getJournalCTime() throws IOException {
1068    return storage.getJournalManager().getJournalCTime();
1069  }
1070}