001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.qjournal.server;
019
020import java.io.Closeable;
021import java.io.File;
022import java.io.FileInputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import java.io.OutputStreamWriter;
026import java.net.URL;
027import java.security.PrivilegedExceptionAction;
028import java.util.Iterator;
029import java.util.List;
030import java.util.concurrent.TimeUnit;
031
032import org.apache.commons.lang.math.LongRange;
033import org.apache.commons.logging.Log;
034import org.apache.commons.logging.LogFactory;
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.fs.FileUtil;
037import org.apache.hadoop.hdfs.protocol.HdfsConstants;
038import org.apache.hadoop.hdfs.qjournal.protocol.JournalNotFormattedException;
039import org.apache.hadoop.hdfs.qjournal.protocol.JournalOutOfSyncException;
040import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
041import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos;
042import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
043import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PersistedRecoveryPaxosData;
044import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
045import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
046import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
047import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
048import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
049import org.apache.hadoop.hdfs.server.common.StorageInfo;
050import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
051import org.apache.hadoop.hdfs.server.namenode.FileJournalManager;
052import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile;
053import org.apache.hadoop.hdfs.server.namenode.JournalManager;
054import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
055import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
056import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
057import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
058import org.apache.hadoop.hdfs.util.AtomicFileOutputStream;
059import org.apache.hadoop.hdfs.util.BestEffortLongFile;
060import org.apache.hadoop.hdfs.util.PersistentLongFile;
061import org.apache.hadoop.io.IOUtils;
062import org.apache.hadoop.ipc.Server;
063import org.apache.hadoop.security.SecurityUtil;
064import org.apache.hadoop.security.UserGroupInformation;
065import org.apache.hadoop.util.StopWatch;
066
067import com.google.common.annotations.VisibleForTesting;
068import com.google.common.base.Charsets;
069import com.google.common.base.Preconditions;
070import com.google.common.collect.ImmutableList;
071import com.google.protobuf.TextFormat;
072
073/**
074 * A JournalNode can manage journals for several clusters at once.
075 * Each such journal is entirely independent despite being hosted by
076 * the same JVM.
077 */
078public class Journal implements Closeable {
079  static final Log LOG = LogFactory.getLog(Journal.class);
080
081
082  // Current writing state
083  private EditLogOutputStream curSegment;
084  private long curSegmentTxId = HdfsConstants.INVALID_TXID;
085  private long nextTxId = HdfsConstants.INVALID_TXID;
086  private long highestWrittenTxId = 0;
087  
088  private final String journalId;
089  
090  private final JNStorage storage;
091
092  /**
093   * When a new writer comes along, it asks each node to promise
094   * to ignore requests from any previous writer, as identified
095   * by epoch number. In order to make such a promise, the epoch
096   * number of that writer is stored persistently on disk.
097   */
098  private PersistentLongFile lastPromisedEpoch;
099
100  /**
101   * Each IPC that comes from a given client contains a serial number
102   * which only increases from the client's perspective. Whenever
103   * we switch epochs, we reset this back to -1. Whenever an IPC
104   * comes from a client, we ensure that it is strictly higher
105   * than any previous IPC. This guards against any bugs in the IPC
106   * layer that would re-order IPCs or cause a stale retry from an old
107   * request to resurface and confuse things.
108   */
109  private long currentEpochIpcSerial = -1;
110  
111  /**
112   * The epoch number of the last writer to actually write a transaction.
113   * This is used to differentiate log segments after a crash at the very
114   * beginning of a segment. See the the 'testNewerVersionOfSegmentWins'
115   * test case.
116   */
117  private PersistentLongFile lastWriterEpoch;
118  
119  /**
120   * Lower-bound on the last committed transaction ID. This is not
121   * depended upon for correctness, but acts as a sanity check
122   * during the recovery procedures, and as a visibility mark
123   * for clients reading in-progress logs.
124   */
125  private BestEffortLongFile committedTxnId;
126  
127  public static final String LAST_PROMISED_FILENAME = "last-promised-epoch";
128  public static final String LAST_WRITER_EPOCH = "last-writer-epoch";
129  private static final String COMMITTED_TXID_FILENAME = "committed-txid";
130  
131  private final FileJournalManager fjm;
132
133  private final JournalMetrics metrics;
134
135  /**
136   * Time threshold for sync calls, beyond which a warning should be logged to the console.
137   */
138  private static final int WARN_SYNC_MILLIS_THRESHOLD = 1000;
139
140  Journal(Configuration conf, File logDir, String journalId,
141      StartupOption startOpt, StorageErrorReporter errorReporter)
142      throws IOException {
143    storage = new JNStorage(conf, logDir, startOpt, errorReporter);
144    this.journalId = journalId;
145
146    refreshCachedData();
147    
148    this.fjm = storage.getJournalManager();
149    
150    this.metrics = JournalMetrics.create(this);
151    
152    EditLogFile latest = scanStorageForLatestEdits();
153    if (latest != null) {
154      highestWrittenTxId = latest.getLastTxId();
155    }
156  }
157
158  /**
159   * Reload any data that may have been cached. This is necessary
160   * when we first load the Journal, but also after any formatting
161   * operation, since the cached data is no longer relevant.
162   */
163  private synchronized void refreshCachedData() {
164    IOUtils.closeStream(committedTxnId);
165    
166    File currentDir = storage.getSingularStorageDir().getCurrentDir();
167    this.lastPromisedEpoch = new PersistentLongFile(
168        new File(currentDir, LAST_PROMISED_FILENAME), 0);
169    this.lastWriterEpoch = new PersistentLongFile(
170        new File(currentDir, LAST_WRITER_EPOCH), 0);
171    this.committedTxnId = new BestEffortLongFile(
172        new File(currentDir, COMMITTED_TXID_FILENAME),
173        HdfsConstants.INVALID_TXID);
174  }
175  
176  /**
177   * Scan the local storage directory, and return the segment containing
178   * the highest transaction.
179   * @return the EditLogFile with the highest transactions, or null
180   * if no files exist.
181   */
182  private synchronized EditLogFile scanStorageForLatestEdits() throws IOException {
183    if (!fjm.getStorageDirectory().getCurrentDir().exists()) {
184      return null;
185    }
186    
187    LOG.info("Scanning storage " + fjm);
188    List<EditLogFile> files = fjm.getLogFiles(0);
189    
190    while (!files.isEmpty()) {
191      EditLogFile latestLog = files.remove(files.size() - 1);
192      latestLog.scanLog();
193      LOG.info("Latest log is " + latestLog);
194      if (latestLog.getLastTxId() == HdfsConstants.INVALID_TXID) {
195        // the log contains no transactions
196        LOG.warn("Latest log " + latestLog + " has no transactions. " +
197            "moving it aside and looking for previous log");
198        latestLog.moveAsideEmptyFile();
199      } else {
200        return latestLog;
201      }
202    }
203    
204    LOG.info("No files in " + fjm);
205    return null;
206  }
207
208  /**
209   * Format the local storage with the given namespace.
210   */
211  void format(NamespaceInfo nsInfo) throws IOException {
212    Preconditions.checkState(nsInfo.getNamespaceID() != 0,
213        "can't format with uninitialized namespace info: %s",
214        nsInfo);
215    LOG.info("Formatting " + this + " with namespace info: " +
216        nsInfo);
217    storage.format(nsInfo);
218    refreshCachedData();
219  }
220
221  /**
222   * Unlock and release resources.
223   */
224  @Override // Closeable
225  public void close() throws IOException {
226    storage.close();
227    IOUtils.closeStream(committedTxnId);
228    IOUtils.closeStream(curSegment);
229  }
230  
231  JNStorage getStorage() {
232    return storage;
233  }
234  
235  String getJournalId() {
236    return journalId;
237  }
238
239  /**
240   * @return the last epoch which this node has promised not to accept
241   * any lower epoch, or 0 if no promises have been made.
242   */
243  synchronized long getLastPromisedEpoch() throws IOException {
244    checkFormatted();
245    return lastPromisedEpoch.get();
246  }
247
248  synchronized public long getLastWriterEpoch() throws IOException {
249    checkFormatted();
250    return lastWriterEpoch.get();
251  }
252  
253  synchronized long getCommittedTxnIdForTests() throws IOException {
254    return committedTxnId.get();
255  }
256  
257  synchronized long getCurrentLagTxns() throws IOException {
258    long committed = committedTxnId.get();
259    if (committed == 0) {
260      return 0;
261    }
262    
263    return Math.max(committed - highestWrittenTxId, 0L);
264  }
265  
266  synchronized long getHighestWrittenTxId() {
267    return highestWrittenTxId;
268  }
269  
270  @VisibleForTesting
271  JournalMetrics getMetricsForTests() {
272    return metrics;
273  }
274
275  /**
276   * Try to create a new epoch for this journal.
277   * @param nsInfo the namespace, which is verified for consistency or used to
278   * format, if the Journal has not yet been written to.
279   * @param epoch the epoch to start
280   * @return the status information necessary to begin recovery
281   * @throws IOException if the node has already made a promise to another
282   * writer with a higher epoch number, if the namespace is inconsistent,
283   * or if a disk error occurs.
284   */
285  synchronized NewEpochResponseProto newEpoch(
286      NamespaceInfo nsInfo, long epoch) throws IOException {
287
288    checkFormatted();
289    storage.checkConsistentNamespace(nsInfo);
290
291    // Check that the new epoch being proposed is in fact newer than
292    // any other that we've promised. 
293    if (epoch <= getLastPromisedEpoch()) {
294      throw new IOException("Proposed epoch " + epoch + " <= last promise " +
295          getLastPromisedEpoch());
296    }
297    
298    updateLastPromisedEpoch(epoch);
299    abortCurSegment();
300    
301    NewEpochResponseProto.Builder builder =
302        NewEpochResponseProto.newBuilder();
303
304    EditLogFile latestFile = scanStorageForLatestEdits();
305
306    if (latestFile != null) {
307      builder.setLastSegmentTxId(latestFile.getFirstTxId());
308    }
309    
310    return builder.build();
311  }
312
313  private void updateLastPromisedEpoch(long newEpoch) throws IOException {
314    LOG.info("Updating lastPromisedEpoch from " + lastPromisedEpoch.get() +
315        " to " + newEpoch + " for client " + Server.getRemoteIp());
316    lastPromisedEpoch.set(newEpoch);
317    
318    // Since we have a new writer, reset the IPC serial - it will start
319    // counting again from 0 for this writer.
320    currentEpochIpcSerial = -1;
321  }
322
323  private void abortCurSegment() throws IOException {
324    if (curSegment == null) {
325      return;
326    }
327    
328    curSegment.abort();
329    curSegment = null;
330    curSegmentTxId = HdfsConstants.INVALID_TXID;
331  }
332
333  /**
334   * Write a batch of edits to the journal.
335   * {@see QJournalProtocol#journal(RequestInfo, long, long, int, byte[])}
336   */
337  synchronized void journal(RequestInfo reqInfo,
338      long segmentTxId, long firstTxnId,
339      int numTxns, byte[] records) throws IOException {
340    checkFormatted();
341    checkWriteRequest(reqInfo);
342
343    checkSync(curSegment != null,
344        "Can't write, no segment open");
345    
346    if (curSegmentTxId != segmentTxId) {
347      // Sanity check: it is possible that the writer will fail IPCs
348      // on both the finalize() and then the start() of the next segment.
349      // This could cause us to continue writing to an old segment
350      // instead of rolling to a new one, which breaks one of the
351      // invariants in the design. If it happens, abort the segment
352      // and throw an exception.
353      JournalOutOfSyncException e = new JournalOutOfSyncException(
354          "Writer out of sync: it thinks it is writing segment " + segmentTxId
355          + " but current segment is " + curSegmentTxId);
356      abortCurSegment();
357      throw e;
358    }
359      
360    checkSync(nextTxId == firstTxnId,
361        "Can't write txid " + firstTxnId + " expecting nextTxId=" + nextTxId);
362    
363    long lastTxnId = firstTxnId + numTxns - 1;
364    if (LOG.isTraceEnabled()) {
365      LOG.trace("Writing txid " + firstTxnId + "-" + lastTxnId);
366    }
367
368    // If the edit has already been marked as committed, we know
369    // it has been fsynced on a quorum of other nodes, and we are
370    // "catching up" with the rest. Hence we do not need to fsync.
371    boolean isLagging = lastTxnId <= committedTxnId.get();
372    boolean shouldFsync = !isLagging;
373    
374    curSegment.writeRaw(records, 0, records.length);
375    curSegment.setReadyToFlush();
376    StopWatch sw = new StopWatch();
377    sw.start();
378    curSegment.flush(shouldFsync);
379    sw.stop();
380
381    long nanoSeconds = sw.now();
382    metrics.addSync(
383        TimeUnit.MICROSECONDS.convert(nanoSeconds, TimeUnit.NANOSECONDS));
384    long milliSeconds = TimeUnit.MILLISECONDS.convert(
385        nanoSeconds, TimeUnit.NANOSECONDS);
386
387    if (milliSeconds > WARN_SYNC_MILLIS_THRESHOLD) {
388      LOG.warn("Sync of transaction range " + firstTxnId + "-" + lastTxnId +
389               " took " + milliSeconds + "ms");
390    }
391
392    if (isLagging) {
393      // This batch of edits has already been committed on a quorum of other
394      // nodes. So, we are in "catch up" mode. This gets its own metric.
395      metrics.batchesWrittenWhileLagging.incr(1);
396    }
397    
398    metrics.batchesWritten.incr(1);
399    metrics.bytesWritten.incr(records.length);
400    metrics.txnsWritten.incr(numTxns);
401    
402    highestWrittenTxId = lastTxnId;
403    nextTxId = lastTxnId + 1;
404  }
405
406  public void heartbeat(RequestInfo reqInfo) throws IOException {
407    checkRequest(reqInfo);
408  }
409  
410  /**
411   * Ensure that the given request is coming from the correct writer and in-order.
412   * @param reqInfo the request info
413   * @throws IOException if the request is invalid.
414   */
415  private synchronized void checkRequest(RequestInfo reqInfo) throws IOException {
416    // Invariant 25 from ZAB paper
417    if (reqInfo.getEpoch() < lastPromisedEpoch.get()) {
418      throw new IOException("IPC's epoch " + reqInfo.getEpoch() +
419          " is less than the last promised epoch " +
420          lastPromisedEpoch.get());
421    } else if (reqInfo.getEpoch() > lastPromisedEpoch.get()) {
422      // A newer client has arrived. Fence any previous writers by updating
423      // the promise.
424      updateLastPromisedEpoch(reqInfo.getEpoch());
425    }
426    
427    // Ensure that the IPCs are arriving in-order as expected.
428    checkSync(reqInfo.getIpcSerialNumber() > currentEpochIpcSerial,
429        "IPC serial %s from client %s was not higher than prior highest " +
430        "IPC serial %s", reqInfo.getIpcSerialNumber(),
431        Server.getRemoteIp(),
432        currentEpochIpcSerial);
433    currentEpochIpcSerial = reqInfo.getIpcSerialNumber();
434
435    if (reqInfo.hasCommittedTxId()) {
436      Preconditions.checkArgument(
437          reqInfo.getCommittedTxId() >= committedTxnId.get(),
438          "Client trying to move committed txid backward from " +
439          committedTxnId.get() + " to " + reqInfo.getCommittedTxId());
440      
441      committedTxnId.set(reqInfo.getCommittedTxId());
442    }
443  }
444  
445  private synchronized void checkWriteRequest(RequestInfo reqInfo) throws IOException {
446    checkRequest(reqInfo);
447    
448    if (reqInfo.getEpoch() != lastWriterEpoch.get()) {
449      throw new IOException("IPC's epoch " + reqInfo.getEpoch() +
450          " is not the current writer epoch  " +
451          lastWriterEpoch.get());
452    }
453  }
454  
455  public synchronized boolean isFormatted() {
456    return storage.isFormatted();
457  }
458
459  private void checkFormatted() throws JournalNotFormattedException {
460    if (!isFormatted()) {
461      throw new JournalNotFormattedException("Journal " +
462          storage.getSingularStorageDir() + " not formatted");
463    }
464  }
465
466  /**
467   * @throws JournalOutOfSyncException if the given expression is not true.
468   * The message of the exception is formatted using the 'msg' and
469   * 'formatArgs' parameters.
470   */
471  private void checkSync(boolean expression, String msg,
472      Object... formatArgs) throws JournalOutOfSyncException {
473    if (!expression) {
474      throw new JournalOutOfSyncException(String.format(msg, formatArgs));
475    }
476  }
477
478  /**
479   * @throws AssertionError if the given expression is not true.
480   * The message of the exception is formatted using the 'msg' and
481   * 'formatArgs' parameters.
482   * 
483   * This should be used in preference to Java's built-in assert in
484   * non-performance-critical paths, where a failure of this invariant
485   * might cause the protocol to lose data. 
486   */
487  private void alwaysAssert(boolean expression, String msg,
488      Object... formatArgs) {
489    if (!expression) {
490      throw new AssertionError(String.format(msg, formatArgs));
491    }
492  }
493  
494  /**
495   * Start a new segment at the given txid. The previous segment
496   * must have already been finalized.
497   */
498  public synchronized void startLogSegment(RequestInfo reqInfo, long txid,
499      int layoutVersion) throws IOException {
500    assert fjm != null;
501    checkFormatted();
502    checkRequest(reqInfo);
503    
504    if (curSegment != null) {
505      LOG.warn("Client is requesting a new log segment " + txid + 
506          " though we are already writing " + curSegment + ". " +
507          "Aborting the current segment in order to begin the new one.");
508      // The writer may have lost a connection to us and is now
509      // re-connecting after the connection came back.
510      // We should abort our own old segment.
511      abortCurSegment();
512    }
513
514    // Paranoid sanity check: we should never overwrite a finalized log file.
515    // Additionally, if it's in-progress, it should have at most 1 transaction.
516    // This can happen if the writer crashes exactly at the start of a segment.
517    EditLogFile existing = fjm.getLogFile(txid);
518    if (existing != null) {
519      if (!existing.isInProgress()) {
520        throw new IllegalStateException("Already have a finalized segment " +
521            existing + " beginning at " + txid);
522      }
523      
524      // If it's in-progress, it should only contain one transaction,
525      // because the "startLogSegment" transaction is written alone at the
526      // start of each segment. 
527      existing.scanLog();
528      if (existing.getLastTxId() != existing.getFirstTxId()) {
529        throw new IllegalStateException("The log file " +
530            existing + " seems to contain valid transactions");
531      }
532    }
533    
534    long curLastWriterEpoch = lastWriterEpoch.get();
535    if (curLastWriterEpoch != reqInfo.getEpoch()) {
536      LOG.info("Updating lastWriterEpoch from " + curLastWriterEpoch +
537          " to " + reqInfo.getEpoch() + " for client " +
538          Server.getRemoteIp());
539      lastWriterEpoch.set(reqInfo.getEpoch());
540    }
541
542    // The fact that we are starting a segment at this txid indicates
543    // that any previous recovery for this same segment was aborted.
544    // Otherwise, no writer would have started writing. So, we can
545    // remove the record of the older segment here.
546    purgePaxosDecision(txid);
547    
548    curSegment = fjm.startLogSegment(txid, layoutVersion);
549    curSegmentTxId = txid;
550    nextTxId = txid;
551  }
552  
553  /**
554   * Finalize the log segment at the given transaction ID.
555   */
556  public synchronized void finalizeLogSegment(RequestInfo reqInfo, long startTxId,
557      long endTxId) throws IOException {
558    checkFormatted();
559    checkRequest(reqInfo);
560
561    boolean needsValidation = true;
562
563    // Finalizing the log that the writer was just writing.
564    if (startTxId == curSegmentTxId) {
565      if (curSegment != null) {
566        curSegment.close();
567        curSegment = null;
568        curSegmentTxId = HdfsConstants.INVALID_TXID;
569      }
570      
571      checkSync(nextTxId == endTxId + 1,
572          "Trying to finalize in-progress log segment %s to end at " +
573          "txid %s but only written up to txid %s",
574          startTxId, endTxId, nextTxId - 1);
575      // No need to validate the edit log if the client is finalizing
576      // the log segment that it was just writing to.
577      needsValidation = false;
578    }
579    
580    FileJournalManager.EditLogFile elf = fjm.getLogFile(startTxId);
581    if (elf == null) {
582      throw new JournalOutOfSyncException("No log file to finalize at " +
583          "transaction ID " + startTxId);
584    }
585
586    if (elf.isInProgress()) {
587      if (needsValidation) {
588        LOG.info("Validating log segment " + elf.getFile() + " about to be " +
589            "finalized");
590        elf.scanLog();
591  
592        checkSync(elf.getLastTxId() == endTxId,
593            "Trying to finalize in-progress log segment %s to end at " +
594            "txid %s but log %s on disk only contains up to txid %s",
595            startTxId, endTxId, elf.getFile(), elf.getLastTxId());
596      }
597      fjm.finalizeLogSegment(startTxId, endTxId);
598    } else {
599      Preconditions.checkArgument(endTxId == elf.getLastTxId(),
600          "Trying to re-finalize already finalized log " +
601              elf + " with different endTxId " + endTxId);
602    }
603
604    // Once logs are finalized, a different length will never be decided.
605    // During recovery, we treat a finalized segment the same as an accepted
606    // recovery. Thus, we no longer need to keep track of the previously-
607    // accepted decision. The existence of the finalized log segment is enough.
608    purgePaxosDecision(elf.getFirstTxId());
609  }
610  
611  /**
612   * @see JournalManager#purgeLogsOlderThan(long)
613   */
614  public synchronized void purgeLogsOlderThan(RequestInfo reqInfo,
615      long minTxIdToKeep) throws IOException {
616    checkFormatted();
617    checkRequest(reqInfo);
618    
619    storage.purgeDataOlderThan(minTxIdToKeep);
620  }
621  
622  /**
623   * Remove the previously-recorded 'accepted recovery' information
624   * for a given log segment, once it is no longer necessary. 
625   * @param segmentTxId the transaction ID to purge
626   * @throws IOException if the file could not be deleted
627   */
628  private void purgePaxosDecision(long segmentTxId) throws IOException {
629    File paxosFile = storage.getPaxosFile(segmentTxId);
630    if (paxosFile.exists()) {
631      if (!paxosFile.delete()) {
632        throw new IOException("Unable to delete paxos file " + paxosFile);
633      }
634    }
635  }
636
637  /**
638   * @see QJournalProtocol#getEditLogManifest(String, long, boolean)
639   */
640  public RemoteEditLogManifest getEditLogManifest(long sinceTxId,
641      boolean inProgressOk) throws IOException {
642    // No need to checkRequest() here - anyone may ask for the list
643    // of segments.
644    checkFormatted();
645    
646    List<RemoteEditLog> logs = fjm.getRemoteEditLogs(sinceTxId, inProgressOk);
647    
648    if (inProgressOk) {
649      RemoteEditLog log = null;
650      for (Iterator<RemoteEditLog> iter = logs.iterator(); iter.hasNext();) {
651        log = iter.next();
652        if (log.isInProgress()) {
653          iter.remove();
654          break;
655        }
656      }
657      if (log != null && log.isInProgress()) {
658        logs.add(new RemoteEditLog(log.getStartTxId(), getHighestWrittenTxId(),
659            true));
660      }
661    }
662    
663    return new RemoteEditLogManifest(logs);
664  }
665
666  /**
667   * @return the current state of the given segment, or null if the
668   * segment does not exist.
669   */
670  @VisibleForTesting
671  SegmentStateProto getSegmentInfo(long segmentTxId)
672      throws IOException {
673    EditLogFile elf = fjm.getLogFile(segmentTxId);
674    if (elf == null) {
675      return null;
676    }
677    if (elf.isInProgress()) {
678      elf.scanLog();
679    }
680    if (elf.getLastTxId() == HdfsConstants.INVALID_TXID) {
681      LOG.info("Edit log file " + elf + " appears to be empty. " +
682          "Moving it aside...");
683      elf.moveAsideEmptyFile();
684      return null;
685    }
686    SegmentStateProto ret = SegmentStateProto.newBuilder()
687        .setStartTxId(segmentTxId)
688        .setEndTxId(elf.getLastTxId())
689        .setIsInProgress(elf.isInProgress())
690        .build();
691    LOG.info("getSegmentInfo(" + segmentTxId + "): " + elf + " -> " +
692        TextFormat.shortDebugString(ret));
693    return ret;
694  }
695
696  /**
697   * @see QJournalProtocol#prepareRecovery(RequestInfo, long)
698   */
699  public synchronized PrepareRecoveryResponseProto prepareRecovery(
700      RequestInfo reqInfo, long segmentTxId) throws IOException {
701    checkFormatted();
702    checkRequest(reqInfo);
703    
704    abortCurSegment();
705    
706    PrepareRecoveryResponseProto.Builder builder =
707        PrepareRecoveryResponseProto.newBuilder();
708
709    PersistedRecoveryPaxosData previouslyAccepted = getPersistedPaxosData(segmentTxId);
710    completeHalfDoneAcceptRecovery(previouslyAccepted);
711
712    SegmentStateProto segInfo = getSegmentInfo(segmentTxId);
713    boolean hasFinalizedSegment = segInfo != null && !segInfo.getIsInProgress();
714
715    if (previouslyAccepted != null && !hasFinalizedSegment) {
716      SegmentStateProto acceptedState = previouslyAccepted.getSegmentState();
717      assert acceptedState.getEndTxId() == segInfo.getEndTxId() :
718            "prev accepted: " + TextFormat.shortDebugString(previouslyAccepted)+ "\n" +
719            "on disk:       " + TextFormat.shortDebugString(segInfo);
720            
721      builder.setAcceptedInEpoch(previouslyAccepted.getAcceptedInEpoch())
722        .setSegmentState(previouslyAccepted.getSegmentState());
723    } else {
724      if (segInfo != null) {
725        builder.setSegmentState(segInfo);
726      }
727    }
728    
729    builder.setLastWriterEpoch(lastWriterEpoch.get());
730    if (committedTxnId.get() != HdfsConstants.INVALID_TXID) {
731      builder.setLastCommittedTxId(committedTxnId.get());
732    }
733    
734    PrepareRecoveryResponseProto resp = builder.build();
735    LOG.info("Prepared recovery for segment " + segmentTxId + ": " +
736        TextFormat.shortDebugString(resp));
737    return resp;
738  }
739  
740  /**
741   * @see QJournalProtocol#acceptRecovery(RequestInfo, QJournalProtocolProtos.SegmentStateProto, URL)
742   */
743  public synchronized void acceptRecovery(RequestInfo reqInfo,
744      SegmentStateProto segment, URL fromUrl)
745      throws IOException {
746    checkFormatted();
747    checkRequest(reqInfo);
748    
749    abortCurSegment();
750
751    long segmentTxId = segment.getStartTxId();
752
753    // Basic sanity checks that the segment is well-formed and contains
754    // at least one transaction.
755    Preconditions.checkArgument(segment.getEndTxId() > 0 &&
756        segment.getEndTxId() >= segmentTxId,
757        "bad recovery state for segment %s: %s",
758        segmentTxId, TextFormat.shortDebugString(segment));
759    
760    PersistedRecoveryPaxosData oldData = getPersistedPaxosData(segmentTxId);
761    PersistedRecoveryPaxosData newData = PersistedRecoveryPaxosData.newBuilder()
762        .setAcceptedInEpoch(reqInfo.getEpoch())
763        .setSegmentState(segment)
764        .build();
765    
766    // If we previously acted on acceptRecovery() from a higher-numbered writer,
767    // this call is out of sync. We should never actually trigger this, since the
768    // checkRequest() call above should filter non-increasing epoch numbers.
769    if (oldData != null) {
770      alwaysAssert(oldData.getAcceptedInEpoch() <= reqInfo.getEpoch(),
771          "Bad paxos transition, out-of-order epochs.\nOld: %s\nNew: %s\n",
772          oldData, newData);
773    }
774    
775    File syncedFile = null;
776    
777    SegmentStateProto currentSegment = getSegmentInfo(segmentTxId);
778    if (currentSegment == null ||
779        currentSegment.getEndTxId() != segment.getEndTxId()) {
780      if (currentSegment == null) {
781        LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) +
782            ": no current segment in place");
783        
784        // Update the highest txid for lag metrics
785        highestWrittenTxId = Math.max(segment.getEndTxId(),
786            highestWrittenTxId);
787      } else {
788        LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) +
789            ": old segment " + TextFormat.shortDebugString(currentSegment) +
790            " is not the right length");
791        
792        // Paranoid sanity check: if the new log is shorter than the log we
793        // currently have, we should not end up discarding any transactions
794        // which are already Committed.
795        if (txnRange(currentSegment).containsLong(committedTxnId.get()) &&
796            !txnRange(segment).containsLong(committedTxnId.get())) {
797          throw new AssertionError(
798              "Cannot replace segment " +
799              TextFormat.shortDebugString(currentSegment) +
800              " with new segment " +
801              TextFormat.shortDebugString(segment) + 
802              ": would discard already-committed txn " +
803              committedTxnId.get());
804        }
805        
806        // Another paranoid check: we should not be asked to synchronize a log
807        // on top of a finalized segment.
808        alwaysAssert(currentSegment.getIsInProgress(),
809            "Should never be asked to synchronize a different log on top of an " +
810            "already-finalized segment");
811        
812        // If we're shortening the log, update our highest txid
813        // used for lag metrics.
814        if (txnRange(currentSegment).containsLong(highestWrittenTxId)) {
815          highestWrittenTxId = segment.getEndTxId();
816        }
817      }
818      syncedFile = syncLog(reqInfo, segment, fromUrl);
819      
820    } else {
821      LOG.info("Skipping download of log " +
822          TextFormat.shortDebugString(segment) +
823          ": already have up-to-date logs");
824    }
825    
826    // This is one of the few places in the protocol where we have a single
827    // RPC that results in two distinct actions:
828    //
829    // - 1) Downloads the new log segment data (above)
830    // - 2) Records the new Paxos data about the synchronized segment (below)
831    //
832    // These need to be treated as a transaction from the perspective
833    // of any external process. We do this by treating the persistPaxosData()
834    // success as the "commit" of an atomic transaction. If we fail before
835    // this point, the downloaded edit log will only exist at a temporary
836    // path, and thus not change any externally visible state. If we fail
837    // after this point, then any future prepareRecovery() call will see
838    // the Paxos data, and by calling completeHalfDoneAcceptRecovery() will
839    // roll forward the rename of the referenced log file.
840    //
841    // See also: HDFS-3955
842    //
843    // The fault points here are exercised by the randomized fault injection
844    // test case to ensure that this atomic "transaction" operates correctly.
845    JournalFaultInjector.get().beforePersistPaxosData();
846    persistPaxosData(segmentTxId, newData);
847    JournalFaultInjector.get().afterPersistPaxosData();
848
849    if (syncedFile != null) {
850      FileUtil.replaceFile(syncedFile,
851          storage.getInProgressEditLog(segmentTxId));
852    }
853
854    LOG.info("Accepted recovery for segment " + segmentTxId + ": " +
855        TextFormat.shortDebugString(newData));
856  }
857
858  private LongRange txnRange(SegmentStateProto seg) {
859    Preconditions.checkArgument(seg.hasEndTxId(),
860        "invalid segment: %s", seg);
861    return new LongRange(seg.getStartTxId(), seg.getEndTxId());
862  }
863
864  /**
865   * Synchronize a log segment from another JournalNode. The log is
866   * downloaded from the provided URL into a temporary location on disk,
867   * which is named based on the current request's epoch.
868   *
869   * @return the temporary location of the downloaded file
870   */
871  private File syncLog(RequestInfo reqInfo,
872      final SegmentStateProto segment, final URL url) throws IOException {
873    final File tmpFile = storage.getSyncLogTemporaryFile(
874        segment.getStartTxId(), reqInfo.getEpoch());
875    final List<File> localPaths = ImmutableList.of(tmpFile);
876
877    LOG.info("Synchronizing log " +
878        TextFormat.shortDebugString(segment) + " from " + url);
879    SecurityUtil.doAsLoginUser(
880        new PrivilegedExceptionAction<Void>() {
881          @Override
882          public Void run() throws IOException {
883            // We may have lost our ticket since last checkpoint, log in again, just in case
884            if (UserGroupInformation.isSecurityEnabled()) {
885              UserGroupInformation.getCurrentUser().checkTGTAndReloginFromKeytab();
886            }
887
888            boolean success = false;
889            try {
890              TransferFsImage.doGetUrl(url, localPaths, storage, true);
891              assert tmpFile.exists();
892              success = true;
893            } finally {
894              if (!success) {
895                if (!tmpFile.delete()) {
896                  LOG.warn("Failed to delete temporary file " + tmpFile);
897                }
898              }
899            }
900            return null;
901          }
902        });
903    return tmpFile;
904  }
905  
906
907  /**
908   * In the case the node crashes in between downloading a log segment
909   * and persisting the associated paxos recovery data, the log segment
910   * will be left in its temporary location on disk. Given the paxos data,
911   * we can check if this was indeed the case, and &quot;roll forward&quot;
912   * the atomic operation.
913   * 
914   * See the inline comments in
915   * {@link #acceptRecovery(RequestInfo, SegmentStateProto, URL)} for more
916   * details.
917   *
918   * @throws IOException if the temporary file is unable to be renamed into
919   * place
920   */
921  private void completeHalfDoneAcceptRecovery(
922      PersistedRecoveryPaxosData paxosData) throws IOException {
923    if (paxosData == null) {
924      return;
925    }
926
927    long segmentId = paxosData.getSegmentState().getStartTxId();
928    long epoch = paxosData.getAcceptedInEpoch();
929    
930    File tmp = storage.getSyncLogTemporaryFile(segmentId, epoch);
931    
932    if (tmp.exists()) {
933      File dst = storage.getInProgressEditLog(segmentId);
934      LOG.info("Rolling forward previously half-completed synchronization: " +
935          tmp + " -> " + dst);
936      FileUtil.replaceFile(tmp, dst);
937    }
938  }
939
940  /**
941   * Retrieve the persisted data for recovering the given segment from disk.
942   */
943  private PersistedRecoveryPaxosData getPersistedPaxosData(long segmentTxId)
944      throws IOException {
945    File f = storage.getPaxosFile(segmentTxId);
946    if (!f.exists()) {
947      // Default instance has no fields filled in (they're optional)
948      return null;
949    }
950    
951    InputStream in = new FileInputStream(f);
952    try {
953      PersistedRecoveryPaxosData ret = PersistedRecoveryPaxosData.parseDelimitedFrom(in);
954      Preconditions.checkState(ret != null &&
955          ret.getSegmentState().getStartTxId() == segmentTxId,
956          "Bad persisted data for segment %s: %s",
957          segmentTxId, ret);
958      return ret;
959    } finally {
960      IOUtils.closeStream(in);
961    }
962  }
963
964  /**
965   * Persist data for recovering the given segment from disk.
966   */
967  private void persistPaxosData(long segmentTxId,
968      PersistedRecoveryPaxosData newData) throws IOException {
969    File f = storage.getPaxosFile(segmentTxId);
970    boolean success = false;
971    AtomicFileOutputStream fos = new AtomicFileOutputStream(f);
972    try {
973      newData.writeDelimitedTo(fos);
974      fos.write('\n');
975      // Write human-readable data after the protobuf. This is only
976      // to assist in debugging -- it's not parsed at all.
977      OutputStreamWriter writer = new OutputStreamWriter(fos, Charsets.UTF_8);
978      
979      writer.write(String.valueOf(newData));
980      writer.write('\n');
981      writer.flush();
982      
983      fos.flush();
984      success = true;
985    } finally {
986      if (success) {
987        IOUtils.closeStream(fos);
988      } else {
989        fos.abort();
990      }
991    }
992  }
993
994  synchronized void discardSegments(long startTxId) throws IOException {
995    storage.getJournalManager().discardSegments(startTxId);
996    // we delete all the segments after the startTxId. let's reset committedTxnId 
997    committedTxnId.set(startTxId - 1);
998  }
999
1000  public synchronized void doPreUpgrade() throws IOException {
1001    // Do not hold file lock on committedTxnId, because the containing
1002    // directory will be renamed.  It will be reopened lazily on next access.
1003    IOUtils.cleanup(LOG, committedTxnId);
1004    storage.getJournalManager().doPreUpgrade();
1005  }
1006
1007  public synchronized void doUpgrade(StorageInfo sInfo) throws IOException {
1008    long oldCTime = storage.getCTime();
1009    storage.cTime = sInfo.cTime;
1010    int oldLV = storage.getLayoutVersion();
1011    storage.layoutVersion = sInfo.layoutVersion;
1012    LOG.info("Starting upgrade of edits directory: "
1013        + ".\n   old LV = " + oldLV
1014        + "; old CTime = " + oldCTime
1015        + ".\n   new LV = " + storage.getLayoutVersion()
1016        + "; new CTime = " + storage.getCTime());
1017    storage.getJournalManager().doUpgrade(storage);
1018    storage.createPaxosDir();
1019    
1020    // Copy over the contents of the epoch data files to the new dir.
1021    File currentDir = storage.getSingularStorageDir().getCurrentDir();
1022    File previousDir = storage.getSingularStorageDir().getPreviousDir();
1023    
1024    PersistentLongFile prevLastPromisedEpoch = new PersistentLongFile(
1025        new File(previousDir, LAST_PROMISED_FILENAME), 0);
1026    PersistentLongFile prevLastWriterEpoch = new PersistentLongFile(
1027        new File(previousDir, LAST_WRITER_EPOCH), 0);
1028    BestEffortLongFile prevCommittedTxnId = new BestEffortLongFile(
1029        new File(previousDir, COMMITTED_TXID_FILENAME),
1030        HdfsConstants.INVALID_TXID);
1031
1032    lastPromisedEpoch = new PersistentLongFile(
1033        new File(currentDir, LAST_PROMISED_FILENAME), 0);
1034    lastWriterEpoch = new PersistentLongFile(
1035        new File(currentDir, LAST_WRITER_EPOCH), 0);
1036    committedTxnId = new BestEffortLongFile(
1037        new File(currentDir, COMMITTED_TXID_FILENAME),
1038        HdfsConstants.INVALID_TXID);
1039
1040    try {
1041      lastPromisedEpoch.set(prevLastPromisedEpoch.get());
1042      lastWriterEpoch.set(prevLastWriterEpoch.get());
1043      committedTxnId.set(prevCommittedTxnId.get());
1044    } finally {
1045      IOUtils.cleanup(LOG, prevCommittedTxnId);
1046    }
1047  }
1048
1049  public synchronized void doFinalize() throws IOException {
1050    LOG.info("Finalizing upgrade for journal " 
1051          + storage.getRoot() + "."
1052          + (storage.getLayoutVersion()==0 ? "" :
1053            "\n   cur LV = " + storage.getLayoutVersion()
1054            + "; cur CTime = " + storage.getCTime()));
1055    storage.getJournalManager().doFinalize();
1056  }
1057
1058  public Boolean canRollBack(StorageInfo storage, StorageInfo prevStorage,
1059      int targetLayoutVersion) throws IOException {
1060    return this.storage.getJournalManager().canRollBack(storage, prevStorage,
1061        targetLayoutVersion);
1062  }
1063
1064  public synchronized void doRollback() throws IOException {
1065    // Do not hold file lock on committedTxnId, because the containing
1066    // directory will be renamed.  It will be reopened lazily on next access.
1067    IOUtils.cleanup(LOG, committedTxnId);
1068    storage.getJournalManager().doRollback();
1069  }
1070
1071  public Long getJournalCTime() throws IOException {
1072    return storage.getJournalManager().getJournalCTime();
1073  }
1074}