001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.util.ExitUtil.terminate;
021
022import java.io.IOException;
023import java.util.ArrayList;
024import java.util.Collection;
025import java.util.Collections;
026import java.util.Comparator;
027import java.util.LinkedList;
028import java.util.List;
029import java.util.PriorityQueue;
030import java.util.SortedSet;
031import java.util.concurrent.CopyOnWriteArrayList;
032
033import org.apache.commons.logging.Log;
034import org.apache.commons.logging.LogFactory;
035import org.apache.hadoop.classification.InterfaceAudience;
036import org.apache.hadoop.hdfs.server.common.Storage;
037import org.apache.hadoop.hdfs.server.common.StorageInfo;
038import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
039import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
040import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
041
042import static org.apache.hadoop.util.ExitUtil.terminate;
043
044import com.google.common.base.Preconditions;
045import com.google.common.collect.ComparisonChain;
046import com.google.common.collect.ImmutableList;
047import com.google.common.collect.ImmutableListMultimap;
048import com.google.common.collect.Lists;
049import com.google.common.collect.Multimaps;
050import com.google.common.collect.Sets;
051
052/**
053 * Manages a collection of Journals. None of the methods are synchronized, it is
054 * assumed that FSEditLog methods, that use this class, use proper
055 * synchronization.
056 */
057public class JournalSet implements JournalManager {
058
059  static final Log LOG = LogFactory.getLog(FSEditLog.class);
060  
061  static final public Comparator<EditLogInputStream>
062    EDIT_LOG_INPUT_STREAM_COMPARATOR = new Comparator<EditLogInputStream>() {
063      @Override
064      public int compare(EditLogInputStream a, EditLogInputStream b) {
065        return ComparisonChain.start().
066          compare(a.getFirstTxId(), b.getFirstTxId()).
067          compare(b.getLastTxId(), a.getLastTxId()).
068          result();
069      }
070    };
071  
072  /**
073   * Container for a JournalManager paired with its currently
074   * active stream.
075   * 
076   * If a Journal gets disabled due to an error writing to its
077   * stream, then the stream will be aborted and set to null.
078   */
079  static class JournalAndStream implements CheckableNameNodeResource {
080    private final JournalManager journal;
081    private boolean disabled = false;
082    private EditLogOutputStream stream;
083    private final boolean required;
084    private final boolean shared;
085    
086    public JournalAndStream(JournalManager manager, boolean required,
087        boolean shared) {
088      this.journal = manager;
089      this.required = required;
090      this.shared = shared;
091    }
092
093    public void startLogSegment(long txId, int layoutVersion) throws IOException {
094      Preconditions.checkState(stream == null);
095      disabled = false;
096      stream = journal.startLogSegment(txId, layoutVersion);
097    }
098
099    /**
100     * Closes the stream, also sets it to null.
101     */
102    public void closeStream() throws IOException {
103      if (stream == null) return;
104      stream.close();
105      stream = null;
106    }
107
108    /**
109     * Close the Journal and Stream
110     */
111    public void close() throws IOException {
112      closeStream();
113
114      journal.close();
115    }
116    
117    /**
118     * Aborts the stream, also sets it to null.
119     */
120    public void abort() {
121      if (stream == null) return;
122      try {
123        stream.abort();
124      } catch (IOException ioe) {
125        LOG.error("Unable to abort stream " + stream, ioe);
126      }
127      stream = null;
128    }
129
130    boolean isActive() {
131      return stream != null;
132    }
133    
134    /**
135     * Should be used outside JournalSet only for testing.
136     */
137    EditLogOutputStream getCurrentStream() {
138      return stream;
139    }
140    
141    @Override
142    public String toString() {
143      return "JournalAndStream(mgr=" + journal +
144        ", " + "stream=" + stream + ")";
145    }
146
147    void setCurrentStreamForTests(EditLogOutputStream stream) {
148      this.stream = stream;
149    }
150    
151    JournalManager getManager() {
152      return journal;
153    }
154
155    boolean isDisabled() {
156      return disabled;
157    }
158
159    private void setDisabled(boolean disabled) {
160      this.disabled = disabled;
161    }
162    
163    @Override
164    public boolean isResourceAvailable() {
165      return !isDisabled();
166    }
167    
168    @Override
169    public boolean isRequired() {
170      return required;
171    }
172    
173    public boolean isShared() {
174      return shared;
175    }
176  }
177 
178  // COW implementation is necessary since some users (eg the web ui) call
179  // getAllJournalStreams() and then iterate. Since this is rarely
180  // mutated, there is no performance concern.
181  private final List<JournalAndStream> journals =
182      new CopyOnWriteArrayList<JournalSet.JournalAndStream>();
183  final int minimumRedundantJournals;
184  
185  JournalSet(int minimumRedundantResources) {
186    this.minimumRedundantJournals = minimumRedundantResources;
187  }
188  
189  @Override
190  public void format(NamespaceInfo nsInfo) throws IOException {
191    // The operation is done by FSEditLog itself
192    throw new UnsupportedOperationException();
193  }
194
195  @Override
196  public boolean hasSomeData() throws IOException {
197    // This is called individually on the underlying journals,
198    // not on the JournalSet.
199    throw new UnsupportedOperationException();
200  }
201
202  
203  @Override
204  public EditLogOutputStream startLogSegment(final long txId,
205      final int layoutVersion) throws IOException {
206    mapJournalsAndReportErrors(new JournalClosure() {
207      @Override
208      public void apply(JournalAndStream jas) throws IOException {
209        jas.startLogSegment(txId, layoutVersion);
210      }
211    }, "starting log segment " + txId);
212    return new JournalSetOutputStream();
213  }
214  
215  @Override
216  public void finalizeLogSegment(final long firstTxId, final long lastTxId)
217      throws IOException {
218    mapJournalsAndReportErrors(new JournalClosure() {
219      @Override
220      public void apply(JournalAndStream jas) throws IOException {
221        if (jas.isActive()) {
222          jas.closeStream();
223          jas.getManager().finalizeLogSegment(firstTxId, lastTxId);
224        }
225      }
226    }, "finalize log segment " + firstTxId + ", " + lastTxId);
227  }
228   
229  @Override
230  public void close() throws IOException {
231    mapJournalsAndReportErrors(new JournalClosure() {
232      @Override
233      public void apply(JournalAndStream jas) throws IOException {
234        jas.close();
235      }
236    }, "close journal");
237  }
238
239  /**
240   * In this function, we get a bunch of streams from all of our JournalManager
241   * objects.  Then we add these to the collection one by one.
242   * 
243   * @param streams          The collection to add the streams to.  It may or 
244   *                         may not be sorted-- this is up to the caller.
245   * @param fromTxId         The transaction ID to start looking for streams at
246   * @param inProgressOk     Should we consider unfinalized streams?
247   */
248  @Override
249  public void selectInputStreams(Collection<EditLogInputStream> streams,
250      long fromTxId, boolean inProgressOk) throws IOException {
251    final PriorityQueue<EditLogInputStream> allStreams = 
252        new PriorityQueue<EditLogInputStream>(64,
253            EDIT_LOG_INPUT_STREAM_COMPARATOR);
254    for (JournalAndStream jas : journals) {
255      if (jas.isDisabled()) {
256        LOG.info("Skipping jas " + jas + " since it's disabled");
257        continue;
258      }
259      try {
260        jas.getManager().selectInputStreams(allStreams, fromTxId, inProgressOk);
261      } catch (IOException ioe) {
262        LOG.warn("Unable to determine input streams from " + jas.getManager() +
263            ". Skipping.", ioe);
264      }
265    }
266    chainAndMakeRedundantStreams(streams, allStreams, fromTxId);
267  }
268  
269  public static void chainAndMakeRedundantStreams(
270      Collection<EditLogInputStream> outStreams,
271      PriorityQueue<EditLogInputStream> allStreams, long fromTxId) {
272    // We want to group together all the streams that start on the same start
273    // transaction ID.  To do this, we maintain an accumulator (acc) of all
274    // the streams we've seen at a given start transaction ID.  When we see a
275    // higher start transaction ID, we select a stream from the accumulator and
276    // clear it.  Then we begin accumulating streams with the new, higher start
277    // transaction ID.
278    LinkedList<EditLogInputStream> acc =
279        new LinkedList<EditLogInputStream>();
280    EditLogInputStream elis;
281    while ((elis = allStreams.poll()) != null) {
282      if (acc.isEmpty()) {
283        acc.add(elis);
284      } else {
285        long accFirstTxId = acc.get(0).getFirstTxId();
286        if (accFirstTxId == elis.getFirstTxId()) {
287          acc.add(elis);
288        } else if (accFirstTxId < elis.getFirstTxId()) {
289          outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
290          acc.clear();
291          acc.add(elis);
292        } else if (accFirstTxId > elis.getFirstTxId()) {
293          throw new RuntimeException("sorted set invariants violated!  " +
294              "Got stream with first txid " + elis.getFirstTxId() +
295              ", but the last firstTxId was " + accFirstTxId);
296        }
297      }
298    }
299    if (!acc.isEmpty()) {
300      outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
301      acc.clear();
302    }
303  }
304
305  /**
306   * Returns true if there are no journals, all redundant journals are disabled,
307   * or any required journals are disabled.
308   * 
309   * @return True if there no journals, all redundant journals are disabled,
310   * or any required journals are disabled.
311   */
312  public boolean isEmpty() {
313    return !NameNodeResourcePolicy.areResourcesAvailable(journals,
314        minimumRedundantJournals);
315  }
316  
317  /**
318   * Called when some journals experience an error in some operation.
319   */
320  private void disableAndReportErrorOnJournals(List<JournalAndStream> badJournals) {
321    if (badJournals == null || badJournals.isEmpty()) {
322      return; // nothing to do
323    }
324 
325    for (JournalAndStream j : badJournals) {
326      LOG.error("Disabling journal " + j);
327      j.abort();
328      j.setDisabled(true);
329    }
330  }
331
332  /**
333   * Implementations of this interface encapsulate operations that can be
334   * iteratively applied on all the journals. For example see
335   * {@link JournalSet#mapJournalsAndReportErrors}.
336   */
337  private interface JournalClosure {
338    /**
339     * The operation on JournalAndStream.
340     * @param jas Object on which operations are performed.
341     * @throws IOException
342     */
343    public void apply(JournalAndStream jas) throws IOException;
344  }
345  
346  /**
347   * Apply the given operation across all of the journal managers, disabling
348   * any for which the closure throws an IOException.
349   * @param closure {@link JournalClosure} object encapsulating the operation.
350   * @param status message used for logging errors (e.g. "opening journal")
351   * @throws IOException If the operation fails on all the journals.
352   */
353  private void mapJournalsAndReportErrors(
354      JournalClosure closure, String status) throws IOException{
355
356    List<JournalAndStream> badJAS = Lists.newLinkedList();
357    for (JournalAndStream jas : journals) {
358      try {
359        closure.apply(jas);
360      } catch (Throwable t) {
361        if (jas.isRequired()) {
362          final String msg = "Error: " + status + " failed for required journal ("
363            + jas + ")";
364          LOG.fatal(msg, t);
365          // If we fail on *any* of the required journals, then we must not
366          // continue on any of the other journals. Abort them to ensure that
367          // retry behavior doesn't allow them to keep going in any way.
368          abortAllJournals();
369          // the current policy is to shutdown the NN on errors to shared edits
370          // dir. There are many code paths to shared edits failures - syncs,
371          // roll of edits etc. All of them go through this common function 
372          // where the isRequired() check is made. Applying exit policy here 
373          // to catch all code paths.
374          terminate(1, msg);
375        } else {
376          LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
377          badJAS.add(jas);          
378        }
379      }
380    }
381    disableAndReportErrorOnJournals(badJAS);
382    if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
383        minimumRedundantJournals)) {
384      String message = status + " failed for too many journals";
385      LOG.error("Error: " + message);
386      throw new IOException(message);
387    }
388  }
389  
390  /**
391   * Abort all of the underlying streams.
392   */
393  private void abortAllJournals() {
394    for (JournalAndStream jas : journals) {
395      if (jas.isActive()) {
396        jas.abort();
397      }
398    }
399  }
400
401  /**
402   * An implementation of EditLogOutputStream that applies a requested method on
403   * all the journals that are currently active.
404   */
405  private class JournalSetOutputStream extends EditLogOutputStream {
406
407    JournalSetOutputStream() throws IOException {
408      super();
409    }
410
411    @Override
412    public void write(final FSEditLogOp op)
413        throws IOException {
414      mapJournalsAndReportErrors(new JournalClosure() {
415        @Override
416        public void apply(JournalAndStream jas) throws IOException {
417          if (jas.isActive()) {
418            jas.getCurrentStream().write(op);
419          }
420        }
421      }, "write op");
422    }
423
424    @Override
425    public void writeRaw(final byte[] data, final int offset, final int length)
426        throws IOException {
427      mapJournalsAndReportErrors(new JournalClosure() {
428        @Override
429        public void apply(JournalAndStream jas) throws IOException {
430          if (jas.isActive()) {
431            jas.getCurrentStream().writeRaw(data, offset, length);
432          }
433        }
434      }, "write bytes");
435    }
436
437    @Override
438    public void create(final int layoutVersion) throws IOException {
439      mapJournalsAndReportErrors(new JournalClosure() {
440        @Override
441        public void apply(JournalAndStream jas) throws IOException {
442          if (jas.isActive()) {
443            jas.getCurrentStream().create(layoutVersion);
444          }
445        }
446      }, "create");
447    }
448
449    @Override
450    public void close() throws IOException {
451      mapJournalsAndReportErrors(new JournalClosure() {
452        @Override
453        public void apply(JournalAndStream jas) throws IOException {
454          jas.closeStream();
455        }
456      }, "close");
457    }
458
459    @Override
460    public void abort() throws IOException {
461      mapJournalsAndReportErrors(new JournalClosure() {
462        @Override
463        public void apply(JournalAndStream jas) throws IOException {
464          jas.abort();
465        }
466      }, "abort");
467    }
468
469    @Override
470    public void setReadyToFlush() throws IOException {
471      mapJournalsAndReportErrors(new JournalClosure() {
472        @Override
473        public void apply(JournalAndStream jas) throws IOException {
474          if (jas.isActive()) {
475            jas.getCurrentStream().setReadyToFlush();
476          }
477        }
478      }, "setReadyToFlush");
479    }
480
481    @Override
482    protected void flushAndSync(final boolean durable) throws IOException {
483      mapJournalsAndReportErrors(new JournalClosure() {
484        @Override
485        public void apply(JournalAndStream jas) throws IOException {
486          if (jas.isActive()) {
487            jas.getCurrentStream().flushAndSync(durable);
488          }
489        }
490      }, "flushAndSync");
491    }
492    
493    @Override
494    public void flush() throws IOException {
495      mapJournalsAndReportErrors(new JournalClosure() {
496        @Override
497        public void apply(JournalAndStream jas) throws IOException {
498          if (jas.isActive()) {
499            jas.getCurrentStream().flush();
500          }
501        }
502      }, "flush");
503    }
504    
505    @Override
506    public boolean shouldForceSync() {
507      for (JournalAndStream js : journals) {
508        if (js.isActive() && js.getCurrentStream().shouldForceSync()) {
509          return true;
510        }
511      }
512      return false;
513    }
514    
515    @Override
516    protected long getNumSync() {
517      for (JournalAndStream jas : journals) {
518        if (jas.isActive()) {
519          return jas.getCurrentStream().getNumSync();
520        }
521      }
522      return 0;
523    }
524  }
525
526  @Override
527  public void setOutputBufferCapacity(final int size) {
528    try {
529      mapJournalsAndReportErrors(new JournalClosure() {
530        @Override
531        public void apply(JournalAndStream jas) throws IOException {
532            jas.getManager().setOutputBufferCapacity(size);
533        }
534      }, "setOutputBufferCapacity");
535    } catch (IOException e) {
536      LOG.error("Error in setting outputbuffer capacity");
537    }
538  }
539  
540  List<JournalAndStream> getAllJournalStreams() {
541    return journals;
542  }
543
544  List<JournalManager> getJournalManagers() {
545    List<JournalManager> jList = new ArrayList<JournalManager>();
546    for (JournalAndStream j : journals) {
547      jList.add(j.getManager());
548    }
549    return jList;
550  }
551  
552  void add(JournalManager j, boolean required) {
553    add(j, required, false);
554  }
555  
556  void add(JournalManager j, boolean required, boolean shared) {
557    JournalAndStream jas = new JournalAndStream(j, required, shared);
558    journals.add(jas);
559  }
560  
561  void remove(JournalManager j) {
562    JournalAndStream jasToRemove = null;
563    for (JournalAndStream jas: journals) {
564      if (jas.getManager().equals(j)) {
565        jasToRemove = jas;
566        break;
567      }
568    }
569    if (jasToRemove != null) {
570      jasToRemove.abort();
571      journals.remove(jasToRemove);
572    }
573  }
574
575  @Override
576  public void purgeLogsOlderThan(final long minTxIdToKeep) throws IOException {
577    mapJournalsAndReportErrors(new JournalClosure() {
578      @Override
579      public void apply(JournalAndStream jas) throws IOException {
580        jas.getManager().purgeLogsOlderThan(minTxIdToKeep);
581      }
582    }, "purgeLogsOlderThan " + minTxIdToKeep);
583  }
584
585  @Override
586  public void recoverUnfinalizedSegments() throws IOException {
587    mapJournalsAndReportErrors(new JournalClosure() {
588      @Override
589      public void apply(JournalAndStream jas) throws IOException {
590        jas.getManager().recoverUnfinalizedSegments();
591      }
592    }, "recoverUnfinalizedSegments");
593  }
594  
595  /**
596   * Return a manifest of what finalized edit logs are available. All available
597   * edit logs are returned starting from the transaction id passed. If
598   * 'fromTxId' falls in the middle of a log, that log is returned as well.
599   * 
600   * @param fromTxId Starting transaction id to read the logs.
601   * @return RemoteEditLogManifest object.
602   */
603  public synchronized RemoteEditLogManifest getEditLogManifest(long fromTxId) {
604    // Collect RemoteEditLogs available from each FileJournalManager
605    List<RemoteEditLog> allLogs = Lists.newArrayList();
606    for (JournalAndStream j : journals) {
607      if (j.getManager() instanceof FileJournalManager) {
608        FileJournalManager fjm = (FileJournalManager)j.getManager();
609        try {
610          allLogs.addAll(fjm.getRemoteEditLogs(fromTxId, false));
611        } catch (Throwable t) {
612          LOG.warn("Cannot list edit logs in " + fjm, t);
613        }
614      }
615    }
616    
617    // Group logs by their starting txid
618    ImmutableListMultimap<Long, RemoteEditLog> logsByStartTxId =
619      Multimaps.index(allLogs, RemoteEditLog.GET_START_TXID);
620    long curStartTxId = fromTxId;
621
622    List<RemoteEditLog> logs = Lists.newArrayList();
623    while (true) {
624      ImmutableList<RemoteEditLog> logGroup = logsByStartTxId.get(curStartTxId);
625      if (logGroup.isEmpty()) {
626        // we have a gap in logs - for example because we recovered some old
627        // storage directory with ancient logs. Clear out any logs we've
628        // accumulated so far, and then skip to the next segment of logs
629        // after the gap.
630        SortedSet<Long> startTxIds = Sets.newTreeSet(logsByStartTxId.keySet());
631        startTxIds = startTxIds.tailSet(curStartTxId);
632        if (startTxIds.isEmpty()) {
633          break;
634        } else {
635          if (LOG.isDebugEnabled()) {
636            LOG.debug("Found gap in logs at " + curStartTxId + ": " +
637                "not returning previous logs in manifest.");
638          }
639          logs.clear();
640          curStartTxId = startTxIds.first();
641          continue;
642        }
643      }
644
645      // Find the one that extends the farthest forward
646      RemoteEditLog bestLog = Collections.max(logGroup);
647      logs.add(bestLog);
648      // And then start looking from after that point
649      curStartTxId = bestLog.getEndTxId() + 1;
650    }
651    RemoteEditLogManifest ret = new RemoteEditLogManifest(logs);
652    
653    if (LOG.isDebugEnabled()) {
654      LOG.debug("Generated manifest for logs since " + fromTxId + ":"
655          + ret);      
656    }
657    return ret;
658  }
659
660  /**
661   * Add sync times to the buffer.
662   */
663  String getSyncTimes() {
664    StringBuilder buf = new StringBuilder();
665    for (JournalAndStream jas : journals) {
666      if (jas.isActive()) {
667        buf.append(jas.getCurrentStream().getTotalSyncTime());
668        buf.append(" ");
669      }
670    }
671    return buf.toString();
672  }
673
674  @Override
675  public void discardSegments(long startTxId) throws IOException {
676    // This operation is handled by FSEditLog directly.
677    throw new UnsupportedOperationException();
678  }
679
680  @Override
681  public void doPreUpgrade() throws IOException {
682    // This operation is handled by FSEditLog directly.
683    throw new UnsupportedOperationException();
684  }
685
686  @Override
687  public void doUpgrade(Storage storage) throws IOException {
688    // This operation is handled by FSEditLog directly.
689    throw new UnsupportedOperationException();
690  }
691  
692  @Override
693  public void doFinalize() throws IOException {
694    // This operation is handled by FSEditLog directly.
695    throw new UnsupportedOperationException();
696  }
697
698  @Override
699  public boolean canRollBack(StorageInfo storage, StorageInfo prevStorage, int targetLayoutVersion) throws IOException {
700    // This operation is handled by FSEditLog directly.
701    throw new UnsupportedOperationException();
702  }
703
704  @Override
705  public void doRollback() throws IOException {
706    // This operation is handled by FSEditLog directly.
707    throw new UnsupportedOperationException();
708  }
709
710  @Override
711  public long getJournalCTime() throws IOException {
712    // This operation is handled by FSEditLog directly.
713    throw new UnsupportedOperationException();
714  }
715}