001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.namenode;
019    
020    import static org.apache.hadoop.util.ExitUtil.terminate;
021    
022    import java.io.IOException;
023    import java.util.ArrayList;
024    import java.util.Collection;
025    import java.util.Collections;
026    import java.util.Comparator;
027    import java.util.LinkedList;
028    import java.util.List;
029    import java.util.PriorityQueue;
030    import java.util.SortedSet;
031    import java.util.concurrent.CopyOnWriteArrayList;
032    
033    import org.apache.commons.logging.Log;
034    import org.apache.commons.logging.LogFactory;
035    import org.apache.hadoop.classification.InterfaceAudience;
036    import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
037    import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
038    import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
039    
040    import static org.apache.hadoop.util.ExitUtil.terminate;
041    
042    import com.google.common.base.Preconditions;
043    import com.google.common.collect.ComparisonChain;
044    import com.google.common.collect.ImmutableList;
045    import com.google.common.collect.ImmutableListMultimap;
046    import com.google.common.collect.Lists;
047    import com.google.common.collect.Multimaps;
048    import com.google.common.collect.Sets;
049    
050    /**
051     * Manages a collection of Journals. None of the methods are synchronized, it is
052     * assumed that FSEditLog methods, that use this class, use proper
053     * synchronization.
054     */
055    public class JournalSet implements JournalManager {
056    
057      static final Log LOG = LogFactory.getLog(FSEditLog.class);
058      
059      static final public Comparator<EditLogInputStream>
060        EDIT_LOG_INPUT_STREAM_COMPARATOR = new Comparator<EditLogInputStream>() {
061          @Override
062          public int compare(EditLogInputStream a, EditLogInputStream b) {
063            return ComparisonChain.start().
064              compare(a.getFirstTxId(), b.getFirstTxId()).
065              compare(b.getLastTxId(), a.getLastTxId()).
066              result();
067          }
068        };
069      
070      /**
071       * Container for a JournalManager paired with its currently
072       * active stream.
073       * 
074       * If a Journal gets disabled due to an error writing to its
075       * stream, then the stream will be aborted and set to null.
076       */
077      static class JournalAndStream implements CheckableNameNodeResource {
078        private final JournalManager journal;
079        private boolean disabled = false;
080        private EditLogOutputStream stream;
081        private boolean required = false;
082        
083        public JournalAndStream(JournalManager manager, boolean required) {
084          this.journal = manager;
085          this.required = required;
086        }
087    
088        public void startLogSegment(long txId) throws IOException {
089          Preconditions.checkState(stream == null);
090          disabled = false;
091          stream = journal.startLogSegment(txId);
092        }
093    
094        /**
095         * Closes the stream, also sets it to null.
096         */
097        public void closeStream() throws IOException {
098          if (stream == null) return;
099          stream.close();
100          stream = null;
101        }
102    
103        /**
104         * Close the Journal and Stream
105         */
106        public void close() throws IOException {
107          closeStream();
108    
109          journal.close();
110        }
111        
112        /**
113         * Aborts the stream, also sets it to null.
114         */
115        public void abort() {
116          if (stream == null) return;
117          try {
118            stream.abort();
119          } catch (IOException ioe) {
120            LOG.error("Unable to abort stream " + stream, ioe);
121          }
122          stream = null;
123        }
124    
125        boolean isActive() {
126          return stream != null;
127        }
128        
129        /**
130         * Should be used outside JournalSet only for testing.
131         */
132        EditLogOutputStream getCurrentStream() {
133          return stream;
134        }
135        
136        @Override
137        public String toString() {
138          return "JournalAndStream(mgr=" + journal +
139            ", " + "stream=" + stream + ")";
140        }
141    
142        void setCurrentStreamForTests(EditLogOutputStream stream) {
143          this.stream = stream;
144        }
145        
146        JournalManager getManager() {
147          return journal;
148        }
149    
150        boolean isDisabled() {
151          return disabled;
152        }
153    
154        private void setDisabled(boolean disabled) {
155          this.disabled = disabled;
156        }
157        
158        @Override
159        public boolean isResourceAvailable() {
160          return !isDisabled();
161        }
162        
163        @Override
164        public boolean isRequired() {
165          return required;
166        }
167      }
168     
169      // COW implementation is necessary since some users (eg the web ui) call
170      // getAllJournalStreams() and then iterate. Since this is rarely
171      // mutated, there is no performance concern.
172      private List<JournalAndStream> journals =
173          new CopyOnWriteArrayList<JournalSet.JournalAndStream>();
174      final int minimumRedundantJournals;
175      
176      JournalSet(int minimumRedundantResources) {
177        this.minimumRedundantJournals = minimumRedundantResources;
178      }
179      
180      @Override
181      public void format(NamespaceInfo nsInfo) throws IOException {
182        // The iteration is done by FSEditLog itself
183        throw new UnsupportedOperationException();
184      }
185    
186      @Override
187      public boolean hasSomeData() throws IOException {
188        // This is called individually on the underlying journals,
189        // not on the JournalSet.
190        throw new UnsupportedOperationException();
191      }
192    
193      
194      @Override
195      public EditLogOutputStream startLogSegment(final long txId) throws IOException {
196        mapJournalsAndReportErrors(new JournalClosure() {
197          @Override
198          public void apply(JournalAndStream jas) throws IOException {
199            jas.startLogSegment(txId);
200          }
201        }, "starting log segment " + txId);
202        return new JournalSetOutputStream();
203      }
204      
205      @Override
206      public void finalizeLogSegment(final long firstTxId, final long lastTxId)
207          throws IOException {
208        mapJournalsAndReportErrors(new JournalClosure() {
209          @Override
210          public void apply(JournalAndStream jas) throws IOException {
211            if (jas.isActive()) {
212              jas.closeStream();
213              jas.getManager().finalizeLogSegment(firstTxId, lastTxId);
214            }
215          }
216        }, "finalize log segment " + firstTxId + ", " + lastTxId);
217      }
218       
219      @Override
220      public void close() throws IOException {
221        mapJournalsAndReportErrors(new JournalClosure() {
222          @Override
223          public void apply(JournalAndStream jas) throws IOException {
224            jas.close();
225          }
226        }, "close journal");
227      }
228    
229      /**
230       * In this function, we get a bunch of streams from all of our JournalManager
231       * objects.  Then we add these to the collection one by one.
232       * 
233       * @param streams          The collection to add the streams to.  It may or 
234       *                         may not be sorted-- this is up to the caller.
235       * @param fromTxId         The transaction ID to start looking for streams at
236       * @param inProgressOk     Should we consider unfinalized streams?
237       * @param forReading       Whether or not the caller intends to read from
238       *                         the returned streams.
239       */
240      @Override
241      public void selectInputStreams(Collection<EditLogInputStream> streams,
242          long fromTxId, boolean inProgressOk, boolean forReading) throws IOException {
243        final PriorityQueue<EditLogInputStream> allStreams = 
244            new PriorityQueue<EditLogInputStream>(64,
245                EDIT_LOG_INPUT_STREAM_COMPARATOR);
246        for (JournalAndStream jas : journals) {
247          if (jas.isDisabled()) {
248            LOG.info("Skipping jas " + jas + " since it's disabled");
249            continue;
250          }
251          try {
252            jas.getManager().selectInputStreams(allStreams, fromTxId, inProgressOk,
253                forReading);
254          } catch (IOException ioe) {
255            LOG.warn("Unable to determine input streams from " + jas.getManager() +
256                ". Skipping.", ioe);
257          }
258        }
259        chainAndMakeRedundantStreams(streams, allStreams, fromTxId);
260      }
261      
262      public static void chainAndMakeRedundantStreams(
263          Collection<EditLogInputStream> outStreams,
264          PriorityQueue<EditLogInputStream> allStreams, long fromTxId) {
265        // We want to group together all the streams that start on the same start
266        // transaction ID.  To do this, we maintain an accumulator (acc) of all
267        // the streams we've seen at a given start transaction ID.  When we see a
268        // higher start transaction ID, we select a stream from the accumulator and
269        // clear it.  Then we begin accumulating streams with the new, higher start
270        // transaction ID.
271        LinkedList<EditLogInputStream> acc =
272            new LinkedList<EditLogInputStream>();
273        EditLogInputStream elis;
274        while ((elis = allStreams.poll()) != null) {
275          if (acc.isEmpty()) {
276            acc.add(elis);
277          } else {
278            long accFirstTxId = acc.get(0).getFirstTxId();
279            if (accFirstTxId == elis.getFirstTxId()) {
280              acc.add(elis);
281            } else if (accFirstTxId < elis.getFirstTxId()) {
282              outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
283              acc.clear();
284              acc.add(elis);
285            } else if (accFirstTxId > elis.getFirstTxId()) {
286              throw new RuntimeException("sorted set invariants violated!  " +
287                  "Got stream with first txid " + elis.getFirstTxId() +
288                  ", but the last firstTxId was " + accFirstTxId);
289            }
290          }
291        }
292        if (!acc.isEmpty()) {
293          outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
294          acc.clear();
295        }
296      }
297    
298      /**
299       * Returns true if there are no journals, all redundant journals are disabled,
300       * or any required journals are disabled.
301       * 
302       * @return True if there no journals, all redundant journals are disabled,
303       * or any required journals are disabled.
304       */
305      public boolean isEmpty() {
306        return !NameNodeResourcePolicy.areResourcesAvailable(journals,
307            minimumRedundantJournals);
308      }
309      
310      /**
311       * Called when some journals experience an error in some operation.
312       */
313      private void disableAndReportErrorOnJournals(List<JournalAndStream> badJournals) {
314        if (badJournals == null || badJournals.isEmpty()) {
315          return; // nothing to do
316        }
317     
318        for (JournalAndStream j : badJournals) {
319          LOG.error("Disabling journal " + j);
320          j.abort();
321          j.setDisabled(true);
322        }
323      }
324    
325      /**
326       * Implementations of this interface encapsulate operations that can be
327       * iteratively applied on all the journals. For example see
328       * {@link JournalSet#mapJournalsAndReportErrors}.
329       */
330      private interface JournalClosure {
331        /**
332         * The operation on JournalAndStream.
333         * @param jas Object on which operations are performed.
334         * @throws IOException
335         */
336        public void apply(JournalAndStream jas) throws IOException;
337      }
338      
339      /**
340       * Apply the given operation across all of the journal managers, disabling
341       * any for which the closure throws an IOException.
342       * @param closure {@link JournalClosure} object encapsulating the operation.
343       * @param status message used for logging errors (e.g. "opening journal")
344       * @throws IOException If the operation fails on all the journals.
345       */
346      private void mapJournalsAndReportErrors(
347          JournalClosure closure, String status) throws IOException{
348    
349        List<JournalAndStream> badJAS = Lists.newLinkedList();
350        for (JournalAndStream jas : journals) {
351          try {
352            closure.apply(jas);
353          } catch (Throwable t) {
354            if (jas.isRequired()) {
355              final String msg = "Error: " + status + " failed for required journal ("
356                + jas + ")";
357              LOG.fatal(msg, t);
358              // If we fail on *any* of the required journals, then we must not
359              // continue on any of the other journals. Abort them to ensure that
360              // retry behavior doesn't allow them to keep going in any way.
361              abortAllJournals();
362              // the current policy is to shutdown the NN on errors to shared edits
363              // dir. There are many code paths to shared edits failures - syncs,
364              // roll of edits etc. All of them go through this common function 
365              // where the isRequired() check is made. Applying exit policy here 
366              // to catch all code paths.
367              terminate(1, msg);
368            } else {
369              LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
370              badJAS.add(jas);          
371            }
372          }
373        }
374        disableAndReportErrorOnJournals(badJAS);
375        if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
376            minimumRedundantJournals)) {
377          String message = status + " failed for too many journals";
378          LOG.error("Error: " + message);
379          throw new IOException(message);
380        }
381      }
382      
383      /**
384       * Abort all of the underlying streams.
385       */
386      private void abortAllJournals() {
387        for (JournalAndStream jas : journals) {
388          if (jas.isActive()) {
389            jas.abort();
390          }
391        }
392      }
393    
394      /**
395       * An implementation of EditLogOutputStream that applies a requested method on
396       * all the journals that are currently active.
397       */
398      private class JournalSetOutputStream extends EditLogOutputStream {
399    
400        JournalSetOutputStream() throws IOException {
401          super();
402        }
403    
404        @Override
405        public void write(final FSEditLogOp op)
406            throws IOException {
407          mapJournalsAndReportErrors(new JournalClosure() {
408            @Override
409            public void apply(JournalAndStream jas) throws IOException {
410              if (jas.isActive()) {
411                jas.getCurrentStream().write(op);
412              }
413            }
414          }, "write op");
415        }
416    
417        @Override
418        public void writeRaw(final byte[] data, final int offset, final int length)
419            throws IOException {
420          mapJournalsAndReportErrors(new JournalClosure() {
421            @Override
422            public void apply(JournalAndStream jas) throws IOException {
423              if (jas.isActive()) {
424                jas.getCurrentStream().writeRaw(data, offset, length);
425              }
426            }
427          }, "write bytes");
428        }
429    
430        @Override
431        public void create() throws IOException {
432          mapJournalsAndReportErrors(new JournalClosure() {
433            @Override
434            public void apply(JournalAndStream jas) throws IOException {
435              if (jas.isActive()) {
436                jas.getCurrentStream().create();
437              }
438            }
439          }, "create");
440        }
441    
442        @Override
443        public void close() throws IOException {
444          mapJournalsAndReportErrors(new JournalClosure() {
445            @Override
446            public void apply(JournalAndStream jas) throws IOException {
447              jas.closeStream();
448            }
449          }, "close");
450        }
451    
452        @Override
453        public void abort() throws IOException {
454          mapJournalsAndReportErrors(new JournalClosure() {
455            @Override
456            public void apply(JournalAndStream jas) throws IOException {
457              jas.abort();
458            }
459          }, "abort");
460        }
461    
462        @Override
463        public void setReadyToFlush() throws IOException {
464          mapJournalsAndReportErrors(new JournalClosure() {
465            @Override
466            public void apply(JournalAndStream jas) throws IOException {
467              if (jas.isActive()) {
468                jas.getCurrentStream().setReadyToFlush();
469              }
470            }
471          }, "setReadyToFlush");
472        }
473    
474        @Override
475        protected void flushAndSync(final boolean durable) throws IOException {
476          mapJournalsAndReportErrors(new JournalClosure() {
477            @Override
478            public void apply(JournalAndStream jas) throws IOException {
479              if (jas.isActive()) {
480                jas.getCurrentStream().flushAndSync(durable);
481              }
482            }
483          }, "flushAndSync");
484        }
485        
486        @Override
487        public void flush() throws IOException {
488          mapJournalsAndReportErrors(new JournalClosure() {
489            @Override
490            public void apply(JournalAndStream jas) throws IOException {
491              if (jas.isActive()) {
492                jas.getCurrentStream().flush();
493              }
494            }
495          }, "flush");
496        }
497        
498        @Override
499        public boolean shouldForceSync() {
500          for (JournalAndStream js : journals) {
501            if (js.isActive() && js.getCurrentStream().shouldForceSync()) {
502              return true;
503            }
504          }
505          return false;
506        }
507        
508        @Override
509        protected long getNumSync() {
510          for (JournalAndStream jas : journals) {
511            if (jas.isActive()) {
512              return jas.getCurrentStream().getNumSync();
513            }
514          }
515          return 0;
516        }
517      }
518    
519      @Override
520      public void setOutputBufferCapacity(final int size) {
521        try {
522          mapJournalsAndReportErrors(new JournalClosure() {
523            @Override
524            public void apply(JournalAndStream jas) throws IOException {
525                jas.getManager().setOutputBufferCapacity(size);
526            }
527          }, "setOutputBufferCapacity");
528        } catch (IOException e) {
529          LOG.error("Error in setting outputbuffer capacity");
530        }
531      }
532      
533      List<JournalAndStream> getAllJournalStreams() {
534        return journals;
535      }
536    
537      List<JournalManager> getJournalManagers() {
538        List<JournalManager> jList = new ArrayList<JournalManager>();
539        for (JournalAndStream j : journals) {
540          jList.add(j.getManager());
541        }
542        return jList;
543      }
544    
545      void add(JournalManager j, boolean required) {
546        JournalAndStream jas = new JournalAndStream(j, required);
547        journals.add(jas);
548      }
549      
550      void remove(JournalManager j) {
551        JournalAndStream jasToRemove = null;
552        for (JournalAndStream jas: journals) {
553          if (jas.getManager().equals(j)) {
554            jasToRemove = jas;
555            break;
556          }
557        }
558        if (jasToRemove != null) {
559          jasToRemove.abort();
560          journals.remove(jasToRemove);
561        }
562      }
563    
564      @Override
565      public void purgeLogsOlderThan(final long minTxIdToKeep) throws IOException {
566        mapJournalsAndReportErrors(new JournalClosure() {
567          @Override
568          public void apply(JournalAndStream jas) throws IOException {
569            jas.getManager().purgeLogsOlderThan(minTxIdToKeep);
570          }
571        }, "purgeLogsOlderThan " + minTxIdToKeep);
572      }
573    
574      @Override
575      public void recoverUnfinalizedSegments() throws IOException {
576        mapJournalsAndReportErrors(new JournalClosure() {
577          @Override
578          public void apply(JournalAndStream jas) throws IOException {
579            jas.getManager().recoverUnfinalizedSegments();
580          }
581        }, "recoverUnfinalizedSegments");
582      }
583      
584      /**
585       * Return a manifest of what finalized edit logs are available. All available
586       * edit logs are returned starting from the transaction id passed.
587       * 
588       * @param fromTxId Starting transaction id to read the logs.
589       * @return RemoteEditLogManifest object.
590       */
591      public synchronized RemoteEditLogManifest getEditLogManifest(long fromTxId,
592          boolean forReading) {
593        // Collect RemoteEditLogs available from each FileJournalManager
594        List<RemoteEditLog> allLogs = Lists.newArrayList();
595        for (JournalAndStream j : journals) {
596          if (j.getManager() instanceof FileJournalManager) {
597            FileJournalManager fjm = (FileJournalManager)j.getManager();
598            try {
599              allLogs.addAll(fjm.getRemoteEditLogs(fromTxId, forReading, false));
600            } catch (Throwable t) {
601              LOG.warn("Cannot list edit logs in " + fjm, t);
602            }
603          }
604        }
605        
606        // Group logs by their starting txid
607        ImmutableListMultimap<Long, RemoteEditLog> logsByStartTxId =
608          Multimaps.index(allLogs, RemoteEditLog.GET_START_TXID);
609        long curStartTxId = fromTxId;
610    
611        List<RemoteEditLog> logs = Lists.newArrayList();
612        while (true) {
613          ImmutableList<RemoteEditLog> logGroup = logsByStartTxId.get(curStartTxId);
614          if (logGroup.isEmpty()) {
615            // we have a gap in logs - for example because we recovered some old
616            // storage directory with ancient logs. Clear out any logs we've
617            // accumulated so far, and then skip to the next segment of logs
618            // after the gap.
619            SortedSet<Long> startTxIds = Sets.newTreeSet(logsByStartTxId.keySet());
620            startTxIds = startTxIds.tailSet(curStartTxId);
621            if (startTxIds.isEmpty()) {
622              break;
623            } else {
624              if (LOG.isDebugEnabled()) {
625                LOG.debug("Found gap in logs at " + curStartTxId + ": " +
626                    "not returning previous logs in manifest.");
627              }
628              logs.clear();
629              curStartTxId = startTxIds.first();
630              continue;
631            }
632          }
633    
634          // Find the one that extends the farthest forward
635          RemoteEditLog bestLog = Collections.max(logGroup);
636          logs.add(bestLog);
637          // And then start looking from after that point
638          curStartTxId = bestLog.getEndTxId() + 1;
639        }
640        RemoteEditLogManifest ret = new RemoteEditLogManifest(logs);
641        
642        if (LOG.isDebugEnabled()) {
643          LOG.debug("Generated manifest for logs since " + fromTxId + ":"
644              + ret);      
645        }
646        return ret;
647      }
648    
649      /**
650       * Add sync times to the buffer.
651       */
652      String getSyncTimes() {
653        StringBuilder buf = new StringBuilder();
654        for (JournalAndStream jas : journals) {
655          if (jas.isActive()) {
656            buf.append(jas.getCurrentStream().getTotalSyncTime());
657            buf.append(" ");
658          }
659        }
660        return buf.toString();
661      }
662    }