001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.namenode;
019    
020    import static org.apache.hadoop.util.Time.now;
021    
022    import java.io.File;
023    import java.io.FilterInputStream;
024    import java.io.IOException;
025    import java.io.InputStream;
026    import java.util.ArrayList;
027    import java.util.Arrays;
028    import java.util.EnumMap;
029    import java.util.List;
030    
031    import org.apache.commons.logging.Log;
032    import org.apache.commons.logging.LogFactory;
033    import org.apache.hadoop.classification.InterfaceAudience;
034    import org.apache.hadoop.classification.InterfaceStability;
035    import org.apache.hadoop.fs.FileSystem;
036    import org.apache.hadoop.hdfs.protocol.Block;
037    import org.apache.hadoop.hdfs.protocol.HdfsConstants;
038    import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
039    import org.apache.hadoop.hdfs.protocol.LayoutVersion;
040    import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
041    import org.apache.hadoop.hdfs.protocol.LocatedBlock;
042    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
043    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
044    import org.apache.hadoop.hdfs.server.common.Storage;
045    import org.apache.hadoop.hdfs.server.namenode.EditLogFileInputStream.LogHeaderCorruptException;
046    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCloseOp;
047    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AllocateBlockIdOp;
048    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AllowSnapshotOp;
049    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.BlockListUpdatingOp;
050    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CancelDelegationTokenOp;
051    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ClearNSQuotaOp;
052    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ConcatDeleteOp;
053    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CreateSnapshotOp;
054    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DeleteOp;
055    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DeleteSnapshotOp;
056    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DisallowSnapshotOp;
057    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.GetDelegationTokenOp;
058    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.MkdirOp;
059    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ReassignLeaseOp;
060    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameOldOp;
061    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameOp;
062    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameSnapshotOp;
063    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenewDelegationTokenOp;
064    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetGenstampV1Op;
065    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetGenstampV2Op;
066    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetNSQuotaOp;
067    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetOwnerOp;
068    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetPermissionsOp;
069    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetQuotaOp;
070    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetReplicationOp;
071    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SymlinkOp;
072    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.TimesOp;
073    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateBlocksOp;
074    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateMasterKeyOp;
075    import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
076    import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
077    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
078    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
079    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
080    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
081    import org.apache.hadoop.hdfs.util.Holder;
082    
083    import com.google.common.base.Joiner;
084    
085    @InterfaceAudience.Private
086    @InterfaceStability.Evolving
087    public class FSEditLogLoader {
088      static final Log LOG = LogFactory.getLog(FSEditLogLoader.class.getName());
089      static long REPLAY_TRANSACTION_LOG_INTERVAL = 1000; // 1sec
090      private final FSNamesystem fsNamesys;
091      private long lastAppliedTxId;
092      
093      public FSEditLogLoader(FSNamesystem fsNamesys, long lastAppliedTxId) {
094        this.fsNamesys = fsNamesys;
095        this.lastAppliedTxId = lastAppliedTxId;
096      }
097      
098      /**
099       * Load an edit log, and apply the changes to the in-memory structure
100       * This is where we apply edits that we've been writing to disk all
101       * along.
102       */
103      long loadFSEdits(EditLogInputStream edits, long expectedStartingTxId,
104          MetaRecoveryContext recovery) throws IOException {
105        StartupProgress prog = NameNode.getStartupProgress();
106        Step step = createStartupProgressStep(edits);
107        prog.beginStep(Phase.LOADING_EDITS, step);
108        fsNamesys.writeLock();
109        try {
110          long startTime = now();
111          FSImage.LOG.info("Start loading edits file " + edits.getName());
112          long numEdits = loadEditRecords(edits, false, 
113                                     expectedStartingTxId, recovery);
114          FSImage.LOG.info("Edits file " + edits.getName() 
115              + " of size " + edits.length() + " edits # " + numEdits 
116              + " loaded in " + (now()-startTime)/1000 + " seconds");
117          return numEdits;
118        } finally {
119          edits.close();
120          fsNamesys.writeUnlock();
121          prog.endStep(Phase.LOADING_EDITS, step);
122        }
123      }
124    
125      long loadEditRecords(EditLogInputStream in, boolean closeOnExit,
126                          long expectedStartingTxId, MetaRecoveryContext recovery)
127          throws IOException {
128        FSDirectory fsDir = fsNamesys.dir;
129    
130        EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts =
131          new EnumMap<FSEditLogOpCodes, Holder<Integer>>(FSEditLogOpCodes.class);
132    
133        if (LOG.isTraceEnabled()) {
134          LOG.trace("Acquiring write lock to replay edit log");
135        }
136    
137        fsNamesys.writeLock();
138        fsDir.writeLock();
139    
140        long recentOpcodeOffsets[] = new long[4];
141        Arrays.fill(recentOpcodeOffsets, -1);
142        
143        long expectedTxId = expectedStartingTxId;
144        long numEdits = 0;
145        long lastTxId = in.getLastTxId();
146        long numTxns = (lastTxId - expectedStartingTxId) + 1;
147        StartupProgress prog = NameNode.getStartupProgress();
148        Step step = createStartupProgressStep(in);
149        prog.setTotal(Phase.LOADING_EDITS, step, numTxns);
150        Counter counter = prog.getCounter(Phase.LOADING_EDITS, step);
151        long lastLogTime = now();
152        long lastInodeId = fsNamesys.getLastInodeId();
153        
154        try {
155          while (true) {
156            try {
157              FSEditLogOp op;
158              try {
159                op = in.readOp();
160                if (op == null) {
161                  break;
162                }
163              } catch (Throwable e) {
164                // Handle a problem with our input
165                check203UpgradeFailure(in.getVersion(), e);
166                String errorMessage =
167                  formatEditLogReplayError(in, recentOpcodeOffsets, expectedTxId);
168                FSImage.LOG.error(errorMessage, e);
169                if (recovery == null) {
170                   // We will only try to skip over problematic opcodes when in
171                   // recovery mode.
172                  throw new EditLogInputException(errorMessage, e, numEdits);
173                }
174                MetaRecoveryContext.editLogLoaderPrompt(
175                    "We failed to read txId " + expectedTxId,
176                    recovery, "skipping the bad section in the log");
177                in.resync();
178                continue;
179              }
180              recentOpcodeOffsets[(int)(numEdits % recentOpcodeOffsets.length)] =
181                in.getPosition();
182              if (op.hasTransactionId()) {
183                if (op.getTransactionId() > expectedTxId) { 
184                  MetaRecoveryContext.editLogLoaderPrompt("There appears " +
185                      "to be a gap in the edit log.  We expected txid " +
186                      expectedTxId + ", but got txid " +
187                      op.getTransactionId() + ".", recovery, "ignoring missing " +
188                      " transaction IDs");
189                } else if (op.getTransactionId() < expectedTxId) { 
190                  MetaRecoveryContext.editLogLoaderPrompt("There appears " +
191                      "to be an out-of-order edit in the edit log.  We " +
192                      "expected txid " + expectedTxId + ", but got txid " +
193                      op.getTransactionId() + ".", recovery,
194                      "skipping the out-of-order edit");
195                  continue;
196                }
197              }
198              try {
199                long inodeId = applyEditLogOp(op, fsDir, in.getVersion(), lastInodeId);
200                if (lastInodeId < inodeId) {
201                  lastInodeId = inodeId;
202                }
203              } catch (Throwable e) {
204                LOG.error("Encountered exception on operation " + op, e);
205                MetaRecoveryContext.editLogLoaderPrompt("Failed to " +
206                 "apply edit log operation " + op + ": error " +
207                 e.getMessage(), recovery, "applying edits");
208              }
209              // Now that the operation has been successfully decoded and
210              // applied, update our bookkeeping.
211              incrOpCount(op.opCode, opCounts, step, counter);
212              if (op.hasTransactionId()) {
213                lastAppliedTxId = op.getTransactionId();
214                expectedTxId = lastAppliedTxId + 1;
215              } else {
216                expectedTxId = lastAppliedTxId = expectedStartingTxId;
217              }
218              // log progress
219              if (op.hasTransactionId()) {
220                long now = now();
221                if (now - lastLogTime > REPLAY_TRANSACTION_LOG_INTERVAL) {
222                  long deltaTxId = lastAppliedTxId - expectedStartingTxId + 1;
223                  int percent = Math.round((float) deltaTxId / numTxns * 100);
224                  LOG.info("replaying edit log: " + deltaTxId + "/" + numTxns
225                      + " transactions completed. (" + percent + "%)");
226                  lastLogTime = now;
227                }
228              }
229              numEdits++;
230            } catch (MetaRecoveryContext.RequestStopException e) {
231              MetaRecoveryContext.LOG.warn("Stopped reading edit log at " +
232                  in.getPosition() + "/"  + in.length());
233              break;
234            }
235          }
236        } finally {
237          fsNamesys.resetLastInodeId(lastInodeId);
238          if(closeOnExit) {
239            in.close();
240          }
241          fsDir.writeUnlock();
242          fsNamesys.writeUnlock();
243    
244          if (LOG.isTraceEnabled()) {
245            LOG.trace("replaying edit log finished");
246          }
247    
248          if (FSImage.LOG.isDebugEnabled()) {
249            dumpOpCounts(opCounts);
250          }
251        }
252        return numEdits;
253      }
254      
255      // allocate and update last allocated inode id
256      private long getAndUpdateLastInodeId(long inodeIdFromOp, int logVersion,
257          long lastInodeId) throws IOException {
258        long inodeId = inodeIdFromOp;
259    
260        if (inodeId == INodeId.GRANDFATHER_INODE_ID) {
261          if (LayoutVersion.supports(Feature.ADD_INODE_ID, logVersion)) {
262            throw new IOException("The layout version " + logVersion
263                + " supports inodeId but gave bogus inodeId");
264          }
265          inodeId = fsNamesys.allocateNewInodeId();
266        } else {
267          // need to reset lastInodeId. fsnamesys gets lastInodeId firstly from
268          // fsimage but editlog captures more recent inodeId allocations
269          if (inodeId > lastInodeId) {
270            fsNamesys.resetLastInodeId(inodeId);
271          }
272        }
273        return inodeId;
274      }
275    
276      @SuppressWarnings("deprecation")
277      private long applyEditLogOp(FSEditLogOp op, FSDirectory fsDir,
278          int logVersion, long lastInodeId) throws IOException {
279        long inodeId = INodeId.GRANDFATHER_INODE_ID;
280        if (LOG.isTraceEnabled()) {
281          LOG.trace("replaying edit log: " + op);
282        }
283        final boolean toAddRetryCache = fsNamesys.hasRetryCache() && op.hasRpcIds();
284        
285        switch (op.opCode) {
286        case OP_ADD: {
287          AddCloseOp addCloseOp = (AddCloseOp)op;
288          if (FSNamesystem.LOG.isDebugEnabled()) {
289            FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
290                " numblocks : " + addCloseOp.blocks.length +
291                " clientHolder " + addCloseOp.clientName +
292                " clientMachine " + addCloseOp.clientMachine);
293          }
294          // There three cases here:
295          // 1. OP_ADD to create a new file
296          // 2. OP_ADD to update file blocks
297          // 3. OP_ADD to open file for append
298    
299          // See if the file already exists (persistBlocks call)
300          final INodesInPath iip = fsDir.getLastINodeInPath(addCloseOp.path);
301          final INodeFile oldFile = INodeFile.valueOf(
302              iip.getINode(0), addCloseOp.path, true);
303          INodeFile newFile = oldFile;
304          if (oldFile == null) { // this is OP_ADD on a new file (case 1)
305            // versions > 0 support per file replication
306            // get name and replication
307            final short replication = fsNamesys.getBlockManager()
308                .adjustReplication(addCloseOp.replication);
309            assert addCloseOp.blocks.length == 0;
310    
311            // add to the file tree
312            inodeId = getAndUpdateLastInodeId(addCloseOp.inodeId, logVersion,
313                lastInodeId);
314            newFile = fsDir.unprotectedAddFile(inodeId,
315                addCloseOp.path, addCloseOp.permissions, replication,
316                addCloseOp.mtime, addCloseOp.atime, addCloseOp.blockSize, true,
317                addCloseOp.clientName, addCloseOp.clientMachine);
318            fsNamesys.leaseManager.addLease(addCloseOp.clientName, addCloseOp.path);
319    
320            // add the op into retry cache if necessary
321            if (toAddRetryCache) {
322              HdfsFileStatus stat = fsNamesys.dir.createFileStatus(
323                  HdfsFileStatus.EMPTY_NAME, newFile, null);
324              fsNamesys.addCacheEntryWithPayload(addCloseOp.rpcClientId,
325                  addCloseOp.rpcCallId, stat);
326            }
327          } else { // This is OP_ADD on an existing file
328            if (!oldFile.isUnderConstruction()) {
329              // This is case 3: a call to append() on an already-closed file.
330              if (FSNamesystem.LOG.isDebugEnabled()) {
331                FSNamesystem.LOG.debug("Reopening an already-closed file " +
332                    "for append");
333              }
334              LocatedBlock lb = fsNamesys.prepareFileForWrite(addCloseOp.path,
335                  oldFile, addCloseOp.clientName, addCloseOp.clientMachine, null,
336                  false, iip.getLatestSnapshot(), false);
337              newFile = INodeFile.valueOf(fsDir.getINode(addCloseOp.path),
338                  addCloseOp.path, true);
339              
340              // add the op into retry cache is necessary
341              if (toAddRetryCache) {
342                fsNamesys.addCacheEntryWithPayload(addCloseOp.rpcClientId,
343                    addCloseOp.rpcCallId, lb);
344              }
345            }
346          }
347          // Fall-through for case 2.
348          // Regardless of whether it's a new file or an updated file,
349          // update the block list.
350          
351          // Update the salient file attributes.
352          newFile.setAccessTime(addCloseOp.atime, null, fsDir.getINodeMap());
353          newFile.setModificationTime(addCloseOp.mtime, null, fsDir.getINodeMap());
354          updateBlocks(fsDir, addCloseOp, newFile);
355          break;
356        }
357        case OP_CLOSE: {
358          AddCloseOp addCloseOp = (AddCloseOp)op;
359          
360          if (FSNamesystem.LOG.isDebugEnabled()) {
361            FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
362                " numblocks : " + addCloseOp.blocks.length +
363                " clientHolder " + addCloseOp.clientName +
364                " clientMachine " + addCloseOp.clientMachine);
365          }
366    
367          final INodesInPath iip = fsDir.getLastINodeInPath(addCloseOp.path);
368          final INodeFile oldFile = INodeFile.valueOf(iip.getINode(0), addCloseOp.path);
369    
370          // Update the salient file attributes.
371          oldFile.setAccessTime(addCloseOp.atime, null, fsDir.getINodeMap());
372          oldFile.setModificationTime(addCloseOp.mtime, null, fsDir.getINodeMap());
373          updateBlocks(fsDir, addCloseOp, oldFile);
374    
375          // Now close the file
376          if (!oldFile.isUnderConstruction() &&
377              logVersion <= LayoutVersion.BUGFIX_HDFS_2991_VERSION) {
378            // There was a bug (HDFS-2991) in hadoop < 0.23.1 where OP_CLOSE
379            // could show up twice in a row. But after that version, this
380            // should be fixed, so we should treat it as an error.
381            throw new IOException(
382                "File is not under construction: " + addCloseOp.path);
383          }
384          // One might expect that you could use removeLease(holder, path) here,
385          // but OP_CLOSE doesn't serialize the holder. So, remove by path.
386          if (oldFile.isUnderConstruction()) {
387            INodeFileUnderConstruction ucFile = (INodeFileUnderConstruction) oldFile;
388            fsNamesys.leaseManager.removeLeaseWithPrefixPath(addCloseOp.path);
389            INodeFile newFile = ucFile.toINodeFile(ucFile.getModificationTime());
390            fsDir.unprotectedReplaceINodeFile(addCloseOp.path, ucFile, newFile);
391          }
392          break;
393        }
394        case OP_UPDATE_BLOCKS: {
395          UpdateBlocksOp updateOp = (UpdateBlocksOp)op;
396          if (FSNamesystem.LOG.isDebugEnabled()) {
397            FSNamesystem.LOG.debug(op.opCode + ": " + updateOp.path +
398                " numblocks : " + updateOp.blocks.length);
399          }
400          INodeFile oldFile = INodeFile.valueOf(fsDir.getINode(updateOp.path),
401              updateOp.path);
402          // Update in-memory data structures
403          updateBlocks(fsDir, updateOp, oldFile);
404          
405          if (toAddRetryCache) {
406            fsNamesys.addCacheEntry(updateOp.rpcClientId, updateOp.rpcCallId);
407          }
408          break;
409        }
410          
411        case OP_SET_REPLICATION: {
412          SetReplicationOp setReplicationOp = (SetReplicationOp)op;
413          short replication = fsNamesys.getBlockManager().adjustReplication(
414              setReplicationOp.replication);
415          fsDir.unprotectedSetReplication(setReplicationOp.path,
416                                          replication, null);
417          break;
418        }
419        case OP_CONCAT_DELETE: {
420          ConcatDeleteOp concatDeleteOp = (ConcatDeleteOp)op;
421          fsDir.unprotectedConcat(concatDeleteOp.trg, concatDeleteOp.srcs,
422              concatDeleteOp.timestamp);
423          
424          if (toAddRetryCache) {
425            fsNamesys.addCacheEntry(concatDeleteOp.rpcClientId,
426                concatDeleteOp.rpcCallId);
427          }
428          break;
429        }
430        case OP_RENAME_OLD: {
431          RenameOldOp renameOp = (RenameOldOp)op;
432          fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
433                                    renameOp.timestamp);
434          
435          if (toAddRetryCache) {
436            fsNamesys.addCacheEntry(renameOp.rpcClientId, renameOp.rpcCallId);
437          }
438          break;
439        }
440        case OP_DELETE: {
441          DeleteOp deleteOp = (DeleteOp)op;
442          fsDir.unprotectedDelete(deleteOp.path, deleteOp.timestamp);
443          
444          if (toAddRetryCache) {
445            fsNamesys.addCacheEntry(deleteOp.rpcClientId, deleteOp.rpcCallId);
446          }
447          break;
448        }
449        case OP_MKDIR: {
450          MkdirOp mkdirOp = (MkdirOp)op;
451          inodeId = getAndUpdateLastInodeId(mkdirOp.inodeId, logVersion,
452              lastInodeId);
453          fsDir.unprotectedMkdir(inodeId, mkdirOp.path, mkdirOp.permissions,
454                                 mkdirOp.timestamp);
455          break;
456        }
457        case OP_SET_GENSTAMP_V1: {
458          SetGenstampV1Op setGenstampV1Op = (SetGenstampV1Op)op;
459          fsNamesys.setGenerationStampV1(setGenstampV1Op.genStampV1);
460          break;
461        }
462        case OP_SET_PERMISSIONS: {
463          SetPermissionsOp setPermissionsOp = (SetPermissionsOp)op;
464          fsDir.unprotectedSetPermission(setPermissionsOp.src,
465                                         setPermissionsOp.permissions);
466          break;
467        }
468        case OP_SET_OWNER: {
469          SetOwnerOp setOwnerOp = (SetOwnerOp)op;
470          fsDir.unprotectedSetOwner(setOwnerOp.src, setOwnerOp.username,
471                                    setOwnerOp.groupname);
472          break;
473        }
474        case OP_SET_NS_QUOTA: {
475          SetNSQuotaOp setNSQuotaOp = (SetNSQuotaOp)op;
476          fsDir.unprotectedSetQuota(setNSQuotaOp.src,
477                                    setNSQuotaOp.nsQuota,
478                                    HdfsConstants.QUOTA_DONT_SET);
479          break;
480        }
481        case OP_CLEAR_NS_QUOTA: {
482          ClearNSQuotaOp clearNSQuotaOp = (ClearNSQuotaOp)op;
483          fsDir.unprotectedSetQuota(clearNSQuotaOp.src,
484                                    HdfsConstants.QUOTA_RESET,
485                                    HdfsConstants.QUOTA_DONT_SET);
486          break;
487        }
488    
489        case OP_SET_QUOTA:
490          SetQuotaOp setQuotaOp = (SetQuotaOp)op;
491          fsDir.unprotectedSetQuota(setQuotaOp.src,
492                                    setQuotaOp.nsQuota,
493                                    setQuotaOp.dsQuota);
494          break;
495    
496        case OP_TIMES: {
497          TimesOp timesOp = (TimesOp)op;
498    
499          fsDir.unprotectedSetTimes(timesOp.path,
500                                    timesOp.mtime,
501                                    timesOp.atime, true);
502          break;
503        }
504        case OP_SYMLINK: {
505          if (!FileSystem.isSymlinksEnabled()) {
506            throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
507          }
508          SymlinkOp symlinkOp = (SymlinkOp)op;
509          inodeId = getAndUpdateLastInodeId(symlinkOp.inodeId, logVersion,
510              lastInodeId);
511          fsDir.unprotectedAddSymlink(inodeId, symlinkOp.path,
512                                      symlinkOp.value, symlinkOp.mtime, 
513                                      symlinkOp.atime, symlinkOp.permissionStatus);
514          
515          if (toAddRetryCache) {
516            fsNamesys.addCacheEntry(symlinkOp.rpcClientId, symlinkOp.rpcCallId);
517          }
518          break;
519        }
520        case OP_RENAME: {
521          RenameOp renameOp = (RenameOp)op;
522          fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
523                                    renameOp.timestamp, renameOp.options);
524          
525          if (toAddRetryCache) {
526            fsNamesys.addCacheEntry(renameOp.rpcClientId, renameOp.rpcCallId);
527          }
528          break;
529        }
530        case OP_GET_DELEGATION_TOKEN: {
531          GetDelegationTokenOp getDelegationTokenOp
532            = (GetDelegationTokenOp)op;
533    
534          fsNamesys.getDelegationTokenSecretManager()
535            .addPersistedDelegationToken(getDelegationTokenOp.token,
536                                         getDelegationTokenOp.expiryTime);
537          break;
538        }
539        case OP_RENEW_DELEGATION_TOKEN: {
540          RenewDelegationTokenOp renewDelegationTokenOp
541            = (RenewDelegationTokenOp)op;
542          fsNamesys.getDelegationTokenSecretManager()
543            .updatePersistedTokenRenewal(renewDelegationTokenOp.token,
544                                         renewDelegationTokenOp.expiryTime);
545          break;
546        }
547        case OP_CANCEL_DELEGATION_TOKEN: {
548          CancelDelegationTokenOp cancelDelegationTokenOp
549            = (CancelDelegationTokenOp)op;
550          fsNamesys.getDelegationTokenSecretManager()
551              .updatePersistedTokenCancellation(
552                  cancelDelegationTokenOp.token);
553          break;
554        }
555        case OP_UPDATE_MASTER_KEY: {
556          UpdateMasterKeyOp updateMasterKeyOp = (UpdateMasterKeyOp)op;
557          fsNamesys.getDelegationTokenSecretManager()
558            .updatePersistedMasterKey(updateMasterKeyOp.key);
559          break;
560        }
561        case OP_REASSIGN_LEASE: {
562          ReassignLeaseOp reassignLeaseOp = (ReassignLeaseOp)op;
563    
564          Lease lease = fsNamesys.leaseManager.getLease(
565              reassignLeaseOp.leaseHolder);
566          INodeFileUnderConstruction pendingFile =
567              INodeFileUnderConstruction.valueOf( 
568                  fsDir.getINode(reassignLeaseOp.path), reassignLeaseOp.path);
569          fsNamesys.reassignLeaseInternal(lease,
570              reassignLeaseOp.path, reassignLeaseOp.newHolder, pendingFile);
571          break;
572        }
573        case OP_START_LOG_SEGMENT:
574        case OP_END_LOG_SEGMENT: {
575          // no data in here currently.
576          break;
577        }
578        case OP_CREATE_SNAPSHOT: {
579          CreateSnapshotOp createSnapshotOp = (CreateSnapshotOp) op;
580          String path = fsNamesys.getSnapshotManager().createSnapshot(
581              createSnapshotOp.snapshotRoot, createSnapshotOp.snapshotName);
582          if (toAddRetryCache) {
583            fsNamesys.addCacheEntryWithPayload(createSnapshotOp.rpcClientId,
584                createSnapshotOp.rpcCallId, path);
585          }
586          break;
587        }
588        case OP_DELETE_SNAPSHOT: {
589          DeleteSnapshotOp deleteSnapshotOp = (DeleteSnapshotOp) op;
590          BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
591          List<INode> removedINodes = new ArrayList<INode>();
592          fsNamesys.getSnapshotManager().deleteSnapshot(
593              deleteSnapshotOp.snapshotRoot, deleteSnapshotOp.snapshotName,
594              collectedBlocks, removedINodes);
595          fsNamesys.removeBlocks(collectedBlocks);
596          collectedBlocks.clear();
597          fsNamesys.dir.removeFromInodeMap(removedINodes);
598          removedINodes.clear();
599          
600          if (toAddRetryCache) {
601            fsNamesys.addCacheEntry(deleteSnapshotOp.rpcClientId,
602                deleteSnapshotOp.rpcCallId);
603          }
604          break;
605        }
606        case OP_RENAME_SNAPSHOT: {
607          RenameSnapshotOp renameSnapshotOp = (RenameSnapshotOp) op;
608          fsNamesys.getSnapshotManager().renameSnapshot(
609              renameSnapshotOp.snapshotRoot, renameSnapshotOp.snapshotOldName,
610              renameSnapshotOp.snapshotNewName);
611          
612          if (toAddRetryCache) {
613            fsNamesys.addCacheEntry(renameSnapshotOp.rpcClientId,
614                renameSnapshotOp.rpcCallId);
615          }
616          break;
617        }
618        case OP_ALLOW_SNAPSHOT: {
619          AllowSnapshotOp allowSnapshotOp = (AllowSnapshotOp) op;
620          fsNamesys.getSnapshotManager().setSnapshottable(
621              allowSnapshotOp.snapshotRoot, false);
622          break;
623        }
624        case OP_DISALLOW_SNAPSHOT: {
625          DisallowSnapshotOp disallowSnapshotOp = (DisallowSnapshotOp) op;
626          fsNamesys.getSnapshotManager().resetSnapshottable(
627              disallowSnapshotOp.snapshotRoot);
628          break;
629        }
630        case OP_SET_GENSTAMP_V2: {
631          SetGenstampV2Op setGenstampV2Op = (SetGenstampV2Op) op;
632          fsNamesys.setGenerationStampV2(setGenstampV2Op.genStampV2);
633          break;
634        }
635        case OP_ALLOCATE_BLOCK_ID: {
636          AllocateBlockIdOp allocateBlockIdOp = (AllocateBlockIdOp) op;
637          fsNamesys.setLastAllocatedBlockId(allocateBlockIdOp.blockId);
638          break;
639        }
640        default:
641          throw new IOException("Invalid operation read " + op.opCode);
642        }
643        return inodeId;
644      }
645      
646      private static String formatEditLogReplayError(EditLogInputStream in,
647          long recentOpcodeOffsets[], long txid) {
648        StringBuilder sb = new StringBuilder();
649        sb.append("Error replaying edit log at offset " + in.getPosition());
650        sb.append(".  Expected transaction ID was ").append(txid);
651        if (recentOpcodeOffsets[0] != -1) {
652          Arrays.sort(recentOpcodeOffsets);
653          sb.append("\nRecent opcode offsets:");
654          for (long offset : recentOpcodeOffsets) {
655            if (offset != -1) {
656              sb.append(' ').append(offset);
657            }
658          }
659        }
660        return sb.toString();
661      }
662    
663      /**
664       * Update in-memory data structures with new block information.
665       * @throws IOException
666       */
667      private void updateBlocks(FSDirectory fsDir, BlockListUpdatingOp op,
668          INodeFile file) throws IOException {
669        // Update its block list
670        BlockInfo[] oldBlocks = file.getBlocks();
671        Block[] newBlocks = op.getBlocks();
672        String path = op.getPath();
673        
674        // Are we only updating the last block's gen stamp.
675        boolean isGenStampUpdate = oldBlocks.length == newBlocks.length;
676        
677        // First, update blocks in common
678        for (int i = 0; i < oldBlocks.length && i < newBlocks.length; i++) {
679          BlockInfo oldBlock = oldBlocks[i];
680          Block newBlock = newBlocks[i];
681          
682          boolean isLastBlock = i == newBlocks.length - 1;
683          if (oldBlock.getBlockId() != newBlock.getBlockId() ||
684              (oldBlock.getGenerationStamp() != newBlock.getGenerationStamp() && 
685                  !(isGenStampUpdate && isLastBlock))) {
686            throw new IOException("Mismatched block IDs or generation stamps, " +
687                "attempting to replace block " + oldBlock + " with " + newBlock +
688                " as block # " + i + "/" + newBlocks.length + " of " +
689                path);
690          }
691          
692          oldBlock.setNumBytes(newBlock.getNumBytes());
693          boolean changeMade =
694            oldBlock.getGenerationStamp() != newBlock.getGenerationStamp();
695          oldBlock.setGenerationStamp(newBlock.getGenerationStamp());
696          
697          if (oldBlock instanceof BlockInfoUnderConstruction &&
698              (!isLastBlock || op.shouldCompleteLastBlock())) {
699            changeMade = true;
700            fsNamesys.getBlockManager().forceCompleteBlock(
701                (INodeFileUnderConstruction)file,
702                (BlockInfoUnderConstruction)oldBlock);
703          }
704          if (changeMade) {
705            // The state or gen-stamp of the block has changed. So, we may be
706            // able to process some messages from datanodes that we previously
707            // were unable to process.
708            fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock);
709          }
710        }
711        
712        if (newBlocks.length < oldBlocks.length) {
713          // We're removing a block from the file, e.g. abandonBlock(...)
714          if (!file.isUnderConstruction()) {
715            throw new IOException("Trying to remove a block from file " +
716                path + " which is not under construction.");
717          }
718          if (newBlocks.length != oldBlocks.length - 1) {
719            throw new IOException("Trying to remove more than one block from file "
720                + path);
721          }
722          Block oldBlock = oldBlocks[oldBlocks.length - 1];
723          boolean removed = fsDir.unprotectedRemoveBlock(path,
724              (INodeFileUnderConstruction) file, oldBlock);
725          if (!removed && !(op instanceof UpdateBlocksOp)) {
726            throw new IOException("Trying to delete non-existant block " + oldBlock);
727          }
728        } else if (newBlocks.length > oldBlocks.length) {
729          // We're adding blocks
730          for (int i = oldBlocks.length; i < newBlocks.length; i++) {
731            Block newBlock = newBlocks[i];
732            BlockInfo newBI;
733            if (!op.shouldCompleteLastBlock()) {
734              // TODO: shouldn't this only be true for the last block?
735              // what about an old-version fsync() where fsync isn't called
736              // until several blocks in?
737              newBI = new BlockInfoUnderConstruction(
738                  newBlock, file.getBlockReplication());
739            } else {
740              // OP_CLOSE should add finalized blocks. This code path
741              // is only executed when loading edits written by prior
742              // versions of Hadoop. Current versions always log
743              // OP_ADD operations as each block is allocated.
744              newBI = new BlockInfo(newBlock, file.getBlockReplication());
745            }
746            fsNamesys.getBlockManager().addBlockCollection(newBI, file);
747            file.addBlock(newBI);
748            fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock);
749          }
750        }
751      }
752    
753      private static void dumpOpCounts(
754          EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts) {
755        StringBuilder sb = new StringBuilder();
756        sb.append("Summary of operations loaded from edit log:\n  ");
757        Joiner.on("\n  ").withKeyValueSeparator("=").appendTo(sb, opCounts);
758        FSImage.LOG.debug(sb.toString());
759      }
760    
761      private void incrOpCount(FSEditLogOpCodes opCode,
762          EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts, Step step,
763          Counter counter) {
764        Holder<Integer> holder = opCounts.get(opCode);
765        if (holder == null) {
766          holder = new Holder<Integer>(1);
767          opCounts.put(opCode, holder);
768        } else {
769          holder.held++;
770        }
771        counter.increment();
772      }
773    
774      /**
775       * Throw appropriate exception during upgrade from 203, when editlog loading
776       * could fail due to opcode conflicts.
777       */
778      private void check203UpgradeFailure(int logVersion, Throwable e)
779          throws IOException {
780        // 0.20.203 version version has conflicting opcodes with the later releases.
781        // The editlog must be emptied by restarting the namenode, before proceeding
782        // with the upgrade.
783        if (Storage.is203LayoutVersion(logVersion)
784            && logVersion != HdfsConstants.LAYOUT_VERSION) {
785          String msg = "During upgrade failed to load the editlog version "
786              + logVersion + " from release 0.20.203. Please go back to the old "
787              + " release and restart the namenode. This empties the editlog "
788              + " and saves the namespace. Resume the upgrade after this step.";
789          throw new IOException(msg, e);
790        }
791      }
792      
793      /**
794       * Find the last valid transaction ID in the stream.
795       * If there are invalid or corrupt transactions in the middle of the stream,
796       * validateEditLog will skip over them.
797       * This reads through the stream but does not close it.
798       *
799       * @throws IOException if the stream cannot be read due to an IO error (eg
800       *                     if the log does not exist)
801       */
802      static EditLogValidation validateEditLog(EditLogInputStream in) {
803        long lastPos = 0;
804        long lastTxId = HdfsConstants.INVALID_TXID;
805        long numValid = 0;
806        FSEditLogOp op = null;
807        while (true) {
808          lastPos = in.getPosition();
809          try {
810            if ((op = in.readOp()) == null) {
811              break;
812            }
813          } catch (Throwable t) {
814            FSImage.LOG.warn("Caught exception after reading " + numValid +
815                " ops from " + in + " while determining its valid length." +
816                "Position was " + lastPos, t);
817            in.resync();
818            FSImage.LOG.warn("After resync, position is " + in.getPosition());
819            continue;
820          }
821          if (lastTxId == HdfsConstants.INVALID_TXID
822              || op.getTransactionId() > lastTxId) {
823            lastTxId = op.getTransactionId();
824          }
825          numValid++;
826        }
827        return new EditLogValidation(lastPos, lastTxId, false);
828      }
829    
830      static class EditLogValidation {
831        private final long validLength;
832        private final long endTxId;
833        private final boolean hasCorruptHeader;
834    
835        EditLogValidation(long validLength, long endTxId,
836            boolean hasCorruptHeader) {
837          this.validLength = validLength;
838          this.endTxId = endTxId;
839          this.hasCorruptHeader = hasCorruptHeader;
840        }
841    
842        long getValidLength() { return validLength; }
843    
844        long getEndTxId() { return endTxId; }
845    
846        boolean hasCorruptHeader() { return hasCorruptHeader; }
847      }
848    
849      /**
850       * Stream wrapper that keeps track of the current stream position.
851       * 
852       * This stream also allows us to set a limit on how many bytes we can read
853       * without getting an exception.
854       */
855      public static class PositionTrackingInputStream extends FilterInputStream
856          implements StreamLimiter {
857        private long curPos = 0;
858        private long markPos = -1;
859        private long limitPos = Long.MAX_VALUE;
860    
861        public PositionTrackingInputStream(InputStream is) {
862          super(is);
863        }
864    
865        private void checkLimit(long amt) throws IOException {
866          long extra = (curPos + amt) - limitPos;
867          if (extra > 0) {
868            throw new IOException("Tried to read " + amt + " byte(s) past " +
869                "the limit at offset " + limitPos);
870          }
871        }
872        
873        @Override
874        public int read() throws IOException {
875          checkLimit(1);
876          int ret = super.read();
877          if (ret != -1) curPos++;
878          return ret;
879        }
880    
881        @Override
882        public int read(byte[] data) throws IOException {
883          checkLimit(data.length);
884          int ret = super.read(data);
885          if (ret > 0) curPos += ret;
886          return ret;
887        }
888    
889        @Override
890        public int read(byte[] data, int offset, int length) throws IOException {
891          checkLimit(length);
892          int ret = super.read(data, offset, length);
893          if (ret > 0) curPos += ret;
894          return ret;
895        }
896    
897        @Override
898        public void setLimit(long limit) {
899          limitPos = curPos + limit;
900        }
901    
902        @Override
903        public void clearLimit() {
904          limitPos = Long.MAX_VALUE;
905        }
906    
907        @Override
908        public void mark(int limit) {
909          super.mark(limit);
910          markPos = curPos;
911        }
912    
913        @Override
914        public void reset() throws IOException {
915          if (markPos == -1) {
916            throw new IOException("Not marked!");
917          }
918          super.reset();
919          curPos = markPos;
920          markPos = -1;
921        }
922    
923        public long getPos() {
924          return curPos;
925        }
926        
927        @Override
928        public long skip(long amt) throws IOException {
929          long extra = (curPos + amt) - limitPos;
930          if (extra > 0) {
931            throw new IOException("Tried to skip " + extra + " bytes past " +
932                "the limit at offset " + limitPos);
933          }
934          long ret = super.skip(amt);
935          curPos += ret;
936          return ret;
937        }
938      }
939    
940      public long getLastAppliedTxId() {
941        return lastAppliedTxId;
942      }
943    
944      /**
945       * Creates a Step used for updating startup progress, populated with
946       * information from the given edits.  The step always includes the log's name.
947       * If the log has a known length, then the length is included in the step too.
948       * 
949       * @param edits EditLogInputStream to use for populating step
950       * @return Step populated with information from edits
951       * @throws IOException thrown if there is an I/O error
952       */
953      private static Step createStartupProgressStep(EditLogInputStream edits)
954          throws IOException {
955        long length = edits.length();
956        String name = edits.getCurrentStreamName();
957        return length != -1 ? new Step(name, length) : new Step(name);
958      }
959    }