001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.namenode;
019    
020    import static org.apache.hadoop.util.Time.now;
021    
022    import java.io.File;
023    import java.io.FilterInputStream;
024    import java.io.IOException;
025    import java.io.InputStream;
026    import java.util.ArrayList;
027    import java.util.Arrays;
028    import java.util.EnumMap;
029    import java.util.List;
030    
031    import org.apache.commons.logging.Log;
032    import org.apache.commons.logging.LogFactory;
033    import org.apache.hadoop.classification.InterfaceAudience;
034    import org.apache.hadoop.classification.InterfaceStability;
035    import org.apache.hadoop.hdfs.protocol.Block;
036    import org.apache.hadoop.hdfs.protocol.HdfsConstants;
037    import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
038    import org.apache.hadoop.hdfs.protocol.LayoutVersion;
039    import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
040    import org.apache.hadoop.hdfs.protocol.LocatedBlock;
041    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
042    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
043    import org.apache.hadoop.hdfs.server.common.Storage;
044    import org.apache.hadoop.hdfs.server.namenode.EditLogFileInputStream.LogHeaderCorruptException;
045    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCloseOp;
046    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AllocateBlockIdOp;
047    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AllowSnapshotOp;
048    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.BlockListUpdatingOp;
049    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CancelDelegationTokenOp;
050    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ClearNSQuotaOp;
051    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ConcatDeleteOp;
052    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CreateSnapshotOp;
053    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DeleteOp;
054    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DeleteSnapshotOp;
055    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DisallowSnapshotOp;
056    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.GetDelegationTokenOp;
057    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.MkdirOp;
058    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ReassignLeaseOp;
059    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameOldOp;
060    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameOp;
061    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameSnapshotOp;
062    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenewDelegationTokenOp;
063    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetGenstampV1Op;
064    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetGenstampV2Op;
065    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetNSQuotaOp;
066    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetOwnerOp;
067    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetPermissionsOp;
068    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetQuotaOp;
069    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetReplicationOp;
070    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SymlinkOp;
071    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.TimesOp;
072    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateBlocksOp;
073    import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateMasterKeyOp;
074    import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
075    import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
076    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
077    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
078    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
079    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
080    import org.apache.hadoop.hdfs.util.Holder;
081    
082    import com.google.common.base.Joiner;
083    
084    @InterfaceAudience.Private
085    @InterfaceStability.Evolving
086    public class FSEditLogLoader {
087      static final Log LOG = LogFactory.getLog(FSEditLogLoader.class.getName());
088      static long REPLAY_TRANSACTION_LOG_INTERVAL = 1000; // 1sec
089      private final FSNamesystem fsNamesys;
090      private long lastAppliedTxId;
091      
092      public FSEditLogLoader(FSNamesystem fsNamesys, long lastAppliedTxId) {
093        this.fsNamesys = fsNamesys;
094        this.lastAppliedTxId = lastAppliedTxId;
095      }
096      
097      /**
098       * Load an edit log, and apply the changes to the in-memory structure
099       * This is where we apply edits that we've been writing to disk all
100       * along.
101       */
102      long loadFSEdits(EditLogInputStream edits, long expectedStartingTxId,
103          MetaRecoveryContext recovery) throws IOException {
104        StartupProgress prog = NameNode.getStartupProgress();
105        Step step = createStartupProgressStep(edits);
106        prog.beginStep(Phase.LOADING_EDITS, step);
107        fsNamesys.writeLock();
108        try {
109          long startTime = now();
110          FSImage.LOG.info("Start loading edits file " + edits.getName());
111          long numEdits = loadEditRecords(edits, false, 
112                                     expectedStartingTxId, recovery);
113          FSImage.LOG.info("Edits file " + edits.getName() 
114              + " of size " + edits.length() + " edits # " + numEdits 
115              + " loaded in " + (now()-startTime)/1000 + " seconds");
116          return numEdits;
117        } finally {
118          edits.close();
119          fsNamesys.writeUnlock();
120          prog.endStep(Phase.LOADING_EDITS, step);
121        }
122      }
123    
124      long loadEditRecords(EditLogInputStream in, boolean closeOnExit,
125                          long expectedStartingTxId, MetaRecoveryContext recovery)
126          throws IOException {
127        FSDirectory fsDir = fsNamesys.dir;
128    
129        EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts =
130          new EnumMap<FSEditLogOpCodes, Holder<Integer>>(FSEditLogOpCodes.class);
131    
132        if (LOG.isTraceEnabled()) {
133          LOG.trace("Acquiring write lock to replay edit log");
134        }
135    
136        fsNamesys.writeLock();
137        fsDir.writeLock();
138    
139        long recentOpcodeOffsets[] = new long[4];
140        Arrays.fill(recentOpcodeOffsets, -1);
141        
142        long expectedTxId = expectedStartingTxId;
143        long numEdits = 0;
144        long lastTxId = in.getLastTxId();
145        long numTxns = (lastTxId - expectedStartingTxId) + 1;
146        StartupProgress prog = NameNode.getStartupProgress();
147        Step step = createStartupProgressStep(in);
148        prog.setTotal(Phase.LOADING_EDITS, step, numTxns);
149        Counter counter = prog.getCounter(Phase.LOADING_EDITS, step);
150        long lastLogTime = now();
151        long lastInodeId = fsNamesys.getLastInodeId();
152        
153        try {
154          while (true) {
155            try {
156              FSEditLogOp op;
157              try {
158                op = in.readOp();
159                if (op == null) {
160                  break;
161                }
162              } catch (Throwable e) {
163                // Handle a problem with our input
164                check203UpgradeFailure(in.getVersion(), e);
165                String errorMessage =
166                  formatEditLogReplayError(in, recentOpcodeOffsets, expectedTxId);
167                FSImage.LOG.error(errorMessage, e);
168                if (recovery == null) {
169                   // We will only try to skip over problematic opcodes when in
170                   // recovery mode.
171                  throw new EditLogInputException(errorMessage, e, numEdits);
172                }
173                MetaRecoveryContext.editLogLoaderPrompt(
174                    "We failed to read txId " + expectedTxId,
175                    recovery, "skipping the bad section in the log");
176                in.resync();
177                continue;
178              }
179              recentOpcodeOffsets[(int)(numEdits % recentOpcodeOffsets.length)] =
180                in.getPosition();
181              if (op.hasTransactionId()) {
182                if (op.getTransactionId() > expectedTxId) { 
183                  MetaRecoveryContext.editLogLoaderPrompt("There appears " +
184                      "to be a gap in the edit log.  We expected txid " +
185                      expectedTxId + ", but got txid " +
186                      op.getTransactionId() + ".", recovery, "ignoring missing " +
187                      " transaction IDs");
188                } else if (op.getTransactionId() < expectedTxId) { 
189                  MetaRecoveryContext.editLogLoaderPrompt("There appears " +
190                      "to be an out-of-order edit in the edit log.  We " +
191                      "expected txid " + expectedTxId + ", but got txid " +
192                      op.getTransactionId() + ".", recovery,
193                      "skipping the out-of-order edit");
194                  continue;
195                }
196              }
197              try {
198                long inodeId = applyEditLogOp(op, fsDir, in.getVersion(), lastInodeId);
199                if (lastInodeId < inodeId) {
200                  lastInodeId = inodeId;
201                }
202              } catch (Throwable e) {
203                LOG.error("Encountered exception on operation " + op, e);
204                MetaRecoveryContext.editLogLoaderPrompt("Failed to " +
205                 "apply edit log operation " + op + ": error " +
206                 e.getMessage(), recovery, "applying edits");
207              }
208              // Now that the operation has been successfully decoded and
209              // applied, update our bookkeeping.
210              incrOpCount(op.opCode, opCounts, step, counter);
211              if (op.hasTransactionId()) {
212                lastAppliedTxId = op.getTransactionId();
213                expectedTxId = lastAppliedTxId + 1;
214              } else {
215                expectedTxId = lastAppliedTxId = expectedStartingTxId;
216              }
217              // log progress
218              if (op.hasTransactionId()) {
219                long now = now();
220                if (now - lastLogTime > REPLAY_TRANSACTION_LOG_INTERVAL) {
221                  long deltaTxId = lastAppliedTxId - expectedStartingTxId + 1;
222                  int percent = Math.round((float) deltaTxId / numTxns * 100);
223                  LOG.info("replaying edit log: " + deltaTxId + "/" + numTxns
224                      + " transactions completed. (" + percent + "%)");
225                  lastLogTime = now;
226                }
227              }
228              numEdits++;
229            } catch (MetaRecoveryContext.RequestStopException e) {
230              MetaRecoveryContext.LOG.warn("Stopped reading edit log at " +
231                  in.getPosition() + "/"  + in.length());
232              break;
233            }
234          }
235        } finally {
236          fsNamesys.resetLastInodeId(lastInodeId);
237          if(closeOnExit) {
238            in.close();
239          }
240          fsDir.writeUnlock();
241          fsNamesys.writeUnlock();
242    
243          if (LOG.isTraceEnabled()) {
244            LOG.trace("replaying edit log finished");
245          }
246    
247          if (FSImage.LOG.isDebugEnabled()) {
248            dumpOpCounts(opCounts);
249          }
250        }
251        return numEdits;
252      }
253      
254      // allocate and update last allocated inode id
255      private long getAndUpdateLastInodeId(long inodeIdFromOp, int logVersion,
256          long lastInodeId) throws IOException {
257        long inodeId = inodeIdFromOp;
258    
259        if (inodeId == INodeId.GRANDFATHER_INODE_ID) {
260          if (LayoutVersion.supports(Feature.ADD_INODE_ID, logVersion)) {
261            throw new IOException("The layout version " + logVersion
262                + " supports inodeId but gave bogus inodeId");
263          }
264          inodeId = fsNamesys.allocateNewInodeId();
265        } else {
266          // need to reset lastInodeId. fsnamesys gets lastInodeId firstly from
267          // fsimage but editlog captures more recent inodeId allocations
268          if (inodeId > lastInodeId) {
269            fsNamesys.resetLastInodeId(inodeId);
270          }
271        }
272        return inodeId;
273      }
274    
275      @SuppressWarnings("deprecation")
276      private long applyEditLogOp(FSEditLogOp op, FSDirectory fsDir,
277          int logVersion, long lastInodeId) throws IOException {
278        long inodeId = INodeId.GRANDFATHER_INODE_ID;
279        if (LOG.isTraceEnabled()) {
280          LOG.trace("replaying edit log: " + op);
281        }
282        final boolean toAddRetryCache = fsNamesys.hasRetryCache() && op.hasRpcIds();
283        
284        switch (op.opCode) {
285        case OP_ADD: {
286          AddCloseOp addCloseOp = (AddCloseOp)op;
287          if (FSNamesystem.LOG.isDebugEnabled()) {
288            FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
289                " numblocks : " + addCloseOp.blocks.length +
290                " clientHolder " + addCloseOp.clientName +
291                " clientMachine " + addCloseOp.clientMachine);
292          }
293          // There three cases here:
294          // 1. OP_ADD to create a new file
295          // 2. OP_ADD to update file blocks
296          // 3. OP_ADD to open file for append
297    
298          // See if the file already exists (persistBlocks call)
299          final INodesInPath iip = fsDir.getLastINodeInPath(addCloseOp.path);
300          final INodeFile oldFile = INodeFile.valueOf(
301              iip.getINode(0), addCloseOp.path, true);
302          INodeFile newFile = oldFile;
303          if (oldFile == null) { // this is OP_ADD on a new file (case 1)
304            // versions > 0 support per file replication
305            // get name and replication
306            final short replication = fsNamesys.getBlockManager()
307                .adjustReplication(addCloseOp.replication);
308            assert addCloseOp.blocks.length == 0;
309    
310            // add to the file tree
311            inodeId = getAndUpdateLastInodeId(addCloseOp.inodeId, logVersion,
312                lastInodeId);
313            newFile = fsDir.unprotectedAddFile(inodeId,
314                addCloseOp.path, addCloseOp.permissions, replication,
315                addCloseOp.mtime, addCloseOp.atime, addCloseOp.blockSize, true,
316                addCloseOp.clientName, addCloseOp.clientMachine);
317            fsNamesys.leaseManager.addLease(addCloseOp.clientName, addCloseOp.path);
318    
319            // add the op into retry cache if necessary
320            if (toAddRetryCache) {
321              HdfsFileStatus stat = fsNamesys.dir.createFileStatus(
322                  HdfsFileStatus.EMPTY_NAME, newFile, null);
323              fsNamesys.addCacheEntryWithPayload(addCloseOp.rpcClientId,
324                  addCloseOp.rpcCallId, stat);
325            }
326          } else { // This is OP_ADD on an existing file
327            if (!oldFile.isUnderConstruction()) {
328              // This is case 3: a call to append() on an already-closed file.
329              if (FSNamesystem.LOG.isDebugEnabled()) {
330                FSNamesystem.LOG.debug("Reopening an already-closed file " +
331                    "for append");
332              }
333              LocatedBlock lb = fsNamesys.prepareFileForWrite(addCloseOp.path,
334                  oldFile, addCloseOp.clientName, addCloseOp.clientMachine, null,
335                  false, iip.getLatestSnapshot(), false);
336              newFile = INodeFile.valueOf(fsDir.getINode(addCloseOp.path),
337                  addCloseOp.path, true);
338              
339              // add the op into retry cache is necessary
340              if (toAddRetryCache) {
341                fsNamesys.addCacheEntryWithPayload(addCloseOp.rpcClientId,
342                    addCloseOp.rpcCallId, lb);
343              }
344            }
345          }
346          // Fall-through for case 2.
347          // Regardless of whether it's a new file or an updated file,
348          // update the block list.
349          
350          // Update the salient file attributes.
351          newFile.setAccessTime(addCloseOp.atime, null, fsDir.getINodeMap());
352          newFile.setModificationTime(addCloseOp.mtime, null, fsDir.getINodeMap());
353          updateBlocks(fsDir, addCloseOp, newFile);
354          break;
355        }
356        case OP_CLOSE: {
357          AddCloseOp addCloseOp = (AddCloseOp)op;
358          
359          if (FSNamesystem.LOG.isDebugEnabled()) {
360            FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
361                " numblocks : " + addCloseOp.blocks.length +
362                " clientHolder " + addCloseOp.clientName +
363                " clientMachine " + addCloseOp.clientMachine);
364          }
365    
366          final INodesInPath iip = fsDir.getLastINodeInPath(addCloseOp.path);
367          final INodeFile oldFile = INodeFile.valueOf(iip.getINode(0), addCloseOp.path);
368    
369          // Update the salient file attributes.
370          oldFile.setAccessTime(addCloseOp.atime, null, fsDir.getINodeMap());
371          oldFile.setModificationTime(addCloseOp.mtime, null, fsDir.getINodeMap());
372          updateBlocks(fsDir, addCloseOp, oldFile);
373    
374          // Now close the file
375          if (!oldFile.isUnderConstruction() &&
376              logVersion <= LayoutVersion.BUGFIX_HDFS_2991_VERSION) {
377            // There was a bug (HDFS-2991) in hadoop < 0.23.1 where OP_CLOSE
378            // could show up twice in a row. But after that version, this
379            // should be fixed, so we should treat it as an error.
380            throw new IOException(
381                "File is not under construction: " + addCloseOp.path);
382          }
383          // One might expect that you could use removeLease(holder, path) here,
384          // but OP_CLOSE doesn't serialize the holder. So, remove by path.
385          if (oldFile.isUnderConstruction()) {
386            INodeFileUnderConstruction ucFile = (INodeFileUnderConstruction) oldFile;
387            fsNamesys.leaseManager.removeLeaseWithPrefixPath(addCloseOp.path);
388            INodeFile newFile = ucFile.toINodeFile(ucFile.getModificationTime());
389            fsDir.unprotectedReplaceINodeFile(addCloseOp.path, ucFile, newFile);
390          }
391          break;
392        }
393        case OP_UPDATE_BLOCKS: {
394          UpdateBlocksOp updateOp = (UpdateBlocksOp)op;
395          if (FSNamesystem.LOG.isDebugEnabled()) {
396            FSNamesystem.LOG.debug(op.opCode + ": " + updateOp.path +
397                " numblocks : " + updateOp.blocks.length);
398          }
399          INodeFile oldFile = INodeFile.valueOf(fsDir.getINode(updateOp.path),
400              updateOp.path);
401          // Update in-memory data structures
402          updateBlocks(fsDir, updateOp, oldFile);
403          
404          if (toAddRetryCache) {
405            fsNamesys.addCacheEntry(updateOp.rpcClientId, updateOp.rpcCallId);
406          }
407          break;
408        }
409          
410        case OP_SET_REPLICATION: {
411          SetReplicationOp setReplicationOp = (SetReplicationOp)op;
412          short replication = fsNamesys.getBlockManager().adjustReplication(
413              setReplicationOp.replication);
414          fsDir.unprotectedSetReplication(setReplicationOp.path,
415                                          replication, null);
416          break;
417        }
418        case OP_CONCAT_DELETE: {
419          ConcatDeleteOp concatDeleteOp = (ConcatDeleteOp)op;
420          fsDir.unprotectedConcat(concatDeleteOp.trg, concatDeleteOp.srcs,
421              concatDeleteOp.timestamp);
422          
423          if (toAddRetryCache) {
424            fsNamesys.addCacheEntry(concatDeleteOp.rpcClientId,
425                concatDeleteOp.rpcCallId);
426          }
427          break;
428        }
429        case OP_RENAME_OLD: {
430          RenameOldOp renameOp = (RenameOldOp)op;
431          fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
432                                    renameOp.timestamp);
433          
434          if (toAddRetryCache) {
435            fsNamesys.addCacheEntry(renameOp.rpcClientId, renameOp.rpcCallId);
436          }
437          break;
438        }
439        case OP_DELETE: {
440          DeleteOp deleteOp = (DeleteOp)op;
441          fsDir.unprotectedDelete(deleteOp.path, deleteOp.timestamp);
442          
443          if (toAddRetryCache) {
444            fsNamesys.addCacheEntry(deleteOp.rpcClientId, deleteOp.rpcCallId);
445          }
446          break;
447        }
448        case OP_MKDIR: {
449          MkdirOp mkdirOp = (MkdirOp)op;
450          inodeId = getAndUpdateLastInodeId(mkdirOp.inodeId, logVersion,
451              lastInodeId);
452          fsDir.unprotectedMkdir(inodeId, mkdirOp.path, mkdirOp.permissions,
453                                 mkdirOp.timestamp);
454          break;
455        }
456        case OP_SET_GENSTAMP_V1: {
457          SetGenstampV1Op setGenstampV1Op = (SetGenstampV1Op)op;
458          fsNamesys.setGenerationStampV1(setGenstampV1Op.genStampV1);
459          break;
460        }
461        case OP_SET_PERMISSIONS: {
462          SetPermissionsOp setPermissionsOp = (SetPermissionsOp)op;
463          fsDir.unprotectedSetPermission(setPermissionsOp.src,
464                                         setPermissionsOp.permissions);
465          break;
466        }
467        case OP_SET_OWNER: {
468          SetOwnerOp setOwnerOp = (SetOwnerOp)op;
469          fsDir.unprotectedSetOwner(setOwnerOp.src, setOwnerOp.username,
470                                    setOwnerOp.groupname);
471          break;
472        }
473        case OP_SET_NS_QUOTA: {
474          SetNSQuotaOp setNSQuotaOp = (SetNSQuotaOp)op;
475          fsDir.unprotectedSetQuota(setNSQuotaOp.src,
476                                    setNSQuotaOp.nsQuota,
477                                    HdfsConstants.QUOTA_DONT_SET);
478          break;
479        }
480        case OP_CLEAR_NS_QUOTA: {
481          ClearNSQuotaOp clearNSQuotaOp = (ClearNSQuotaOp)op;
482          fsDir.unprotectedSetQuota(clearNSQuotaOp.src,
483                                    HdfsConstants.QUOTA_RESET,
484                                    HdfsConstants.QUOTA_DONT_SET);
485          break;
486        }
487    
488        case OP_SET_QUOTA:
489          SetQuotaOp setQuotaOp = (SetQuotaOp)op;
490          fsDir.unprotectedSetQuota(setQuotaOp.src,
491                                    setQuotaOp.nsQuota,
492                                    setQuotaOp.dsQuota);
493          break;
494    
495        case OP_TIMES: {
496          TimesOp timesOp = (TimesOp)op;
497    
498          fsDir.unprotectedSetTimes(timesOp.path,
499                                    timesOp.mtime,
500                                    timesOp.atime, true);
501          break;
502        }
503        case OP_SYMLINK: {
504          SymlinkOp symlinkOp = (SymlinkOp)op;
505          inodeId = getAndUpdateLastInodeId(symlinkOp.inodeId, logVersion,
506              lastInodeId);
507          fsDir.unprotectedAddSymlink(inodeId, symlinkOp.path,
508                                      symlinkOp.value, symlinkOp.mtime, 
509                                      symlinkOp.atime, symlinkOp.permissionStatus);
510          
511          if (toAddRetryCache) {
512            fsNamesys.addCacheEntry(symlinkOp.rpcClientId, symlinkOp.rpcCallId);
513          }
514          break;
515        }
516        case OP_RENAME: {
517          RenameOp renameOp = (RenameOp)op;
518          fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
519                                    renameOp.timestamp, renameOp.options);
520          
521          if (toAddRetryCache) {
522            fsNamesys.addCacheEntry(renameOp.rpcClientId, renameOp.rpcCallId);
523          }
524          break;
525        }
526        case OP_GET_DELEGATION_TOKEN: {
527          GetDelegationTokenOp getDelegationTokenOp
528            = (GetDelegationTokenOp)op;
529    
530          fsNamesys.getDelegationTokenSecretManager()
531            .addPersistedDelegationToken(getDelegationTokenOp.token,
532                                         getDelegationTokenOp.expiryTime);
533          break;
534        }
535        case OP_RENEW_DELEGATION_TOKEN: {
536          RenewDelegationTokenOp renewDelegationTokenOp
537            = (RenewDelegationTokenOp)op;
538          fsNamesys.getDelegationTokenSecretManager()
539            .updatePersistedTokenRenewal(renewDelegationTokenOp.token,
540                                         renewDelegationTokenOp.expiryTime);
541          break;
542        }
543        case OP_CANCEL_DELEGATION_TOKEN: {
544          CancelDelegationTokenOp cancelDelegationTokenOp
545            = (CancelDelegationTokenOp)op;
546          fsNamesys.getDelegationTokenSecretManager()
547              .updatePersistedTokenCancellation(
548                  cancelDelegationTokenOp.token);
549          break;
550        }
551        case OP_UPDATE_MASTER_KEY: {
552          UpdateMasterKeyOp updateMasterKeyOp = (UpdateMasterKeyOp)op;
553          fsNamesys.getDelegationTokenSecretManager()
554            .updatePersistedMasterKey(updateMasterKeyOp.key);
555          break;
556        }
557        case OP_REASSIGN_LEASE: {
558          ReassignLeaseOp reassignLeaseOp = (ReassignLeaseOp)op;
559    
560          Lease lease = fsNamesys.leaseManager.getLease(
561              reassignLeaseOp.leaseHolder);
562          INodeFileUnderConstruction pendingFile =
563              INodeFileUnderConstruction.valueOf( 
564                  fsDir.getINode(reassignLeaseOp.path), reassignLeaseOp.path);
565          fsNamesys.reassignLeaseInternal(lease,
566              reassignLeaseOp.path, reassignLeaseOp.newHolder, pendingFile);
567          break;
568        }
569        case OP_START_LOG_SEGMENT:
570        case OP_END_LOG_SEGMENT: {
571          // no data in here currently.
572          break;
573        }
574        case OP_CREATE_SNAPSHOT: {
575          CreateSnapshotOp createSnapshotOp = (CreateSnapshotOp) op;
576          String path = fsNamesys.getSnapshotManager().createSnapshot(
577              createSnapshotOp.snapshotRoot, createSnapshotOp.snapshotName);
578          if (toAddRetryCache) {
579            fsNamesys.addCacheEntryWithPayload(createSnapshotOp.rpcClientId,
580                createSnapshotOp.rpcCallId, path);
581          }
582          break;
583        }
584        case OP_DELETE_SNAPSHOT: {
585          DeleteSnapshotOp deleteSnapshotOp = (DeleteSnapshotOp) op;
586          BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
587          List<INode> removedINodes = new ArrayList<INode>();
588          fsNamesys.getSnapshotManager().deleteSnapshot(
589              deleteSnapshotOp.snapshotRoot, deleteSnapshotOp.snapshotName,
590              collectedBlocks, removedINodes);
591          fsNamesys.removeBlocks(collectedBlocks);
592          collectedBlocks.clear();
593          fsNamesys.dir.removeFromInodeMap(removedINodes);
594          removedINodes.clear();
595          
596          if (toAddRetryCache) {
597            fsNamesys.addCacheEntry(deleteSnapshotOp.rpcClientId,
598                deleteSnapshotOp.rpcCallId);
599          }
600          break;
601        }
602        case OP_RENAME_SNAPSHOT: {
603          RenameSnapshotOp renameSnapshotOp = (RenameSnapshotOp) op;
604          fsNamesys.getSnapshotManager().renameSnapshot(
605              renameSnapshotOp.snapshotRoot, renameSnapshotOp.snapshotOldName,
606              renameSnapshotOp.snapshotNewName);
607          
608          if (toAddRetryCache) {
609            fsNamesys.addCacheEntry(renameSnapshotOp.rpcClientId,
610                renameSnapshotOp.rpcCallId);
611          }
612          break;
613        }
614        case OP_ALLOW_SNAPSHOT: {
615          AllowSnapshotOp allowSnapshotOp = (AllowSnapshotOp) op;
616          fsNamesys.getSnapshotManager().setSnapshottable(
617              allowSnapshotOp.snapshotRoot, false);
618          break;
619        }
620        case OP_DISALLOW_SNAPSHOT: {
621          DisallowSnapshotOp disallowSnapshotOp = (DisallowSnapshotOp) op;
622          fsNamesys.getSnapshotManager().resetSnapshottable(
623              disallowSnapshotOp.snapshotRoot);
624          break;
625        }
626        case OP_SET_GENSTAMP_V2: {
627          SetGenstampV2Op setGenstampV2Op = (SetGenstampV2Op) op;
628          fsNamesys.setGenerationStampV2(setGenstampV2Op.genStampV2);
629          break;
630        }
631        case OP_ALLOCATE_BLOCK_ID: {
632          AllocateBlockIdOp allocateBlockIdOp = (AllocateBlockIdOp) op;
633          fsNamesys.setLastAllocatedBlockId(allocateBlockIdOp.blockId);
634          break;
635        }
636        default:
637          throw new IOException("Invalid operation read " + op.opCode);
638        }
639        return inodeId;
640      }
641      
642      private static String formatEditLogReplayError(EditLogInputStream in,
643          long recentOpcodeOffsets[], long txid) {
644        StringBuilder sb = new StringBuilder();
645        sb.append("Error replaying edit log at offset " + in.getPosition());
646        sb.append(".  Expected transaction ID was ").append(txid);
647        if (recentOpcodeOffsets[0] != -1) {
648          Arrays.sort(recentOpcodeOffsets);
649          sb.append("\nRecent opcode offsets:");
650          for (long offset : recentOpcodeOffsets) {
651            if (offset != -1) {
652              sb.append(' ').append(offset);
653            }
654          }
655        }
656        return sb.toString();
657      }
658    
659      /**
660       * Update in-memory data structures with new block information.
661       * @throws IOException
662       */
663      private void updateBlocks(FSDirectory fsDir, BlockListUpdatingOp op,
664          INodeFile file) throws IOException {
665        // Update its block list
666        BlockInfo[] oldBlocks = file.getBlocks();
667        Block[] newBlocks = op.getBlocks();
668        String path = op.getPath();
669        
670        // Are we only updating the last block's gen stamp.
671        boolean isGenStampUpdate = oldBlocks.length == newBlocks.length;
672        
673        // First, update blocks in common
674        for (int i = 0; i < oldBlocks.length && i < newBlocks.length; i++) {
675          BlockInfo oldBlock = oldBlocks[i];
676          Block newBlock = newBlocks[i];
677          
678          boolean isLastBlock = i == newBlocks.length - 1;
679          if (oldBlock.getBlockId() != newBlock.getBlockId() ||
680              (oldBlock.getGenerationStamp() != newBlock.getGenerationStamp() && 
681                  !(isGenStampUpdate && isLastBlock))) {
682            throw new IOException("Mismatched block IDs or generation stamps, " +
683                "attempting to replace block " + oldBlock + " with " + newBlock +
684                " as block # " + i + "/" + newBlocks.length + " of " +
685                path);
686          }
687          
688          oldBlock.setNumBytes(newBlock.getNumBytes());
689          boolean changeMade =
690            oldBlock.getGenerationStamp() != newBlock.getGenerationStamp();
691          oldBlock.setGenerationStamp(newBlock.getGenerationStamp());
692          
693          if (oldBlock instanceof BlockInfoUnderConstruction &&
694              (!isLastBlock || op.shouldCompleteLastBlock())) {
695            changeMade = true;
696            fsNamesys.getBlockManager().forceCompleteBlock(
697                (INodeFileUnderConstruction)file,
698                (BlockInfoUnderConstruction)oldBlock);
699          }
700          if (changeMade) {
701            // The state or gen-stamp of the block has changed. So, we may be
702            // able to process some messages from datanodes that we previously
703            // were unable to process.
704            fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock);
705          }
706        }
707        
708        if (newBlocks.length < oldBlocks.length) {
709          // We're removing a block from the file, e.g. abandonBlock(...)
710          if (!file.isUnderConstruction()) {
711            throw new IOException("Trying to remove a block from file " +
712                path + " which is not under construction.");
713          }
714          if (newBlocks.length != oldBlocks.length - 1) {
715            throw new IOException("Trying to remove more than one block from file "
716                + path);
717          }
718          Block oldBlock = oldBlocks[oldBlocks.length - 1];
719          boolean removed = fsDir.unprotectedRemoveBlock(path,
720              (INodeFileUnderConstruction) file, oldBlock);
721          if (!removed && !(op instanceof UpdateBlocksOp)) {
722            throw new IOException("Trying to delete non-existant block " + oldBlock);
723          }
724        } else if (newBlocks.length > oldBlocks.length) {
725          // We're adding blocks
726          for (int i = oldBlocks.length; i < newBlocks.length; i++) {
727            Block newBlock = newBlocks[i];
728            BlockInfo newBI;
729            if (!op.shouldCompleteLastBlock()) {
730              // TODO: shouldn't this only be true for the last block?
731              // what about an old-version fsync() where fsync isn't called
732              // until several blocks in?
733              newBI = new BlockInfoUnderConstruction(
734                  newBlock, file.getBlockReplication());
735            } else {
736              // OP_CLOSE should add finalized blocks. This code path
737              // is only executed when loading edits written by prior
738              // versions of Hadoop. Current versions always log
739              // OP_ADD operations as each block is allocated.
740              newBI = new BlockInfo(newBlock, file.getBlockReplication());
741            }
742            fsNamesys.getBlockManager().addBlockCollection(newBI, file);
743            file.addBlock(newBI);
744            fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock);
745          }
746        }
747      }
748    
749      private static void dumpOpCounts(
750          EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts) {
751        StringBuilder sb = new StringBuilder();
752        sb.append("Summary of operations loaded from edit log:\n  ");
753        Joiner.on("\n  ").withKeyValueSeparator("=").appendTo(sb, opCounts);
754        FSImage.LOG.debug(sb.toString());
755      }
756    
757      private void incrOpCount(FSEditLogOpCodes opCode,
758          EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts, Step step,
759          Counter counter) {
760        Holder<Integer> holder = opCounts.get(opCode);
761        if (holder == null) {
762          holder = new Holder<Integer>(1);
763          opCounts.put(opCode, holder);
764        } else {
765          holder.held++;
766        }
767        counter.increment();
768      }
769    
770      /**
771       * Throw appropriate exception during upgrade from 203, when editlog loading
772       * could fail due to opcode conflicts.
773       */
774      private void check203UpgradeFailure(int logVersion, Throwable e)
775          throws IOException {
776        // 0.20.203 version version has conflicting opcodes with the later releases.
777        // The editlog must be emptied by restarting the namenode, before proceeding
778        // with the upgrade.
779        if (Storage.is203LayoutVersion(logVersion)
780            && logVersion != HdfsConstants.LAYOUT_VERSION) {
781          String msg = "During upgrade failed to load the editlog version "
782              + logVersion + " from release 0.20.203. Please go back to the old "
783              + " release and restart the namenode. This empties the editlog "
784              + " and saves the namespace. Resume the upgrade after this step.";
785          throw new IOException(msg, e);
786        }
787      }
788      
789      /**
790       * Find the last valid transaction ID in the stream.
791       * If there are invalid or corrupt transactions in the middle of the stream,
792       * validateEditLog will skip over them.
793       * This reads through the stream but does not close it.
794       *
795       * @throws IOException if the stream cannot be read due to an IO error (eg
796       *                     if the log does not exist)
797       */
798      static EditLogValidation validateEditLog(EditLogInputStream in) {
799        long lastPos = 0;
800        long lastTxId = HdfsConstants.INVALID_TXID;
801        long numValid = 0;
802        FSEditLogOp op = null;
803        while (true) {
804          lastPos = in.getPosition();
805          try {
806            if ((op = in.readOp()) == null) {
807              break;
808            }
809          } catch (Throwable t) {
810            FSImage.LOG.warn("Caught exception after reading " + numValid +
811                " ops from " + in + " while determining its valid length." +
812                "Position was " + lastPos, t);
813            in.resync();
814            FSImage.LOG.warn("After resync, position is " + in.getPosition());
815            continue;
816          }
817          if (lastTxId == HdfsConstants.INVALID_TXID
818              || op.getTransactionId() > lastTxId) {
819            lastTxId = op.getTransactionId();
820          }
821          numValid++;
822        }
823        return new EditLogValidation(lastPos, lastTxId, false);
824      }
825    
826      static class EditLogValidation {
827        private final long validLength;
828        private final long endTxId;
829        private final boolean hasCorruptHeader;
830    
831        EditLogValidation(long validLength, long endTxId,
832            boolean hasCorruptHeader) {
833          this.validLength = validLength;
834          this.endTxId = endTxId;
835          this.hasCorruptHeader = hasCorruptHeader;
836        }
837    
838        long getValidLength() { return validLength; }
839    
840        long getEndTxId() { return endTxId; }
841    
842        boolean hasCorruptHeader() { return hasCorruptHeader; }
843      }
844    
845      /**
846       * Stream wrapper that keeps track of the current stream position.
847       * 
848       * This stream also allows us to set a limit on how many bytes we can read
849       * without getting an exception.
850       */
851      public static class PositionTrackingInputStream extends FilterInputStream
852          implements StreamLimiter {
853        private long curPos = 0;
854        private long markPos = -1;
855        private long limitPos = Long.MAX_VALUE;
856    
857        public PositionTrackingInputStream(InputStream is) {
858          super(is);
859        }
860    
861        private void checkLimit(long amt) throws IOException {
862          long extra = (curPos + amt) - limitPos;
863          if (extra > 0) {
864            throw new IOException("Tried to read " + amt + " byte(s) past " +
865                "the limit at offset " + limitPos);
866          }
867        }
868        
869        @Override
870        public int read() throws IOException {
871          checkLimit(1);
872          int ret = super.read();
873          if (ret != -1) curPos++;
874          return ret;
875        }
876    
877        @Override
878        public int read(byte[] data) throws IOException {
879          checkLimit(data.length);
880          int ret = super.read(data);
881          if (ret > 0) curPos += ret;
882          return ret;
883        }
884    
885        @Override
886        public int read(byte[] data, int offset, int length) throws IOException {
887          checkLimit(length);
888          int ret = super.read(data, offset, length);
889          if (ret > 0) curPos += ret;
890          return ret;
891        }
892    
893        @Override
894        public void setLimit(long limit) {
895          limitPos = curPos + limit;
896        }
897    
898        @Override
899        public void clearLimit() {
900          limitPos = Long.MAX_VALUE;
901        }
902    
903        @Override
904        public void mark(int limit) {
905          super.mark(limit);
906          markPos = curPos;
907        }
908    
909        @Override
910        public void reset() throws IOException {
911          if (markPos == -1) {
912            throw new IOException("Not marked!");
913          }
914          super.reset();
915          curPos = markPos;
916          markPos = -1;
917        }
918    
919        public long getPos() {
920          return curPos;
921        }
922        
923        @Override
924        public long skip(long amt) throws IOException {
925          long extra = (curPos + amt) - limitPos;
926          if (extra > 0) {
927            throw new IOException("Tried to skip " + extra + " bytes past " +
928                "the limit at offset " + limitPos);
929          }
930          long ret = super.skip(amt);
931          curPos += ret;
932          return ret;
933        }
934      }
935    
936      public long getLastAppliedTxId() {
937        return lastAppliedTxId;
938      }
939    
940      /**
941       * Creates a Step used for updating startup progress, populated with
942       * information from the given edits.  The step always includes the log's name.
943       * If the log has a known length, then the length is included in the step too.
944       * 
945       * @param edits EditLogInputStream to use for populating step
946       * @return Step populated with information from edits
947       * @throws IOException thrown if there is an I/O error
948       */
949      private static Step createStartupProgressStep(EditLogInputStream edits)
950          throws IOException {
951        long length = edits.length();
952        String name = edits.getCurrentStreamName();
953        return length != -1 ? new Step(name, length) : new Step(name);
954      }
955    }