001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs;
019
020import java.io.EOFException;
021import java.io.IOException;
022import java.net.InetSocketAddress;
023import java.nio.ByteBuffer;
024import java.util.AbstractMap;
025import java.util.ArrayList;
026import java.util.Arrays;
027import java.util.Collection;
028import java.util.EnumSet;
029import java.util.HashMap;
030import java.util.HashSet;
031import java.util.Iterator;
032import java.util.LinkedList;
033import java.util.List;
034import java.util.Map;
035import java.util.Map.Entry;
036import java.util.Set;
037import java.util.concurrent.Callable;
038import java.util.concurrent.CancellationException;
039import java.util.concurrent.CompletionService;
040import java.util.concurrent.ConcurrentHashMap;
041import java.util.concurrent.ExecutionException;
042import java.util.concurrent.ExecutorCompletionService;
043import java.util.concurrent.Future;
044import java.util.concurrent.TimeUnit;
045import java.util.concurrent.atomic.AtomicBoolean;
046
047import org.apache.commons.io.IOUtils;
048import org.apache.hadoop.classification.InterfaceAudience;
049import org.apache.hadoop.fs.ByteBufferReadable;
050import org.apache.hadoop.fs.ByteBufferUtil;
051import org.apache.hadoop.fs.CanSetDropBehind;
052import org.apache.hadoop.fs.CanSetReadahead;
053import org.apache.hadoop.fs.CanUnbuffer;
054import org.apache.hadoop.fs.ChecksumException;
055import org.apache.hadoop.fs.FSInputStream;
056import org.apache.hadoop.fs.HasEnhancedByteBufferAccess;
057import org.apache.hadoop.fs.ReadOption;
058import org.apache.hadoop.fs.StorageType;
059import org.apache.hadoop.fs.UnresolvedLinkException;
060import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
061import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
062import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
063import org.apache.hadoop.fs.FileEncryptionInfo;
064import org.apache.hadoop.hdfs.protocol.LocatedBlock;
065import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
066import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
067import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
068import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
069import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
070import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
071import org.apache.hadoop.hdfs.shortcircuit.ClientMmap;
072import org.apache.hadoop.io.ByteBufferPool;
073import org.apache.hadoop.ipc.RPC;
074import org.apache.hadoop.ipc.RemoteException;
075import org.apache.hadoop.ipc.RetriableException;
076import org.apache.hadoop.net.NetUtils;
077import org.apache.hadoop.security.token.SecretManager.InvalidToken;
078import org.apache.hadoop.security.token.Token;
079import org.apache.hadoop.util.IdentityHashStore;
080import org.apache.hadoop.util.StopWatch;
081import org.apache.htrace.Span;
082import org.apache.htrace.Trace;
083import org.apache.htrace.TraceScope;
084
085import com.google.common.annotations.VisibleForTesting;
086
087/****************************************************************
088 * DFSInputStream provides bytes from a named file.  It handles 
089 * negotiation of the namenode and various datanodes as necessary.
090 ****************************************************************/
091@InterfaceAudience.Private
092public class DFSInputStream extends FSInputStream
093implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
094    HasEnhancedByteBufferAccess, CanUnbuffer {
095  @VisibleForTesting
096  public static boolean tcpReadsDisabledForTesting = false;
097  private long hedgedReadOpsLoopNumForTesting = 0;
098  private final DFSClient dfsClient;
099  private AtomicBoolean closed = new AtomicBoolean(false);
100  private final String src;
101  private final boolean verifyChecksum;
102
103  // state by stateful read only:
104  // (protected by lock on this)
105  /////
106  private DatanodeInfo currentNode = null;
107  private LocatedBlock currentLocatedBlock = null;
108  private long pos = 0;
109  private long blockEnd = -1;
110  private BlockReader blockReader = null;
111  ////
112
113  // state shared by stateful and positional read:
114  // (protected by lock on infoLock)
115  ////
116  private LocatedBlocks locatedBlocks = null;
117  private long lastBlockBeingWrittenLength = 0;
118  private FileEncryptionInfo fileEncryptionInfo = null;
119  private CachingStrategy cachingStrategy;
120  ////
121
122  private final ReadStatistics readStatistics = new ReadStatistics();
123  // lock for state shared between read and pread
124  // Note: Never acquire a lock on <this> with this lock held to avoid deadlocks
125  //       (it's OK to acquire this lock when the lock on <this> is held)
126  private final Object infoLock = new Object();
127
128  /**
129   * Track the ByteBuffers that we have handed out to readers.
130   * 
131   * The value type can be either ByteBufferPool or ClientMmap, depending on
132   * whether we this is a memory-mapped buffer or not.
133   */
134  private IdentityHashStore<ByteBuffer, Object> extendedReadBuffers;
135
136  private synchronized IdentityHashStore<ByteBuffer, Object>
137        getExtendedReadBuffers() {
138    if (extendedReadBuffers == null) {
139      extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0);
140    }
141    return extendedReadBuffers;
142  }
143
144  public static class ReadStatistics {
145    public ReadStatistics() {
146      clear();
147    }
148
149    public ReadStatistics(ReadStatistics rhs) {
150      this.totalBytesRead = rhs.getTotalBytesRead();
151      this.totalLocalBytesRead = rhs.getTotalLocalBytesRead();
152      this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead();
153      this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead();
154    }
155
156    /**
157     * @return The total bytes read.  This will always be at least as
158     * high as the other numbers, since it includes all of them.
159     */
160    public long getTotalBytesRead() {
161      return totalBytesRead;
162    }
163
164    /**
165     * @return The total local bytes read.  This will always be at least
166     * as high as totalShortCircuitBytesRead, since all short-circuit
167     * reads are also local.
168     */
169    public long getTotalLocalBytesRead() {
170      return totalLocalBytesRead;
171    }
172
173    /**
174     * @return The total short-circuit local bytes read.
175     */
176    public long getTotalShortCircuitBytesRead() {
177      return totalShortCircuitBytesRead;
178    }
179    
180    /**
181     * @return The total number of zero-copy bytes read.
182     */
183    public long getTotalZeroCopyBytesRead() {
184      return totalZeroCopyBytesRead;
185    }
186
187    /**
188     * @return The total number of bytes read which were not local.
189     */
190    public long getRemoteBytesRead() {
191      return totalBytesRead - totalLocalBytesRead;
192    }
193    
194    void addRemoteBytes(long amt) {
195      this.totalBytesRead += amt;
196    }
197
198    void addLocalBytes(long amt) {
199      this.totalBytesRead += amt;
200      this.totalLocalBytesRead += amt;
201    }
202
203    void addShortCircuitBytes(long amt) {
204      this.totalBytesRead += amt;
205      this.totalLocalBytesRead += amt;
206      this.totalShortCircuitBytesRead += amt;
207    }
208
209    void addZeroCopyBytes(long amt) {
210      this.totalBytesRead += amt;
211      this.totalLocalBytesRead += amt;
212      this.totalShortCircuitBytesRead += amt;
213      this.totalZeroCopyBytesRead += amt;
214    }
215
216    void clear() {
217      this.totalBytesRead = 0;
218      this.totalLocalBytesRead = 0;
219      this.totalShortCircuitBytesRead = 0;
220      this.totalZeroCopyBytesRead = 0;
221    }
222    
223    private long totalBytesRead;
224
225    private long totalLocalBytesRead;
226
227    private long totalShortCircuitBytesRead;
228
229    private long totalZeroCopyBytesRead;
230  }
231  
232  /**
233   * This variable tracks the number of failures since the start of the
234   * most recent user-facing operation. That is to say, it should be reset
235   * whenever the user makes a call on this stream, and if at any point
236   * during the retry logic, the failure count exceeds a threshold,
237   * the errors will be thrown back to the operation.
238   *
239   * Specifically this counts the number of times the client has gone
240   * back to the namenode to get a new list of block locations, and is
241   * capped at maxBlockAcquireFailures
242   */
243  private int failures = 0;
244
245  /* XXX Use of CocurrentHashMap is temp fix. Need to fix 
246   * parallel accesses to DFSInputStream (through ptreads) properly */
247  private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes =
248             new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>();
249
250  private byte[] oneByteBuf; // used for 'int read()'
251
252  void addToDeadNodes(DatanodeInfo dnInfo) {
253    deadNodes.put(dnInfo, dnInfo);
254  }
255  
256  DFSInputStream(DFSClient dfsClient, String src, boolean verifyChecksum
257                 ) throws IOException, UnresolvedLinkException {
258    this.dfsClient = dfsClient;
259    this.verifyChecksum = verifyChecksum;
260    this.src = src;
261    synchronized (infoLock) {
262      this.cachingStrategy = dfsClient.getDefaultReadCachingStrategy();
263    }
264    openInfo();
265  }
266
267  /**
268   * Grab the open-file info from namenode
269   */
270  void openInfo() throws IOException, UnresolvedLinkException {
271    synchronized(infoLock) {
272      lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
273      int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength;
274      while (retriesForLastBlockLength > 0) {
275        // Getting last block length as -1 is a special case. When cluster
276        // restarts, DNs may not report immediately. At this time partial block
277        // locations will not be available with NN for getting the length. Lets
278        // retry for 3 times to get the length.
279        if (lastBlockBeingWrittenLength == -1) {
280          DFSClient.LOG.warn("Last block locations not available. "
281              + "Datanodes might not have reported blocks completely."
282              + " Will retry for " + retriesForLastBlockLength + " times");
283          waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength);
284          lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
285        } else {
286          break;
287        }
288        retriesForLastBlockLength--;
289      }
290      if (retriesForLastBlockLength == 0) {
291        throw new IOException("Could not obtain the last block locations.");
292      }
293    }
294  }
295
296  private void waitFor(int waitTime) throws IOException {
297    try {
298      Thread.sleep(waitTime);
299    } catch (InterruptedException e) {
300      throw new IOException(
301          "Interrupted while getting the last block length.");
302    }
303  }
304
305  private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException {
306    final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0);
307    if (DFSClient.LOG.isDebugEnabled()) {
308      DFSClient.LOG.debug("newInfo = " + newInfo);
309    }
310    if (newInfo == null) {
311      throw new IOException("Cannot open filename " + src);
312    }
313
314    if (locatedBlocks != null) {
315      Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator();
316      Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator();
317      while (oldIter.hasNext() && newIter.hasNext()) {
318        if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) {
319          throw new IOException("Blocklist for " + src + " has changed!");
320        }
321      }
322    }
323    locatedBlocks = newInfo;
324    long lastBlockBeingWrittenLength = 0;
325    if (!locatedBlocks.isLastBlockComplete()) {
326      final LocatedBlock last = locatedBlocks.getLastLocatedBlock();
327      if (last != null) {
328        if (last.getLocations().length == 0) {
329          if (last.getBlockSize() == 0) {
330            // if the length is zero, then no data has been written to
331            // datanode. So no need to wait for the locations.
332            return 0;
333          }
334          return -1;
335        }
336        final long len = readBlockLength(last);
337        last.getBlock().setNumBytes(len);
338        lastBlockBeingWrittenLength = len; 
339      }
340    }
341
342    fileEncryptionInfo = locatedBlocks.getFileEncryptionInfo();
343
344    return lastBlockBeingWrittenLength;
345  }
346
347  /** Read the block length from one of the datanodes. */
348  private long readBlockLength(LocatedBlock locatedblock) throws IOException {
349    assert locatedblock != null : "LocatedBlock cannot be null";
350    int replicaNotFoundCount = locatedblock.getLocations().length;
351
352    final int timeout = dfsClient.getConf().socketTimeout;
353    LinkedList<DatanodeInfo> nodeList = new LinkedList<DatanodeInfo>(
354        Arrays.asList(locatedblock.getLocations()));
355    LinkedList<DatanodeInfo> retryList = new LinkedList<DatanodeInfo>();
356    boolean isRetry = false;
357    StopWatch sw = new StopWatch();
358    while (nodeList.size() > 0) {
359      DatanodeInfo datanode = nodeList.pop();
360      ClientDatanodeProtocol cdp = null;
361      try {
362        cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode,
363            dfsClient.getConfiguration(), timeout,
364            dfsClient.getConf().connectToDnViaHostname, locatedblock);
365        
366        final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock());
367        
368        if (n >= 0) {
369          return n;
370        }
371      } catch (IOException ioe) {
372        if (ioe instanceof RemoteException) {
373          if (((RemoteException) ioe).unwrapRemoteException() instanceof
374              ReplicaNotFoundException) {
375            // replica is not on the DN. We will treat it as 0 length
376            // if no one actually has a replica.
377            replicaNotFoundCount--;
378          } else if (((RemoteException) ioe).unwrapRemoteException() instanceof
379              RetriableException) {
380            // add to the list to be retried if necessary.
381            retryList.add(datanode);
382          }
383        }
384        
385        if (DFSClient.LOG.isDebugEnabled()) {
386          DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode "
387              + datanode + " for block " + locatedblock.getBlock(), ioe);
388        }
389      } finally {
390        if (cdp != null) {
391          RPC.stopProxy(cdp);
392        }
393      }
394
395      // Ran out of nodes, but there are retriable nodes.
396      if (nodeList.size() == 0 && retryList.size() > 0) {
397        nodeList.addAll(retryList);
398        retryList.clear();
399        isRetry = true;
400      }
401
402      if (isRetry) {
403        // start the stop watch if not already running.
404        if (!sw.isRunning()) {
405          sw.start();
406        }
407        try {
408          Thread.sleep(500); // delay between retries.
409        } catch (InterruptedException e) {
410          throw new IOException("Interrupted while getting the length.");
411        }
412      }
413
414      // see if we ran out of retry time
415      if (sw.isRunning() && sw.now(TimeUnit.MILLISECONDS) > timeout) {
416        break;
417      }
418    }
419
420    // Namenode told us about these locations, but none know about the replica
421    // means that we hit the race between pipeline creation start and end.
422    // we require all 3 because some other exception could have happened
423    // on a DN that has it.  we want to report that error
424    if (replicaNotFoundCount == 0) {
425      return 0;
426    }
427
428    throw new IOException("Cannot obtain block length for " + locatedblock);
429  }
430  
431  public long getFileLength() {
432    synchronized(infoLock) {
433      return locatedBlocks == null? 0:
434          locatedBlocks.getFileLength() + lastBlockBeingWrittenLength;
435    }
436  }
437
438  // Short circuit local reads are forbidden for files that are
439  // under construction.  See HDFS-2757.
440  boolean shortCircuitForbidden() {
441    synchronized(infoLock) {
442      return locatedBlocks.isUnderConstruction();
443    }
444  }
445
446  /**
447   * Returns the datanode from which the stream is currently reading.
448   */
449  public synchronized DatanodeInfo getCurrentDatanode() {
450    return currentNode;
451  }
452
453  /**
454   * Returns the block containing the target position. 
455   */
456  synchronized public ExtendedBlock getCurrentBlock() {
457    if (currentLocatedBlock == null){
458      return null;
459    }
460    return currentLocatedBlock.getBlock();
461  }
462
463  /**
464   * Return collection of blocks that has already been located.
465   */
466  public List<LocatedBlock> getAllBlocks() throws IOException {
467    return getBlockRange(0, getFileLength());
468  }
469
470  /**
471   * Get block at the specified position.
472   * Fetch it from the namenode if not cached.
473   * 
474   * @param offset block corresponding to this offset in file is returned
475   * @return located block
476   * @throws IOException
477   */
478  private LocatedBlock getBlockAt(long offset) throws IOException {
479    synchronized(infoLock) {
480      assert (locatedBlocks != null) : "locatedBlocks is null";
481
482      final LocatedBlock blk;
483
484      //check offset
485      if (offset < 0 || offset >= getFileLength()) {
486        throw new IOException("offset < 0 || offset >= getFileLength(), offset="
487            + offset
488            + ", locatedBlocks=" + locatedBlocks);
489      }
490      else if (offset >= locatedBlocks.getFileLength()) {
491        // offset to the portion of the last block,
492        // which is not known to the name-node yet;
493        // getting the last block
494        blk = locatedBlocks.getLastLocatedBlock();
495      }
496      else {
497        // search cached blocks first
498        int targetBlockIdx = locatedBlocks.findBlock(offset);
499        if (targetBlockIdx < 0) { // block is not cached
500          targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
501          // fetch more blocks
502          final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
503          assert (newBlocks != null) : "Could not find target position " + offset;
504          locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
505        }
506        blk = locatedBlocks.get(targetBlockIdx);
507      }
508      return blk;
509    }
510  }
511
512  /** Fetch a block from namenode and cache it */
513  private void fetchBlockAt(long offset) throws IOException {
514    synchronized(infoLock) {
515      int targetBlockIdx = locatedBlocks.findBlock(offset);
516      if (targetBlockIdx < 0) { // block is not cached
517        targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
518      }
519      // fetch blocks
520      final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
521      if (newBlocks == null) {
522        throw new IOException("Could not find target position " + offset);
523      }
524      locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
525    }
526  }
527
528  /**
529   * Get blocks in the specified range.
530   * Fetch them from the namenode if not cached. This function
531   * will not get a read request beyond the EOF.
532   * @param offset starting offset in file
533   * @param length length of data
534   * @return consequent segment of located blocks
535   * @throws IOException
536   */
537  private List<LocatedBlock> getBlockRange(long offset,
538      long length)  throws IOException {
539    // getFileLength(): returns total file length
540    // locatedBlocks.getFileLength(): returns length of completed blocks
541    if (offset >= getFileLength()) {
542      throw new IOException("Offset: " + offset +
543        " exceeds file length: " + getFileLength());
544    }
545    synchronized(infoLock) {
546      final List<LocatedBlock> blocks;
547      final long lengthOfCompleteBlk = locatedBlocks.getFileLength();
548      final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk;
549      final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk;
550
551      if (readOffsetWithinCompleteBlk) {
552        //get the blocks of finalized (completed) block range
553        blocks = getFinalizedBlockRange(offset,
554          Math.min(length, lengthOfCompleteBlk - offset));
555      } else {
556        blocks = new ArrayList<LocatedBlock>(1);
557      }
558
559      // get the blocks from incomplete block range
560      if (readLengthPastCompleteBlk) {
561         blocks.add(locatedBlocks.getLastLocatedBlock());
562      }
563
564      return blocks;
565    }
566  }
567
568  /**
569   * Get blocks in the specified range.
570   * Includes only the complete blocks.
571   * Fetch them from the namenode if not cached.
572   */
573  private List<LocatedBlock> getFinalizedBlockRange(
574      long offset, long length) throws IOException {
575    synchronized(infoLock) {
576      assert (locatedBlocks != null) : "locatedBlocks is null";
577      List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>();
578      // search cached blocks first
579      int blockIdx = locatedBlocks.findBlock(offset);
580      if (blockIdx < 0) { // block is not cached
581        blockIdx = LocatedBlocks.getInsertIndex(blockIdx);
582      }
583      long remaining = length;
584      long curOff = offset;
585      while(remaining > 0) {
586        LocatedBlock blk = null;
587        if(blockIdx < locatedBlocks.locatedBlockCount())
588          blk = locatedBlocks.get(blockIdx);
589        if (blk == null || curOff < blk.getStartOffset()) {
590          LocatedBlocks newBlocks;
591          newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining);
592          locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks());
593          continue;
594        }
595        assert curOff >= blk.getStartOffset() : "Block not found";
596        blockRange.add(blk);
597        long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff;
598        remaining -= bytesRead;
599        curOff += bytesRead;
600        blockIdx++;
601      }
602      return blockRange;
603    }
604  }
605
606  /**
607   * Open a DataInputStream to a DataNode so that it can be read from.
608   * We get block ID and the IDs of the destinations at startup, from the namenode.
609   */
610  private synchronized DatanodeInfo blockSeekTo(long target) throws IOException {
611    if (target >= getFileLength()) {
612      throw new IOException("Attempted to read past end of file");
613    }
614
615    // Will be getting a new BlockReader.
616    closeCurrentBlockReader();
617
618    //
619    // Connect to best DataNode for desired Block, with potential offset
620    //
621    DatanodeInfo chosenNode = null;
622    int refetchToken = 1; // only need to get a new access token once
623    int refetchEncryptionKey = 1; // only need to get a new encryption key once
624    
625    boolean connectFailedOnce = false;
626
627    while (true) {
628      //
629      // Compute desired block
630      //
631      LocatedBlock targetBlock = getBlockAt(target);
632
633      // update current position
634      this.pos = target;
635      this.blockEnd = targetBlock.getStartOffset() +
636            targetBlock.getBlockSize() - 1;
637      this.currentLocatedBlock = targetBlock;
638
639      assert (target==pos) : "Wrong postion " + pos + " expect " + target;
640      long offsetIntoBlock = target - targetBlock.getStartOffset();
641
642      DNAddrPair retval = chooseDataNode(targetBlock, null);
643      chosenNode = retval.info;
644      InetSocketAddress targetAddr = retval.addr;
645      StorageType storageType = retval.storageType;
646
647      try {
648        ExtendedBlock blk = targetBlock.getBlock();
649        Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
650        CachingStrategy curCachingStrategy;
651        boolean shortCircuitForbidden;
652        synchronized(infoLock) {
653          curCachingStrategy = cachingStrategy;
654          shortCircuitForbidden = shortCircuitForbidden();
655        }
656        blockReader = new BlockReaderFactory(dfsClient.getConf()).
657            setInetSocketAddress(targetAddr).
658            setRemotePeerFactory(dfsClient).
659            setDatanodeInfo(chosenNode).
660            setStorageType(storageType).
661            setFileName(src).
662            setBlock(blk).
663            setBlockToken(accessToken).
664            setStartOffset(offsetIntoBlock).
665            setVerifyChecksum(verifyChecksum).
666            setClientName(dfsClient.clientName).
667            setLength(blk.getNumBytes() - offsetIntoBlock).
668            setCachingStrategy(curCachingStrategy).
669            setAllowShortCircuitLocalReads(!shortCircuitForbidden).
670            setClientCacheContext(dfsClient.getClientContext()).
671            setUserGroupInformation(dfsClient.ugi).
672            setConfiguration(dfsClient.getConfiguration()).
673            build();
674        if(connectFailedOnce) {
675          DFSClient.LOG.info("Successfully connected to " + targetAddr +
676                             " for " + blk);
677        }
678        return chosenNode;
679      } catch (IOException ex) {
680        if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
681          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
682              + "encryption key was invalid when connecting to " + targetAddr
683              + " : " + ex);
684          // The encryption key used is invalid.
685          refetchEncryptionKey--;
686          dfsClient.clearDataEncryptionKey();
687        } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
688          refetchToken--;
689          fetchBlockAt(target);
690        } else {
691          connectFailedOnce = true;
692          DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
693            + ", add to deadNodes and continue. " + ex, ex);
694          // Put chosen node into dead list, continue
695          addToDeadNodes(chosenNode);
696        }
697      }
698    }
699  }
700
701  /**
702   * Close it down!
703   */
704  @Override
705  public synchronized void close() throws IOException {
706    if (!closed.compareAndSet(false, true)) {
707      DFSClient.LOG.debug("DFSInputStream has been closed already");
708      return;
709    }
710    dfsClient.checkOpen();
711
712    if ((extendedReadBuffers != null) && (!extendedReadBuffers.isEmpty())) {
713      final StringBuilder builder = new StringBuilder();
714      extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() {
715        private String prefix = "";
716        @Override
717        public void accept(ByteBuffer k, Object v) {
718          builder.append(prefix).append(k);
719          prefix = ", ";
720        }
721      });
722      DFSClient.LOG.warn("closing file " + src + ", but there are still " +
723          "unreleased ByteBuffers allocated by read().  " +
724          "Please release " + builder.toString() + ".");
725    }
726    closeCurrentBlockReader();
727    super.close();
728  }
729
730  @Override
731  public synchronized int read() throws IOException {
732    if (oneByteBuf == null) {
733      oneByteBuf = new byte[1];
734    }
735    int ret = read( oneByteBuf, 0, 1 );
736    return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff);
737  }
738
739  /**
740   * Wraps different possible read implementations so that readBuffer can be
741   * strategy-agnostic.
742   */
743  private interface ReaderStrategy {
744    public int doRead(BlockReader blockReader, int off, int len)
745        throws ChecksumException, IOException;
746  }
747
748  private void updateReadStatistics(ReadStatistics readStatistics, 
749        int nRead, BlockReader blockReader) {
750    if (nRead <= 0) return;
751    synchronized(infoLock) {
752      if (blockReader.isShortCircuit()) {
753        readStatistics.addShortCircuitBytes(nRead);
754      } else if (blockReader.isLocal()) {
755        readStatistics.addLocalBytes(nRead);
756      } else {
757        readStatistics.addRemoteBytes(nRead);
758      }
759    }
760  }
761  
762  /**
763   * Used to read bytes into a byte[]
764   */
765  private class ByteArrayStrategy implements ReaderStrategy {
766    final byte[] buf;
767
768    public ByteArrayStrategy(byte[] buf) {
769      this.buf = buf;
770    }
771
772    @Override
773    public int doRead(BlockReader blockReader, int off, int len)
774          throws ChecksumException, IOException {
775      int nRead = blockReader.read(buf, off, len);
776      updateReadStatistics(readStatistics, nRead, blockReader);
777      return nRead;
778    }
779  }
780
781  /**
782   * Used to read bytes into a user-supplied ByteBuffer
783   */
784  private class ByteBufferStrategy implements ReaderStrategy {
785    final ByteBuffer buf;
786    ByteBufferStrategy(ByteBuffer buf) {
787      this.buf = buf;
788    }
789
790    @Override
791    public int doRead(BlockReader blockReader, int off, int len)
792        throws ChecksumException, IOException {
793      int oldpos = buf.position();
794      int oldlimit = buf.limit();
795      boolean success = false;
796      try {
797        int ret = blockReader.read(buf);
798        success = true;
799        updateReadStatistics(readStatistics, ret, blockReader);
800        return ret;
801      } finally {
802        if (!success) {
803          // Reset to original state so that retries work correctly.
804          buf.position(oldpos);
805          buf.limit(oldlimit);
806        }
807      } 
808    }
809  }
810
811  /* This is a used by regular read() and handles ChecksumExceptions.
812   * name readBuffer() is chosen to imply similarity to readBuffer() in
813   * ChecksumFileSystem
814   */ 
815  private synchronized int readBuffer(ReaderStrategy reader, int off, int len,
816      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
817      throws IOException {
818    IOException ioe;
819    
820    /* we retry current node only once. So this is set to true only here.
821     * Intention is to handle one common case of an error that is not a
822     * failure on datanode or client : when DataNode closes the connection
823     * since client is idle. If there are other cases of "non-errors" then
824     * then a datanode might be retried by setting this to true again.
825     */
826    boolean retryCurrentNode = true;
827
828    while (true) {
829      // retry as many times as seekToNewSource allows.
830      try {
831        return reader.doRead(blockReader, off, len);
832      } catch ( ChecksumException ce ) {
833        DFSClient.LOG.warn("Found Checksum error for "
834            + getCurrentBlock() + " from " + currentNode
835            + " at " + ce.getPos());        
836        ioe = ce;
837        retryCurrentNode = false;
838        // we want to remember which block replicas we have tried
839        addIntoCorruptedBlockMap(getCurrentBlock(), currentNode,
840            corruptedBlockMap);
841      } catch ( IOException e ) {
842        if (!retryCurrentNode) {
843          DFSClient.LOG.warn("Exception while reading from "
844              + getCurrentBlock() + " of " + src + " from "
845              + currentNode, e);
846        }
847        ioe = e;
848      }
849      boolean sourceFound = false;
850      if (retryCurrentNode) {
851        /* possibly retry the same node so that transient errors don't
852         * result in application level failures (e.g. Datanode could have
853         * closed the connection because the client is idle for too long).
854         */ 
855        sourceFound = seekToBlockSource(pos);
856      } else {
857        addToDeadNodes(currentNode);
858        sourceFound = seekToNewSource(pos);
859      }
860      if (!sourceFound) {
861        throw ioe;
862      }
863      retryCurrentNode = false;
864    }
865  }
866
867  private synchronized int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException {
868    dfsClient.checkOpen();
869    if (closed.get()) {
870      throw new IOException("Stream closed");
871    }
872    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
873      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
874    failures = 0;
875    if (pos < getFileLength()) {
876      int retries = 2;
877      while (retries > 0) {
878        try {
879          // currentNode can be left as null if previous read had a checksum
880          // error on the same block. See HDFS-3067
881          if (pos > blockEnd || currentNode == null) {
882            currentNode = blockSeekTo(pos);
883          }
884          int realLen = (int) Math.min(len, (blockEnd - pos + 1L));
885          synchronized(infoLock) {
886            if (locatedBlocks.isLastBlockComplete()) {
887              realLen = (int) Math.min(realLen,
888                  locatedBlocks.getFileLength() - pos);
889            }
890          }
891          int result = readBuffer(strategy, off, realLen, corruptedBlockMap);
892          
893          if (result >= 0) {
894            pos += result;
895          } else {
896            // got a EOS from reader though we expect more data on it.
897            throw new IOException("Unexpected EOS from the reader");
898          }
899          if (dfsClient.stats != null) {
900            dfsClient.stats.incrementBytesRead(result);
901          }
902          return result;
903        } catch (ChecksumException ce) {
904          throw ce;            
905        } catch (IOException e) {
906          if (retries == 1) {
907            DFSClient.LOG.warn("DFS Read", e);
908          }
909          blockEnd = -1;
910          if (currentNode != null) { addToDeadNodes(currentNode); }
911          if (--retries == 0) {
912            throw e;
913          }
914        } finally {
915          // Check if need to report block replicas corruption either read
916          // was successful or ChecksumException occured.
917          reportCheckSumFailure(corruptedBlockMap, 
918              currentLocatedBlock.getLocations().length);
919        }
920      }
921    }
922    return -1;
923  }
924
925  /**
926   * Read the entire buffer.
927   */
928  @Override
929  public synchronized int read(final byte buf[], int off, int len) throws IOException {
930    ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf);
931    TraceScope scope =
932        dfsClient.getPathTraceScope("DFSInputStream#byteArrayRead", src);
933    try {
934      return readWithStrategy(byteArrayReader, off, len);
935    } finally {
936      scope.close();
937    }
938  }
939
940  @Override
941  public synchronized int read(final ByteBuffer buf) throws IOException {
942    ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf);
943    TraceScope scope =
944        dfsClient.getPathTraceScope("DFSInputStream#byteBufferRead", src);
945    try {
946      return readWithStrategy(byteBufferReader, 0, buf.remaining());
947    } finally {
948      scope.close();
949    }
950  }
951
952
953  /**
954   * Add corrupted block replica into map.
955   */
956  private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 
957      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
958    Set<DatanodeInfo> dnSet = null;
959    if((corruptedBlockMap.containsKey(blk))) {
960      dnSet = corruptedBlockMap.get(blk);
961    }else {
962      dnSet = new HashSet<DatanodeInfo>();
963    }
964    if (!dnSet.contains(node)) {
965      dnSet.add(node);
966      corruptedBlockMap.put(blk, dnSet);
967    }
968  }
969
970  private DNAddrPair chooseDataNode(LocatedBlock block,
971      Collection<DatanodeInfo> ignoredNodes) throws IOException {
972    while (true) {
973      try {
974        return getBestNodeDNAddrPair(block, ignoredNodes);
975      } catch (IOException ie) {
976        String errMsg = getBestNodeDNAddrPairErrorString(block.getLocations(),
977          deadNodes, ignoredNodes);
978        String blockInfo = block.getBlock() + " file=" + src;
979        if (failures >= dfsClient.getMaxBlockAcquireFailures()) {
980          String description = "Could not obtain block: " + blockInfo;
981          DFSClient.LOG.warn(description + errMsg
982              + ". Throwing a BlockMissingException");
983          throw new BlockMissingException(src, description,
984              block.getStartOffset());
985        }
986
987        DatanodeInfo[] nodes = block.getLocations();
988        if (nodes == null || nodes.length == 0) {
989          DFSClient.LOG.info("No node available for " + blockInfo);
990        }
991        DFSClient.LOG.info("Could not obtain " + block.getBlock()
992            + " from any node: " + ie + errMsg
993            + ". Will get new block locations from namenode and retry...");
994        try {
995          // Introducing a random factor to the wait time before another retry.
996          // The wait time is dependent on # of failures and a random factor.
997          // At the first time of getting a BlockMissingException, the wait time
998          // is a random number between 0..3000 ms. If the first retry
999          // still fails, we will wait 3000 ms grace period before the 2nd retry.
1000          // Also at the second retry, the waiting window is expanded to 6000 ms
1001          // alleviating the request rate from the server. Similarly the 3rd retry
1002          // will wait 6000ms grace period before retry and the waiting window is
1003          // expanded to 9000ms. 
1004          final int timeWindow = dfsClient.getConf().timeWindow;
1005          double waitTime = timeWindow * failures +       // grace period for the last round of attempt
1006            timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure
1007          DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec.");
1008          Thread.sleep((long)waitTime);
1009        } catch (InterruptedException iex) {
1010        }
1011        deadNodes.clear(); //2nd option is to remove only nodes[blockId]
1012        openInfo();
1013        block = getBlockAt(block.getStartOffset());
1014        failures++;
1015        continue;
1016      }
1017    }
1018  }
1019
1020  /**
1021   * Get the best node from which to stream the data.
1022   * @param block LocatedBlock, containing nodes in priority order.
1023   * @param ignoredNodes Do not choose nodes in this array (may be null)
1024   * @return The DNAddrPair of the best node.
1025   * @throws IOException
1026   */
1027  private DNAddrPair getBestNodeDNAddrPair(LocatedBlock block,
1028      Collection<DatanodeInfo> ignoredNodes) throws IOException {
1029    DatanodeInfo[] nodes = block.getLocations();
1030    StorageType[] storageTypes = block.getStorageTypes();
1031    DatanodeInfo chosenNode = null;
1032    StorageType storageType = null;
1033    if (nodes != null) {
1034      for (int i = 0; i < nodes.length; i++) {
1035        if (!deadNodes.containsKey(nodes[i])
1036            && (ignoredNodes == null || !ignoredNodes.contains(nodes[i]))) {
1037          chosenNode = nodes[i];
1038          // Storage types are ordered to correspond with nodes, so use the same
1039          // index to get storage type.
1040          if (storageTypes != null && i < storageTypes.length) {
1041            storageType = storageTypes[i];
1042          }
1043          break;
1044        }
1045      }
1046    }
1047    if (chosenNode == null) {
1048      throw new IOException("No live nodes contain block " + block.getBlock() +
1049          " after checking nodes = " + Arrays.toString(nodes) +
1050          ", ignoredNodes = " + ignoredNodes);
1051    }
1052    final String dnAddr =
1053        chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname);
1054    if (DFSClient.LOG.isDebugEnabled()) {
1055      DFSClient.LOG.debug("Connecting to datanode " + dnAddr);
1056    }
1057    InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr);
1058    return new DNAddrPair(chosenNode, targetAddr, storageType);
1059  }
1060
1061  private static String getBestNodeDNAddrPairErrorString(
1062      DatanodeInfo nodes[], AbstractMap<DatanodeInfo,
1063      DatanodeInfo> deadNodes, Collection<DatanodeInfo> ignoredNodes) {
1064    StringBuilder errMsgr = new StringBuilder(
1065        " No live nodes contain current block ");
1066    errMsgr.append("Block locations:");
1067    for (DatanodeInfo datanode : nodes) {
1068      errMsgr.append(" ");
1069      errMsgr.append(datanode.toString());
1070    }
1071    errMsgr.append(" Dead nodes: ");
1072    for (DatanodeInfo datanode : deadNodes.keySet()) {
1073      errMsgr.append(" ");
1074      errMsgr.append(datanode.toString());
1075    }
1076    if (ignoredNodes != null) {
1077      errMsgr.append(" Ignored nodes: ");
1078      for (DatanodeInfo datanode : ignoredNodes) {
1079        errMsgr.append(" ");
1080        errMsgr.append(datanode.toString());
1081      }
1082    }
1083    return errMsgr.toString();
1084  }
1085
1086  private void fetchBlockByteRange(LocatedBlock block, long start, long end,
1087      byte[] buf, int offset,
1088      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1089      throws IOException {
1090    block = getBlockAt(block.getStartOffset());
1091    while (true) {
1092      DNAddrPair addressPair = chooseDataNode(block, null);
1093      try {
1094        actualGetFromOneDataNode(addressPair, block, start, end, buf, offset,
1095            corruptedBlockMap);
1096        return;
1097      } catch (IOException e) {
1098        // Ignore. Already processed inside the function.
1099        // Loop through to try the next node.
1100      }
1101    }
1102  }
1103
1104  private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode,
1105      final LocatedBlock block, final long start, final long end,
1106      final ByteBuffer bb,
1107      final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap,
1108      final int hedgedReadId) {
1109    final Span parentSpan = Trace.currentSpan();
1110    return new Callable<ByteBuffer>() {
1111      @Override
1112      public ByteBuffer call() throws Exception {
1113        byte[] buf = bb.array();
1114        int offset = bb.position();
1115        TraceScope scope =
1116            Trace.startSpan("hedgedRead" + hedgedReadId, parentSpan);
1117        try {
1118          actualGetFromOneDataNode(datanode, block, start, end, buf, offset,
1119              corruptedBlockMap);
1120          return bb;
1121        } finally {
1122          scope.close();
1123        }
1124      }
1125    };
1126  }
1127
1128  private void actualGetFromOneDataNode(final DNAddrPair datanode,
1129      LocatedBlock block, final long start, final long end, byte[] buf,
1130      int offset, Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1131      throws IOException {
1132    DFSClientFaultInjector.get().startFetchFromDatanode();
1133    int refetchToken = 1; // only need to get a new access token once
1134    int refetchEncryptionKey = 1; // only need to get a new encryption key once
1135
1136    while (true) {
1137      // cached block locations may have been updated by chooseDataNode()
1138      // or fetchBlockAt(). Always get the latest list of locations at the
1139      // start of the loop.
1140      CachingStrategy curCachingStrategy;
1141      boolean allowShortCircuitLocalReads;
1142      block = getBlockAt(block.getStartOffset());
1143      synchronized(infoLock) {
1144        curCachingStrategy = cachingStrategy;
1145        allowShortCircuitLocalReads = !shortCircuitForbidden();
1146      }
1147      DatanodeInfo chosenNode = datanode.info;
1148      InetSocketAddress targetAddr = datanode.addr;
1149      StorageType storageType = datanode.storageType;
1150      BlockReader reader = null;
1151
1152      try {
1153        DFSClientFaultInjector.get().fetchFromDatanodeException();
1154        Token<BlockTokenIdentifier> blockToken = block.getBlockToken();
1155        int len = (int) (end - start + 1);
1156        reader = new BlockReaderFactory(dfsClient.getConf()).
1157            setInetSocketAddress(targetAddr).
1158            setRemotePeerFactory(dfsClient).
1159            setDatanodeInfo(chosenNode).
1160            setStorageType(storageType).
1161            setFileName(src).
1162            setBlock(block.getBlock()).
1163            setBlockToken(blockToken).
1164            setStartOffset(start).
1165            setVerifyChecksum(verifyChecksum).
1166            setClientName(dfsClient.clientName).
1167            setLength(len).
1168            setCachingStrategy(curCachingStrategy).
1169            setAllowShortCircuitLocalReads(allowShortCircuitLocalReads).
1170            setClientCacheContext(dfsClient.getClientContext()).
1171            setUserGroupInformation(dfsClient.ugi).
1172            setConfiguration(dfsClient.getConfiguration()).
1173            build();
1174        int nread = reader.readAll(buf, offset, len);
1175        updateReadStatistics(readStatistics, nread, reader);
1176
1177        if (nread != len) {
1178          throw new IOException("truncated return from reader.read(): " +
1179                                "excpected " + len + ", got " + nread);
1180        }
1181        DFSClientFaultInjector.get().readFromDatanodeDelay();
1182        return;
1183      } catch (ChecksumException e) {
1184        String msg = "fetchBlockByteRange(). Got a checksum exception for "
1185            + src + " at " + block.getBlock() + ":" + e.getPos() + " from "
1186            + chosenNode;
1187        DFSClient.LOG.warn(msg);
1188        // we want to remember what we have tried
1189        addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
1190        addToDeadNodes(chosenNode);
1191        throw new IOException(msg);
1192      } catch (IOException e) {
1193        if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
1194          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
1195              + "encryption key was invalid when connecting to " + targetAddr
1196              + " : " + e);
1197          // The encryption key used is invalid.
1198          refetchEncryptionKey--;
1199          dfsClient.clearDataEncryptionKey();
1200          continue;
1201        } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) {
1202          refetchToken--;
1203          try {
1204            fetchBlockAt(block.getStartOffset());
1205          } catch (IOException fbae) {
1206            // ignore IOE, since we can retry it later in a loop
1207          }
1208          continue;
1209        } else {
1210          String msg = "Failed to connect to " + targetAddr + " for file "
1211              + src + " for block " + block.getBlock() + ":" + e;
1212          DFSClient.LOG.warn("Connection failure: " + msg, e);
1213          addToDeadNodes(chosenNode);
1214          throw new IOException(msg);
1215        }
1216      } finally {
1217        if (reader != null) {
1218          reader.close();
1219        }
1220      }
1221    }
1222  }
1223
1224  /**
1225   * Like {@link #fetchBlockByteRange(LocatedBlock, long, long, byte[],
1226   * int, Map)} except we start up a second, parallel, 'hedged' read
1227   * if the first read is taking longer than configured amount of
1228   * time.  We then wait on which ever read returns first.
1229   */
1230  private void hedgedFetchBlockByteRange(LocatedBlock block, long start,
1231      long end, byte[] buf, int offset,
1232      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1233      throws IOException {
1234    ArrayList<Future<ByteBuffer>> futures = new ArrayList<Future<ByteBuffer>>();
1235    CompletionService<ByteBuffer> hedgedService =
1236        new ExecutorCompletionService<ByteBuffer>(
1237        dfsClient.getHedgedReadsThreadPool());
1238    ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>();
1239    ByteBuffer bb = null;
1240    int len = (int) (end - start + 1);
1241    int hedgedReadId = 0;
1242    block = getBlockAt(block.getStartOffset());
1243    while (true) {
1244      // see HDFS-6591, this metric is used to verify/catch unnecessary loops
1245      hedgedReadOpsLoopNumForTesting++;
1246      DNAddrPair chosenNode = null;
1247      // there is no request already executing.
1248      if (futures.isEmpty()) {
1249        // chooseDataNode is a commitment. If no node, we go to
1250        // the NN to reget block locations. Only go here on first read.
1251        chosenNode = chooseDataNode(block, ignored);
1252        bb = ByteBuffer.allocate(len);
1253        Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode(
1254            chosenNode, block, start, end, bb, corruptedBlockMap,
1255            hedgedReadId++);
1256        Future<ByteBuffer> firstRequest = hedgedService
1257            .submit(getFromDataNodeCallable);
1258        futures.add(firstRequest);
1259        try {
1260          Future<ByteBuffer> future = hedgedService.poll(
1261              dfsClient.getHedgedReadTimeout(), TimeUnit.MILLISECONDS);
1262          if (future != null) {
1263            ByteBuffer result = future.get();
1264            System.arraycopy(result.array(), result.position(), buf, offset,
1265                len);
1266            return;
1267          }
1268          if (DFSClient.LOG.isDebugEnabled()) {
1269            DFSClient.LOG.debug("Waited " + dfsClient.getHedgedReadTimeout()
1270                + "ms to read from " + chosenNode.info
1271                + "; spawning hedged read");
1272          }
1273          // Ignore this node on next go around.
1274          ignored.add(chosenNode.info);
1275          dfsClient.getHedgedReadMetrics().incHedgedReadOps();
1276          continue; // no need to refresh block locations
1277        } catch (InterruptedException e) {
1278          // Ignore
1279        } catch (ExecutionException e) {
1280          // Ignore already logged in the call.
1281        }
1282      } else {
1283        // We are starting up a 'hedged' read. We have a read already
1284        // ongoing. Call getBestNodeDNAddrPair instead of chooseDataNode.
1285        // If no nodes to do hedged reads against, pass.
1286        try {
1287          try {
1288            chosenNode = getBestNodeDNAddrPair(block, ignored);
1289          } catch (IOException ioe) {
1290            chosenNode = chooseDataNode(block, ignored);
1291          }
1292          bb = ByteBuffer.allocate(len);
1293          Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode(
1294              chosenNode, block, start, end, bb, corruptedBlockMap,
1295              hedgedReadId++);
1296          Future<ByteBuffer> oneMoreRequest = hedgedService
1297              .submit(getFromDataNodeCallable);
1298          futures.add(oneMoreRequest);
1299        } catch (IOException ioe) {
1300          if (DFSClient.LOG.isDebugEnabled()) {
1301            DFSClient.LOG.debug("Failed getting node for hedged read: "
1302                + ioe.getMessage());
1303          }
1304        }
1305        // if not succeeded. Submit callables for each datanode in a loop, wait
1306        // for a fixed interval and get the result from the fastest one.
1307        try {
1308          ByteBuffer result = getFirstToComplete(hedgedService, futures);
1309          // cancel the rest.
1310          cancelAll(futures);
1311          dfsClient.getHedgedReadMetrics().incHedgedReadWins();
1312          System.arraycopy(result.array(), result.position(), buf, offset,
1313              len);
1314          return;
1315        } catch (InterruptedException ie) {
1316          // Ignore and retry
1317        }
1318        // We got here if exception. Ignore this node on next go around IFF
1319        // we found a chosenNode to hedge read against.
1320        if (chosenNode != null && chosenNode.info != null) {
1321          ignored.add(chosenNode.info);
1322        }
1323      }
1324    }
1325  }
1326
1327  @VisibleForTesting
1328  public long getHedgedReadOpsLoopNumForTesting() {
1329    return hedgedReadOpsLoopNumForTesting;
1330  }
1331
1332  private ByteBuffer getFirstToComplete(
1333      CompletionService<ByteBuffer> hedgedService,
1334      ArrayList<Future<ByteBuffer>> futures) throws InterruptedException {
1335    if (futures.isEmpty()) {
1336      throw new InterruptedException("let's retry");
1337    }
1338    Future<ByteBuffer> future = null;
1339    try {
1340      future = hedgedService.take();
1341      ByteBuffer bb = future.get();
1342      futures.remove(future);
1343      return bb;
1344    } catch (ExecutionException e) {
1345      // already logged in the Callable
1346      futures.remove(future);
1347    } catch (CancellationException ce) {
1348      // already logged in the Callable
1349      futures.remove(future);
1350    }
1351
1352    throw new InterruptedException("let's retry");
1353  }
1354
1355  private void cancelAll(List<Future<ByteBuffer>> futures) {
1356    for (Future<ByteBuffer> future : futures) {
1357      // Unfortunately, hdfs reads do not take kindly to interruption.
1358      // Threads return a variety of interrupted-type exceptions but
1359      // also complaints about invalid pbs -- likely because read
1360      // is interrupted before gets whole pb.  Also verbose WARN
1361      // logging.  So, for now, do not interrupt running read.
1362      future.cancel(false);
1363    }
1364  }
1365
1366  /**
1367   * Should the block access token be refetched on an exception
1368   * 
1369   * @param ex Exception received
1370   * @param targetAddr Target datanode address from where exception was received
1371   * @return true if block access token has expired or invalid and it should be
1372   *         refetched
1373   */
1374  private static boolean tokenRefetchNeeded(IOException ex,
1375      InetSocketAddress targetAddr) {
1376    /*
1377     * Get a new access token and retry. Retry is needed in 2 cases. 1)
1378     * When both NN and DN re-started while DFSClient holding a cached
1379     * access token. 2) In the case that NN fails to update its
1380     * access key at pre-set interval (by a wide margin) and
1381     * subsequently restarts. In this case, DN re-registers itself with
1382     * NN and receives a new access key, but DN will delete the old
1383     * access key from its memory since it's considered expired based on
1384     * the estimated expiration date.
1385     */
1386    if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) {
1387      DFSClient.LOG.info("Access token was invalid when connecting to "
1388          + targetAddr + " : " + ex);
1389      return true;
1390    }
1391    return false;
1392  }
1393
1394  /**
1395   * Read bytes starting from the specified position.
1396   * 
1397   * @param position start read from this position
1398   * @param buffer read buffer
1399   * @param offset offset into buffer
1400   * @param length number of bytes to read
1401   * 
1402   * @return actual number of bytes read
1403   */
1404  @Override
1405  public int read(long position, byte[] buffer, int offset, int length)
1406      throws IOException {
1407    TraceScope scope =
1408        dfsClient.getPathTraceScope("DFSInputStream#byteArrayPread", src);
1409    try {
1410      return pread(position, buffer, offset, length);
1411    } finally {
1412      scope.close();
1413    }
1414  }
1415
1416  private int pread(long position, byte[] buffer, int offset, int length)
1417      throws IOException {
1418    // sanity checks
1419    dfsClient.checkOpen();
1420    if (closed.get()) {
1421      throw new IOException("Stream closed");
1422    }
1423    failures = 0;
1424    long filelen = getFileLength();
1425    if ((position < 0) || (position >= filelen)) {
1426      return -1;
1427    }
1428    int realLen = length;
1429    if ((position + length) > filelen) {
1430      realLen = (int)(filelen - position);
1431    }
1432    
1433    // determine the block and byte range within the block
1434    // corresponding to position and realLen
1435    List<LocatedBlock> blockRange = getBlockRange(position, realLen);
1436    int remaining = realLen;
1437    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
1438      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
1439    for (LocatedBlock blk : blockRange) {
1440      long targetStart = position - blk.getStartOffset();
1441      long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart);
1442      try {
1443        if (dfsClient.isHedgedReadsEnabled()) {
1444          hedgedFetchBlockByteRange(blk, targetStart, targetStart + bytesToRead
1445              - 1, buffer, offset, corruptedBlockMap);
1446        } else {
1447          fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1,
1448              buffer, offset, corruptedBlockMap);
1449        }
1450      } finally {
1451        // Check and report if any block replicas are corrupted.
1452        // BlockMissingException may be caught if all block replicas are
1453        // corrupted.
1454        reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length);
1455      }
1456
1457      remaining -= bytesToRead;
1458      position += bytesToRead;
1459      offset += bytesToRead;
1460    }
1461    assert remaining == 0 : "Wrong number of bytes read.";
1462    if (dfsClient.stats != null) {
1463      dfsClient.stats.incrementBytesRead(realLen);
1464    }
1465    return realLen;
1466  }
1467  
1468  /**
1469   * DFSInputStream reports checksum failure.
1470   * Case I : client has tried multiple data nodes and at least one of the
1471   * attempts has succeeded. We report the other failures as corrupted block to
1472   * namenode. 
1473   * Case II: client has tried out all data nodes, but all failed. We
1474   * only report if the total number of replica is 1. We do not
1475   * report otherwise since this maybe due to the client is a handicapped client
1476   * (who can not read).
1477   * @param corruptedBlockMap map of corrupted blocks
1478   * @param dataNodeCount number of data nodes who contains the block replicas
1479   */
1480  private void reportCheckSumFailure(
1481      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 
1482      int dataNodeCount) {
1483    if (corruptedBlockMap.isEmpty()) {
1484      return;
1485    }
1486    Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap
1487        .entrySet().iterator();
1488    Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next();
1489    ExtendedBlock blk = entry.getKey();
1490    Set<DatanodeInfo> dnSet = entry.getValue();
1491    if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0))
1492        || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) {
1493      DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()];
1494      int i = 0;
1495      for (DatanodeInfo dn:dnSet) {
1496        locs[i++] = dn;
1497      }
1498      LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) };
1499      dfsClient.reportChecksumFailure(src, lblocks);
1500    }
1501    corruptedBlockMap.clear();
1502  }
1503
1504  @Override
1505  public long skip(long n) throws IOException {
1506    if ( n > 0 ) {
1507      long curPos = getPos();
1508      long fileLen = getFileLength();
1509      if( n+curPos > fileLen ) {
1510        n = fileLen - curPos;
1511      }
1512      seek(curPos+n);
1513      return n;
1514    }
1515    return n < 0 ? -1 : 0;
1516  }
1517
1518  /**
1519   * Seek to a new arbitrary location
1520   */
1521  @Override
1522  public synchronized void seek(long targetPos) throws IOException {
1523    if (targetPos > getFileLength()) {
1524      throw new EOFException("Cannot seek after EOF");
1525    }
1526    if (targetPos < 0) {
1527      throw new EOFException("Cannot seek to negative offset");
1528    }
1529    if (closed.get()) {
1530      throw new IOException("Stream is closed!");
1531    }
1532    boolean done = false;
1533    if (pos <= targetPos && targetPos <= blockEnd) {
1534      //
1535      // If this seek is to a positive position in the current
1536      // block, and this piece of data might already be lying in
1537      // the TCP buffer, then just eat up the intervening data.
1538      //
1539      int diff = (int)(targetPos - pos);
1540      if (diff <= blockReader.available()) {
1541        try {
1542          pos += blockReader.skip(diff);
1543          if (pos == targetPos) {
1544            done = true;
1545          } else {
1546            // The range was already checked. If the block reader returns
1547            // something unexpected instead of throwing an exception, it is
1548            // most likely a bug. 
1549            String errMsg = "BlockReader failed to seek to " + 
1550                targetPos + ". Instead, it seeked to " + pos + ".";
1551            DFSClient.LOG.warn(errMsg);
1552            throw new IOException(errMsg);
1553          }
1554        } catch (IOException e) {//make following read to retry
1555          if(DFSClient.LOG.isDebugEnabled()) {
1556            DFSClient.LOG.debug("Exception while seek to " + targetPos
1557                + " from " + getCurrentBlock() + " of " + src + " from "
1558                + currentNode, e);
1559          }
1560        }
1561      }
1562    }
1563    if (!done) {
1564      pos = targetPos;
1565      blockEnd = -1;
1566    }
1567  }
1568
1569  /**
1570   * Same as {@link #seekToNewSource(long)} except that it does not exclude
1571   * the current datanode and might connect to the same node.
1572   */
1573  private boolean seekToBlockSource(long targetPos)
1574                                                 throws IOException {
1575    currentNode = blockSeekTo(targetPos);
1576    return true;
1577  }
1578  
1579  /**
1580   * Seek to given position on a node other than the current node.  If
1581   * a node other than the current node is found, then returns true. 
1582   * If another node could not be found, then returns false.
1583   */
1584  @Override
1585  public synchronized boolean seekToNewSource(long targetPos) throws IOException {
1586    boolean markedDead = deadNodes.containsKey(currentNode);
1587    addToDeadNodes(currentNode);
1588    DatanodeInfo oldNode = currentNode;
1589    DatanodeInfo newNode = blockSeekTo(targetPos);
1590    if (!markedDead) {
1591      /* remove it from deadNodes. blockSeekTo could have cleared 
1592       * deadNodes and added currentNode again. Thats ok. */
1593      deadNodes.remove(oldNode);
1594    }
1595    if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) {
1596      currentNode = newNode;
1597      return true;
1598    } else {
1599      return false;
1600    }
1601  }
1602      
1603  /**
1604   */
1605  @Override
1606  public synchronized long getPos() throws IOException {
1607    return pos;
1608  }
1609
1610  /** Return the size of the remaining available bytes
1611   * if the size is less than or equal to {@link Integer#MAX_VALUE},
1612   * otherwise, return {@link Integer#MAX_VALUE}.
1613   */
1614  @Override
1615  public synchronized int available() throws IOException {
1616    if (closed.get()) {
1617      throw new IOException("Stream closed");
1618    }
1619
1620    final long remaining = getFileLength() - pos;
1621    return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE;
1622  }
1623
1624  /**
1625   * We definitely don't support marks
1626   */
1627  @Override
1628  public boolean markSupported() {
1629    return false;
1630  }
1631  @Override
1632  public void mark(int readLimit) {
1633  }
1634  @Override
1635  public void reset() throws IOException {
1636    throw new IOException("Mark/reset not supported");
1637  }
1638
1639  /** Utility class to encapsulate data node info and its address. */
1640  private static final class DNAddrPair {
1641    final DatanodeInfo info;
1642    final InetSocketAddress addr;
1643    final StorageType storageType;
1644
1645    DNAddrPair(DatanodeInfo info, InetSocketAddress addr,
1646        StorageType storageType) {
1647      this.info = info;
1648      this.addr = addr;
1649      this.storageType = storageType;
1650    }
1651  }
1652
1653  /**
1654   * Get statistics about the reads which this DFSInputStream has done.
1655   */
1656  public ReadStatistics getReadStatistics() {
1657    synchronized(infoLock) {
1658      return new ReadStatistics(readStatistics);
1659    }
1660  }
1661
1662  /**
1663   * Clear statistics about the reads which this DFSInputStream has done.
1664   */
1665  public void clearReadStatistics() {
1666    synchronized(infoLock) {
1667      readStatistics.clear();
1668    }
1669  }
1670
1671  public FileEncryptionInfo getFileEncryptionInfo() {
1672    synchronized(infoLock) {
1673      return fileEncryptionInfo;
1674    }
1675  }
1676
1677  private void closeCurrentBlockReader() {
1678    if (blockReader == null) return;
1679    // Close the current block reader so that the new caching settings can 
1680    // take effect immediately.
1681    try {
1682      blockReader.close();
1683    } catch (IOException e) {
1684      DFSClient.LOG.error("error closing blockReader", e);
1685    }
1686    blockReader = null;
1687    blockEnd = -1;
1688  }
1689
1690  @Override
1691  public synchronized void setReadahead(Long readahead)
1692      throws IOException {
1693    synchronized (infoLock) {
1694      this.cachingStrategy =
1695          new CachingStrategy.Builder(this.cachingStrategy).setReadahead(readahead).build();
1696    }
1697    closeCurrentBlockReader();
1698  }
1699
1700  @Override
1701  public synchronized void setDropBehind(Boolean dropBehind)
1702      throws IOException {
1703    synchronized (infoLock) {
1704      this.cachingStrategy =
1705          new CachingStrategy.Builder(this.cachingStrategy).setDropBehind(dropBehind).build();
1706    }
1707    closeCurrentBlockReader();
1708  }
1709
1710  /**
1711   * The immutable empty buffer we return when we reach EOF when doing a
1712   * zero-copy read.
1713   */
1714  private static final ByteBuffer EMPTY_BUFFER =
1715    ByteBuffer.allocateDirect(0).asReadOnlyBuffer();
1716
1717  @Override
1718  public synchronized ByteBuffer read(ByteBufferPool bufferPool,
1719      int maxLength, EnumSet<ReadOption> opts) 
1720          throws IOException, UnsupportedOperationException {
1721    if (maxLength == 0) {
1722      return EMPTY_BUFFER;
1723    } else if (maxLength < 0) {
1724      throw new IllegalArgumentException("can't read a negative " +
1725          "number of bytes.");
1726    }
1727    if ((blockReader == null) || (blockEnd == -1)) {
1728      if (pos >= getFileLength()) {
1729        return null;
1730      }
1731      /*
1732       * If we don't have a blockReader, or the one we have has no more bytes
1733       * left to read, we call seekToBlockSource to get a new blockReader and
1734       * recalculate blockEnd.  Note that we assume we're not at EOF here
1735       * (we check this above).
1736       */
1737      if ((!seekToBlockSource(pos)) || (blockReader == null)) {
1738        throw new IOException("failed to allocate new BlockReader " +
1739            "at position " + pos);
1740      }
1741    }
1742    ByteBuffer buffer = null;
1743    if (dfsClient.getConf().shortCircuitMmapEnabled) {
1744      buffer = tryReadZeroCopy(maxLength, opts);
1745    }
1746    if (buffer != null) {
1747      return buffer;
1748    }
1749    buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength);
1750    if (buffer != null) {
1751      getExtendedReadBuffers().put(buffer, bufferPool);
1752    }
1753    return buffer;
1754  }
1755
1756  private synchronized ByteBuffer tryReadZeroCopy(int maxLength,
1757      EnumSet<ReadOption> opts) throws IOException {
1758    // Copy 'pos' and 'blockEnd' to local variables to make it easier for the
1759    // JVM to optimize this function.
1760    final long curPos = pos;
1761    final long curEnd = blockEnd;
1762    final long blockStartInFile = currentLocatedBlock.getStartOffset();
1763    final long blockPos = curPos - blockStartInFile;
1764
1765    // Shorten this read if the end of the block is nearby.
1766    long length63;
1767    if ((curPos + maxLength) <= (curEnd + 1)) {
1768      length63 = maxLength;
1769    } else {
1770      length63 = 1 + curEnd - curPos;
1771      if (length63 <= 0) {
1772        if (DFSClient.LOG.isDebugEnabled()) {
1773          DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " +
1774            curPos + " of " + src + "; " + length63 + " bytes left in block.  " +
1775            "blockPos=" + blockPos + "; curPos=" + curPos +
1776            "; curEnd=" + curEnd);
1777        }
1778        return null;
1779      }
1780      if (DFSClient.LOG.isDebugEnabled()) {
1781        DFSClient.LOG.debug("Reducing read length from " + maxLength +
1782            " to " + length63 + " to avoid going more than one byte " +
1783            "past the end of the block.  blockPos=" + blockPos +
1784            "; curPos=" + curPos + "; curEnd=" + curEnd);
1785      }
1786    }
1787    // Make sure that don't go beyond 31-bit offsets in the MappedByteBuffer.
1788    int length;
1789    if (blockPos + length63 <= Integer.MAX_VALUE) {
1790      length = (int)length63;
1791    } else {
1792      long length31 = Integer.MAX_VALUE - blockPos;
1793      if (length31 <= 0) {
1794        // Java ByteBuffers can't be longer than 2 GB, because they use
1795        // 4-byte signed integers to represent capacity, etc.
1796        // So we can't mmap the parts of the block higher than the 2 GB offset.
1797        // FIXME: we could work around this with multiple memory maps.
1798        // See HDFS-5101.
1799        if (DFSClient.LOG.isDebugEnabled()) {
1800          DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " +
1801            curPos + " of " + src + "; 31-bit MappedByteBuffer limit " +
1802            "exceeded.  blockPos=" + blockPos + ", curEnd=" + curEnd);
1803        }
1804        return null;
1805      }
1806      length = (int)length31;
1807      if (DFSClient.LOG.isDebugEnabled()) {
1808        DFSClient.LOG.debug("Reducing read length from " + maxLength +
1809            " to " + length + " to avoid 31-bit limit.  " +
1810            "blockPos=" + blockPos + "; curPos=" + curPos +
1811            "; curEnd=" + curEnd);
1812      }
1813    }
1814    final ClientMmap clientMmap = blockReader.getClientMmap(opts);
1815    if (clientMmap == null) {
1816      if (DFSClient.LOG.isDebugEnabled()) {
1817        DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
1818          curPos + " of " + src + "; BlockReader#getClientMmap returned " +
1819          "null.");
1820      }
1821      return null;
1822    }
1823    boolean success = false;
1824    ByteBuffer buffer;
1825    try {
1826      seek(curPos + length);
1827      buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
1828      buffer.position((int)blockPos);
1829      buffer.limit((int)(blockPos + length));
1830      getExtendedReadBuffers().put(buffer, clientMmap);
1831      synchronized (infoLock) {
1832        readStatistics.addZeroCopyBytes(length);
1833      }
1834      if (DFSClient.LOG.isDebugEnabled()) {
1835        DFSClient.LOG.debug("readZeroCopy read " + length + 
1836            " bytes from offset " + curPos + " via the zero-copy read " +
1837            "path.  blockEnd = " + blockEnd);
1838      }
1839      success = true;
1840    } finally {
1841      if (!success) {
1842        IOUtils.closeQuietly(clientMmap);
1843      }
1844    }
1845    return buffer;
1846  }
1847
1848  @Override
1849  public synchronized void releaseBuffer(ByteBuffer buffer) {
1850    if (buffer == EMPTY_BUFFER) return;
1851    Object val = getExtendedReadBuffers().remove(buffer);
1852    if (val == null) {
1853      throw new IllegalArgumentException("tried to release a buffer " +
1854          "that was not created by this stream, " + buffer);
1855    }
1856    if (val instanceof ClientMmap) {
1857      IOUtils.closeQuietly((ClientMmap)val);
1858    } else if (val instanceof ByteBufferPool) {
1859      ((ByteBufferPool)val).putBuffer(buffer);
1860    }
1861  }
1862
1863  @Override
1864  public synchronized void unbuffer() {
1865    closeCurrentBlockReader();
1866  }
1867}