001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs;
019
020import java.io.EOFException;
021import java.io.IOException;
022import java.net.InetSocketAddress;
023import java.nio.ByteBuffer;
024import java.util.AbstractMap;
025import java.util.ArrayList;
026import java.util.Arrays;
027import java.util.Collection;
028import java.util.EnumSet;
029import java.util.HashMap;
030import java.util.HashSet;
031import java.util.Iterator;
032import java.util.List;
033import java.util.Map;
034import java.util.Map.Entry;
035import java.util.Set;
036import java.util.concurrent.Callable;
037import java.util.concurrent.CancellationException;
038import java.util.concurrent.CompletionService;
039import java.util.concurrent.ConcurrentHashMap;
040import java.util.concurrent.ExecutionException;
041import java.util.concurrent.ExecutorCompletionService;
042import java.util.concurrent.Future;
043import java.util.concurrent.TimeUnit;
044import java.util.concurrent.atomic.AtomicLong;
045
046import org.apache.commons.io.IOUtils;
047import org.apache.hadoop.classification.InterfaceAudience;
048import org.apache.hadoop.fs.ByteBufferReadable;
049import org.apache.hadoop.fs.ByteBufferUtil;
050import org.apache.hadoop.fs.CanSetDropBehind;
051import org.apache.hadoop.fs.CanSetReadahead;
052import org.apache.hadoop.fs.ChecksumException;
053import org.apache.hadoop.fs.FSInputStream;
054import org.apache.hadoop.fs.HasEnhancedByteBufferAccess;
055import org.apache.hadoop.fs.ReadOption;
056import org.apache.hadoop.fs.UnresolvedLinkException;
057import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
058import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
059import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
060import org.apache.hadoop.fs.FileEncryptionInfo;
061import org.apache.hadoop.hdfs.protocol.LocatedBlock;
062import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
063import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
064import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
065import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
066import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
067import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
068import org.apache.hadoop.hdfs.shortcircuit.ClientMmap;
069import org.apache.hadoop.io.ByteBufferPool;
070import org.apache.hadoop.ipc.RPC;
071import org.apache.hadoop.ipc.RemoteException;
072import org.apache.hadoop.net.NetUtils;
073import org.apache.hadoop.security.token.SecretManager.InvalidToken;
074import org.apache.hadoop.security.token.Token;
075import org.apache.hadoop.util.IdentityHashStore;
076
077import com.google.common.annotations.VisibleForTesting;
078
079/****************************************************************
080 * DFSInputStream provides bytes from a named file.  It handles 
081 * negotiation of the namenode and various datanodes as necessary.
082 ****************************************************************/
083@InterfaceAudience.Private
084public class DFSInputStream extends FSInputStream
085implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
086    HasEnhancedByteBufferAccess {
087  @VisibleForTesting
088  public static boolean tcpReadsDisabledForTesting = false;
089  private long hedgedReadOpsLoopNumForTesting = 0;
090  private final DFSClient dfsClient;
091  private boolean closed = false;
092  private final String src;
093  private BlockReader blockReader = null;
094  private final boolean verifyChecksum;
095  private LocatedBlocks locatedBlocks = null;
096  private long lastBlockBeingWrittenLength = 0;
097  private FileEncryptionInfo fileEncryptionInfo = null;
098  private DatanodeInfo currentNode = null;
099  private LocatedBlock currentLocatedBlock = null;
100  private long pos = 0;
101  private long blockEnd = -1;
102  private CachingStrategy cachingStrategy;
103  private final ReadStatistics readStatistics = new ReadStatistics();
104
105  /**
106   * Track the ByteBuffers that we have handed out to readers.
107   * 
108   * The value type can be either ByteBufferPool or ClientMmap, depending on
109   * whether we this is a memory-mapped buffer or not.
110   */
111  private final IdentityHashStore<ByteBuffer, Object>
112      extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0);
113
114  public static class ReadStatistics {
115    public ReadStatistics() {
116      this.totalBytesRead = 0;
117      this.totalLocalBytesRead = 0;
118      this.totalShortCircuitBytesRead = 0;
119      this.totalZeroCopyBytesRead = 0;
120    }
121
122    public ReadStatistics(ReadStatistics rhs) {
123      this.totalBytesRead = rhs.getTotalBytesRead();
124      this.totalLocalBytesRead = rhs.getTotalLocalBytesRead();
125      this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead();
126      this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead();
127    }
128
129    /**
130     * @return The total bytes read.  This will always be at least as
131     * high as the other numbers, since it includes all of them.
132     */
133    public long getTotalBytesRead() {
134      return totalBytesRead;
135    }
136
137    /**
138     * @return The total local bytes read.  This will always be at least
139     * as high as totalShortCircuitBytesRead, since all short-circuit
140     * reads are also local.
141     */
142    public long getTotalLocalBytesRead() {
143      return totalLocalBytesRead;
144    }
145
146    /**
147     * @return The total short-circuit local bytes read.
148     */
149    public long getTotalShortCircuitBytesRead() {
150      return totalShortCircuitBytesRead;
151    }
152    
153    /**
154     * @return The total number of zero-copy bytes read.
155     */
156    public long getTotalZeroCopyBytesRead() {
157      return totalZeroCopyBytesRead;
158    }
159
160    /**
161     * @return The total number of bytes read which were not local.
162     */
163    public long getRemoteBytesRead() {
164      return totalBytesRead - totalLocalBytesRead;
165    }
166    
167    void addRemoteBytes(long amt) {
168      this.totalBytesRead += amt;
169    }
170
171    void addLocalBytes(long amt) {
172      this.totalBytesRead += amt;
173      this.totalLocalBytesRead += amt;
174    }
175
176    void addShortCircuitBytes(long amt) {
177      this.totalBytesRead += amt;
178      this.totalLocalBytesRead += amt;
179      this.totalShortCircuitBytesRead += amt;
180    }
181
182    void addZeroCopyBytes(long amt) {
183      this.totalBytesRead += amt;
184      this.totalLocalBytesRead += amt;
185      this.totalShortCircuitBytesRead += amt;
186      this.totalZeroCopyBytesRead += amt;
187    }
188    
189    private long totalBytesRead;
190
191    private long totalLocalBytesRead;
192
193    private long totalShortCircuitBytesRead;
194
195    private long totalZeroCopyBytesRead;
196  }
197  
198  /**
199   * This variable tracks the number of failures since the start of the
200   * most recent user-facing operation. That is to say, it should be reset
201   * whenever the user makes a call on this stream, and if at any point
202   * during the retry logic, the failure count exceeds a threshold,
203   * the errors will be thrown back to the operation.
204   *
205   * Specifically this counts the number of times the client has gone
206   * back to the namenode to get a new list of block locations, and is
207   * capped at maxBlockAcquireFailures
208   */
209  private int failures = 0;
210
211  /* XXX Use of CocurrentHashMap is temp fix. Need to fix 
212   * parallel accesses to DFSInputStream (through ptreads) properly */
213  private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes =
214             new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>();
215  private int buffersize = 1;
216  
217  private final byte[] oneByteBuf = new byte[1]; // used for 'int read()'
218
219  void addToDeadNodes(DatanodeInfo dnInfo) {
220    deadNodes.put(dnInfo, dnInfo);
221  }
222  
223  DFSInputStream(DFSClient dfsClient, String src, int buffersize, boolean verifyChecksum
224                 ) throws IOException, UnresolvedLinkException {
225    this.dfsClient = dfsClient;
226    this.verifyChecksum = verifyChecksum;
227    this.buffersize = buffersize;
228    this.src = src;
229    this.cachingStrategy =
230        dfsClient.getDefaultReadCachingStrategy();
231    openInfo();
232  }
233
234  /**
235   * Grab the open-file info from namenode
236   */
237  synchronized void openInfo() throws IOException, UnresolvedLinkException {
238    lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
239    int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength;
240    while (retriesForLastBlockLength > 0) {
241      // Getting last block length as -1 is a special case. When cluster
242      // restarts, DNs may not report immediately. At this time partial block
243      // locations will not be available with NN for getting the length. Lets
244      // retry for 3 times to get the length.
245      if (lastBlockBeingWrittenLength == -1) {
246        DFSClient.LOG.warn("Last block locations not available. "
247            + "Datanodes might not have reported blocks completely."
248            + " Will retry for " + retriesForLastBlockLength + " times");
249        waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength);
250        lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
251      } else {
252        break;
253      }
254      retriesForLastBlockLength--;
255    }
256    if (retriesForLastBlockLength == 0) {
257      throw new IOException("Could not obtain the last block locations.");
258    }
259  }
260
261  private void waitFor(int waitTime) throws IOException {
262    try {
263      Thread.sleep(waitTime);
264    } catch (InterruptedException e) {
265      throw new IOException(
266          "Interrupted while getting the last block length.");
267    }
268  }
269
270  private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException {
271    final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0);
272    if (DFSClient.LOG.isDebugEnabled()) {
273      DFSClient.LOG.debug("newInfo = " + newInfo);
274    }
275    if (newInfo == null) {
276      throw new IOException("Cannot open filename " + src);
277    }
278
279    if (locatedBlocks != null) {
280      Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator();
281      Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator();
282      while (oldIter.hasNext() && newIter.hasNext()) {
283        if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) {
284          throw new IOException("Blocklist for " + src + " has changed!");
285        }
286      }
287    }
288    locatedBlocks = newInfo;
289    long lastBlockBeingWrittenLength = 0;
290    if (!locatedBlocks.isLastBlockComplete()) {
291      final LocatedBlock last = locatedBlocks.getLastLocatedBlock();
292      if (last != null) {
293        if (last.getLocations().length == 0) {
294          if (last.getBlockSize() == 0) {
295            // if the length is zero, then no data has been written to
296            // datanode. So no need to wait for the locations.
297            return 0;
298          }
299          return -1;
300        }
301        final long len = readBlockLength(last);
302        last.getBlock().setNumBytes(len);
303        lastBlockBeingWrittenLength = len; 
304      }
305    }
306
307    fileEncryptionInfo = locatedBlocks.getFileEncryptionInfo();
308
309    currentNode = null;
310    return lastBlockBeingWrittenLength;
311  }
312
313  /** Read the block length from one of the datanodes. */
314  private long readBlockLength(LocatedBlock locatedblock) throws IOException {
315    assert locatedblock != null : "LocatedBlock cannot be null";
316    int replicaNotFoundCount = locatedblock.getLocations().length;
317    
318    for(DatanodeInfo datanode : locatedblock.getLocations()) {
319      ClientDatanodeProtocol cdp = null;
320      
321      try {
322        cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode,
323            dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout,
324            dfsClient.getConf().connectToDnViaHostname, locatedblock);
325        
326        final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock());
327        
328        if (n >= 0) {
329          return n;
330        }
331      }
332      catch(IOException ioe) {
333        if (ioe instanceof RemoteException &&
334          (((RemoteException) ioe).unwrapRemoteException() instanceof
335            ReplicaNotFoundException)) {
336          // special case : replica might not be on the DN, treat as 0 length
337          replicaNotFoundCount--;
338        }
339        
340        if (DFSClient.LOG.isDebugEnabled()) {
341          DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode "
342              + datanode + " for block " + locatedblock.getBlock(), ioe);
343        }
344      } finally {
345        if (cdp != null) {
346          RPC.stopProxy(cdp);
347        }
348      }
349    }
350
351    // Namenode told us about these locations, but none know about the replica
352    // means that we hit the race between pipeline creation start and end.
353    // we require all 3 because some other exception could have happened
354    // on a DN that has it.  we want to report that error
355    if (replicaNotFoundCount == 0) {
356      return 0;
357    }
358
359    throw new IOException("Cannot obtain block length for " + locatedblock);
360  }
361  
362  public synchronized long getFileLength() {
363    return locatedBlocks == null? 0:
364        locatedBlocks.getFileLength() + lastBlockBeingWrittenLength;
365  }
366
367  // Short circuit local reads are forbidden for files that are
368  // under construction.  See HDFS-2757.
369  synchronized boolean shortCircuitForbidden() {
370    return locatedBlocks.isUnderConstruction();
371  }
372
373  /**
374   * Returns the datanode from which the stream is currently reading.
375   */
376  public DatanodeInfo getCurrentDatanode() {
377    return currentNode;
378  }
379
380  /**
381   * Returns the block containing the target position. 
382   */
383  synchronized public ExtendedBlock getCurrentBlock() {
384    if (currentLocatedBlock == null){
385      return null;
386    }
387    return currentLocatedBlock.getBlock();
388  }
389
390  /**
391   * Return collection of blocks that has already been located.
392   */
393  public synchronized List<LocatedBlock> getAllBlocks() throws IOException {
394    return getBlockRange(0, getFileLength());
395  }
396
397  /**
398   * Get block at the specified position.
399   * Fetch it from the namenode if not cached.
400   * 
401   * @param offset block corresponding to this offset in file is returned
402   * @param updatePosition whether to update current position
403   * @return located block
404   * @throws IOException
405   */
406  private synchronized LocatedBlock getBlockAt(long offset,
407      boolean updatePosition) throws IOException {
408    assert (locatedBlocks != null) : "locatedBlocks is null";
409
410    final LocatedBlock blk;
411
412    //check offset
413    if (offset < 0 || offset >= getFileLength()) {
414      throw new IOException("offset < 0 || offset >= getFileLength(), offset="
415          + offset
416          + ", updatePosition=" + updatePosition
417          + ", locatedBlocks=" + locatedBlocks);
418    }
419    else if (offset >= locatedBlocks.getFileLength()) {
420      // offset to the portion of the last block,
421      // which is not known to the name-node yet;
422      // getting the last block 
423      blk = locatedBlocks.getLastLocatedBlock();
424    }
425    else {
426      // search cached blocks first
427      int targetBlockIdx = locatedBlocks.findBlock(offset);
428      if (targetBlockIdx < 0) { // block is not cached
429        targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
430        // fetch more blocks
431        final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
432        assert (newBlocks != null) : "Could not find target position " + offset;
433        locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
434      }
435      blk = locatedBlocks.get(targetBlockIdx);
436    }
437
438    // update current position
439    if (updatePosition) {
440      pos = offset;
441      blockEnd = blk.getStartOffset() + blk.getBlockSize() - 1;
442      currentLocatedBlock = blk;
443    }
444    return blk;
445  }
446
447  /** Fetch a block from namenode and cache it */
448  private synchronized void fetchBlockAt(long offset) throws IOException {
449    int targetBlockIdx = locatedBlocks.findBlock(offset);
450    if (targetBlockIdx < 0) { // block is not cached
451      targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
452    }
453    // fetch blocks
454    final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
455    if (newBlocks == null) {
456      throw new IOException("Could not find target position " + offset);
457    }
458    locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
459  }
460
461  /**
462   * Get blocks in the specified range.
463   * Fetch them from the namenode if not cached. This function
464   * will not get a read request beyond the EOF.
465   * @param offset starting offset in file
466   * @param length length of data
467   * @return consequent segment of located blocks
468   * @throws IOException
469   */
470  private synchronized List<LocatedBlock> getBlockRange(long offset,
471      long length)  throws IOException {
472    // getFileLength(): returns total file length
473    // locatedBlocks.getFileLength(): returns length of completed blocks
474    if (offset >= getFileLength()) {
475      throw new IOException("Offset: " + offset +
476        " exceeds file length: " + getFileLength());
477    }
478
479    final List<LocatedBlock> blocks;
480    final long lengthOfCompleteBlk = locatedBlocks.getFileLength();
481    final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk;
482    final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk;
483
484    if (readOffsetWithinCompleteBlk) {
485      //get the blocks of finalized (completed) block range
486      blocks = getFinalizedBlockRange(offset, 
487        Math.min(length, lengthOfCompleteBlk - offset));
488    } else {
489      blocks = new ArrayList<LocatedBlock>(1);
490    }
491
492    // get the blocks from incomplete block range
493    if (readLengthPastCompleteBlk) {
494       blocks.add(locatedBlocks.getLastLocatedBlock());
495    }
496
497    return blocks;
498  }
499
500  /**
501   * Get blocks in the specified range.
502   * Includes only the complete blocks.
503   * Fetch them from the namenode if not cached.
504   */
505  private synchronized List<LocatedBlock> getFinalizedBlockRange(
506      long offset, long length) throws IOException {
507    assert (locatedBlocks != null) : "locatedBlocks is null";
508    List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>();
509    // search cached blocks first
510    int blockIdx = locatedBlocks.findBlock(offset);
511    if (blockIdx < 0) { // block is not cached
512      blockIdx = LocatedBlocks.getInsertIndex(blockIdx);
513    }
514    long remaining = length;
515    long curOff = offset;
516    while(remaining > 0) {
517      LocatedBlock blk = null;
518      if(blockIdx < locatedBlocks.locatedBlockCount())
519        blk = locatedBlocks.get(blockIdx);
520      if (blk == null || curOff < blk.getStartOffset()) {
521        LocatedBlocks newBlocks;
522        newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining);
523        locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks());
524        continue;
525      }
526      assert curOff >= blk.getStartOffset() : "Block not found";
527      blockRange.add(blk);
528      long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff;
529      remaining -= bytesRead;
530      curOff += bytesRead;
531      blockIdx++;
532    }
533    return blockRange;
534  }
535
536  /**
537   * Open a DataInputStream to a DataNode so that it can be read from.
538   * We get block ID and the IDs of the destinations at startup, from the namenode.
539   */
540  private synchronized DatanodeInfo blockSeekTo(long target) throws IOException {
541    if (target >= getFileLength()) {
542      throw new IOException("Attempted to read past end of file");
543    }
544
545    // Will be getting a new BlockReader.
546    if (blockReader != null) {
547      blockReader.close();
548      blockReader = null;
549    }
550
551    //
552    // Connect to best DataNode for desired Block, with potential offset
553    //
554    DatanodeInfo chosenNode = null;
555    int refetchToken = 1; // only need to get a new access token once
556    int refetchEncryptionKey = 1; // only need to get a new encryption key once
557    
558    boolean connectFailedOnce = false;
559
560    while (true) {
561      //
562      // Compute desired block
563      //
564      LocatedBlock targetBlock = getBlockAt(target, true);
565      assert (target==pos) : "Wrong postion " + pos + " expect " + target;
566      long offsetIntoBlock = target - targetBlock.getStartOffset();
567
568      DNAddrPair retval = chooseDataNode(targetBlock, null);
569      chosenNode = retval.info;
570      InetSocketAddress targetAddr = retval.addr;
571      StorageType storageType = retval.storageType;
572
573      try {
574        ExtendedBlock blk = targetBlock.getBlock();
575        Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
576        blockReader = new BlockReaderFactory(dfsClient.getConf()).
577            setInetSocketAddress(targetAddr).
578            setRemotePeerFactory(dfsClient).
579            setDatanodeInfo(chosenNode).
580            setStorageType(storageType).
581            setFileName(src).
582            setBlock(blk).
583            setBlockToken(accessToken).
584            setStartOffset(offsetIntoBlock).
585            setVerifyChecksum(verifyChecksum).
586            setClientName(dfsClient.clientName).
587            setLength(blk.getNumBytes() - offsetIntoBlock).
588            setCachingStrategy(cachingStrategy).
589            setAllowShortCircuitLocalReads(!shortCircuitForbidden()).
590            setClientCacheContext(dfsClient.getClientContext()).
591            setUserGroupInformation(dfsClient.ugi).
592            setConfiguration(dfsClient.getConfiguration()).
593            build();
594        if(connectFailedOnce) {
595          DFSClient.LOG.info("Successfully connected to " + targetAddr +
596                             " for " + blk);
597        }
598        return chosenNode;
599      } catch (IOException ex) {
600        if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
601          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
602              + "encryption key was invalid when connecting to " + targetAddr
603              + " : " + ex);
604          // The encryption key used is invalid.
605          refetchEncryptionKey--;
606          dfsClient.clearDataEncryptionKey();
607        } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
608          refetchToken--;
609          fetchBlockAt(target);
610        } else {
611          connectFailedOnce = true;
612          DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
613            + ", add to deadNodes and continue. " + ex, ex);
614          // Put chosen node into dead list, continue
615          addToDeadNodes(chosenNode);
616        }
617      }
618    }
619  }
620
621  /**
622   * Close it down!
623   */
624  @Override
625  public synchronized void close() throws IOException {
626    if (closed) {
627      return;
628    }
629    dfsClient.checkOpen();
630
631    if (!extendedReadBuffers.isEmpty()) {
632      final StringBuilder builder = new StringBuilder();
633      extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() {
634        private String prefix = "";
635        @Override
636        public void accept(ByteBuffer k, Object v) {
637          builder.append(prefix).append(k);
638          prefix = ", ";
639        }
640      });
641      DFSClient.LOG.warn("closing file " + src + ", but there are still " +
642          "unreleased ByteBuffers allocated by read().  " +
643          "Please release " + builder.toString() + ".");
644    }
645    if (blockReader != null) {
646      blockReader.close();
647      blockReader = null;
648    }
649    super.close();
650    closed = true;
651  }
652
653  @Override
654  public synchronized int read() throws IOException {
655    int ret = read( oneByteBuf, 0, 1 );
656    return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff);
657  }
658
659  /**
660   * Wraps different possible read implementations so that readBuffer can be
661   * strategy-agnostic.
662   */
663  private interface ReaderStrategy {
664    public int doRead(BlockReader blockReader, int off, int len,
665        ReadStatistics readStatistics) throws ChecksumException, IOException;
666  }
667
668  private static void updateReadStatistics(ReadStatistics readStatistics, 
669        int nRead, BlockReader blockReader) {
670    if (nRead <= 0) return;
671    if (blockReader.isShortCircuit()) {
672      readStatistics.addShortCircuitBytes(nRead);
673    } else if (blockReader.isLocal()) {
674      readStatistics.addLocalBytes(nRead);
675    } else {
676      readStatistics.addRemoteBytes(nRead);
677    }
678  }
679  
680  /**
681   * Used to read bytes into a byte[]
682   */
683  private static class ByteArrayStrategy implements ReaderStrategy {
684    final byte[] buf;
685
686    public ByteArrayStrategy(byte[] buf) {
687      this.buf = buf;
688    }
689
690    @Override
691    public int doRead(BlockReader blockReader, int off, int len,
692            ReadStatistics readStatistics) throws ChecksumException, IOException {
693        int nRead = blockReader.read(buf, off, len);
694        updateReadStatistics(readStatistics, nRead, blockReader);
695        return nRead;
696    }
697  }
698
699  /**
700   * Used to read bytes into a user-supplied ByteBuffer
701   */
702  private static class ByteBufferStrategy implements ReaderStrategy {
703    final ByteBuffer buf;
704    ByteBufferStrategy(ByteBuffer buf) {
705      this.buf = buf;
706    }
707
708    @Override
709    public int doRead(BlockReader blockReader, int off, int len,
710        ReadStatistics readStatistics) throws ChecksumException, IOException {
711      int oldpos = buf.position();
712      int oldlimit = buf.limit();
713      boolean success = false;
714      try {
715        int ret = blockReader.read(buf);
716        success = true;
717        updateReadStatistics(readStatistics, ret, blockReader);
718        return ret;
719      } finally {
720        if (!success) {
721          // Reset to original state so that retries work correctly.
722          buf.position(oldpos);
723          buf.limit(oldlimit);
724        }
725      } 
726    }
727  }
728
729  /* This is a used by regular read() and handles ChecksumExceptions.
730   * name readBuffer() is chosen to imply similarity to readBuffer() in
731   * ChecksumFileSystem
732   */ 
733  private synchronized int readBuffer(ReaderStrategy reader, int off, int len,
734      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
735      throws IOException {
736    IOException ioe;
737    
738    /* we retry current node only once. So this is set to true only here.
739     * Intention is to handle one common case of an error that is not a
740     * failure on datanode or client : when DataNode closes the connection
741     * since client is idle. If there are other cases of "non-errors" then
742     * then a datanode might be retried by setting this to true again.
743     */
744    boolean retryCurrentNode = true;
745
746    while (true) {
747      // retry as many times as seekToNewSource allows.
748      try {
749        return reader.doRead(blockReader, off, len, readStatistics);
750      } catch ( ChecksumException ce ) {
751        DFSClient.LOG.warn("Found Checksum error for "
752            + getCurrentBlock() + " from " + currentNode
753            + " at " + ce.getPos());        
754        ioe = ce;
755        retryCurrentNode = false;
756        // we want to remember which block replicas we have tried
757        addIntoCorruptedBlockMap(getCurrentBlock(), currentNode,
758            corruptedBlockMap);
759      } catch ( IOException e ) {
760        if (!retryCurrentNode) {
761          DFSClient.LOG.warn("Exception while reading from "
762              + getCurrentBlock() + " of " + src + " from "
763              + currentNode, e);
764        }
765        ioe = e;
766      }
767      boolean sourceFound = false;
768      if (retryCurrentNode) {
769        /* possibly retry the same node so that transient errors don't
770         * result in application level failures (e.g. Datanode could have
771         * closed the connection because the client is idle for too long).
772         */ 
773        sourceFound = seekToBlockSource(pos);
774      } else {
775        addToDeadNodes(currentNode);
776        sourceFound = seekToNewSource(pos);
777      }
778      if (!sourceFound) {
779        throw ioe;
780      }
781      retryCurrentNode = false;
782    }
783  }
784
785  private int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException {
786    dfsClient.checkOpen();
787    if (closed) {
788      throw new IOException("Stream closed");
789    }
790    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
791      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
792    failures = 0;
793    if (pos < getFileLength()) {
794      int retries = 2;
795      while (retries > 0) {
796        try {
797          // currentNode can be left as null if previous read had a checksum
798          // error on the same block. See HDFS-3067
799          if (pos > blockEnd || currentNode == null) {
800            currentNode = blockSeekTo(pos);
801          }
802          int realLen = (int) Math.min(len, (blockEnd - pos + 1L));
803          if (locatedBlocks.isLastBlockComplete()) {
804            realLen = (int) Math.min(realLen,
805                locatedBlocks.getFileLength() - pos);
806          }
807          int result = readBuffer(strategy, off, realLen, corruptedBlockMap);
808          
809          if (result >= 0) {
810            pos += result;
811          } else {
812            // got a EOS from reader though we expect more data on it.
813            throw new IOException("Unexpected EOS from the reader");
814          }
815          if (dfsClient.stats != null) {
816            dfsClient.stats.incrementBytesRead(result);
817          }
818          return result;
819        } catch (ChecksumException ce) {
820          throw ce;            
821        } catch (IOException e) {
822          if (retries == 1) {
823            DFSClient.LOG.warn("DFS Read", e);
824          }
825          blockEnd = -1;
826          if (currentNode != null) { addToDeadNodes(currentNode); }
827          if (--retries == 0) {
828            throw e;
829          }
830        } finally {
831          // Check if need to report block replicas corruption either read
832          // was successful or ChecksumException occured.
833          reportCheckSumFailure(corruptedBlockMap, 
834              currentLocatedBlock.getLocations().length);
835        }
836      }
837    }
838    return -1;
839  }
840
841  /**
842   * Read the entire buffer.
843   */
844  @Override
845  public synchronized int read(final byte buf[], int off, int len) throws IOException {
846    ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf);
847
848    return readWithStrategy(byteArrayReader, off, len);
849  }
850
851  @Override
852  public synchronized int read(final ByteBuffer buf) throws IOException {
853    ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf);
854
855    return readWithStrategy(byteBufferReader, 0, buf.remaining());
856  }
857
858
859  /**
860   * Add corrupted block replica into map.
861   */
862  private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 
863      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
864    Set<DatanodeInfo> dnSet = null;
865    if((corruptedBlockMap.containsKey(blk))) {
866      dnSet = corruptedBlockMap.get(blk);
867    }else {
868      dnSet = new HashSet<DatanodeInfo>();
869    }
870    if (!dnSet.contains(node)) {
871      dnSet.add(node);
872      corruptedBlockMap.put(blk, dnSet);
873    }
874  }
875
876  private DNAddrPair chooseDataNode(LocatedBlock block,
877      Collection<DatanodeInfo> ignoredNodes) throws IOException {
878    while (true) {
879      try {
880        return getBestNodeDNAddrPair(block, ignoredNodes);
881      } catch (IOException ie) {
882        String errMsg = getBestNodeDNAddrPairErrorString(block.getLocations(),
883          deadNodes, ignoredNodes);
884        String blockInfo = block.getBlock() + " file=" + src;
885        if (failures >= dfsClient.getMaxBlockAcquireFailures()) {
886          String description = "Could not obtain block: " + blockInfo;
887          DFSClient.LOG.warn(description + errMsg
888              + ". Throwing a BlockMissingException");
889          throw new BlockMissingException(src, description,
890              block.getStartOffset());
891        }
892
893        DatanodeInfo[] nodes = block.getLocations();
894        if (nodes == null || nodes.length == 0) {
895          DFSClient.LOG.info("No node available for " + blockInfo);
896        }
897        DFSClient.LOG.info("Could not obtain " + block.getBlock()
898            + " from any node: " + ie + errMsg
899            + ". Will get new block locations from namenode and retry...");
900        try {
901          // Introducing a random factor to the wait time before another retry.
902          // The wait time is dependent on # of failures and a random factor.
903          // At the first time of getting a BlockMissingException, the wait time
904          // is a random number between 0..3000 ms. If the first retry
905          // still fails, we will wait 3000 ms grace period before the 2nd retry.
906          // Also at the second retry, the waiting window is expanded to 6000 ms
907          // alleviating the request rate from the server. Similarly the 3rd retry
908          // will wait 6000ms grace period before retry and the waiting window is
909          // expanded to 9000ms. 
910          final int timeWindow = dfsClient.getConf().timeWindow;
911          double waitTime = timeWindow * failures +       // grace period for the last round of attempt
912            timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure
913          DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec.");
914          Thread.sleep((long)waitTime);
915        } catch (InterruptedException iex) {
916        }
917        deadNodes.clear(); //2nd option is to remove only nodes[blockId]
918        openInfo();
919        block = getBlockAt(block.getStartOffset(), false);
920        failures++;
921        continue;
922      }
923    }
924  }
925
926  /**
927   * Get the best node from which to stream the data.
928   * @param block LocatedBlock, containing nodes in priority order.
929   * @param ignoredNodes Do not choose nodes in this array (may be null)
930   * @return The DNAddrPair of the best node.
931   * @throws IOException
932   */
933  private DNAddrPair getBestNodeDNAddrPair(LocatedBlock block,
934      Collection<DatanodeInfo> ignoredNodes) throws IOException {
935    DatanodeInfo[] nodes = block.getLocations();
936    StorageType[] storageTypes = block.getStorageTypes();
937    DatanodeInfo chosenNode = null;
938    StorageType storageType = null;
939    if (nodes != null) {
940      for (int i = 0; i < nodes.length; i++) {
941        if (!deadNodes.containsKey(nodes[i])
942            && (ignoredNodes == null || !ignoredNodes.contains(nodes[i]))) {
943          chosenNode = nodes[i];
944          // Storage types are ordered to correspond with nodes, so use the same
945          // index to get storage type.
946          if (storageTypes != null && i < storageTypes.length) {
947            storageType = storageTypes[i];
948          }
949          break;
950        }
951      }
952    }
953    if (chosenNode == null) {
954      throw new IOException("No live nodes contain block " + block.getBlock() +
955          " after checking nodes = " + Arrays.toString(nodes) +
956          ", ignoredNodes = " + ignoredNodes);
957    }
958    final String dnAddr =
959        chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname);
960    if (DFSClient.LOG.isDebugEnabled()) {
961      DFSClient.LOG.debug("Connecting to datanode " + dnAddr);
962    }
963    InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr);
964    return new DNAddrPair(chosenNode, targetAddr, storageType);
965  }
966
967  private static String getBestNodeDNAddrPairErrorString(
968      DatanodeInfo nodes[], AbstractMap<DatanodeInfo,
969      DatanodeInfo> deadNodes, Collection<DatanodeInfo> ignoredNodes) {
970    StringBuilder errMsgr = new StringBuilder(
971        " No live nodes contain current block ");
972    errMsgr.append("Block locations:");
973    for (DatanodeInfo datanode : nodes) {
974      errMsgr.append(" ");
975      errMsgr.append(datanode.toString());
976    }
977    errMsgr.append(" Dead nodes: ");
978    for (DatanodeInfo datanode : deadNodes.keySet()) {
979      errMsgr.append(" ");
980      errMsgr.append(datanode.toString());
981    }
982    if (ignoredNodes != null) {
983      errMsgr.append(" Ignored nodes: ");
984      for (DatanodeInfo datanode : ignoredNodes) {
985        errMsgr.append(" ");
986        errMsgr.append(datanode.toString());
987      }
988    }
989    return errMsgr.toString();
990  }
991
992  private void fetchBlockByteRange(LocatedBlock block, long start, long end,
993      byte[] buf, int offset,
994      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
995      throws IOException {
996    block = getBlockAt(block.getStartOffset(), false);
997    while (true) {
998      DNAddrPair addressPair = chooseDataNode(block, null);
999      try {
1000        actualGetFromOneDataNode(addressPair, block, start, end, buf, offset,
1001            corruptedBlockMap);
1002        return;
1003      } catch (IOException e) {
1004        // Ignore. Already processed inside the function.
1005        // Loop through to try the next node.
1006      }
1007    }
1008  }
1009
1010  private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode,
1011      final LocatedBlock block, final long start, final long end,
1012      final ByteBuffer bb,
1013      final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
1014    return new Callable<ByteBuffer>() {
1015      @Override
1016      public ByteBuffer call() throws Exception {
1017        byte[] buf = bb.array();
1018        int offset = bb.position();
1019        actualGetFromOneDataNode(datanode, block, start, end, buf, offset,
1020            corruptedBlockMap);
1021        return bb;
1022      }
1023    };
1024  }
1025
1026  private void actualGetFromOneDataNode(final DNAddrPair datanode,
1027      LocatedBlock block, final long start, final long end, byte[] buf,
1028      int offset, Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1029      throws IOException {
1030    DFSClientFaultInjector.get().startFetchFromDatanode();
1031    int refetchToken = 1; // only need to get a new access token once
1032    int refetchEncryptionKey = 1; // only need to get a new encryption key once
1033
1034    while (true) {
1035      // cached block locations may have been updated by chooseDataNode()
1036      // or fetchBlockAt(). Always get the latest list of locations at the
1037      // start of the loop.
1038      CachingStrategy curCachingStrategy;
1039      boolean allowShortCircuitLocalReads;
1040      synchronized (this) {
1041        block = getBlockAt(block.getStartOffset(), false);
1042        curCachingStrategy = cachingStrategy;
1043        allowShortCircuitLocalReads = !shortCircuitForbidden();
1044      }
1045      DatanodeInfo chosenNode = datanode.info;
1046      InetSocketAddress targetAddr = datanode.addr;
1047      StorageType storageType = datanode.storageType;
1048      BlockReader reader = null;
1049
1050      try {
1051        DFSClientFaultInjector.get().fetchFromDatanodeException();
1052        Token<BlockTokenIdentifier> blockToken = block.getBlockToken();
1053        int len = (int) (end - start + 1);
1054        reader = new BlockReaderFactory(dfsClient.getConf()).
1055            setInetSocketAddress(targetAddr).
1056            setRemotePeerFactory(dfsClient).
1057            setDatanodeInfo(chosenNode).
1058            setStorageType(storageType).
1059            setFileName(src).
1060            setBlock(block.getBlock()).
1061            setBlockToken(blockToken).
1062            setStartOffset(start).
1063            setVerifyChecksum(verifyChecksum).
1064            setClientName(dfsClient.clientName).
1065            setLength(len).
1066            setCachingStrategy(curCachingStrategy).
1067            setAllowShortCircuitLocalReads(allowShortCircuitLocalReads).
1068            setClientCacheContext(dfsClient.getClientContext()).
1069            setUserGroupInformation(dfsClient.ugi).
1070            setConfiguration(dfsClient.getConfiguration()).
1071            build();
1072        int nread = reader.readAll(buf, offset, len);
1073        updateReadStatistics(readStatistics, nread, reader);
1074
1075        if (nread != len) {
1076          throw new IOException("truncated return from reader.read(): " +
1077                                "excpected " + len + ", got " + nread);
1078        }
1079        DFSClientFaultInjector.get().readFromDatanodeDelay();
1080        return;
1081      } catch (ChecksumException e) {
1082        String msg = "fetchBlockByteRange(). Got a checksum exception for "
1083            + src + " at " + block.getBlock() + ":" + e.getPos() + " from "
1084            + chosenNode;
1085        DFSClient.LOG.warn(msg);
1086        // we want to remember what we have tried
1087        addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
1088        addToDeadNodes(chosenNode);
1089        throw new IOException(msg);
1090      } catch (IOException e) {
1091        if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
1092          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
1093              + "encryption key was invalid when connecting to " + targetAddr
1094              + " : " + e);
1095          // The encryption key used is invalid.
1096          refetchEncryptionKey--;
1097          dfsClient.clearDataEncryptionKey();
1098          continue;
1099        } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) {
1100          refetchToken--;
1101          try {
1102            fetchBlockAt(block.getStartOffset());
1103          } catch (IOException fbae) {
1104            // ignore IOE, since we can retry it later in a loop
1105          }
1106          continue;
1107        } else {
1108          String msg = "Failed to connect to " + targetAddr + " for file "
1109              + src + " for block " + block.getBlock() + ":" + e;
1110          DFSClient.LOG.warn("Connection failure: " + msg, e);
1111          addToDeadNodes(chosenNode);
1112          throw new IOException(msg);
1113        }
1114      } finally {
1115        if (reader != null) {
1116          reader.close();
1117        }
1118      }
1119    }
1120  }
1121
1122  /**
1123   * Like {@link #fetchBlockByteRange(LocatedBlock, long, long, byte[],
1124   * int, Map)} except we start up a second, parallel, 'hedged' read
1125   * if the first read is taking longer than configured amount of
1126   * time.  We then wait on which ever read returns first.
1127   */
1128  private void hedgedFetchBlockByteRange(LocatedBlock block, long start,
1129      long end, byte[] buf, int offset,
1130      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1131      throws IOException {
1132    ArrayList<Future<ByteBuffer>> futures = new ArrayList<Future<ByteBuffer>>();
1133    CompletionService<ByteBuffer> hedgedService =
1134        new ExecutorCompletionService<ByteBuffer>(
1135        dfsClient.getHedgedReadsThreadPool());
1136    ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>();
1137    ByteBuffer bb = null;
1138    int len = (int) (end - start + 1);
1139    block = getBlockAt(block.getStartOffset(), false);
1140    while (true) {
1141      // see HDFS-6591, this metric is used to verify/catch unnecessary loops
1142      hedgedReadOpsLoopNumForTesting++;
1143      DNAddrPair chosenNode = null;
1144      // there is no request already executing.
1145      if (futures.isEmpty()) {
1146        // chooseDataNode is a commitment. If no node, we go to
1147        // the NN to reget block locations. Only go here on first read.
1148        chosenNode = chooseDataNode(block, ignored);
1149        bb = ByteBuffer.wrap(buf, offset, len);
1150        Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode(
1151            chosenNode, block, start, end, bb, corruptedBlockMap);
1152        Future<ByteBuffer> firstRequest = hedgedService
1153            .submit(getFromDataNodeCallable);
1154        futures.add(firstRequest);
1155        try {
1156          Future<ByteBuffer> future = hedgedService.poll(
1157              dfsClient.getHedgedReadTimeout(), TimeUnit.MILLISECONDS);
1158          if (future != null) {
1159            future.get();
1160            return;
1161          }
1162          if (DFSClient.LOG.isDebugEnabled()) {
1163            DFSClient.LOG.debug("Waited " + dfsClient.getHedgedReadTimeout()
1164                + "ms to read from " + chosenNode.info
1165                + "; spawning hedged read");
1166          }
1167          // Ignore this node on next go around.
1168          ignored.add(chosenNode.info);
1169          dfsClient.getHedgedReadMetrics().incHedgedReadOps();
1170          continue; // no need to refresh block locations
1171        } catch (InterruptedException e) {
1172          // Ignore
1173        } catch (ExecutionException e) {
1174          // Ignore already logged in the call.
1175        }
1176      } else {
1177        // We are starting up a 'hedged' read. We have a read already
1178        // ongoing. Call getBestNodeDNAddrPair instead of chooseDataNode.
1179        // If no nodes to do hedged reads against, pass.
1180        try {
1181          try {
1182            chosenNode = getBestNodeDNAddrPair(block, ignored);
1183          } catch (IOException ioe) {
1184            chosenNode = chooseDataNode(block, ignored);
1185          }
1186          bb = ByteBuffer.allocate(len);
1187          Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode(
1188              chosenNode, block, start, end, bb, corruptedBlockMap);
1189          Future<ByteBuffer> oneMoreRequest = hedgedService
1190              .submit(getFromDataNodeCallable);
1191          futures.add(oneMoreRequest);
1192        } catch (IOException ioe) {
1193          if (DFSClient.LOG.isDebugEnabled()) {
1194            DFSClient.LOG.debug("Failed getting node for hedged read: "
1195                + ioe.getMessage());
1196          }
1197        }
1198        // if not succeeded. Submit callables for each datanode in a loop, wait
1199        // for a fixed interval and get the result from the fastest one.
1200        try {
1201          ByteBuffer result = getFirstToComplete(hedgedService, futures);
1202          // cancel the rest.
1203          cancelAll(futures);
1204          if (result.array() != buf) { // compare the array pointers
1205            dfsClient.getHedgedReadMetrics().incHedgedReadWins();
1206            System.arraycopy(result.array(), result.position(), buf, offset,
1207                len);
1208          } else {
1209            dfsClient.getHedgedReadMetrics().incHedgedReadOps();
1210          }
1211          return;
1212        } catch (InterruptedException ie) {
1213          // Ignore and retry
1214        }
1215        // We got here if exception. Ignore this node on next go around IFF
1216        // we found a chosenNode to hedge read against.
1217        if (chosenNode != null && chosenNode.info != null) {
1218          ignored.add(chosenNode.info);
1219        }
1220      }
1221    }
1222  }
1223
1224  @VisibleForTesting
1225  public long getHedgedReadOpsLoopNumForTesting() {
1226    return hedgedReadOpsLoopNumForTesting;
1227  }
1228
1229  private ByteBuffer getFirstToComplete(
1230      CompletionService<ByteBuffer> hedgedService,
1231      ArrayList<Future<ByteBuffer>> futures) throws InterruptedException {
1232    if (futures.isEmpty()) {
1233      throw new InterruptedException("let's retry");
1234    }
1235    Future<ByteBuffer> future = null;
1236    try {
1237      future = hedgedService.take();
1238      ByteBuffer bb = future.get();
1239      futures.remove(future);
1240      return bb;
1241    } catch (ExecutionException e) {
1242      // already logged in the Callable
1243      futures.remove(future);
1244    } catch (CancellationException ce) {
1245      // already logged in the Callable
1246      futures.remove(future);
1247    }
1248
1249    throw new InterruptedException("let's retry");
1250  }
1251
1252  private void cancelAll(List<Future<ByteBuffer>> futures) {
1253    for (Future<ByteBuffer> future : futures) {
1254      // Unfortunately, hdfs reads do not take kindly to interruption.
1255      // Threads return a variety of interrupted-type exceptions but
1256      // also complaints about invalid pbs -- likely because read
1257      // is interrupted before gets whole pb.  Also verbose WARN
1258      // logging.  So, for now, do not interrupt running read.
1259      future.cancel(false);
1260    }
1261  }
1262
1263  /**
1264   * Should the block access token be refetched on an exception
1265   * 
1266   * @param ex Exception received
1267   * @param targetAddr Target datanode address from where exception was received
1268   * @return true if block access token has expired or invalid and it should be
1269   *         refetched
1270   */
1271  private static boolean tokenRefetchNeeded(IOException ex,
1272      InetSocketAddress targetAddr) {
1273    /*
1274     * Get a new access token and retry. Retry is needed in 2 cases. 1)
1275     * When both NN and DN re-started while DFSClient holding a cached
1276     * access token. 2) In the case that NN fails to update its
1277     * access key at pre-set interval (by a wide margin) and
1278     * subsequently restarts. In this case, DN re-registers itself with
1279     * NN and receives a new access key, but DN will delete the old
1280     * access key from its memory since it's considered expired based on
1281     * the estimated expiration date.
1282     */
1283    if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) {
1284      DFSClient.LOG.info("Access token was invalid when connecting to "
1285          + targetAddr + " : " + ex);
1286      return true;
1287    }
1288    return false;
1289  }
1290
1291  /**
1292   * Read bytes starting from the specified position.
1293   * 
1294   * @param position start read from this position
1295   * @param buffer read buffer
1296   * @param offset offset into buffer
1297   * @param length number of bytes to read
1298   * 
1299   * @return actual number of bytes read
1300   */
1301  @Override
1302  public int read(long position, byte[] buffer, int offset, int length)
1303    throws IOException {
1304    // sanity checks
1305    dfsClient.checkOpen();
1306    if (closed) {
1307      throw new IOException("Stream closed");
1308    }
1309    failures = 0;
1310    long filelen = getFileLength();
1311    if ((position < 0) || (position >= filelen)) {
1312      return -1;
1313    }
1314    int realLen = length;
1315    if ((position + length) > filelen) {
1316      realLen = (int)(filelen - position);
1317    }
1318    
1319    // determine the block and byte range within the block
1320    // corresponding to position and realLen
1321    List<LocatedBlock> blockRange = getBlockRange(position, realLen);
1322    int remaining = realLen;
1323    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
1324      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
1325    for (LocatedBlock blk : blockRange) {
1326      long targetStart = position - blk.getStartOffset();
1327      long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart);
1328      try {
1329        if (dfsClient.isHedgedReadsEnabled()) {
1330          hedgedFetchBlockByteRange(blk, targetStart, targetStart + bytesToRead
1331              - 1, buffer, offset, corruptedBlockMap);
1332        } else {
1333          fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1,
1334              buffer, offset, corruptedBlockMap);
1335        }
1336      } finally {
1337        // Check and report if any block replicas are corrupted.
1338        // BlockMissingException may be caught if all block replicas are
1339        // corrupted.
1340        reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length);
1341      }
1342
1343      remaining -= bytesToRead;
1344      position += bytesToRead;
1345      offset += bytesToRead;
1346    }
1347    assert remaining == 0 : "Wrong number of bytes read.";
1348    if (dfsClient.stats != null) {
1349      dfsClient.stats.incrementBytesRead(realLen);
1350    }
1351    return realLen;
1352  }
1353  
1354  /**
1355   * DFSInputStream reports checksum failure.
1356   * Case I : client has tried multiple data nodes and at least one of the
1357   * attempts has succeeded. We report the other failures as corrupted block to
1358   * namenode. 
1359   * Case II: client has tried out all data nodes, but all failed. We
1360   * only report if the total number of replica is 1. We do not
1361   * report otherwise since this maybe due to the client is a handicapped client
1362   * (who can not read).
1363   * @param corruptedBlockMap map of corrupted blocks
1364   * @param dataNodeCount number of data nodes who contains the block replicas
1365   */
1366  private void reportCheckSumFailure(
1367      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 
1368      int dataNodeCount) {
1369    if (corruptedBlockMap.isEmpty()) {
1370      return;
1371    }
1372    Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap
1373        .entrySet().iterator();
1374    Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next();
1375    ExtendedBlock blk = entry.getKey();
1376    Set<DatanodeInfo> dnSet = entry.getValue();
1377    if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0))
1378        || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) {
1379      DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()];
1380      int i = 0;
1381      for (DatanodeInfo dn:dnSet) {
1382        locs[i++] = dn;
1383      }
1384      LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) };
1385      dfsClient.reportChecksumFailure(src, lblocks);
1386    }
1387    corruptedBlockMap.clear();
1388  }
1389
1390  @Override
1391  public long skip(long n) throws IOException {
1392    if ( n > 0 ) {
1393      long curPos = getPos();
1394      long fileLen = getFileLength();
1395      if( n+curPos > fileLen ) {
1396        n = fileLen - curPos;
1397      }
1398      seek(curPos+n);
1399      return n;
1400    }
1401    return n < 0 ? -1 : 0;
1402  }
1403
1404  /**
1405   * Seek to a new arbitrary location
1406   */
1407  @Override
1408  public synchronized void seek(long targetPos) throws IOException {
1409    if (targetPos > getFileLength()) {
1410      throw new EOFException("Cannot seek after EOF");
1411    }
1412    if (targetPos < 0) {
1413      throw new EOFException("Cannot seek to negative offset");
1414    }
1415    if (closed) {
1416      throw new IOException("Stream is closed!");
1417    }
1418    boolean done = false;
1419    if (pos <= targetPos && targetPos <= blockEnd) {
1420      //
1421      // If this seek is to a positive position in the current
1422      // block, and this piece of data might already be lying in
1423      // the TCP buffer, then just eat up the intervening data.
1424      //
1425      int diff = (int)(targetPos - pos);
1426      if (diff <= blockReader.available()) {
1427        try {
1428          pos += blockReader.skip(diff);
1429          if (pos == targetPos) {
1430            done = true;
1431          } else {
1432            // The range was already checked. If the block reader returns
1433            // something unexpected instead of throwing an exception, it is
1434            // most likely a bug. 
1435            String errMsg = "BlockReader failed to seek to " + 
1436                targetPos + ". Instead, it seeked to " + pos + ".";
1437            DFSClient.LOG.warn(errMsg);
1438            throw new IOException(errMsg);
1439          }
1440        } catch (IOException e) {//make following read to retry
1441          if(DFSClient.LOG.isDebugEnabled()) {
1442            DFSClient.LOG.debug("Exception while seek to " + targetPos
1443                + " from " + getCurrentBlock() + " of " + src + " from "
1444                + currentNode, e);
1445          }
1446        }
1447      }
1448    }
1449    if (!done) {
1450      pos = targetPos;
1451      blockEnd = -1;
1452    }
1453  }
1454
1455  /**
1456   * Same as {@link #seekToNewSource(long)} except that it does not exclude
1457   * the current datanode and might connect to the same node.
1458   */
1459  private synchronized boolean seekToBlockSource(long targetPos)
1460                                                 throws IOException {
1461    currentNode = blockSeekTo(targetPos);
1462    return true;
1463  }
1464  
1465  /**
1466   * Seek to given position on a node other than the current node.  If
1467   * a node other than the current node is found, then returns true. 
1468   * If another node could not be found, then returns false.
1469   */
1470  @Override
1471  public synchronized boolean seekToNewSource(long targetPos) throws IOException {
1472    boolean markedDead = deadNodes.containsKey(currentNode);
1473    addToDeadNodes(currentNode);
1474    DatanodeInfo oldNode = currentNode;
1475    DatanodeInfo newNode = blockSeekTo(targetPos);
1476    if (!markedDead) {
1477      /* remove it from deadNodes. blockSeekTo could have cleared 
1478       * deadNodes and added currentNode again. Thats ok. */
1479      deadNodes.remove(oldNode);
1480    }
1481    if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) {
1482      currentNode = newNode;
1483      return true;
1484    } else {
1485      return false;
1486    }
1487  }
1488      
1489  /**
1490   */
1491  @Override
1492  public synchronized long getPos() throws IOException {
1493    return pos;
1494  }
1495
1496  /** Return the size of the remaining available bytes
1497   * if the size is less than or equal to {@link Integer#MAX_VALUE},
1498   * otherwise, return {@link Integer#MAX_VALUE}.
1499   */
1500  @Override
1501  public synchronized int available() throws IOException {
1502    if (closed) {
1503      throw new IOException("Stream closed");
1504    }
1505
1506    final long remaining = getFileLength() - pos;
1507    return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE;
1508  }
1509
1510  /**
1511   * We definitely don't support marks
1512   */
1513  @Override
1514  public boolean markSupported() {
1515    return false;
1516  }
1517  @Override
1518  public void mark(int readLimit) {
1519  }
1520  @Override
1521  public void reset() throws IOException {
1522    throw new IOException("Mark/reset not supported");
1523  }
1524
1525  /** Utility class to encapsulate data node info and its address. */
1526  private static final class DNAddrPair {
1527    final DatanodeInfo info;
1528    final InetSocketAddress addr;
1529    final StorageType storageType;
1530
1531    DNAddrPair(DatanodeInfo info, InetSocketAddress addr,
1532        StorageType storageType) {
1533      this.info = info;
1534      this.addr = addr;
1535      this.storageType = storageType;
1536    }
1537  }
1538
1539  /**
1540   * Get statistics about the reads which this DFSInputStream has done.
1541   */
1542  public synchronized ReadStatistics getReadStatistics() {
1543    return new ReadStatistics(readStatistics);
1544  }
1545
1546  public synchronized FileEncryptionInfo getFileEncryptionInfo() {
1547    return fileEncryptionInfo;
1548  }
1549
1550  private synchronized void closeCurrentBlockReader() {
1551    if (blockReader == null) return;
1552    // Close the current block reader so that the new caching settings can 
1553    // take effect immediately.
1554    try {
1555      blockReader.close();
1556    } catch (IOException e) {
1557      DFSClient.LOG.error("error closing blockReader", e);
1558    }
1559    blockReader = null;
1560  }
1561
1562  @Override
1563  public synchronized void setReadahead(Long readahead)
1564      throws IOException {
1565    this.cachingStrategy =
1566        new CachingStrategy.Builder(this.cachingStrategy).
1567            setReadahead(readahead).build();
1568    closeCurrentBlockReader();
1569  }
1570
1571  @Override
1572  public synchronized void setDropBehind(Boolean dropBehind)
1573      throws IOException {
1574    this.cachingStrategy =
1575        new CachingStrategy.Builder(this.cachingStrategy).
1576            setDropBehind(dropBehind).build();
1577    closeCurrentBlockReader();
1578  }
1579
1580  /**
1581   * The immutable empty buffer we return when we reach EOF when doing a
1582   * zero-copy read.
1583   */
1584  private static final ByteBuffer EMPTY_BUFFER =
1585    ByteBuffer.allocateDirect(0).asReadOnlyBuffer();
1586
1587  @Override
1588  public synchronized ByteBuffer read(ByteBufferPool bufferPool,
1589      int maxLength, EnumSet<ReadOption> opts) 
1590          throws IOException, UnsupportedOperationException {
1591    if (maxLength == 0) {
1592      return EMPTY_BUFFER;
1593    } else if (maxLength < 0) {
1594      throw new IllegalArgumentException("can't read a negative " +
1595          "number of bytes.");
1596    }
1597    if ((blockReader == null) || (blockEnd == -1)) {
1598      if (pos >= getFileLength()) {
1599        return null;
1600      }
1601      /*
1602       * If we don't have a blockReader, or the one we have has no more bytes
1603       * left to read, we call seekToBlockSource to get a new blockReader and
1604       * recalculate blockEnd.  Note that we assume we're not at EOF here
1605       * (we check this above).
1606       */
1607      if ((!seekToBlockSource(pos)) || (blockReader == null)) {
1608        throw new IOException("failed to allocate new BlockReader " +
1609            "at position " + pos);
1610      }
1611    }
1612    ByteBuffer buffer = null;
1613    if (dfsClient.getConf().shortCircuitMmapEnabled) {
1614      buffer = tryReadZeroCopy(maxLength, opts);
1615    }
1616    if (buffer != null) {
1617      return buffer;
1618    }
1619    buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength);
1620    if (buffer != null) {
1621      extendedReadBuffers.put(buffer, bufferPool);
1622    }
1623    return buffer;
1624  }
1625
1626  private synchronized ByteBuffer tryReadZeroCopy(int maxLength,
1627      EnumSet<ReadOption> opts) throws IOException {
1628    // Copy 'pos' and 'blockEnd' to local variables to make it easier for the
1629    // JVM to optimize this function.
1630    final long curPos = pos;
1631    final long curEnd = blockEnd;
1632    final long blockStartInFile = currentLocatedBlock.getStartOffset();
1633    final long blockPos = curPos - blockStartInFile;
1634
1635    // Shorten this read if the end of the block is nearby.
1636    long length63;
1637    if ((curPos + maxLength) <= (curEnd + 1)) {
1638      length63 = maxLength;
1639    } else {
1640      length63 = 1 + curEnd - curPos;
1641      if (length63 <= 0) {
1642        if (DFSClient.LOG.isDebugEnabled()) {
1643          DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " +
1644            curPos + " of " + src + "; " + length63 + " bytes left in block.  " +
1645            "blockPos=" + blockPos + "; curPos=" + curPos +
1646            "; curEnd=" + curEnd);
1647        }
1648        return null;
1649      }
1650      if (DFSClient.LOG.isDebugEnabled()) {
1651        DFSClient.LOG.debug("Reducing read length from " + maxLength +
1652            " to " + length63 + " to avoid going more than one byte " +
1653            "past the end of the block.  blockPos=" + blockPos +
1654            "; curPos=" + curPos + "; curEnd=" + curEnd);
1655      }
1656    }
1657    // Make sure that don't go beyond 31-bit offsets in the MappedByteBuffer.
1658    int length;
1659    if (blockPos + length63 <= Integer.MAX_VALUE) {
1660      length = (int)length63;
1661    } else {
1662      long length31 = Integer.MAX_VALUE - blockPos;
1663      if (length31 <= 0) {
1664        // Java ByteBuffers can't be longer than 2 GB, because they use
1665        // 4-byte signed integers to represent capacity, etc.
1666        // So we can't mmap the parts of the block higher than the 2 GB offset.
1667        // FIXME: we could work around this with multiple memory maps.
1668        // See HDFS-5101.
1669        if (DFSClient.LOG.isDebugEnabled()) {
1670          DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " +
1671            curPos + " of " + src + "; 31-bit MappedByteBuffer limit " +
1672            "exceeded.  blockPos=" + blockPos + ", curEnd=" + curEnd);
1673        }
1674        return null;
1675      }
1676      length = (int)length31;
1677      if (DFSClient.LOG.isDebugEnabled()) {
1678        DFSClient.LOG.debug("Reducing read length from " + maxLength +
1679            " to " + length + " to avoid 31-bit limit.  " +
1680            "blockPos=" + blockPos + "; curPos=" + curPos +
1681            "; curEnd=" + curEnd);
1682      }
1683    }
1684    final ClientMmap clientMmap = blockReader.getClientMmap(opts);
1685    if (clientMmap == null) {
1686      if (DFSClient.LOG.isDebugEnabled()) {
1687        DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
1688          curPos + " of " + src + "; BlockReader#getClientMmap returned " +
1689          "null.");
1690      }
1691      return null;
1692    }
1693    boolean success = false;
1694    ByteBuffer buffer;
1695    try {
1696      seek(curPos + length);
1697      buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
1698      buffer.position((int)blockPos);
1699      buffer.limit((int)(blockPos + length));
1700      extendedReadBuffers.put(buffer, clientMmap);
1701      readStatistics.addZeroCopyBytes(length);
1702      if (DFSClient.LOG.isDebugEnabled()) {
1703        DFSClient.LOG.debug("readZeroCopy read " + length + 
1704            " bytes from offset " + curPos + " via the zero-copy read " +
1705            "path.  blockEnd = " + blockEnd);
1706      }
1707      success = true;
1708    } finally {
1709      if (!success) {
1710        IOUtils.closeQuietly(clientMmap);
1711      }
1712    }
1713    return buffer;
1714  }
1715
1716  @Override
1717  public synchronized void releaseBuffer(ByteBuffer buffer) {
1718    if (buffer == EMPTY_BUFFER) return;
1719    Object val = extendedReadBuffers.remove(buffer);
1720    if (val == null) {
1721      throw new IllegalArgumentException("tried to release a buffer " +
1722          "that was not created by this stream, " + buffer);
1723    }
1724    if (val instanceof ClientMmap) {
1725      IOUtils.closeQuietly((ClientMmap)val);
1726    } else if (val instanceof ByteBufferPool) {
1727      ((ByteBufferPool)val).putBuffer(buffer);
1728    }
1729  }
1730}