001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs;
019
020import java.io.EOFException;
021import java.io.IOException;
022import java.net.InetSocketAddress;
023import java.nio.ByteBuffer;
024import java.util.AbstractMap;
025import java.util.ArrayList;
026import java.util.Arrays;
027import java.util.Collection;
028import java.util.EnumSet;
029import java.util.HashMap;
030import java.util.HashSet;
031import java.util.Iterator;
032import java.util.List;
033import java.util.Map;
034import java.util.Map.Entry;
035import java.util.Set;
036import java.util.concurrent.Callable;
037import java.util.concurrent.CancellationException;
038import java.util.concurrent.CompletionService;
039import java.util.concurrent.ConcurrentHashMap;
040import java.util.concurrent.ExecutionException;
041import java.util.concurrent.ExecutorCompletionService;
042import java.util.concurrent.Future;
043import java.util.concurrent.TimeUnit;
044import java.util.concurrent.atomic.AtomicBoolean;
045
046import org.apache.commons.io.IOUtils;
047import org.apache.hadoop.classification.InterfaceAudience;
048import org.apache.hadoop.fs.ByteBufferReadable;
049import org.apache.hadoop.fs.ByteBufferUtil;
050import org.apache.hadoop.fs.CanSetDropBehind;
051import org.apache.hadoop.fs.CanSetReadahead;
052import org.apache.hadoop.fs.CanUnbuffer;
053import org.apache.hadoop.fs.ChecksumException;
054import org.apache.hadoop.fs.FSInputStream;
055import org.apache.hadoop.fs.HasEnhancedByteBufferAccess;
056import org.apache.hadoop.fs.ReadOption;
057import org.apache.hadoop.fs.StorageType;
058import org.apache.hadoop.fs.UnresolvedLinkException;
059import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
060import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
061import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
062import org.apache.hadoop.fs.FileEncryptionInfo;
063import org.apache.hadoop.hdfs.protocol.LocatedBlock;
064import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
065import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
066import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
067import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
068import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
069import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
070import org.apache.hadoop.hdfs.shortcircuit.ClientMmap;
071import org.apache.hadoop.io.ByteBufferPool;
072import org.apache.hadoop.ipc.RPC;
073import org.apache.hadoop.ipc.RemoteException;
074import org.apache.hadoop.net.NetUtils;
075import org.apache.hadoop.security.token.SecretManager.InvalidToken;
076import org.apache.hadoop.security.token.Token;
077import org.apache.hadoop.util.IdentityHashStore;
078import org.apache.htrace.Span;
079import org.apache.htrace.Trace;
080import org.apache.htrace.TraceScope;
081
082import com.google.common.annotations.VisibleForTesting;
083
084/****************************************************************
085 * DFSInputStream provides bytes from a named file.  It handles 
086 * negotiation of the namenode and various datanodes as necessary.
087 ****************************************************************/
088@InterfaceAudience.Private
089public class DFSInputStream extends FSInputStream
090implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
091    HasEnhancedByteBufferAccess, CanUnbuffer {
092  @VisibleForTesting
093  public static boolean tcpReadsDisabledForTesting = false;
094  private long hedgedReadOpsLoopNumForTesting = 0;
095  private final DFSClient dfsClient;
096  private AtomicBoolean closed = new AtomicBoolean(false);
097  private final String src;
098  private final boolean verifyChecksum;
099
100  // state by stateful read only:
101  // (protected by lock on this)
102  /////
103  private DatanodeInfo currentNode = null;
104  private LocatedBlock currentLocatedBlock = null;
105  private long pos = 0;
106  private long blockEnd = -1;
107  private BlockReader blockReader = null;
108  ////
109
110  // state shared by stateful and positional read:
111  // (protected by lock on infoLock)
112  ////
113  private LocatedBlocks locatedBlocks = null;
114  private long lastBlockBeingWrittenLength = 0;
115  private FileEncryptionInfo fileEncryptionInfo = null;
116  private CachingStrategy cachingStrategy;
117  ////
118
119  private final ReadStatistics readStatistics = new ReadStatistics();
120  // lock for state shared between read and pread
121  // Note: Never acquire a lock on <this> with this lock held to avoid deadlocks
122  //       (it's OK to acquire this lock when the lock on <this> is held)
123  private final Object infoLock = new Object();
124
125  /**
126   * Track the ByteBuffers that we have handed out to readers.
127   * 
128   * The value type can be either ByteBufferPool or ClientMmap, depending on
129   * whether we this is a memory-mapped buffer or not.
130   */
131  private IdentityHashStore<ByteBuffer, Object> extendedReadBuffers;
132
133  private synchronized IdentityHashStore<ByteBuffer, Object>
134        getExtendedReadBuffers() {
135    if (extendedReadBuffers == null) {
136      extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0);
137    }
138    return extendedReadBuffers;
139  }
140
141  public static class ReadStatistics {
142    public ReadStatistics() {
143      clear();
144    }
145
146    public ReadStatistics(ReadStatistics rhs) {
147      this.totalBytesRead = rhs.getTotalBytesRead();
148      this.totalLocalBytesRead = rhs.getTotalLocalBytesRead();
149      this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead();
150      this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead();
151    }
152
153    /**
154     * @return The total bytes read.  This will always be at least as
155     * high as the other numbers, since it includes all of them.
156     */
157    public long getTotalBytesRead() {
158      return totalBytesRead;
159    }
160
161    /**
162     * @return The total local bytes read.  This will always be at least
163     * as high as totalShortCircuitBytesRead, since all short-circuit
164     * reads are also local.
165     */
166    public long getTotalLocalBytesRead() {
167      return totalLocalBytesRead;
168    }
169
170    /**
171     * @return The total short-circuit local bytes read.
172     */
173    public long getTotalShortCircuitBytesRead() {
174      return totalShortCircuitBytesRead;
175    }
176    
177    /**
178     * @return The total number of zero-copy bytes read.
179     */
180    public long getTotalZeroCopyBytesRead() {
181      return totalZeroCopyBytesRead;
182    }
183
184    /**
185     * @return The total number of bytes read which were not local.
186     */
187    public long getRemoteBytesRead() {
188      return totalBytesRead - totalLocalBytesRead;
189    }
190    
191    void addRemoteBytes(long amt) {
192      this.totalBytesRead += amt;
193    }
194
195    void addLocalBytes(long amt) {
196      this.totalBytesRead += amt;
197      this.totalLocalBytesRead += amt;
198    }
199
200    void addShortCircuitBytes(long amt) {
201      this.totalBytesRead += amt;
202      this.totalLocalBytesRead += amt;
203      this.totalShortCircuitBytesRead += amt;
204    }
205
206    void addZeroCopyBytes(long amt) {
207      this.totalBytesRead += amt;
208      this.totalLocalBytesRead += amt;
209      this.totalShortCircuitBytesRead += amt;
210      this.totalZeroCopyBytesRead += amt;
211    }
212
213    void clear() {
214      this.totalBytesRead = 0;
215      this.totalLocalBytesRead = 0;
216      this.totalShortCircuitBytesRead = 0;
217      this.totalZeroCopyBytesRead = 0;
218    }
219    
220    private long totalBytesRead;
221
222    private long totalLocalBytesRead;
223
224    private long totalShortCircuitBytesRead;
225
226    private long totalZeroCopyBytesRead;
227  }
228  
229  /**
230   * This variable tracks the number of failures since the start of the
231   * most recent user-facing operation. That is to say, it should be reset
232   * whenever the user makes a call on this stream, and if at any point
233   * during the retry logic, the failure count exceeds a threshold,
234   * the errors will be thrown back to the operation.
235   *
236   * Specifically this counts the number of times the client has gone
237   * back to the namenode to get a new list of block locations, and is
238   * capped at maxBlockAcquireFailures
239   */
240  private int failures = 0;
241
242  /* XXX Use of CocurrentHashMap is temp fix. Need to fix 
243   * parallel accesses to DFSInputStream (through ptreads) properly */
244  private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes =
245             new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>();
246
247  private byte[] oneByteBuf; // used for 'int read()'
248
249  void addToDeadNodes(DatanodeInfo dnInfo) {
250    deadNodes.put(dnInfo, dnInfo);
251  }
252  
253  DFSInputStream(DFSClient dfsClient, String src, boolean verifyChecksum
254                 ) throws IOException, UnresolvedLinkException {
255    this.dfsClient = dfsClient;
256    this.verifyChecksum = verifyChecksum;
257    this.src = src;
258    synchronized (infoLock) {
259      this.cachingStrategy = dfsClient.getDefaultReadCachingStrategy();
260    }
261    openInfo();
262  }
263
264  /**
265   * Grab the open-file info from namenode
266   */
267  void openInfo() throws IOException, UnresolvedLinkException {
268    synchronized(infoLock) {
269      lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
270      int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength;
271      while (retriesForLastBlockLength > 0) {
272        // Getting last block length as -1 is a special case. When cluster
273        // restarts, DNs may not report immediately. At this time partial block
274        // locations will not be available with NN for getting the length. Lets
275        // retry for 3 times to get the length.
276        if (lastBlockBeingWrittenLength == -1) {
277          DFSClient.LOG.warn("Last block locations not available. "
278              + "Datanodes might not have reported blocks completely."
279              + " Will retry for " + retriesForLastBlockLength + " times");
280          waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength);
281          lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
282        } else {
283          break;
284        }
285        retriesForLastBlockLength--;
286      }
287      if (retriesForLastBlockLength == 0) {
288        throw new IOException("Could not obtain the last block locations.");
289      }
290    }
291  }
292
293  private void waitFor(int waitTime) throws IOException {
294    try {
295      Thread.sleep(waitTime);
296    } catch (InterruptedException e) {
297      throw new IOException(
298          "Interrupted while getting the last block length.");
299    }
300  }
301
302  private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException {
303    final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0);
304    if (DFSClient.LOG.isDebugEnabled()) {
305      DFSClient.LOG.debug("newInfo = " + newInfo);
306    }
307    if (newInfo == null) {
308      throw new IOException("Cannot open filename " + src);
309    }
310
311    if (locatedBlocks != null) {
312      Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator();
313      Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator();
314      while (oldIter.hasNext() && newIter.hasNext()) {
315        if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) {
316          throw new IOException("Blocklist for " + src + " has changed!");
317        }
318      }
319    }
320    locatedBlocks = newInfo;
321    long lastBlockBeingWrittenLength = 0;
322    if (!locatedBlocks.isLastBlockComplete()) {
323      final LocatedBlock last = locatedBlocks.getLastLocatedBlock();
324      if (last != null) {
325        if (last.getLocations().length == 0) {
326          if (last.getBlockSize() == 0) {
327            // if the length is zero, then no data has been written to
328            // datanode. So no need to wait for the locations.
329            return 0;
330          }
331          return -1;
332        }
333        final long len = readBlockLength(last);
334        last.getBlock().setNumBytes(len);
335        lastBlockBeingWrittenLength = len; 
336      }
337    }
338
339    fileEncryptionInfo = locatedBlocks.getFileEncryptionInfo();
340
341    return lastBlockBeingWrittenLength;
342  }
343
344  /** Read the block length from one of the datanodes. */
345  private long readBlockLength(LocatedBlock locatedblock) throws IOException {
346    assert locatedblock != null : "LocatedBlock cannot be null";
347    int replicaNotFoundCount = locatedblock.getLocations().length;
348    
349    for(DatanodeInfo datanode : locatedblock.getLocations()) {
350      ClientDatanodeProtocol cdp = null;
351      
352      try {
353        cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode,
354            dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout,
355            dfsClient.getConf().connectToDnViaHostname, locatedblock);
356        
357        final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock());
358        
359        if (n >= 0) {
360          return n;
361        }
362      }
363      catch(IOException ioe) {
364        if (ioe instanceof RemoteException &&
365          (((RemoteException) ioe).unwrapRemoteException() instanceof
366            ReplicaNotFoundException)) {
367          // special case : replica might not be on the DN, treat as 0 length
368          replicaNotFoundCount--;
369        }
370        
371        if (DFSClient.LOG.isDebugEnabled()) {
372          DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode "
373              + datanode + " for block " + locatedblock.getBlock(), ioe);
374        }
375      } finally {
376        if (cdp != null) {
377          RPC.stopProxy(cdp);
378        }
379      }
380    }
381
382    // Namenode told us about these locations, but none know about the replica
383    // means that we hit the race between pipeline creation start and end.
384    // we require all 3 because some other exception could have happened
385    // on a DN that has it.  we want to report that error
386    if (replicaNotFoundCount == 0) {
387      return 0;
388    }
389
390    throw new IOException("Cannot obtain block length for " + locatedblock);
391  }
392  
393  public long getFileLength() {
394    synchronized(infoLock) {
395      return locatedBlocks == null? 0:
396          locatedBlocks.getFileLength() + lastBlockBeingWrittenLength;
397    }
398  }
399
400  // Short circuit local reads are forbidden for files that are
401  // under construction.  See HDFS-2757.
402  boolean shortCircuitForbidden() {
403    synchronized(infoLock) {
404      return locatedBlocks.isUnderConstruction();
405    }
406  }
407
408  /**
409   * Returns the datanode from which the stream is currently reading.
410   */
411  public synchronized DatanodeInfo getCurrentDatanode() {
412    return currentNode;
413  }
414
415  /**
416   * Returns the block containing the target position. 
417   */
418  synchronized public ExtendedBlock getCurrentBlock() {
419    if (currentLocatedBlock == null){
420      return null;
421    }
422    return currentLocatedBlock.getBlock();
423  }
424
425  /**
426   * Return collection of blocks that has already been located.
427   */
428  public List<LocatedBlock> getAllBlocks() throws IOException {
429    return getBlockRange(0, getFileLength());
430  }
431
432  /**
433   * Get block at the specified position.
434   * Fetch it from the namenode if not cached.
435   * 
436   * @param offset block corresponding to this offset in file is returned
437   * @return located block
438   * @throws IOException
439   */
440  private LocatedBlock getBlockAt(long offset) throws IOException {
441    synchronized(infoLock) {
442      assert (locatedBlocks != null) : "locatedBlocks is null";
443
444      final LocatedBlock blk;
445
446      //check offset
447      if (offset < 0 || offset >= getFileLength()) {
448        throw new IOException("offset < 0 || offset >= getFileLength(), offset="
449            + offset
450            + ", locatedBlocks=" + locatedBlocks);
451      }
452      else if (offset >= locatedBlocks.getFileLength()) {
453        // offset to the portion of the last block,
454        // which is not known to the name-node yet;
455        // getting the last block
456        blk = locatedBlocks.getLastLocatedBlock();
457      }
458      else {
459        // search cached blocks first
460        int targetBlockIdx = locatedBlocks.findBlock(offset);
461        if (targetBlockIdx < 0) { // block is not cached
462          targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
463          // fetch more blocks
464          final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
465          assert (newBlocks != null) : "Could not find target position " + offset;
466          locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
467        }
468        blk = locatedBlocks.get(targetBlockIdx);
469      }
470      return blk;
471    }
472  }
473
474  /** Fetch a block from namenode and cache it */
475  private void fetchBlockAt(long offset) throws IOException {
476    synchronized(infoLock) {
477      int targetBlockIdx = locatedBlocks.findBlock(offset);
478      if (targetBlockIdx < 0) { // block is not cached
479        targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
480      }
481      // fetch blocks
482      final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
483      if (newBlocks == null) {
484        throw new IOException("Could not find target position " + offset);
485      }
486      locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
487    }
488  }
489
490  /**
491   * Get blocks in the specified range.
492   * Fetch them from the namenode if not cached. This function
493   * will not get a read request beyond the EOF.
494   * @param offset starting offset in file
495   * @param length length of data
496   * @return consequent segment of located blocks
497   * @throws IOException
498   */
499  private List<LocatedBlock> getBlockRange(long offset,
500      long length)  throws IOException {
501    // getFileLength(): returns total file length
502    // locatedBlocks.getFileLength(): returns length of completed blocks
503    if (offset >= getFileLength()) {
504      throw new IOException("Offset: " + offset +
505        " exceeds file length: " + getFileLength());
506    }
507    synchronized(infoLock) {
508      final List<LocatedBlock> blocks;
509      final long lengthOfCompleteBlk = locatedBlocks.getFileLength();
510      final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk;
511      final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk;
512
513      if (readOffsetWithinCompleteBlk) {
514        //get the blocks of finalized (completed) block range
515        blocks = getFinalizedBlockRange(offset,
516          Math.min(length, lengthOfCompleteBlk - offset));
517      } else {
518        blocks = new ArrayList<LocatedBlock>(1);
519      }
520
521      // get the blocks from incomplete block range
522      if (readLengthPastCompleteBlk) {
523         blocks.add(locatedBlocks.getLastLocatedBlock());
524      }
525
526      return blocks;
527    }
528  }
529
530  /**
531   * Get blocks in the specified range.
532   * Includes only the complete blocks.
533   * Fetch them from the namenode if not cached.
534   */
535  private List<LocatedBlock> getFinalizedBlockRange(
536      long offset, long length) throws IOException {
537    synchronized(infoLock) {
538      assert (locatedBlocks != null) : "locatedBlocks is null";
539      List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>();
540      // search cached blocks first
541      int blockIdx = locatedBlocks.findBlock(offset);
542      if (blockIdx < 0) { // block is not cached
543        blockIdx = LocatedBlocks.getInsertIndex(blockIdx);
544      }
545      long remaining = length;
546      long curOff = offset;
547      while(remaining > 0) {
548        LocatedBlock blk = null;
549        if(blockIdx < locatedBlocks.locatedBlockCount())
550          blk = locatedBlocks.get(blockIdx);
551        if (blk == null || curOff < blk.getStartOffset()) {
552          LocatedBlocks newBlocks;
553          newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining);
554          locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks());
555          continue;
556        }
557        assert curOff >= blk.getStartOffset() : "Block not found";
558        blockRange.add(blk);
559        long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff;
560        remaining -= bytesRead;
561        curOff += bytesRead;
562        blockIdx++;
563      }
564      return blockRange;
565    }
566  }
567
568  /**
569   * Open a DataInputStream to a DataNode so that it can be read from.
570   * We get block ID and the IDs of the destinations at startup, from the namenode.
571   */
572  private synchronized DatanodeInfo blockSeekTo(long target) throws IOException {
573    if (target >= getFileLength()) {
574      throw new IOException("Attempted to read past end of file");
575    }
576
577    // Will be getting a new BlockReader.
578    closeCurrentBlockReader();
579
580    //
581    // Connect to best DataNode for desired Block, with potential offset
582    //
583    DatanodeInfo chosenNode = null;
584    int refetchToken = 1; // only need to get a new access token once
585    int refetchEncryptionKey = 1; // only need to get a new encryption key once
586    
587    boolean connectFailedOnce = false;
588
589    while (true) {
590      //
591      // Compute desired block
592      //
593      LocatedBlock targetBlock = getBlockAt(target);
594
595      // update current position
596      this.pos = target;
597      this.blockEnd = targetBlock.getStartOffset() +
598            targetBlock.getBlockSize() - 1;
599      this.currentLocatedBlock = targetBlock;
600
601      assert (target==pos) : "Wrong postion " + pos + " expect " + target;
602      long offsetIntoBlock = target - targetBlock.getStartOffset();
603
604      DNAddrPair retval = chooseDataNode(targetBlock, null);
605      chosenNode = retval.info;
606      InetSocketAddress targetAddr = retval.addr;
607      StorageType storageType = retval.storageType;
608
609      try {
610        ExtendedBlock blk = targetBlock.getBlock();
611        Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
612        CachingStrategy curCachingStrategy;
613        boolean shortCircuitForbidden;
614        synchronized(infoLock) {
615          curCachingStrategy = cachingStrategy;
616          shortCircuitForbidden = shortCircuitForbidden();
617        }
618        blockReader = new BlockReaderFactory(dfsClient.getConf()).
619            setInetSocketAddress(targetAddr).
620            setRemotePeerFactory(dfsClient).
621            setDatanodeInfo(chosenNode).
622            setStorageType(storageType).
623            setFileName(src).
624            setBlock(blk).
625            setBlockToken(accessToken).
626            setStartOffset(offsetIntoBlock).
627            setVerifyChecksum(verifyChecksum).
628            setClientName(dfsClient.clientName).
629            setLength(blk.getNumBytes() - offsetIntoBlock).
630            setCachingStrategy(curCachingStrategy).
631            setAllowShortCircuitLocalReads(!shortCircuitForbidden).
632            setClientCacheContext(dfsClient.getClientContext()).
633            setUserGroupInformation(dfsClient.ugi).
634            setConfiguration(dfsClient.getConfiguration()).
635            build();
636        if(connectFailedOnce) {
637          DFSClient.LOG.info("Successfully connected to " + targetAddr +
638                             " for " + blk);
639        }
640        return chosenNode;
641      } catch (IOException ex) {
642        if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
643          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
644              + "encryption key was invalid when connecting to " + targetAddr
645              + " : " + ex);
646          // The encryption key used is invalid.
647          refetchEncryptionKey--;
648          dfsClient.clearDataEncryptionKey();
649        } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
650          refetchToken--;
651          fetchBlockAt(target);
652        } else {
653          connectFailedOnce = true;
654          DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
655            + ", add to deadNodes and continue. " + ex, ex);
656          // Put chosen node into dead list, continue
657          addToDeadNodes(chosenNode);
658        }
659      }
660    }
661  }
662
663  /**
664   * Close it down!
665   */
666  @Override
667  public synchronized void close() throws IOException {
668    if (!closed.compareAndSet(false, true)) {
669      DFSClient.LOG.warn("DFSInputStream has been closed already");
670      return;
671    }
672    dfsClient.checkOpen();
673
674    if ((extendedReadBuffers != null) && (!extendedReadBuffers.isEmpty())) {
675      final StringBuilder builder = new StringBuilder();
676      extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() {
677        private String prefix = "";
678        @Override
679        public void accept(ByteBuffer k, Object v) {
680          builder.append(prefix).append(k);
681          prefix = ", ";
682        }
683      });
684      DFSClient.LOG.warn("closing file " + src + ", but there are still " +
685          "unreleased ByteBuffers allocated by read().  " +
686          "Please release " + builder.toString() + ".");
687    }
688    closeCurrentBlockReader();
689    super.close();
690  }
691
692  @Override
693  public synchronized int read() throws IOException {
694    if (oneByteBuf == null) {
695      oneByteBuf = new byte[1];
696    }
697    int ret = read( oneByteBuf, 0, 1 );
698    return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff);
699  }
700
701  /**
702   * Wraps different possible read implementations so that readBuffer can be
703   * strategy-agnostic.
704   */
705  private interface ReaderStrategy {
706    public int doRead(BlockReader blockReader, int off, int len)
707        throws ChecksumException, IOException;
708  }
709
710  private void updateReadStatistics(ReadStatistics readStatistics, 
711        int nRead, BlockReader blockReader) {
712    if (nRead <= 0) return;
713    synchronized(infoLock) {
714      if (blockReader.isShortCircuit()) {
715        readStatistics.addShortCircuitBytes(nRead);
716      } else if (blockReader.isLocal()) {
717        readStatistics.addLocalBytes(nRead);
718      } else {
719        readStatistics.addRemoteBytes(nRead);
720      }
721    }
722  }
723  
724  /**
725   * Used to read bytes into a byte[]
726   */
727  private class ByteArrayStrategy implements ReaderStrategy {
728    final byte[] buf;
729
730    public ByteArrayStrategy(byte[] buf) {
731      this.buf = buf;
732    }
733
734    @Override
735    public int doRead(BlockReader blockReader, int off, int len)
736          throws ChecksumException, IOException {
737      int nRead = blockReader.read(buf, off, len);
738      updateReadStatistics(readStatistics, nRead, blockReader);
739      return nRead;
740    }
741  }
742
743  /**
744   * Used to read bytes into a user-supplied ByteBuffer
745   */
746  private class ByteBufferStrategy implements ReaderStrategy {
747    final ByteBuffer buf;
748    ByteBufferStrategy(ByteBuffer buf) {
749      this.buf = buf;
750    }
751
752    @Override
753    public int doRead(BlockReader blockReader, int off, int len)
754        throws ChecksumException, IOException {
755      int oldpos = buf.position();
756      int oldlimit = buf.limit();
757      boolean success = false;
758      try {
759        int ret = blockReader.read(buf);
760        success = true;
761        updateReadStatistics(readStatistics, ret, blockReader);
762        return ret;
763      } finally {
764        if (!success) {
765          // Reset to original state so that retries work correctly.
766          buf.position(oldpos);
767          buf.limit(oldlimit);
768        }
769      } 
770    }
771  }
772
773  /* This is a used by regular read() and handles ChecksumExceptions.
774   * name readBuffer() is chosen to imply similarity to readBuffer() in
775   * ChecksumFileSystem
776   */ 
777  private synchronized int readBuffer(ReaderStrategy reader, int off, int len,
778      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
779      throws IOException {
780    IOException ioe;
781    
782    /* we retry current node only once. So this is set to true only here.
783     * Intention is to handle one common case of an error that is not a
784     * failure on datanode or client : when DataNode closes the connection
785     * since client is idle. If there are other cases of "non-errors" then
786     * then a datanode might be retried by setting this to true again.
787     */
788    boolean retryCurrentNode = true;
789
790    while (true) {
791      // retry as many times as seekToNewSource allows.
792      try {
793        return reader.doRead(blockReader, off, len);
794      } catch ( ChecksumException ce ) {
795        DFSClient.LOG.warn("Found Checksum error for "
796            + getCurrentBlock() + " from " + currentNode
797            + " at " + ce.getPos());        
798        ioe = ce;
799        retryCurrentNode = false;
800        // we want to remember which block replicas we have tried
801        addIntoCorruptedBlockMap(getCurrentBlock(), currentNode,
802            corruptedBlockMap);
803      } catch ( IOException e ) {
804        if (!retryCurrentNode) {
805          DFSClient.LOG.warn("Exception while reading from "
806              + getCurrentBlock() + " of " + src + " from "
807              + currentNode, e);
808        }
809        ioe = e;
810      }
811      boolean sourceFound = false;
812      if (retryCurrentNode) {
813        /* possibly retry the same node so that transient errors don't
814         * result in application level failures (e.g. Datanode could have
815         * closed the connection because the client is idle for too long).
816         */ 
817        sourceFound = seekToBlockSource(pos);
818      } else {
819        addToDeadNodes(currentNode);
820        sourceFound = seekToNewSource(pos);
821      }
822      if (!sourceFound) {
823        throw ioe;
824      }
825      retryCurrentNode = false;
826    }
827  }
828
829  private synchronized int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException {
830    dfsClient.checkOpen();
831    if (closed.get()) {
832      throw new IOException("Stream closed");
833    }
834    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
835      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
836    failures = 0;
837    if (pos < getFileLength()) {
838      int retries = 2;
839      while (retries > 0) {
840        try {
841          // currentNode can be left as null if previous read had a checksum
842          // error on the same block. See HDFS-3067
843          if (pos > blockEnd || currentNode == null) {
844            currentNode = blockSeekTo(pos);
845          }
846          int realLen = (int) Math.min(len, (blockEnd - pos + 1L));
847          synchronized(infoLock) {
848            if (locatedBlocks.isLastBlockComplete()) {
849              realLen = (int) Math.min(realLen,
850                  locatedBlocks.getFileLength() - pos);
851            }
852          }
853          int result = readBuffer(strategy, off, realLen, corruptedBlockMap);
854          
855          if (result >= 0) {
856            pos += result;
857          } else {
858            // got a EOS from reader though we expect more data on it.
859            throw new IOException("Unexpected EOS from the reader");
860          }
861          if (dfsClient.stats != null) {
862            dfsClient.stats.incrementBytesRead(result);
863          }
864          return result;
865        } catch (ChecksumException ce) {
866          throw ce;            
867        } catch (IOException e) {
868          if (retries == 1) {
869            DFSClient.LOG.warn("DFS Read", e);
870          }
871          blockEnd = -1;
872          if (currentNode != null) { addToDeadNodes(currentNode); }
873          if (--retries == 0) {
874            throw e;
875          }
876        } finally {
877          // Check if need to report block replicas corruption either read
878          // was successful or ChecksumException occured.
879          reportCheckSumFailure(corruptedBlockMap, 
880              currentLocatedBlock.getLocations().length);
881        }
882      }
883    }
884    return -1;
885  }
886
887  /**
888   * Read the entire buffer.
889   */
890  @Override
891  public synchronized int read(final byte buf[], int off, int len) throws IOException {
892    ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf);
893    TraceScope scope =
894        dfsClient.getPathTraceScope("DFSInputStream#byteArrayRead", src);
895    try {
896      return readWithStrategy(byteArrayReader, off, len);
897    } finally {
898      scope.close();
899    }
900  }
901
902  @Override
903  public synchronized int read(final ByteBuffer buf) throws IOException {
904    ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf);
905    TraceScope scope =
906        dfsClient.getPathTraceScope("DFSInputStream#byteBufferRead", src);
907    try {
908      return readWithStrategy(byteBufferReader, 0, buf.remaining());
909    } finally {
910      scope.close();
911    }
912  }
913
914
915  /**
916   * Add corrupted block replica into map.
917   */
918  private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 
919      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
920    Set<DatanodeInfo> dnSet = null;
921    if((corruptedBlockMap.containsKey(blk))) {
922      dnSet = corruptedBlockMap.get(blk);
923    }else {
924      dnSet = new HashSet<DatanodeInfo>();
925    }
926    if (!dnSet.contains(node)) {
927      dnSet.add(node);
928      corruptedBlockMap.put(blk, dnSet);
929    }
930  }
931
932  private DNAddrPair chooseDataNode(LocatedBlock block,
933      Collection<DatanodeInfo> ignoredNodes) throws IOException {
934    while (true) {
935      try {
936        return getBestNodeDNAddrPair(block, ignoredNodes);
937      } catch (IOException ie) {
938        String errMsg = getBestNodeDNAddrPairErrorString(block.getLocations(),
939          deadNodes, ignoredNodes);
940        String blockInfo = block.getBlock() + " file=" + src;
941        if (failures >= dfsClient.getMaxBlockAcquireFailures()) {
942          String description = "Could not obtain block: " + blockInfo;
943          DFSClient.LOG.warn(description + errMsg
944              + ". Throwing a BlockMissingException");
945          throw new BlockMissingException(src, description,
946              block.getStartOffset());
947        }
948
949        DatanodeInfo[] nodes = block.getLocations();
950        if (nodes == null || nodes.length == 0) {
951          DFSClient.LOG.info("No node available for " + blockInfo);
952        }
953        DFSClient.LOG.info("Could not obtain " + block.getBlock()
954            + " from any node: " + ie + errMsg
955            + ". Will get new block locations from namenode and retry...");
956        try {
957          // Introducing a random factor to the wait time before another retry.
958          // The wait time is dependent on # of failures and a random factor.
959          // At the first time of getting a BlockMissingException, the wait time
960          // is a random number between 0..3000 ms. If the first retry
961          // still fails, we will wait 3000 ms grace period before the 2nd retry.
962          // Also at the second retry, the waiting window is expanded to 6000 ms
963          // alleviating the request rate from the server. Similarly the 3rd retry
964          // will wait 6000ms grace period before retry and the waiting window is
965          // expanded to 9000ms. 
966          final int timeWindow = dfsClient.getConf().timeWindow;
967          double waitTime = timeWindow * failures +       // grace period for the last round of attempt
968            timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure
969          DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec.");
970          Thread.sleep((long)waitTime);
971        } catch (InterruptedException iex) {
972        }
973        deadNodes.clear(); //2nd option is to remove only nodes[blockId]
974        openInfo();
975        block = getBlockAt(block.getStartOffset());
976        failures++;
977        continue;
978      }
979    }
980  }
981
982  /**
983   * Get the best node from which to stream the data.
984   * @param block LocatedBlock, containing nodes in priority order.
985   * @param ignoredNodes Do not choose nodes in this array (may be null)
986   * @return The DNAddrPair of the best node.
987   * @throws IOException
988   */
989  private DNAddrPair getBestNodeDNAddrPair(LocatedBlock block,
990      Collection<DatanodeInfo> ignoredNodes) throws IOException {
991    DatanodeInfo[] nodes = block.getLocations();
992    StorageType[] storageTypes = block.getStorageTypes();
993    DatanodeInfo chosenNode = null;
994    StorageType storageType = null;
995    if (nodes != null) {
996      for (int i = 0; i < nodes.length; i++) {
997        if (!deadNodes.containsKey(nodes[i])
998            && (ignoredNodes == null || !ignoredNodes.contains(nodes[i]))) {
999          chosenNode = nodes[i];
1000          // Storage types are ordered to correspond with nodes, so use the same
1001          // index to get storage type.
1002          if (storageTypes != null && i < storageTypes.length) {
1003            storageType = storageTypes[i];
1004          }
1005          break;
1006        }
1007      }
1008    }
1009    if (chosenNode == null) {
1010      throw new IOException("No live nodes contain block " + block.getBlock() +
1011          " after checking nodes = " + Arrays.toString(nodes) +
1012          ", ignoredNodes = " + ignoredNodes);
1013    }
1014    final String dnAddr =
1015        chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname);
1016    if (DFSClient.LOG.isDebugEnabled()) {
1017      DFSClient.LOG.debug("Connecting to datanode " + dnAddr);
1018    }
1019    InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr);
1020    return new DNAddrPair(chosenNode, targetAddr, storageType);
1021  }
1022
1023  private static String getBestNodeDNAddrPairErrorString(
1024      DatanodeInfo nodes[], AbstractMap<DatanodeInfo,
1025      DatanodeInfo> deadNodes, Collection<DatanodeInfo> ignoredNodes) {
1026    StringBuilder errMsgr = new StringBuilder(
1027        " No live nodes contain current block ");
1028    errMsgr.append("Block locations:");
1029    for (DatanodeInfo datanode : nodes) {
1030      errMsgr.append(" ");
1031      errMsgr.append(datanode.toString());
1032    }
1033    errMsgr.append(" Dead nodes: ");
1034    for (DatanodeInfo datanode : deadNodes.keySet()) {
1035      errMsgr.append(" ");
1036      errMsgr.append(datanode.toString());
1037    }
1038    if (ignoredNodes != null) {
1039      errMsgr.append(" Ignored nodes: ");
1040      for (DatanodeInfo datanode : ignoredNodes) {
1041        errMsgr.append(" ");
1042        errMsgr.append(datanode.toString());
1043      }
1044    }
1045    return errMsgr.toString();
1046  }
1047
1048  private void fetchBlockByteRange(LocatedBlock block, long start, long end,
1049      byte[] buf, int offset,
1050      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1051      throws IOException {
1052    block = getBlockAt(block.getStartOffset());
1053    while (true) {
1054      DNAddrPair addressPair = chooseDataNode(block, null);
1055      try {
1056        actualGetFromOneDataNode(addressPair, block, start, end, buf, offset,
1057            corruptedBlockMap);
1058        return;
1059      } catch (IOException e) {
1060        // Ignore. Already processed inside the function.
1061        // Loop through to try the next node.
1062      }
1063    }
1064  }
1065
1066  private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode,
1067      final LocatedBlock block, final long start, final long end,
1068      final ByteBuffer bb,
1069      final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap,
1070      final int hedgedReadId) {
1071    final Span parentSpan = Trace.currentSpan();
1072    return new Callable<ByteBuffer>() {
1073      @Override
1074      public ByteBuffer call() throws Exception {
1075        byte[] buf = bb.array();
1076        int offset = bb.position();
1077        TraceScope scope =
1078            Trace.startSpan("hedgedRead" + hedgedReadId, parentSpan);
1079        try {
1080          actualGetFromOneDataNode(datanode, block, start, end, buf, offset,
1081              corruptedBlockMap);
1082          return bb;
1083        } finally {
1084          scope.close();
1085        }
1086      }
1087    };
1088  }
1089
1090  private void actualGetFromOneDataNode(final DNAddrPair datanode,
1091      LocatedBlock block, final long start, final long end, byte[] buf,
1092      int offset, Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1093      throws IOException {
1094    DFSClientFaultInjector.get().startFetchFromDatanode();
1095    int refetchToken = 1; // only need to get a new access token once
1096    int refetchEncryptionKey = 1; // only need to get a new encryption key once
1097
1098    while (true) {
1099      // cached block locations may have been updated by chooseDataNode()
1100      // or fetchBlockAt(). Always get the latest list of locations at the
1101      // start of the loop.
1102      CachingStrategy curCachingStrategy;
1103      boolean allowShortCircuitLocalReads;
1104      block = getBlockAt(block.getStartOffset());
1105      synchronized(infoLock) {
1106        curCachingStrategy = cachingStrategy;
1107        allowShortCircuitLocalReads = !shortCircuitForbidden();
1108      }
1109      DatanodeInfo chosenNode = datanode.info;
1110      InetSocketAddress targetAddr = datanode.addr;
1111      StorageType storageType = datanode.storageType;
1112      BlockReader reader = null;
1113
1114      try {
1115        DFSClientFaultInjector.get().fetchFromDatanodeException();
1116        Token<BlockTokenIdentifier> blockToken = block.getBlockToken();
1117        int len = (int) (end - start + 1);
1118        reader = new BlockReaderFactory(dfsClient.getConf()).
1119            setInetSocketAddress(targetAddr).
1120            setRemotePeerFactory(dfsClient).
1121            setDatanodeInfo(chosenNode).
1122            setStorageType(storageType).
1123            setFileName(src).
1124            setBlock(block.getBlock()).
1125            setBlockToken(blockToken).
1126            setStartOffset(start).
1127            setVerifyChecksum(verifyChecksum).
1128            setClientName(dfsClient.clientName).
1129            setLength(len).
1130            setCachingStrategy(curCachingStrategy).
1131            setAllowShortCircuitLocalReads(allowShortCircuitLocalReads).
1132            setClientCacheContext(dfsClient.getClientContext()).
1133            setUserGroupInformation(dfsClient.ugi).
1134            setConfiguration(dfsClient.getConfiguration()).
1135            build();
1136        int nread = reader.readAll(buf, offset, len);
1137        updateReadStatistics(readStatistics, nread, reader);
1138
1139        if (nread != len) {
1140          throw new IOException("truncated return from reader.read(): " +
1141                                "excpected " + len + ", got " + nread);
1142        }
1143        DFSClientFaultInjector.get().readFromDatanodeDelay();
1144        return;
1145      } catch (ChecksumException e) {
1146        String msg = "fetchBlockByteRange(). Got a checksum exception for "
1147            + src + " at " + block.getBlock() + ":" + e.getPos() + " from "
1148            + chosenNode;
1149        DFSClient.LOG.warn(msg);
1150        // we want to remember what we have tried
1151        addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
1152        addToDeadNodes(chosenNode);
1153        throw new IOException(msg);
1154      } catch (IOException e) {
1155        if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
1156          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
1157              + "encryption key was invalid when connecting to " + targetAddr
1158              + " : " + e);
1159          // The encryption key used is invalid.
1160          refetchEncryptionKey--;
1161          dfsClient.clearDataEncryptionKey();
1162          continue;
1163        } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) {
1164          refetchToken--;
1165          try {
1166            fetchBlockAt(block.getStartOffset());
1167          } catch (IOException fbae) {
1168            // ignore IOE, since we can retry it later in a loop
1169          }
1170          continue;
1171        } else {
1172          String msg = "Failed to connect to " + targetAddr + " for file "
1173              + src + " for block " + block.getBlock() + ":" + e;
1174          DFSClient.LOG.warn("Connection failure: " + msg, e);
1175          addToDeadNodes(chosenNode);
1176          throw new IOException(msg);
1177        }
1178      } finally {
1179        if (reader != null) {
1180          reader.close();
1181        }
1182      }
1183    }
1184  }
1185
1186  /**
1187   * Like {@link #fetchBlockByteRange(LocatedBlock, long, long, byte[],
1188   * int, Map)} except we start up a second, parallel, 'hedged' read
1189   * if the first read is taking longer than configured amount of
1190   * time.  We then wait on which ever read returns first.
1191   */
1192  private void hedgedFetchBlockByteRange(LocatedBlock block, long start,
1193      long end, byte[] buf, int offset,
1194      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1195      throws IOException {
1196    ArrayList<Future<ByteBuffer>> futures = new ArrayList<Future<ByteBuffer>>();
1197    CompletionService<ByteBuffer> hedgedService =
1198        new ExecutorCompletionService<ByteBuffer>(
1199        dfsClient.getHedgedReadsThreadPool());
1200    ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>();
1201    ByteBuffer bb = null;
1202    int len = (int) (end - start + 1);
1203    int hedgedReadId = 0;
1204    block = getBlockAt(block.getStartOffset());
1205    while (true) {
1206      // see HDFS-6591, this metric is used to verify/catch unnecessary loops
1207      hedgedReadOpsLoopNumForTesting++;
1208      DNAddrPair chosenNode = null;
1209      // there is no request already executing.
1210      if (futures.isEmpty()) {
1211        // chooseDataNode is a commitment. If no node, we go to
1212        // the NN to reget block locations. Only go here on first read.
1213        chosenNode = chooseDataNode(block, ignored);
1214        bb = ByteBuffer.wrap(buf, offset, len);
1215        Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode(
1216            chosenNode, block, start, end, bb, corruptedBlockMap,
1217            hedgedReadId++);
1218        Future<ByteBuffer> firstRequest = hedgedService
1219            .submit(getFromDataNodeCallable);
1220        futures.add(firstRequest);
1221        try {
1222          Future<ByteBuffer> future = hedgedService.poll(
1223              dfsClient.getHedgedReadTimeout(), TimeUnit.MILLISECONDS);
1224          if (future != null) {
1225            future.get();
1226            return;
1227          }
1228          if (DFSClient.LOG.isDebugEnabled()) {
1229            DFSClient.LOG.debug("Waited " + dfsClient.getHedgedReadTimeout()
1230                + "ms to read from " + chosenNode.info
1231                + "; spawning hedged read");
1232          }
1233          // Ignore this node on next go around.
1234          ignored.add(chosenNode.info);
1235          dfsClient.getHedgedReadMetrics().incHedgedReadOps();
1236          continue; // no need to refresh block locations
1237        } catch (InterruptedException e) {
1238          // Ignore
1239        } catch (ExecutionException e) {
1240          // Ignore already logged in the call.
1241        }
1242      } else {
1243        // We are starting up a 'hedged' read. We have a read already
1244        // ongoing. Call getBestNodeDNAddrPair instead of chooseDataNode.
1245        // If no nodes to do hedged reads against, pass.
1246        try {
1247          try {
1248            chosenNode = getBestNodeDNAddrPair(block, ignored);
1249          } catch (IOException ioe) {
1250            chosenNode = chooseDataNode(block, ignored);
1251          }
1252          bb = ByteBuffer.allocate(len);
1253          Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode(
1254              chosenNode, block, start, end, bb, corruptedBlockMap,
1255              hedgedReadId++);
1256          Future<ByteBuffer> oneMoreRequest = hedgedService
1257              .submit(getFromDataNodeCallable);
1258          futures.add(oneMoreRequest);
1259        } catch (IOException ioe) {
1260          if (DFSClient.LOG.isDebugEnabled()) {
1261            DFSClient.LOG.debug("Failed getting node for hedged read: "
1262                + ioe.getMessage());
1263          }
1264        }
1265        // if not succeeded. Submit callables for each datanode in a loop, wait
1266        // for a fixed interval and get the result from the fastest one.
1267        try {
1268          ByteBuffer result = getFirstToComplete(hedgedService, futures);
1269          // cancel the rest.
1270          cancelAll(futures);
1271          if (result.array() != buf) { // compare the array pointers
1272            dfsClient.getHedgedReadMetrics().incHedgedReadWins();
1273            System.arraycopy(result.array(), result.position(), buf, offset,
1274                len);
1275          } else {
1276            dfsClient.getHedgedReadMetrics().incHedgedReadOps();
1277          }
1278          return;
1279        } catch (InterruptedException ie) {
1280          // Ignore and retry
1281        }
1282        // We got here if exception. Ignore this node on next go around IFF
1283        // we found a chosenNode to hedge read against.
1284        if (chosenNode != null && chosenNode.info != null) {
1285          ignored.add(chosenNode.info);
1286        }
1287      }
1288    }
1289  }
1290
1291  @VisibleForTesting
1292  public long getHedgedReadOpsLoopNumForTesting() {
1293    return hedgedReadOpsLoopNumForTesting;
1294  }
1295
1296  private ByteBuffer getFirstToComplete(
1297      CompletionService<ByteBuffer> hedgedService,
1298      ArrayList<Future<ByteBuffer>> futures) throws InterruptedException {
1299    if (futures.isEmpty()) {
1300      throw new InterruptedException("let's retry");
1301    }
1302    Future<ByteBuffer> future = null;
1303    try {
1304      future = hedgedService.take();
1305      ByteBuffer bb = future.get();
1306      futures.remove(future);
1307      return bb;
1308    } catch (ExecutionException e) {
1309      // already logged in the Callable
1310      futures.remove(future);
1311    } catch (CancellationException ce) {
1312      // already logged in the Callable
1313      futures.remove(future);
1314    }
1315
1316    throw new InterruptedException("let's retry");
1317  }
1318
1319  private void cancelAll(List<Future<ByteBuffer>> futures) {
1320    for (Future<ByteBuffer> future : futures) {
1321      // Unfortunately, hdfs reads do not take kindly to interruption.
1322      // Threads return a variety of interrupted-type exceptions but
1323      // also complaints about invalid pbs -- likely because read
1324      // is interrupted before gets whole pb.  Also verbose WARN
1325      // logging.  So, for now, do not interrupt running read.
1326      future.cancel(false);
1327    }
1328  }
1329
1330  /**
1331   * Should the block access token be refetched on an exception
1332   * 
1333   * @param ex Exception received
1334   * @param targetAddr Target datanode address from where exception was received
1335   * @return true if block access token has expired or invalid and it should be
1336   *         refetched
1337   */
1338  private static boolean tokenRefetchNeeded(IOException ex,
1339      InetSocketAddress targetAddr) {
1340    /*
1341     * Get a new access token and retry. Retry is needed in 2 cases. 1)
1342     * When both NN and DN re-started while DFSClient holding a cached
1343     * access token. 2) In the case that NN fails to update its
1344     * access key at pre-set interval (by a wide margin) and
1345     * subsequently restarts. In this case, DN re-registers itself with
1346     * NN and receives a new access key, but DN will delete the old
1347     * access key from its memory since it's considered expired based on
1348     * the estimated expiration date.
1349     */
1350    if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) {
1351      DFSClient.LOG.info("Access token was invalid when connecting to "
1352          + targetAddr + " : " + ex);
1353      return true;
1354    }
1355    return false;
1356  }
1357
1358  /**
1359   * Read bytes starting from the specified position.
1360   * 
1361   * @param position start read from this position
1362   * @param buffer read buffer
1363   * @param offset offset into buffer
1364   * @param length number of bytes to read
1365   * 
1366   * @return actual number of bytes read
1367   */
1368  @Override
1369  public int read(long position, byte[] buffer, int offset, int length)
1370      throws IOException {
1371    TraceScope scope =
1372        dfsClient.getPathTraceScope("DFSInputStream#byteArrayPread", src);
1373    try {
1374      return pread(position, buffer, offset, length);
1375    } finally {
1376      scope.close();
1377    }
1378  }
1379
1380  private int pread(long position, byte[] buffer, int offset, int length)
1381      throws IOException {
1382    // sanity checks
1383    dfsClient.checkOpen();
1384    if (closed.get()) {
1385      throw new IOException("Stream closed");
1386    }
1387    failures = 0;
1388    long filelen = getFileLength();
1389    if ((position < 0) || (position >= filelen)) {
1390      return -1;
1391    }
1392    int realLen = length;
1393    if ((position + length) > filelen) {
1394      realLen = (int)(filelen - position);
1395    }
1396    
1397    // determine the block and byte range within the block
1398    // corresponding to position and realLen
1399    List<LocatedBlock> blockRange = getBlockRange(position, realLen);
1400    int remaining = realLen;
1401    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
1402      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
1403    for (LocatedBlock blk : blockRange) {
1404      long targetStart = position - blk.getStartOffset();
1405      long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart);
1406      try {
1407        if (dfsClient.isHedgedReadsEnabled()) {
1408          hedgedFetchBlockByteRange(blk, targetStart, targetStart + bytesToRead
1409              - 1, buffer, offset, corruptedBlockMap);
1410        } else {
1411          fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1,
1412              buffer, offset, corruptedBlockMap);
1413        }
1414      } finally {
1415        // Check and report if any block replicas are corrupted.
1416        // BlockMissingException may be caught if all block replicas are
1417        // corrupted.
1418        reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length);
1419      }
1420
1421      remaining -= bytesToRead;
1422      position += bytesToRead;
1423      offset += bytesToRead;
1424    }
1425    assert remaining == 0 : "Wrong number of bytes read.";
1426    if (dfsClient.stats != null) {
1427      dfsClient.stats.incrementBytesRead(realLen);
1428    }
1429    return realLen;
1430  }
1431  
1432  /**
1433   * DFSInputStream reports checksum failure.
1434   * Case I : client has tried multiple data nodes and at least one of the
1435   * attempts has succeeded. We report the other failures as corrupted block to
1436   * namenode. 
1437   * Case II: client has tried out all data nodes, but all failed. We
1438   * only report if the total number of replica is 1. We do not
1439   * report otherwise since this maybe due to the client is a handicapped client
1440   * (who can not read).
1441   * @param corruptedBlockMap map of corrupted blocks
1442   * @param dataNodeCount number of data nodes who contains the block replicas
1443   */
1444  private void reportCheckSumFailure(
1445      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 
1446      int dataNodeCount) {
1447    if (corruptedBlockMap.isEmpty()) {
1448      return;
1449    }
1450    Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap
1451        .entrySet().iterator();
1452    Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next();
1453    ExtendedBlock blk = entry.getKey();
1454    Set<DatanodeInfo> dnSet = entry.getValue();
1455    if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0))
1456        || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) {
1457      DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()];
1458      int i = 0;
1459      for (DatanodeInfo dn:dnSet) {
1460        locs[i++] = dn;
1461      }
1462      LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) };
1463      dfsClient.reportChecksumFailure(src, lblocks);
1464    }
1465    corruptedBlockMap.clear();
1466  }
1467
1468  @Override
1469  public long skip(long n) throws IOException {
1470    if ( n > 0 ) {
1471      long curPos = getPos();
1472      long fileLen = getFileLength();
1473      if( n+curPos > fileLen ) {
1474        n = fileLen - curPos;
1475      }
1476      seek(curPos+n);
1477      return n;
1478    }
1479    return n < 0 ? -1 : 0;
1480  }
1481
1482  /**
1483   * Seek to a new arbitrary location
1484   */
1485  @Override
1486  public synchronized void seek(long targetPos) throws IOException {
1487    if (targetPos > getFileLength()) {
1488      throw new EOFException("Cannot seek after EOF");
1489    }
1490    if (targetPos < 0) {
1491      throw new EOFException("Cannot seek to negative offset");
1492    }
1493    if (closed.get()) {
1494      throw new IOException("Stream is closed!");
1495    }
1496    boolean done = false;
1497    if (pos <= targetPos && targetPos <= blockEnd) {
1498      //
1499      // If this seek is to a positive position in the current
1500      // block, and this piece of data might already be lying in
1501      // the TCP buffer, then just eat up the intervening data.
1502      //
1503      int diff = (int)(targetPos - pos);
1504      if (diff <= blockReader.available()) {
1505        try {
1506          pos += blockReader.skip(diff);
1507          if (pos == targetPos) {
1508            done = true;
1509          } else {
1510            // The range was already checked. If the block reader returns
1511            // something unexpected instead of throwing an exception, it is
1512            // most likely a bug. 
1513            String errMsg = "BlockReader failed to seek to " + 
1514                targetPos + ". Instead, it seeked to " + pos + ".";
1515            DFSClient.LOG.warn(errMsg);
1516            throw new IOException(errMsg);
1517          }
1518        } catch (IOException e) {//make following read to retry
1519          if(DFSClient.LOG.isDebugEnabled()) {
1520            DFSClient.LOG.debug("Exception while seek to " + targetPos
1521                + " from " + getCurrentBlock() + " of " + src + " from "
1522                + currentNode, e);
1523          }
1524        }
1525      }
1526    }
1527    if (!done) {
1528      pos = targetPos;
1529      blockEnd = -1;
1530    }
1531  }
1532
1533  /**
1534   * Same as {@link #seekToNewSource(long)} except that it does not exclude
1535   * the current datanode and might connect to the same node.
1536   */
1537  private boolean seekToBlockSource(long targetPos)
1538                                                 throws IOException {
1539    currentNode = blockSeekTo(targetPos);
1540    return true;
1541  }
1542  
1543  /**
1544   * Seek to given position on a node other than the current node.  If
1545   * a node other than the current node is found, then returns true. 
1546   * If another node could not be found, then returns false.
1547   */
1548  @Override
1549  public synchronized boolean seekToNewSource(long targetPos) throws IOException {
1550    boolean markedDead = deadNodes.containsKey(currentNode);
1551    addToDeadNodes(currentNode);
1552    DatanodeInfo oldNode = currentNode;
1553    DatanodeInfo newNode = blockSeekTo(targetPos);
1554    if (!markedDead) {
1555      /* remove it from deadNodes. blockSeekTo could have cleared 
1556       * deadNodes and added currentNode again. Thats ok. */
1557      deadNodes.remove(oldNode);
1558    }
1559    if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) {
1560      currentNode = newNode;
1561      return true;
1562    } else {
1563      return false;
1564    }
1565  }
1566      
1567  /**
1568   */
1569  @Override
1570  public synchronized long getPos() throws IOException {
1571    return pos;
1572  }
1573
1574  /** Return the size of the remaining available bytes
1575   * if the size is less than or equal to {@link Integer#MAX_VALUE},
1576   * otherwise, return {@link Integer#MAX_VALUE}.
1577   */
1578  @Override
1579  public synchronized int available() throws IOException {
1580    if (closed.get()) {
1581      throw new IOException("Stream closed");
1582    }
1583
1584    final long remaining = getFileLength() - pos;
1585    return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE;
1586  }
1587
1588  /**
1589   * We definitely don't support marks
1590   */
1591  @Override
1592  public boolean markSupported() {
1593    return false;
1594  }
1595  @Override
1596  public void mark(int readLimit) {
1597  }
1598  @Override
1599  public void reset() throws IOException {
1600    throw new IOException("Mark/reset not supported");
1601  }
1602
1603  /** Utility class to encapsulate data node info and its address. */
1604  private static final class DNAddrPair {
1605    final DatanodeInfo info;
1606    final InetSocketAddress addr;
1607    final StorageType storageType;
1608
1609    DNAddrPair(DatanodeInfo info, InetSocketAddress addr,
1610        StorageType storageType) {
1611      this.info = info;
1612      this.addr = addr;
1613      this.storageType = storageType;
1614    }
1615  }
1616
1617  /**
1618   * Get statistics about the reads which this DFSInputStream has done.
1619   */
1620  public ReadStatistics getReadStatistics() {
1621    synchronized(infoLock) {
1622      return new ReadStatistics(readStatistics);
1623    }
1624  }
1625
1626  /**
1627   * Clear statistics about the reads which this DFSInputStream has done.
1628   */
1629  public void clearReadStatistics() {
1630    synchronized(infoLock) {
1631      readStatistics.clear();
1632    }
1633  }
1634
1635  public FileEncryptionInfo getFileEncryptionInfo() {
1636    synchronized(infoLock) {
1637      return fileEncryptionInfo;
1638    }
1639  }
1640
1641  private void closeCurrentBlockReader() {
1642    if (blockReader == null) return;
1643    // Close the current block reader so that the new caching settings can 
1644    // take effect immediately.
1645    try {
1646      blockReader.close();
1647    } catch (IOException e) {
1648      DFSClient.LOG.error("error closing blockReader", e);
1649    }
1650    blockReader = null;
1651    blockEnd = -1;
1652  }
1653
1654  @Override
1655  public synchronized void setReadahead(Long readahead)
1656      throws IOException {
1657    synchronized (infoLock) {
1658      this.cachingStrategy =
1659          new CachingStrategy.Builder(this.cachingStrategy).setReadahead(readahead).build();
1660    }
1661    closeCurrentBlockReader();
1662  }
1663
1664  @Override
1665  public synchronized void setDropBehind(Boolean dropBehind)
1666      throws IOException {
1667    synchronized (infoLock) {
1668      this.cachingStrategy =
1669          new CachingStrategy.Builder(this.cachingStrategy).setDropBehind(dropBehind).build();
1670    }
1671    closeCurrentBlockReader();
1672  }
1673
1674  /**
1675   * The immutable empty buffer we return when we reach EOF when doing a
1676   * zero-copy read.
1677   */
1678  private static final ByteBuffer EMPTY_BUFFER =
1679    ByteBuffer.allocateDirect(0).asReadOnlyBuffer();
1680
1681  @Override
1682  public synchronized ByteBuffer read(ByteBufferPool bufferPool,
1683      int maxLength, EnumSet<ReadOption> opts) 
1684          throws IOException, UnsupportedOperationException {
1685    if (maxLength == 0) {
1686      return EMPTY_BUFFER;
1687    } else if (maxLength < 0) {
1688      throw new IllegalArgumentException("can't read a negative " +
1689          "number of bytes.");
1690    }
1691    if ((blockReader == null) || (blockEnd == -1)) {
1692      if (pos >= getFileLength()) {
1693        return null;
1694      }
1695      /*
1696       * If we don't have a blockReader, or the one we have has no more bytes
1697       * left to read, we call seekToBlockSource to get a new blockReader and
1698       * recalculate blockEnd.  Note that we assume we're not at EOF here
1699       * (we check this above).
1700       */
1701      if ((!seekToBlockSource(pos)) || (blockReader == null)) {
1702        throw new IOException("failed to allocate new BlockReader " +
1703            "at position " + pos);
1704      }
1705    }
1706    ByteBuffer buffer = null;
1707    if (dfsClient.getConf().shortCircuitMmapEnabled) {
1708      buffer = tryReadZeroCopy(maxLength, opts);
1709    }
1710    if (buffer != null) {
1711      return buffer;
1712    }
1713    buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength);
1714    if (buffer != null) {
1715      getExtendedReadBuffers().put(buffer, bufferPool);
1716    }
1717    return buffer;
1718  }
1719
1720  private synchronized ByteBuffer tryReadZeroCopy(int maxLength,
1721      EnumSet<ReadOption> opts) throws IOException {
1722    // Copy 'pos' and 'blockEnd' to local variables to make it easier for the
1723    // JVM to optimize this function.
1724    final long curPos = pos;
1725    final long curEnd = blockEnd;
1726    final long blockStartInFile = currentLocatedBlock.getStartOffset();
1727    final long blockPos = curPos - blockStartInFile;
1728
1729    // Shorten this read if the end of the block is nearby.
1730    long length63;
1731    if ((curPos + maxLength) <= (curEnd + 1)) {
1732      length63 = maxLength;
1733    } else {
1734      length63 = 1 + curEnd - curPos;
1735      if (length63 <= 0) {
1736        if (DFSClient.LOG.isDebugEnabled()) {
1737          DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " +
1738            curPos + " of " + src + "; " + length63 + " bytes left in block.  " +
1739            "blockPos=" + blockPos + "; curPos=" + curPos +
1740            "; curEnd=" + curEnd);
1741        }
1742        return null;
1743      }
1744      if (DFSClient.LOG.isDebugEnabled()) {
1745        DFSClient.LOG.debug("Reducing read length from " + maxLength +
1746            " to " + length63 + " to avoid going more than one byte " +
1747            "past the end of the block.  blockPos=" + blockPos +
1748            "; curPos=" + curPos + "; curEnd=" + curEnd);
1749      }
1750    }
1751    // Make sure that don't go beyond 31-bit offsets in the MappedByteBuffer.
1752    int length;
1753    if (blockPos + length63 <= Integer.MAX_VALUE) {
1754      length = (int)length63;
1755    } else {
1756      long length31 = Integer.MAX_VALUE - blockPos;
1757      if (length31 <= 0) {
1758        // Java ByteBuffers can't be longer than 2 GB, because they use
1759        // 4-byte signed integers to represent capacity, etc.
1760        // So we can't mmap the parts of the block higher than the 2 GB offset.
1761        // FIXME: we could work around this with multiple memory maps.
1762        // See HDFS-5101.
1763        if (DFSClient.LOG.isDebugEnabled()) {
1764          DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " +
1765            curPos + " of " + src + "; 31-bit MappedByteBuffer limit " +
1766            "exceeded.  blockPos=" + blockPos + ", curEnd=" + curEnd);
1767        }
1768        return null;
1769      }
1770      length = (int)length31;
1771      if (DFSClient.LOG.isDebugEnabled()) {
1772        DFSClient.LOG.debug("Reducing read length from " + maxLength +
1773            " to " + length + " to avoid 31-bit limit.  " +
1774            "blockPos=" + blockPos + "; curPos=" + curPos +
1775            "; curEnd=" + curEnd);
1776      }
1777    }
1778    final ClientMmap clientMmap = blockReader.getClientMmap(opts);
1779    if (clientMmap == null) {
1780      if (DFSClient.LOG.isDebugEnabled()) {
1781        DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
1782          curPos + " of " + src + "; BlockReader#getClientMmap returned " +
1783          "null.");
1784      }
1785      return null;
1786    }
1787    boolean success = false;
1788    ByteBuffer buffer;
1789    try {
1790      seek(curPos + length);
1791      buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
1792      buffer.position((int)blockPos);
1793      buffer.limit((int)(blockPos + length));
1794      getExtendedReadBuffers().put(buffer, clientMmap);
1795      synchronized (infoLock) {
1796        readStatistics.addZeroCopyBytes(length);
1797      }
1798      if (DFSClient.LOG.isDebugEnabled()) {
1799        DFSClient.LOG.debug("readZeroCopy read " + length + 
1800            " bytes from offset " + curPos + " via the zero-copy read " +
1801            "path.  blockEnd = " + blockEnd);
1802      }
1803      success = true;
1804    } finally {
1805      if (!success) {
1806        IOUtils.closeQuietly(clientMmap);
1807      }
1808    }
1809    return buffer;
1810  }
1811
1812  @Override
1813  public synchronized void releaseBuffer(ByteBuffer buffer) {
1814    if (buffer == EMPTY_BUFFER) return;
1815    Object val = getExtendedReadBuffers().remove(buffer);
1816    if (val == null) {
1817      throw new IllegalArgumentException("tried to release a buffer " +
1818          "that was not created by this stream, " + buffer);
1819    }
1820    if (val instanceof ClientMmap) {
1821      IOUtils.closeQuietly((ClientMmap)val);
1822    } else if (val instanceof ByteBufferPool) {
1823      ((ByteBufferPool)val).putBuffer(buffer);
1824    }
1825  }
1826
1827  @Override
1828  public synchronized void unbuffer() {
1829    closeCurrentBlockReader();
1830  }
1831}