001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs;
019
020import java.io.EOFException;
021import java.io.IOException;
022import java.net.InetSocketAddress;
023import java.nio.ByteBuffer;
024import java.util.AbstractMap;
025import java.util.ArrayList;
026import java.util.Collection;
027import java.util.EnumSet;
028import java.util.HashMap;
029import java.util.HashSet;
030import java.util.Iterator;
031import java.util.List;
032import java.util.Map;
033import java.util.Map.Entry;
034import java.util.Set;
035import java.util.concurrent.Callable;
036import java.util.concurrent.CancellationException;
037import java.util.concurrent.CompletionService;
038import java.util.concurrent.ConcurrentHashMap;
039import java.util.concurrent.ExecutionException;
040import java.util.concurrent.ExecutorCompletionService;
041import java.util.concurrent.Future;
042import java.util.concurrent.TimeUnit;
043import java.util.concurrent.atomic.AtomicLong;
044
045import org.apache.commons.io.IOUtils;
046import org.apache.hadoop.classification.InterfaceAudience;
047import org.apache.hadoop.fs.ByteBufferReadable;
048import org.apache.hadoop.fs.ByteBufferUtil;
049import org.apache.hadoop.fs.CanSetDropBehind;
050import org.apache.hadoop.fs.CanSetReadahead;
051import org.apache.hadoop.fs.ChecksumException;
052import org.apache.hadoop.fs.FSInputStream;
053import org.apache.hadoop.fs.HasEnhancedByteBufferAccess;
054import org.apache.hadoop.fs.ReadOption;
055import org.apache.hadoop.fs.UnresolvedLinkException;
056import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
057import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
058import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
059import org.apache.hadoop.hdfs.protocol.LocatedBlock;
060import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
061import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
062import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
063import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
064import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
065import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
066import org.apache.hadoop.hdfs.shortcircuit.ClientMmap;
067import org.apache.hadoop.io.ByteBufferPool;
068import org.apache.hadoop.ipc.RPC;
069import org.apache.hadoop.ipc.RemoteException;
070import org.apache.hadoop.net.NetUtils;
071import org.apache.hadoop.security.token.SecretManager.InvalidToken;
072import org.apache.hadoop.security.token.Token;
073import org.apache.hadoop.util.IdentityHashStore;
074
075import com.google.common.annotations.VisibleForTesting;
076
077/****************************************************************
078 * DFSInputStream provides bytes from a named file.  It handles 
079 * negotiation of the namenode and various datanodes as necessary.
080 ****************************************************************/
081@InterfaceAudience.Private
082public class DFSInputStream extends FSInputStream
083implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
084    HasEnhancedByteBufferAccess {
085  @VisibleForTesting
086  public static boolean tcpReadsDisabledForTesting = false;
087  private long hedgedReadOpsLoopNumForTesting = 0;
088  private final DFSClient dfsClient;
089  private boolean closed = false;
090  private final String src;
091  private BlockReader blockReader = null;
092  private final boolean verifyChecksum;
093  private LocatedBlocks locatedBlocks = null;
094  private long lastBlockBeingWrittenLength = 0;
095  private DatanodeInfo currentNode = null;
096  private LocatedBlock currentLocatedBlock = null;
097  private long pos = 0;
098  private long blockEnd = -1;
099  private CachingStrategy cachingStrategy;
100  private final ReadStatistics readStatistics = new ReadStatistics();
101
102  /**
103   * Track the ByteBuffers that we have handed out to readers.
104   * 
105   * The value type can be either ByteBufferPool or ClientMmap, depending on
106   * whether we this is a memory-mapped buffer or not.
107   */
108  private final IdentityHashStore<ByteBuffer, Object>
109      extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0);
110
111  public static class ReadStatistics {
112    public ReadStatistics() {
113      this.totalBytesRead = 0;
114      this.totalLocalBytesRead = 0;
115      this.totalShortCircuitBytesRead = 0;
116      this.totalZeroCopyBytesRead = 0;
117    }
118
119    public ReadStatistics(ReadStatistics rhs) {
120      this.totalBytesRead = rhs.getTotalBytesRead();
121      this.totalLocalBytesRead = rhs.getTotalLocalBytesRead();
122      this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead();
123      this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead();
124    }
125
126    /**
127     * @return The total bytes read.  This will always be at least as
128     * high as the other numbers, since it includes all of them.
129     */
130    public long getTotalBytesRead() {
131      return totalBytesRead;
132    }
133
134    /**
135     * @return The total local bytes read.  This will always be at least
136     * as high as totalShortCircuitBytesRead, since all short-circuit
137     * reads are also local.
138     */
139    public long getTotalLocalBytesRead() {
140      return totalLocalBytesRead;
141    }
142
143    /**
144     * @return The total short-circuit local bytes read.
145     */
146    public long getTotalShortCircuitBytesRead() {
147      return totalShortCircuitBytesRead;
148    }
149    
150    /**
151     * @return The total number of zero-copy bytes read.
152     */
153    public long getTotalZeroCopyBytesRead() {
154      return totalZeroCopyBytesRead;
155    }
156
157    /**
158     * @return The total number of bytes read which were not local.
159     */
160    public long getRemoteBytesRead() {
161      return totalBytesRead - totalLocalBytesRead;
162    }
163    
164    void addRemoteBytes(long amt) {
165      this.totalBytesRead += amt;
166    }
167
168    void addLocalBytes(long amt) {
169      this.totalBytesRead += amt;
170      this.totalLocalBytesRead += amt;
171    }
172
173    void addShortCircuitBytes(long amt) {
174      this.totalBytesRead += amt;
175      this.totalLocalBytesRead += amt;
176      this.totalShortCircuitBytesRead += amt;
177    }
178
179    void addZeroCopyBytes(long amt) {
180      this.totalBytesRead += amt;
181      this.totalLocalBytesRead += amt;
182      this.totalShortCircuitBytesRead += amt;
183      this.totalZeroCopyBytesRead += amt;
184    }
185    
186    private long totalBytesRead;
187
188    private long totalLocalBytesRead;
189
190    private long totalShortCircuitBytesRead;
191
192    private long totalZeroCopyBytesRead;
193  }
194  
195  /**
196   * This variable tracks the number of failures since the start of the
197   * most recent user-facing operation. That is to say, it should be reset
198   * whenever the user makes a call on this stream, and if at any point
199   * during the retry logic, the failure count exceeds a threshold,
200   * the errors will be thrown back to the operation.
201   *
202   * Specifically this counts the number of times the client has gone
203   * back to the namenode to get a new list of block locations, and is
204   * capped at maxBlockAcquireFailures
205   */
206  private int failures = 0;
207
208  /* XXX Use of CocurrentHashMap is temp fix. Need to fix 
209   * parallel accesses to DFSInputStream (through ptreads) properly */
210  private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes =
211             new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>();
212  private int buffersize = 1;
213  
214  private final byte[] oneByteBuf = new byte[1]; // used for 'int read()'
215
216  void addToDeadNodes(DatanodeInfo dnInfo) {
217    deadNodes.put(dnInfo, dnInfo);
218  }
219  
220  DFSInputStream(DFSClient dfsClient, String src, int buffersize, boolean verifyChecksum
221                 ) throws IOException, UnresolvedLinkException {
222    this.dfsClient = dfsClient;
223    this.verifyChecksum = verifyChecksum;
224    this.buffersize = buffersize;
225    this.src = src;
226    this.cachingStrategy =
227        dfsClient.getDefaultReadCachingStrategy();
228    openInfo();
229  }
230
231  /**
232   * Grab the open-file info from namenode
233   */
234  synchronized void openInfo() throws IOException, UnresolvedLinkException {
235    lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
236    int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength;
237    while (retriesForLastBlockLength > 0) {
238      // Getting last block length as -1 is a special case. When cluster
239      // restarts, DNs may not report immediately. At this time partial block
240      // locations will not be available with NN for getting the length. Lets
241      // retry for 3 times to get the length.
242      if (lastBlockBeingWrittenLength == -1) {
243        DFSClient.LOG.warn("Last block locations not available. "
244            + "Datanodes might not have reported blocks completely."
245            + " Will retry for " + retriesForLastBlockLength + " times");
246        waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength);
247        lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
248      } else {
249        break;
250      }
251      retriesForLastBlockLength--;
252    }
253    if (retriesForLastBlockLength == 0) {
254      throw new IOException("Could not obtain the last block locations.");
255    }
256  }
257
258  private void waitFor(int waitTime) throws IOException {
259    try {
260      Thread.sleep(waitTime);
261    } catch (InterruptedException e) {
262      throw new IOException(
263          "Interrupted while getting the last block length.");
264    }
265  }
266
267  private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException {
268    final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0);
269    if (DFSClient.LOG.isDebugEnabled()) {
270      DFSClient.LOG.debug("newInfo = " + newInfo);
271    }
272    if (newInfo == null) {
273      throw new IOException("Cannot open filename " + src);
274    }
275
276    if (locatedBlocks != null) {
277      Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator();
278      Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator();
279      while (oldIter.hasNext() && newIter.hasNext()) {
280        if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) {
281          throw new IOException("Blocklist for " + src + " has changed!");
282        }
283      }
284    }
285    locatedBlocks = newInfo;
286    long lastBlockBeingWrittenLength = 0;
287    if (!locatedBlocks.isLastBlockComplete()) {
288      final LocatedBlock last = locatedBlocks.getLastLocatedBlock();
289      if (last != null) {
290        if (last.getLocations().length == 0) {
291          if (last.getBlockSize() == 0) {
292            // if the length is zero, then no data has been written to
293            // datanode. So no need to wait for the locations.
294            return 0;
295          }
296          return -1;
297        }
298        final long len = readBlockLength(last);
299        last.getBlock().setNumBytes(len);
300        lastBlockBeingWrittenLength = len; 
301      }
302    }
303
304    currentNode = null;
305    return lastBlockBeingWrittenLength;
306  }
307
308  /** Read the block length from one of the datanodes. */
309  private long readBlockLength(LocatedBlock locatedblock) throws IOException {
310    assert locatedblock != null : "LocatedBlock cannot be null";
311    int replicaNotFoundCount = locatedblock.getLocations().length;
312    
313    for(DatanodeInfo datanode : locatedblock.getLocations()) {
314      ClientDatanodeProtocol cdp = null;
315      
316      try {
317        cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode,
318            dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout,
319            dfsClient.getConf().connectToDnViaHostname, locatedblock);
320        
321        final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock());
322        
323        if (n >= 0) {
324          return n;
325        }
326      }
327      catch(IOException ioe) {
328        if (ioe instanceof RemoteException &&
329          (((RemoteException) ioe).unwrapRemoteException() instanceof
330            ReplicaNotFoundException)) {
331          // special case : replica might not be on the DN, treat as 0 length
332          replicaNotFoundCount--;
333        }
334        
335        if (DFSClient.LOG.isDebugEnabled()) {
336          DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode "
337              + datanode + " for block " + locatedblock.getBlock(), ioe);
338        }
339      } finally {
340        if (cdp != null) {
341          RPC.stopProxy(cdp);
342        }
343      }
344    }
345
346    // Namenode told us about these locations, but none know about the replica
347    // means that we hit the race between pipeline creation start and end.
348    // we require all 3 because some other exception could have happened
349    // on a DN that has it.  we want to report that error
350    if (replicaNotFoundCount == 0) {
351      return 0;
352    }
353
354    throw new IOException("Cannot obtain block length for " + locatedblock);
355  }
356  
357  public synchronized long getFileLength() {
358    return locatedBlocks == null? 0:
359        locatedBlocks.getFileLength() + lastBlockBeingWrittenLength;
360  }
361
362  // Short circuit local reads are forbidden for files that are
363  // under construction.  See HDFS-2757.
364  synchronized boolean shortCircuitForbidden() {
365    return locatedBlocks.isUnderConstruction();
366  }
367
368  /**
369   * Returns the datanode from which the stream is currently reading.
370   */
371  public DatanodeInfo getCurrentDatanode() {
372    return currentNode;
373  }
374
375  /**
376   * Returns the block containing the target position. 
377   */
378  synchronized public ExtendedBlock getCurrentBlock() {
379    if (currentLocatedBlock == null){
380      return null;
381    }
382    return currentLocatedBlock.getBlock();
383  }
384
385  /**
386   * Return collection of blocks that has already been located.
387   */
388  public synchronized List<LocatedBlock> getAllBlocks() throws IOException {
389    return getBlockRange(0, getFileLength());
390  }
391
392  /**
393   * Get block at the specified position.
394   * Fetch it from the namenode if not cached.
395   * 
396   * @param offset block corresponding to this offset in file is returned
397   * @param updatePosition whether to update current position
398   * @return located block
399   * @throws IOException
400   */
401  private synchronized LocatedBlock getBlockAt(long offset,
402      boolean updatePosition) throws IOException {
403    assert (locatedBlocks != null) : "locatedBlocks is null";
404
405    final LocatedBlock blk;
406
407    //check offset
408    if (offset < 0 || offset >= getFileLength()) {
409      throw new IOException("offset < 0 || offset >= getFileLength(), offset="
410          + offset
411          + ", updatePosition=" + updatePosition
412          + ", locatedBlocks=" + locatedBlocks);
413    }
414    else if (offset >= locatedBlocks.getFileLength()) {
415      // offset to the portion of the last block,
416      // which is not known to the name-node yet;
417      // getting the last block 
418      blk = locatedBlocks.getLastLocatedBlock();
419    }
420    else {
421      // search cached blocks first
422      int targetBlockIdx = locatedBlocks.findBlock(offset);
423      if (targetBlockIdx < 0) { // block is not cached
424        targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
425        // fetch more blocks
426        final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
427        assert (newBlocks != null) : "Could not find target position " + offset;
428        locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
429      }
430      blk = locatedBlocks.get(targetBlockIdx);
431    }
432
433    // update current position
434    if (updatePosition) {
435      pos = offset;
436      blockEnd = blk.getStartOffset() + blk.getBlockSize() - 1;
437      currentLocatedBlock = blk;
438    }
439    return blk;
440  }
441
442  /** Fetch a block from namenode and cache it */
443  private synchronized void fetchBlockAt(long offset) throws IOException {
444    int targetBlockIdx = locatedBlocks.findBlock(offset);
445    if (targetBlockIdx < 0) { // block is not cached
446      targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
447    }
448    // fetch blocks
449    final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
450    if (newBlocks == null) {
451      throw new IOException("Could not find target position " + offset);
452    }
453    locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
454  }
455
456  /**
457   * Get blocks in the specified range.
458   * Fetch them from the namenode if not cached. This function
459   * will not get a read request beyond the EOF.
460   * @param offset starting offset in file
461   * @param length length of data
462   * @return consequent segment of located blocks
463   * @throws IOException
464   */
465  private synchronized List<LocatedBlock> getBlockRange(long offset,
466      long length)  throws IOException {
467    // getFileLength(): returns total file length
468    // locatedBlocks.getFileLength(): returns length of completed blocks
469    if (offset >= getFileLength()) {
470      throw new IOException("Offset: " + offset +
471        " exceeds file length: " + getFileLength());
472    }
473
474    final List<LocatedBlock> blocks;
475    final long lengthOfCompleteBlk = locatedBlocks.getFileLength();
476    final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk;
477    final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk;
478
479    if (readOffsetWithinCompleteBlk) {
480      //get the blocks of finalized (completed) block range
481      blocks = getFinalizedBlockRange(offset, 
482        Math.min(length, lengthOfCompleteBlk - offset));
483    } else {
484      blocks = new ArrayList<LocatedBlock>(1);
485    }
486
487    // get the blocks from incomplete block range
488    if (readLengthPastCompleteBlk) {
489       blocks.add(locatedBlocks.getLastLocatedBlock());
490    }
491
492    return blocks;
493  }
494
495  /**
496   * Get blocks in the specified range.
497   * Includes only the complete blocks.
498   * Fetch them from the namenode if not cached.
499   */
500  private synchronized List<LocatedBlock> getFinalizedBlockRange(
501      long offset, long length) throws IOException {
502    assert (locatedBlocks != null) : "locatedBlocks is null";
503    List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>();
504    // search cached blocks first
505    int blockIdx = locatedBlocks.findBlock(offset);
506    if (blockIdx < 0) { // block is not cached
507      blockIdx = LocatedBlocks.getInsertIndex(blockIdx);
508    }
509    long remaining = length;
510    long curOff = offset;
511    while(remaining > 0) {
512      LocatedBlock blk = null;
513      if(blockIdx < locatedBlocks.locatedBlockCount())
514        blk = locatedBlocks.get(blockIdx);
515      if (blk == null || curOff < blk.getStartOffset()) {
516        LocatedBlocks newBlocks;
517        newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining);
518        locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks());
519        continue;
520      }
521      assert curOff >= blk.getStartOffset() : "Block not found";
522      blockRange.add(blk);
523      long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff;
524      remaining -= bytesRead;
525      curOff += bytesRead;
526      blockIdx++;
527    }
528    return blockRange;
529  }
530
531  /**
532   * Open a DataInputStream to a DataNode so that it can be read from.
533   * We get block ID and the IDs of the destinations at startup, from the namenode.
534   */
535  private synchronized DatanodeInfo blockSeekTo(long target) throws IOException {
536    if (target >= getFileLength()) {
537      throw new IOException("Attempted to read past end of file");
538    }
539
540    // Will be getting a new BlockReader.
541    if (blockReader != null) {
542      blockReader.close();
543      blockReader = null;
544    }
545
546    //
547    // Connect to best DataNode for desired Block, with potential offset
548    //
549    DatanodeInfo chosenNode = null;
550    int refetchToken = 1; // only need to get a new access token once
551    int refetchEncryptionKey = 1; // only need to get a new encryption key once
552    
553    boolean connectFailedOnce = false;
554
555    while (true) {
556      //
557      // Compute desired block
558      //
559      LocatedBlock targetBlock = getBlockAt(target, true);
560      assert (target==pos) : "Wrong postion " + pos + " expect " + target;
561      long offsetIntoBlock = target - targetBlock.getStartOffset();
562
563      DNAddrPair retval = chooseDataNode(targetBlock, null);
564      chosenNode = retval.info;
565      InetSocketAddress targetAddr = retval.addr;
566
567      try {
568        ExtendedBlock blk = targetBlock.getBlock();
569        Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
570        blockReader = new BlockReaderFactory(dfsClient.getConf()).
571            setInetSocketAddress(targetAddr).
572            setRemotePeerFactory(dfsClient).
573            setDatanodeInfo(chosenNode).
574            setFileName(src).
575            setBlock(blk).
576            setBlockToken(accessToken).
577            setStartOffset(offsetIntoBlock).
578            setVerifyChecksum(verifyChecksum).
579            setClientName(dfsClient.clientName).
580            setLength(blk.getNumBytes() - offsetIntoBlock).
581            setCachingStrategy(cachingStrategy).
582            setAllowShortCircuitLocalReads(!shortCircuitForbidden()).
583            setClientCacheContext(dfsClient.getClientContext()).
584            setUserGroupInformation(dfsClient.ugi).
585            setConfiguration(dfsClient.getConfiguration()).
586            build();
587        if(connectFailedOnce) {
588          DFSClient.LOG.info("Successfully connected to " + targetAddr +
589                             " for " + blk);
590        }
591        return chosenNode;
592      } catch (IOException ex) {
593        if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
594          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
595              + "encryption key was invalid when connecting to " + targetAddr
596              + " : " + ex);
597          // The encryption key used is invalid.
598          refetchEncryptionKey--;
599          dfsClient.clearDataEncryptionKey();
600        } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
601          refetchToken--;
602          fetchBlockAt(target);
603        } else {
604          connectFailedOnce = true;
605          DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
606            + ", add to deadNodes and continue. " + ex, ex);
607          // Put chosen node into dead list, continue
608          addToDeadNodes(chosenNode);
609        }
610      }
611    }
612  }
613
614  /**
615   * Close it down!
616   */
617  @Override
618  public synchronized void close() throws IOException {
619    if (closed) {
620      return;
621    }
622    dfsClient.checkOpen();
623
624    if (!extendedReadBuffers.isEmpty()) {
625      final StringBuilder builder = new StringBuilder();
626      extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() {
627        private String prefix = "";
628        @Override
629        public void accept(ByteBuffer k, Object v) {
630          builder.append(prefix).append(k);
631          prefix = ", ";
632        }
633      });
634      DFSClient.LOG.warn("closing file " + src + ", but there are still " +
635          "unreleased ByteBuffers allocated by read().  " +
636          "Please release " + builder.toString() + ".");
637    }
638    if (blockReader != null) {
639      blockReader.close();
640      blockReader = null;
641    }
642    super.close();
643    closed = true;
644  }
645
646  @Override
647  public synchronized int read() throws IOException {
648    int ret = read( oneByteBuf, 0, 1 );
649    return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff);
650  }
651
652  /**
653   * Wraps different possible read implementations so that readBuffer can be
654   * strategy-agnostic.
655   */
656  private interface ReaderStrategy {
657    public int doRead(BlockReader blockReader, int off, int len,
658        ReadStatistics readStatistics) throws ChecksumException, IOException;
659  }
660
661  private static void updateReadStatistics(ReadStatistics readStatistics, 
662        int nRead, BlockReader blockReader) {
663    if (nRead <= 0) return;
664    if (blockReader.isShortCircuit()) {
665      readStatistics.addShortCircuitBytes(nRead);
666    } else if (blockReader.isLocal()) {
667      readStatistics.addLocalBytes(nRead);
668    } else {
669      readStatistics.addRemoteBytes(nRead);
670    }
671  }
672  
673  /**
674   * Used to read bytes into a byte[]
675   */
676  private static class ByteArrayStrategy implements ReaderStrategy {
677    final byte[] buf;
678
679    public ByteArrayStrategy(byte[] buf) {
680      this.buf = buf;
681    }
682
683    @Override
684    public int doRead(BlockReader blockReader, int off, int len,
685            ReadStatistics readStatistics) throws ChecksumException, IOException {
686        int nRead = blockReader.read(buf, off, len);
687        updateReadStatistics(readStatistics, nRead, blockReader);
688        return nRead;
689    }
690  }
691
692  /**
693   * Used to read bytes into a user-supplied ByteBuffer
694   */
695  private static class ByteBufferStrategy implements ReaderStrategy {
696    final ByteBuffer buf;
697    ByteBufferStrategy(ByteBuffer buf) {
698      this.buf = buf;
699    }
700
701    @Override
702    public int doRead(BlockReader blockReader, int off, int len,
703        ReadStatistics readStatistics) throws ChecksumException, IOException {
704      int oldpos = buf.position();
705      int oldlimit = buf.limit();
706      boolean success = false;
707      try {
708        int ret = blockReader.read(buf);
709        success = true;
710        updateReadStatistics(readStatistics, ret, blockReader);
711        return ret;
712      } finally {
713        if (!success) {
714          // Reset to original state so that retries work correctly.
715          buf.position(oldpos);
716          buf.limit(oldlimit);
717        }
718      } 
719    }
720  }
721
722  /* This is a used by regular read() and handles ChecksumExceptions.
723   * name readBuffer() is chosen to imply similarity to readBuffer() in
724   * ChecksumFileSystem
725   */ 
726  private synchronized int readBuffer(ReaderStrategy reader, int off, int len,
727      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
728      throws IOException {
729    IOException ioe;
730    
731    /* we retry current node only once. So this is set to true only here.
732     * Intention is to handle one common case of an error that is not a
733     * failure on datanode or client : when DataNode closes the connection
734     * since client is idle. If there are other cases of "non-errors" then
735     * then a datanode might be retried by setting this to true again.
736     */
737    boolean retryCurrentNode = true;
738
739    while (true) {
740      // retry as many times as seekToNewSource allows.
741      try {
742        return reader.doRead(blockReader, off, len, readStatistics);
743      } catch ( ChecksumException ce ) {
744        DFSClient.LOG.warn("Found Checksum error for "
745            + getCurrentBlock() + " from " + currentNode
746            + " at " + ce.getPos());        
747        ioe = ce;
748        retryCurrentNode = false;
749        // we want to remember which block replicas we have tried
750        addIntoCorruptedBlockMap(getCurrentBlock(), currentNode,
751            corruptedBlockMap);
752      } catch ( IOException e ) {
753        if (!retryCurrentNode) {
754          DFSClient.LOG.warn("Exception while reading from "
755              + getCurrentBlock() + " of " + src + " from "
756              + currentNode, e);
757        }
758        ioe = e;
759      }
760      boolean sourceFound = false;
761      if (retryCurrentNode) {
762        /* possibly retry the same node so that transient errors don't
763         * result in application level failures (e.g. Datanode could have
764         * closed the connection because the client is idle for too long).
765         */ 
766        sourceFound = seekToBlockSource(pos);
767      } else {
768        addToDeadNodes(currentNode);
769        sourceFound = seekToNewSource(pos);
770      }
771      if (!sourceFound) {
772        throw ioe;
773      }
774      retryCurrentNode = false;
775    }
776  }
777
778  private int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException {
779    dfsClient.checkOpen();
780    if (closed) {
781      throw new IOException("Stream closed");
782    }
783    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
784      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
785    failures = 0;
786    if (pos < getFileLength()) {
787      int retries = 2;
788      while (retries > 0) {
789        try {
790          // currentNode can be left as null if previous read had a checksum
791          // error on the same block. See HDFS-3067
792          if (pos > blockEnd || currentNode == null) {
793            currentNode = blockSeekTo(pos);
794          }
795          int realLen = (int) Math.min(len, (blockEnd - pos + 1L));
796          if (locatedBlocks.isLastBlockComplete()) {
797            realLen = (int) Math.min(realLen, locatedBlocks.getFileLength());
798          }
799          int result = readBuffer(strategy, off, realLen, corruptedBlockMap);
800          
801          if (result >= 0) {
802            pos += result;
803          } else {
804            // got a EOS from reader though we expect more data on it.
805            throw new IOException("Unexpected EOS from the reader");
806          }
807          if (dfsClient.stats != null) {
808            dfsClient.stats.incrementBytesRead(result);
809          }
810          return result;
811        } catch (ChecksumException ce) {
812          throw ce;            
813        } catch (IOException e) {
814          if (retries == 1) {
815            DFSClient.LOG.warn("DFS Read", e);
816          }
817          blockEnd = -1;
818          if (currentNode != null) { addToDeadNodes(currentNode); }
819          if (--retries == 0) {
820            throw e;
821          }
822        } finally {
823          // Check if need to report block replicas corruption either read
824          // was successful or ChecksumException occured.
825          reportCheckSumFailure(corruptedBlockMap, 
826              currentLocatedBlock.getLocations().length);
827        }
828      }
829    }
830    return -1;
831  }
832
833  /**
834   * Read the entire buffer.
835   */
836  @Override
837  public synchronized int read(final byte buf[], int off, int len) throws IOException {
838    ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf);
839
840    return readWithStrategy(byteArrayReader, off, len);
841  }
842
843  @Override
844  public synchronized int read(final ByteBuffer buf) throws IOException {
845    ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf);
846
847    return readWithStrategy(byteBufferReader, 0, buf.remaining());
848  }
849
850
851  /**
852   * Add corrupted block replica into map.
853   */
854  private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 
855      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
856    Set<DatanodeInfo> dnSet = null;
857    if((corruptedBlockMap.containsKey(blk))) {
858      dnSet = corruptedBlockMap.get(blk);
859    }else {
860      dnSet = new HashSet<DatanodeInfo>();
861    }
862    if (!dnSet.contains(node)) {
863      dnSet.add(node);
864      corruptedBlockMap.put(blk, dnSet);
865    }
866  }
867
868  private DNAddrPair chooseDataNode(LocatedBlock block,
869      Collection<DatanodeInfo> ignoredNodes) throws IOException {
870    while (true) {
871      DatanodeInfo[] nodes = block.getLocations();
872      try {
873        return getBestNodeDNAddrPair(nodes, ignoredNodes);
874      } catch (IOException ie) {
875        String errMsg =
876          getBestNodeDNAddrPairErrorString(nodes, deadNodes, ignoredNodes);
877        String blockInfo = block.getBlock() + " file=" + src;
878        if (failures >= dfsClient.getMaxBlockAcquireFailures()) {
879          String description = "Could not obtain block: " + blockInfo;
880          DFSClient.LOG.warn(description + errMsg
881              + ". Throwing a BlockMissingException");
882          throw new BlockMissingException(src, description,
883              block.getStartOffset());
884        }
885        
886        if (nodes == null || nodes.length == 0) {
887          DFSClient.LOG.info("No node available for " + blockInfo);
888        }
889        DFSClient.LOG.info("Could not obtain " + block.getBlock()
890            + " from any node: " + ie + errMsg
891            + ". Will get new block locations from namenode and retry...");
892        try {
893          // Introducing a random factor to the wait time before another retry.
894          // The wait time is dependent on # of failures and a random factor.
895          // At the first time of getting a BlockMissingException, the wait time
896          // is a random number between 0..3000 ms. If the first retry
897          // still fails, we will wait 3000 ms grace period before the 2nd retry.
898          // Also at the second retry, the waiting window is expanded to 6000 ms
899          // alleviating the request rate from the server. Similarly the 3rd retry
900          // will wait 6000ms grace period before retry and the waiting window is
901          // expanded to 9000ms. 
902          final int timeWindow = dfsClient.getConf().timeWindow;
903          double waitTime = timeWindow * failures +       // grace period for the last round of attempt
904            timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure
905          DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec.");
906          Thread.sleep((long)waitTime);
907        } catch (InterruptedException iex) {
908        }
909        deadNodes.clear(); //2nd option is to remove only nodes[blockId]
910        openInfo();
911        block = getBlockAt(block.getStartOffset(), false);
912        failures++;
913        continue;
914      }
915    }
916  }
917
918  /**
919   * Get the best node.
920   * @param nodes Nodes to choose from.
921   * @param ignoredNodes Do not chose nodes in this array (may be null)
922   * @return The DNAddrPair of the best node.
923   * @throws IOException
924   */
925  private DNAddrPair getBestNodeDNAddrPair(final DatanodeInfo[] nodes,
926      Collection<DatanodeInfo> ignoredNodes) throws IOException {
927    DatanodeInfo chosenNode = bestNode(nodes, deadNodes, ignoredNodes);
928    final String dnAddr =
929        chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname);
930    if (DFSClient.LOG.isDebugEnabled()) {
931      DFSClient.LOG.debug("Connecting to datanode " + dnAddr);
932    }
933    InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr);
934    return new DNAddrPair(chosenNode, targetAddr);
935  }
936
937  private static String getBestNodeDNAddrPairErrorString(
938      DatanodeInfo nodes[], AbstractMap<DatanodeInfo,
939      DatanodeInfo> deadNodes, Collection<DatanodeInfo> ignoredNodes) {
940    StringBuilder errMsgr = new StringBuilder(
941        " No live nodes contain current block ");
942    errMsgr.append("Block locations:");
943    for (DatanodeInfo datanode : nodes) {
944      errMsgr.append(" ");
945      errMsgr.append(datanode.toString());
946    }
947    errMsgr.append(" Dead nodes: ");
948    for (DatanodeInfo datanode : deadNodes.keySet()) {
949      errMsgr.append(" ");
950      errMsgr.append(datanode.toString());
951    }
952    if (ignoredNodes != null) {
953      errMsgr.append(" Ignored nodes: ");
954      for (DatanodeInfo datanode : ignoredNodes) {
955        errMsgr.append(" ");
956        errMsgr.append(datanode.toString());
957      }
958    }
959    return errMsgr.toString();
960  }
961
962  private void fetchBlockByteRange(LocatedBlock block, long start, long end,
963      byte[] buf, int offset,
964      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
965      throws IOException {
966    block = getBlockAt(block.getStartOffset(), false);
967    while (true) {
968      DNAddrPair addressPair = chooseDataNode(block, null);
969      try {
970        actualGetFromOneDataNode(addressPair, block, start, end, buf, offset,
971            corruptedBlockMap);
972        return;
973      } catch (IOException e) {
974        // Ignore. Already processed inside the function.
975        // Loop through to try the next node.
976      }
977    }
978  }
979
980  private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode,
981      final LocatedBlock block, final long start, final long end,
982      final ByteBuffer bb,
983      final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
984    return new Callable<ByteBuffer>() {
985      @Override
986      public ByteBuffer call() throws Exception {
987        byte[] buf = bb.array();
988        int offset = bb.position();
989        actualGetFromOneDataNode(datanode, block, start, end, buf, offset,
990            corruptedBlockMap);
991        return bb;
992      }
993    };
994  }
995
996  private void actualGetFromOneDataNode(final DNAddrPair datanode,
997      LocatedBlock block, final long start, final long end, byte[] buf,
998      int offset, Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
999      throws IOException {
1000    DFSClientFaultInjector.get().startFetchFromDatanode();
1001    int refetchToken = 1; // only need to get a new access token once
1002    int refetchEncryptionKey = 1; // only need to get a new encryption key once
1003
1004    while (true) {
1005      // cached block locations may have been updated by chooseDataNode()
1006      // or fetchBlockAt(). Always get the latest list of locations at the
1007      // start of the loop.
1008      CachingStrategy curCachingStrategy;
1009      boolean allowShortCircuitLocalReads;
1010      synchronized (this) {
1011        block = getBlockAt(block.getStartOffset(), false);
1012        curCachingStrategy = cachingStrategy;
1013        allowShortCircuitLocalReads = !shortCircuitForbidden();
1014      }
1015      DatanodeInfo chosenNode = datanode.info;
1016      InetSocketAddress targetAddr = datanode.addr;
1017      BlockReader reader = null;
1018
1019      try {
1020        DFSClientFaultInjector.get().fetchFromDatanodeException();
1021        Token<BlockTokenIdentifier> blockToken = block.getBlockToken();
1022        int len = (int) (end - start + 1);
1023        reader = new BlockReaderFactory(dfsClient.getConf()).
1024            setInetSocketAddress(targetAddr).
1025            setRemotePeerFactory(dfsClient).
1026            setDatanodeInfo(chosenNode).
1027            setFileName(src).
1028            setBlock(block.getBlock()).
1029            setBlockToken(blockToken).
1030            setStartOffset(start).
1031            setVerifyChecksum(verifyChecksum).
1032            setClientName(dfsClient.clientName).
1033            setLength(len).
1034            setCachingStrategy(curCachingStrategy).
1035            setAllowShortCircuitLocalReads(allowShortCircuitLocalReads).
1036            setClientCacheContext(dfsClient.getClientContext()).
1037            setUserGroupInformation(dfsClient.ugi).
1038            setConfiguration(dfsClient.getConfiguration()).
1039            build();
1040        int nread = reader.readAll(buf, offset, len);
1041        updateReadStatistics(readStatistics, nread, reader);
1042
1043        if (nread != len) {
1044          throw new IOException("truncated return from reader.read(): " +
1045                                "excpected " + len + ", got " + nread);
1046        }
1047        DFSClientFaultInjector.get().readFromDatanodeDelay();
1048        return;
1049      } catch (ChecksumException e) {
1050        String msg = "fetchBlockByteRange(). Got a checksum exception for "
1051            + src + " at " + block.getBlock() + ":" + e.getPos() + " from "
1052            + chosenNode;
1053        DFSClient.LOG.warn(msg);
1054        // we want to remember what we have tried
1055        addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
1056        addToDeadNodes(chosenNode);
1057        throw new IOException(msg);
1058      } catch (IOException e) {
1059        if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
1060          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
1061              + "encryption key was invalid when connecting to " + targetAddr
1062              + " : " + e);
1063          // The encryption key used is invalid.
1064          refetchEncryptionKey--;
1065          dfsClient.clearDataEncryptionKey();
1066          continue;
1067        } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) {
1068          refetchToken--;
1069          try {
1070            fetchBlockAt(block.getStartOffset());
1071          } catch (IOException fbae) {
1072            // ignore IOE, since we can retry it later in a loop
1073          }
1074          continue;
1075        } else {
1076          String msg = "Failed to connect to " + targetAddr + " for file "
1077              + src + " for block " + block.getBlock() + ":" + e;
1078          DFSClient.LOG.warn("Connection failure: " + msg, e);
1079          addToDeadNodes(chosenNode);
1080          throw new IOException(msg);
1081        }
1082      } finally {
1083        if (reader != null) {
1084          reader.close();
1085        }
1086      }
1087    }
1088  }
1089
1090  /**
1091   * Like {@link #fetchBlockByteRange(LocatedBlock, long, long, byte[],
1092   * int, Map)} except we start up a second, parallel, 'hedged' read
1093   * if the first read is taking longer than configured amount of
1094   * time.  We then wait on which ever read returns first.
1095   */
1096  private void hedgedFetchBlockByteRange(LocatedBlock block, long start,
1097      long end, byte[] buf, int offset,
1098      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1099      throws IOException {
1100    ArrayList<Future<ByteBuffer>> futures = new ArrayList<Future<ByteBuffer>>();
1101    CompletionService<ByteBuffer> hedgedService =
1102        new ExecutorCompletionService<ByteBuffer>(
1103        dfsClient.getHedgedReadsThreadPool());
1104    ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>();
1105    ByteBuffer bb = null;
1106    int len = (int) (end - start + 1);
1107    block = getBlockAt(block.getStartOffset(), false);
1108    while (true) {
1109      // see HDFS-6591, this metric is used to verify/catch unnecessary loops
1110      hedgedReadOpsLoopNumForTesting++;
1111      DNAddrPair chosenNode = null;
1112      // there is no request already executing.
1113      if (futures.isEmpty()) {
1114        // chooseDataNode is a commitment. If no node, we go to
1115        // the NN to reget block locations. Only go here on first read.
1116        chosenNode = chooseDataNode(block, ignored);
1117        bb = ByteBuffer.wrap(buf, offset, len);
1118        Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode(
1119            chosenNode, block, start, end, bb, corruptedBlockMap);
1120        Future<ByteBuffer> firstRequest = hedgedService
1121            .submit(getFromDataNodeCallable);
1122        futures.add(firstRequest);
1123        try {
1124          Future<ByteBuffer> future = hedgedService.poll(
1125              dfsClient.getHedgedReadTimeout(), TimeUnit.MILLISECONDS);
1126          if (future != null) {
1127            future.get();
1128            return;
1129          }
1130          if (DFSClient.LOG.isDebugEnabled()) {
1131            DFSClient.LOG.debug("Waited " + dfsClient.getHedgedReadTimeout()
1132                + "ms to read from " + chosenNode.info
1133                + "; spawning hedged read");
1134          }
1135          // Ignore this node on next go around.
1136          ignored.add(chosenNode.info);
1137          dfsClient.getHedgedReadMetrics().incHedgedReadOps();
1138          continue; // no need to refresh block locations
1139        } catch (InterruptedException e) {
1140          // Ignore
1141        } catch (ExecutionException e) {
1142          // Ignore already logged in the call.
1143        }
1144      } else {
1145        // We are starting up a 'hedged' read. We have a read already
1146        // ongoing. Call getBestNodeDNAddrPair instead of chooseDataNode.
1147        // If no nodes to do hedged reads against, pass.
1148        try {
1149          try {
1150            chosenNode = getBestNodeDNAddrPair(block.getLocations(), ignored);
1151          } catch (IOException ioe) {
1152            chosenNode = chooseDataNode(block, ignored);
1153          }
1154          bb = ByteBuffer.allocate(len);
1155          Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode(
1156              chosenNode, block, start, end, bb, corruptedBlockMap);
1157          Future<ByteBuffer> oneMoreRequest = hedgedService
1158              .submit(getFromDataNodeCallable);
1159          futures.add(oneMoreRequest);
1160        } catch (IOException ioe) {
1161          if (DFSClient.LOG.isDebugEnabled()) {
1162            DFSClient.LOG.debug("Failed getting node for hedged read: "
1163                + ioe.getMessage());
1164          }
1165        }
1166        // if not succeeded. Submit callables for each datanode in a loop, wait
1167        // for a fixed interval and get the result from the fastest one.
1168        try {
1169          ByteBuffer result = getFirstToComplete(hedgedService, futures);
1170          // cancel the rest.
1171          cancelAll(futures);
1172          if (result.array() != buf) { // compare the array pointers
1173            dfsClient.getHedgedReadMetrics().incHedgedReadWins();
1174            System.arraycopy(result.array(), result.position(), buf, offset,
1175                len);
1176          } else {
1177            dfsClient.getHedgedReadMetrics().incHedgedReadOps();
1178          }
1179          return;
1180        } catch (InterruptedException ie) {
1181          // Ignore and retry
1182        }
1183        // We got here if exception. Ignore this node on next go around IFF
1184        // we found a chosenNode to hedge read against.
1185        if (chosenNode != null && chosenNode.info != null) {
1186          ignored.add(chosenNode.info);
1187        }
1188      }
1189    }
1190  }
1191
1192  @VisibleForTesting
1193  public long getHedgedReadOpsLoopNumForTesting() {
1194    return hedgedReadOpsLoopNumForTesting;
1195  }
1196
1197  private ByteBuffer getFirstToComplete(
1198      CompletionService<ByteBuffer> hedgedService,
1199      ArrayList<Future<ByteBuffer>> futures) throws InterruptedException {
1200    if (futures.isEmpty()) {
1201      throw new InterruptedException("let's retry");
1202    }
1203    Future<ByteBuffer> future = null;
1204    try {
1205      future = hedgedService.take();
1206      ByteBuffer bb = future.get();
1207      futures.remove(future);
1208      return bb;
1209    } catch (ExecutionException e) {
1210      // already logged in the Callable
1211      futures.remove(future);
1212    } catch (CancellationException ce) {
1213      // already logged in the Callable
1214      futures.remove(future);
1215    }
1216
1217    throw new InterruptedException("let's retry");
1218  }
1219
1220  private void cancelAll(List<Future<ByteBuffer>> futures) {
1221    for (Future<ByteBuffer> future : futures) {
1222      // Unfortunately, hdfs reads do not take kindly to interruption.
1223      // Threads return a variety of interrupted-type exceptions but
1224      // also complaints about invalid pbs -- likely because read
1225      // is interrupted before gets whole pb.  Also verbose WARN
1226      // logging.  So, for now, do not interrupt running read.
1227      future.cancel(false);
1228    }
1229  }
1230
1231  /**
1232   * Should the block access token be refetched on an exception
1233   * 
1234   * @param ex Exception received
1235   * @param targetAddr Target datanode address from where exception was received
1236   * @return true if block access token has expired or invalid and it should be
1237   *         refetched
1238   */
1239  private static boolean tokenRefetchNeeded(IOException ex,
1240      InetSocketAddress targetAddr) {
1241    /*
1242     * Get a new access token and retry. Retry is needed in 2 cases. 1)
1243     * When both NN and DN re-started while DFSClient holding a cached
1244     * access token. 2) In the case that NN fails to update its
1245     * access key at pre-set interval (by a wide margin) and
1246     * subsequently restarts. In this case, DN re-registers itself with
1247     * NN and receives a new access key, but DN will delete the old
1248     * access key from its memory since it's considered expired based on
1249     * the estimated expiration date.
1250     */
1251    if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) {
1252      DFSClient.LOG.info("Access token was invalid when connecting to "
1253          + targetAddr + " : " + ex);
1254      return true;
1255    }
1256    return false;
1257  }
1258
1259  /**
1260   * Read bytes starting from the specified position.
1261   * 
1262   * @param position start read from this position
1263   * @param buffer read buffer
1264   * @param offset offset into buffer
1265   * @param length number of bytes to read
1266   * 
1267   * @return actual number of bytes read
1268   */
1269  @Override
1270  public int read(long position, byte[] buffer, int offset, int length)
1271    throws IOException {
1272    // sanity checks
1273    dfsClient.checkOpen();
1274    if (closed) {
1275      throw new IOException("Stream closed");
1276    }
1277    failures = 0;
1278    long filelen = getFileLength();
1279    if ((position < 0) || (position >= filelen)) {
1280      return -1;
1281    }
1282    int realLen = length;
1283    if ((position + length) > filelen) {
1284      realLen = (int)(filelen - position);
1285    }
1286    
1287    // determine the block and byte range within the block
1288    // corresponding to position and realLen
1289    List<LocatedBlock> blockRange = getBlockRange(position, realLen);
1290    int remaining = realLen;
1291    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
1292      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
1293    for (LocatedBlock blk : blockRange) {
1294      long targetStart = position - blk.getStartOffset();
1295      long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart);
1296      try {
1297        if (dfsClient.isHedgedReadsEnabled()) {
1298          hedgedFetchBlockByteRange(blk, targetStart, targetStart + bytesToRead
1299              - 1, buffer, offset, corruptedBlockMap);
1300        } else {
1301          fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1,
1302              buffer, offset, corruptedBlockMap);
1303        }
1304      } finally {
1305        // Check and report if any block replicas are corrupted.
1306        // BlockMissingException may be caught if all block replicas are
1307        // corrupted.
1308        reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length);
1309      }
1310
1311      remaining -= bytesToRead;
1312      position += bytesToRead;
1313      offset += bytesToRead;
1314    }
1315    assert remaining == 0 : "Wrong number of bytes read.";
1316    if (dfsClient.stats != null) {
1317      dfsClient.stats.incrementBytesRead(realLen);
1318    }
1319    return realLen;
1320  }
1321  
1322  /**
1323   * DFSInputStream reports checksum failure.
1324   * Case I : client has tried multiple data nodes and at least one of the
1325   * attempts has succeeded. We report the other failures as corrupted block to
1326   * namenode. 
1327   * Case II: client has tried out all data nodes, but all failed. We
1328   * only report if the total number of replica is 1. We do not
1329   * report otherwise since this maybe due to the client is a handicapped client
1330   * (who can not read).
1331   * @param corruptedBlockMap map of corrupted blocks
1332   * @param dataNodeCount number of data nodes who contains the block replicas
1333   */
1334  private void reportCheckSumFailure(
1335      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 
1336      int dataNodeCount) {
1337    if (corruptedBlockMap.isEmpty()) {
1338      return;
1339    }
1340    Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap
1341        .entrySet().iterator();
1342    Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next();
1343    ExtendedBlock blk = entry.getKey();
1344    Set<DatanodeInfo> dnSet = entry.getValue();
1345    if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0))
1346        || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) {
1347      DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()];
1348      int i = 0;
1349      for (DatanodeInfo dn:dnSet) {
1350        locs[i++] = dn;
1351      }
1352      LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) };
1353      dfsClient.reportChecksumFailure(src, lblocks);
1354    }
1355    corruptedBlockMap.clear();
1356  }
1357
1358  @Override
1359  public long skip(long n) throws IOException {
1360    if ( n > 0 ) {
1361      long curPos = getPos();
1362      long fileLen = getFileLength();
1363      if( n+curPos > fileLen ) {
1364        n = fileLen - curPos;
1365      }
1366      seek(curPos+n);
1367      return n;
1368    }
1369    return n < 0 ? -1 : 0;
1370  }
1371
1372  /**
1373   * Seek to a new arbitrary location
1374   */
1375  @Override
1376  public synchronized void seek(long targetPos) throws IOException {
1377    if (targetPos > getFileLength()) {
1378      throw new EOFException("Cannot seek after EOF");
1379    }
1380    if (targetPos < 0) {
1381      throw new EOFException("Cannot seek to negative offset");
1382    }
1383    if (closed) {
1384      throw new IOException("Stream is closed!");
1385    }
1386    boolean done = false;
1387    if (pos <= targetPos && targetPos <= blockEnd) {
1388      //
1389      // If this seek is to a positive position in the current
1390      // block, and this piece of data might already be lying in
1391      // the TCP buffer, then just eat up the intervening data.
1392      //
1393      int diff = (int)(targetPos - pos);
1394      if (diff <= blockReader.available()) {
1395        try {
1396          pos += blockReader.skip(diff);
1397          if (pos == targetPos) {
1398            done = true;
1399          } else {
1400            // The range was already checked. If the block reader returns
1401            // something unexpected instead of throwing an exception, it is
1402            // most likely a bug. 
1403            String errMsg = "BlockReader failed to seek to " + 
1404                targetPos + ". Instead, it seeked to " + pos + ".";
1405            DFSClient.LOG.warn(errMsg);
1406            throw new IOException(errMsg);
1407          }
1408        } catch (IOException e) {//make following read to retry
1409          if(DFSClient.LOG.isDebugEnabled()) {
1410            DFSClient.LOG.debug("Exception while seek to " + targetPos
1411                + " from " + getCurrentBlock() + " of " + src + " from "
1412                + currentNode, e);
1413          }
1414        }
1415      }
1416    }
1417    if (!done) {
1418      pos = targetPos;
1419      blockEnd = -1;
1420    }
1421  }
1422
1423  /**
1424   * Same as {@link #seekToNewSource(long)} except that it does not exclude
1425   * the current datanode and might connect to the same node.
1426   */
1427  private synchronized boolean seekToBlockSource(long targetPos)
1428                                                 throws IOException {
1429    currentNode = blockSeekTo(targetPos);
1430    return true;
1431  }
1432  
1433  /**
1434   * Seek to given position on a node other than the current node.  If
1435   * a node other than the current node is found, then returns true. 
1436   * If another node could not be found, then returns false.
1437   */
1438  @Override
1439  public synchronized boolean seekToNewSource(long targetPos) throws IOException {
1440    boolean markedDead = deadNodes.containsKey(currentNode);
1441    addToDeadNodes(currentNode);
1442    DatanodeInfo oldNode = currentNode;
1443    DatanodeInfo newNode = blockSeekTo(targetPos);
1444    if (!markedDead) {
1445      /* remove it from deadNodes. blockSeekTo could have cleared 
1446       * deadNodes and added currentNode again. Thats ok. */
1447      deadNodes.remove(oldNode);
1448    }
1449    if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) {
1450      currentNode = newNode;
1451      return true;
1452    } else {
1453      return false;
1454    }
1455  }
1456      
1457  /**
1458   */
1459  @Override
1460  public synchronized long getPos() throws IOException {
1461    return pos;
1462  }
1463
1464  /** Return the size of the remaining available bytes
1465   * if the size is less than or equal to {@link Integer#MAX_VALUE},
1466   * otherwise, return {@link Integer#MAX_VALUE}.
1467   */
1468  @Override
1469  public synchronized int available() throws IOException {
1470    if (closed) {
1471      throw new IOException("Stream closed");
1472    }
1473
1474    final long remaining = getFileLength() - pos;
1475    return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE;
1476  }
1477
1478  /**
1479   * We definitely don't support marks
1480   */
1481  @Override
1482  public boolean markSupported() {
1483    return false;
1484  }
1485  @Override
1486  public void mark(int readLimit) {
1487  }
1488  @Override
1489  public void reset() throws IOException {
1490    throw new IOException("Mark/reset not supported");
1491  }
1492
1493  /**
1494   * Pick the best node from which to stream the data.
1495   * Entries in <i>nodes</i> are already in the priority order
1496   */
1497  static DatanodeInfo bestNode(DatanodeInfo nodes[],
1498      AbstractMap<DatanodeInfo, DatanodeInfo> deadNodes,
1499      Collection<DatanodeInfo> ignoredNodes) throws IOException {
1500    if (nodes != null) {
1501      for (int i = 0; i < nodes.length; i++) {
1502        if (!deadNodes.containsKey(nodes[i])
1503            && (ignoredNodes == null || !ignoredNodes.contains(nodes[i]))) {
1504          return nodes[i];
1505        }
1506      }
1507    }
1508    throw new IOException("No live nodes contain current block");
1509  }
1510
1511  /** Utility class to encapsulate data node info and its address. */
1512  static class DNAddrPair {
1513    final DatanodeInfo info;
1514    final InetSocketAddress addr;
1515    DNAddrPair(DatanodeInfo info, InetSocketAddress addr) {
1516      this.info = info;
1517      this.addr = addr;
1518    }
1519  }
1520
1521  /**
1522   * Get statistics about the reads which this DFSInputStream has done.
1523   */
1524  public synchronized ReadStatistics getReadStatistics() {
1525    return new ReadStatistics(readStatistics);
1526  }
1527
1528  private synchronized void closeCurrentBlockReader() {
1529    if (blockReader == null) return;
1530    // Close the current block reader so that the new caching settings can 
1531    // take effect immediately.
1532    try {
1533      blockReader.close();
1534    } catch (IOException e) {
1535      DFSClient.LOG.error("error closing blockReader", e);
1536    }
1537    blockReader = null;
1538  }
1539
1540  @Override
1541  public synchronized void setReadahead(Long readahead)
1542      throws IOException {
1543    this.cachingStrategy =
1544        new CachingStrategy.Builder(this.cachingStrategy).
1545            setReadahead(readahead).build();
1546    closeCurrentBlockReader();
1547  }
1548
1549  @Override
1550  public synchronized void setDropBehind(Boolean dropBehind)
1551      throws IOException {
1552    this.cachingStrategy =
1553        new CachingStrategy.Builder(this.cachingStrategy).
1554            setDropBehind(dropBehind).build();
1555    closeCurrentBlockReader();
1556  }
1557
1558  /**
1559   * The immutable empty buffer we return when we reach EOF when doing a
1560   * zero-copy read.
1561   */
1562  private static final ByteBuffer EMPTY_BUFFER =
1563    ByteBuffer.allocateDirect(0).asReadOnlyBuffer();
1564
1565  @Override
1566  public synchronized ByteBuffer read(ByteBufferPool bufferPool,
1567      int maxLength, EnumSet<ReadOption> opts) 
1568          throws IOException, UnsupportedOperationException {
1569    if (maxLength == 0) {
1570      return EMPTY_BUFFER;
1571    } else if (maxLength < 0) {
1572      throw new IllegalArgumentException("can't read a negative " +
1573          "number of bytes.");
1574    }
1575    if ((blockReader == null) || (blockEnd == -1)) {
1576      if (pos >= getFileLength()) {
1577        return null;
1578      }
1579      /*
1580       * If we don't have a blockReader, or the one we have has no more bytes
1581       * left to read, we call seekToBlockSource to get a new blockReader and
1582       * recalculate blockEnd.  Note that we assume we're not at EOF here
1583       * (we check this above).
1584       */
1585      if ((!seekToBlockSource(pos)) || (blockReader == null)) {
1586        throw new IOException("failed to allocate new BlockReader " +
1587            "at position " + pos);
1588      }
1589    }
1590    ByteBuffer buffer = null;
1591    if (dfsClient.getConf().shortCircuitMmapEnabled) {
1592      buffer = tryReadZeroCopy(maxLength, opts);
1593    }
1594    if (buffer != null) {
1595      return buffer;
1596    }
1597    buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength);
1598    if (buffer != null) {
1599      extendedReadBuffers.put(buffer, bufferPool);
1600    }
1601    return buffer;
1602  }
1603
1604  private synchronized ByteBuffer tryReadZeroCopy(int maxLength,
1605      EnumSet<ReadOption> opts) throws IOException {
1606    // Copy 'pos' and 'blockEnd' to local variables to make it easier for the
1607    // JVM to optimize this function.
1608    final long curPos = pos;
1609    final long curEnd = blockEnd;
1610    final long blockStartInFile = currentLocatedBlock.getStartOffset();
1611    final long blockPos = curPos - blockStartInFile;
1612
1613    // Shorten this read if the end of the block is nearby.
1614    long length63;
1615    if ((curPos + maxLength) <= (curEnd + 1)) {
1616      length63 = maxLength;
1617    } else {
1618      length63 = 1 + curEnd - curPos;
1619      if (length63 <= 0) {
1620        if (DFSClient.LOG.isDebugEnabled()) {
1621          DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " +
1622            curPos + " of " + src + "; " + length63 + " bytes left in block.  " +
1623            "blockPos=" + blockPos + "; curPos=" + curPos +
1624            "; curEnd=" + curEnd);
1625        }
1626        return null;
1627      }
1628      if (DFSClient.LOG.isDebugEnabled()) {
1629        DFSClient.LOG.debug("Reducing read length from " + maxLength +
1630            " to " + length63 + " to avoid going more than one byte " +
1631            "past the end of the block.  blockPos=" + blockPos +
1632            "; curPos=" + curPos + "; curEnd=" + curEnd);
1633      }
1634    }
1635    // Make sure that don't go beyond 31-bit offsets in the MappedByteBuffer.
1636    int length;
1637    if (blockPos + length63 <= Integer.MAX_VALUE) {
1638      length = (int)length63;
1639    } else {
1640      long length31 = Integer.MAX_VALUE - blockPos;
1641      if (length31 <= 0) {
1642        // Java ByteBuffers can't be longer than 2 GB, because they use
1643        // 4-byte signed integers to represent capacity, etc.
1644        // So we can't mmap the parts of the block higher than the 2 GB offset.
1645        // FIXME: we could work around this with multiple memory maps.
1646        // See HDFS-5101.
1647        if (DFSClient.LOG.isDebugEnabled()) {
1648          DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " +
1649            curPos + " of " + src + "; 31-bit MappedByteBuffer limit " +
1650            "exceeded.  blockPos=" + blockPos + ", curEnd=" + curEnd);
1651        }
1652        return null;
1653      }
1654      length = (int)length31;
1655      if (DFSClient.LOG.isDebugEnabled()) {
1656        DFSClient.LOG.debug("Reducing read length from " + maxLength +
1657            " to " + length + " to avoid 31-bit limit.  " +
1658            "blockPos=" + blockPos + "; curPos=" + curPos +
1659            "; curEnd=" + curEnd);
1660      }
1661    }
1662    final ClientMmap clientMmap = blockReader.getClientMmap(opts);
1663    if (clientMmap == null) {
1664      if (DFSClient.LOG.isDebugEnabled()) {
1665        DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
1666          curPos + " of " + src + "; BlockReader#getClientMmap returned " +
1667          "null.");
1668      }
1669      return null;
1670    }
1671    boolean success = false;
1672    ByteBuffer buffer;
1673    try {
1674      seek(curPos + length);
1675      buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
1676      buffer.position((int)blockPos);
1677      buffer.limit((int)(blockPos + length));
1678      extendedReadBuffers.put(buffer, clientMmap);
1679      readStatistics.addZeroCopyBytes(length);
1680      if (DFSClient.LOG.isDebugEnabled()) {
1681        DFSClient.LOG.debug("readZeroCopy read " + length + 
1682            " bytes from offset " + curPos + " via the zero-copy read " +
1683            "path.  blockEnd = " + blockEnd);
1684      }
1685      success = true;
1686    } finally {
1687      if (!success) {
1688        IOUtils.closeQuietly(clientMmap);
1689      }
1690    }
1691    return buffer;
1692  }
1693
1694  @Override
1695  public synchronized void releaseBuffer(ByteBuffer buffer) {
1696    if (buffer == EMPTY_BUFFER) return;
1697    Object val = extendedReadBuffers.remove(buffer);
1698    if (val == null) {
1699      throw new IllegalArgumentException("tried to release a buffer " +
1700          "that was not created by this stream, " + buffer);
1701    }
1702    if (val instanceof ClientMmap) {
1703      IOUtils.closeQuietly((ClientMmap)val);
1704    } else if (val instanceof ByteBufferPool) {
1705      ((ByteBufferPool)val).putBuffer(buffer);
1706    }
1707  }
1708}