001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs;
019    
020    import java.io.FileInputStream;
021    import java.io.IOException;
022    import java.net.InetSocketAddress;
023    import java.net.Socket;
024    import java.nio.ByteBuffer;
025    import java.util.AbstractMap;
026    import java.util.ArrayList;
027    import java.util.EnumSet;
028    import java.util.HashMap;
029    import java.util.HashSet;
030    import java.util.Iterator;
031    import java.util.List;
032    import java.util.Map;
033    import java.util.Map.Entry;
034    import java.util.Set;
035    import java.util.concurrent.ConcurrentHashMap;
036    
037    import org.apache.commons.io.IOUtils;
038    import org.apache.hadoop.classification.InterfaceAudience;
039    import org.apache.hadoop.fs.ByteBufferReadable;
040    import org.apache.hadoop.fs.ByteBufferUtil;
041    import org.apache.hadoop.fs.CanSetDropBehind;
042    import org.apache.hadoop.fs.CanSetReadahead;
043    import org.apache.hadoop.fs.ChecksumException;
044    import org.apache.hadoop.fs.FSInputStream;
045    import org.apache.hadoop.fs.HasEnhancedByteBufferAccess;
046    import org.apache.hadoop.fs.ReadOption;
047    import org.apache.hadoop.fs.UnresolvedLinkException;
048    import org.apache.hadoop.hdfs.client.ClientMmap;
049    import org.apache.hadoop.hdfs.net.DomainPeer;
050    import org.apache.hadoop.hdfs.net.Peer;
051    import org.apache.hadoop.hdfs.net.TcpPeerServer;
052    import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
053    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
054    import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
055    import org.apache.hadoop.hdfs.protocol.LocatedBlock;
056    import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
057    import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
058    import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
059    import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
060    import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
061    import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
062    import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
063    import org.apache.hadoop.io.ByteBufferPool;
064    import org.apache.hadoop.ipc.RPC;
065    import org.apache.hadoop.ipc.RemoteException;
066    import org.apache.hadoop.net.NetUtils;
067    import org.apache.hadoop.net.unix.DomainSocket;
068    import org.apache.hadoop.security.AccessControlException;
069    import org.apache.hadoop.security.token.SecretManager.InvalidToken;
070    import org.apache.hadoop.security.token.Token;
071    import org.apache.hadoop.util.IdentityHashStore;
072    
073    import com.google.common.annotations.VisibleForTesting;
074    
075    /****************************************************************
076     * DFSInputStream provides bytes from a named file.  It handles 
077     * negotiation of the namenode and various datanodes as necessary.
078     ****************************************************************/
079    @InterfaceAudience.Private
080    public class DFSInputStream extends FSInputStream
081    implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
082        HasEnhancedByteBufferAccess {
083      @VisibleForTesting
084      static boolean tcpReadsDisabledForTesting = false;
085      private final PeerCache peerCache;
086      private final DFSClient dfsClient;
087      private boolean closed = false;
088      private final String src;
089      private BlockReader blockReader = null;
090      private final boolean verifyChecksum;
091      private LocatedBlocks locatedBlocks = null;
092      private long lastBlockBeingWrittenLength = 0;
093      private DatanodeInfo currentNode = null;
094      private LocatedBlock currentLocatedBlock = null;
095      private long pos = 0;
096      private long blockEnd = -1;
097      private CachingStrategy cachingStrategy;
098      private final ReadStatistics readStatistics = new ReadStatistics();
099    
100      /**
101       * Track the ByteBuffers that we have handed out to readers.
102       * 
103       * The value type can be either ByteBufferPool or ClientMmap, depending on
104       * whether we this is a memory-mapped buffer or not.
105       */
106      private final IdentityHashStore<ByteBuffer, Object>
107          extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0);
108    
109      public static class ReadStatistics {
110        public ReadStatistics() {
111          this.totalBytesRead = 0;
112          this.totalLocalBytesRead = 0;
113          this.totalShortCircuitBytesRead = 0;
114          this.totalZeroCopyBytesRead = 0;
115        }
116    
117        public ReadStatistics(ReadStatistics rhs) {
118          this.totalBytesRead = rhs.getTotalBytesRead();
119          this.totalLocalBytesRead = rhs.getTotalLocalBytesRead();
120          this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead();
121          this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead();
122        }
123    
124        /**
125         * @return The total bytes read.  This will always be at least as
126         * high as the other numbers, since it includes all of them.
127         */
128        public long getTotalBytesRead() {
129          return totalBytesRead;
130        }
131    
132        /**
133         * @return The total local bytes read.  This will always be at least
134         * as high as totalShortCircuitBytesRead, since all short-circuit
135         * reads are also local.
136         */
137        public long getTotalLocalBytesRead() {
138          return totalLocalBytesRead;
139        }
140    
141        /**
142         * @return The total short-circuit local bytes read.
143         */
144        public long getTotalShortCircuitBytesRead() {
145          return totalShortCircuitBytesRead;
146        }
147        
148        /**
149         * @return The total number of zero-copy bytes read.
150         */
151        public long getTotalZeroCopyBytesRead() {
152          return totalZeroCopyBytesRead;
153        }
154    
155        /**
156         * @return The total number of bytes read which were not local.
157         */
158        public long getRemoteBytesRead() {
159          return totalBytesRead - totalLocalBytesRead;
160        }
161        
162        void addRemoteBytes(long amt) {
163          this.totalBytesRead += amt;
164        }
165    
166        void addLocalBytes(long amt) {
167          this.totalBytesRead += amt;
168          this.totalLocalBytesRead += amt;
169        }
170    
171        void addShortCircuitBytes(long amt) {
172          this.totalBytesRead += amt;
173          this.totalLocalBytesRead += amt;
174          this.totalShortCircuitBytesRead += amt;
175        }
176    
177        void addZeroCopyBytes(long amt) {
178          this.totalBytesRead += amt;
179          this.totalLocalBytesRead += amt;
180          this.totalShortCircuitBytesRead += amt;
181          this.totalZeroCopyBytesRead += amt;
182        }
183        
184        private long totalBytesRead;
185    
186        private long totalLocalBytesRead;
187    
188        private long totalShortCircuitBytesRead;
189    
190        private long totalZeroCopyBytesRead;
191      }
192      
193      private final FileInputStreamCache fileInputStreamCache;
194    
195      /**
196       * This variable tracks the number of failures since the start of the
197       * most recent user-facing operation. That is to say, it should be reset
198       * whenever the user makes a call on this stream, and if at any point
199       * during the retry logic, the failure count exceeds a threshold,
200       * the errors will be thrown back to the operation.
201       *
202       * Specifically this counts the number of times the client has gone
203       * back to the namenode to get a new list of block locations, and is
204       * capped at maxBlockAcquireFailures
205       */
206      private int failures = 0;
207    
208      /* XXX Use of CocurrentHashMap is temp fix. Need to fix 
209       * parallel accesses to DFSInputStream (through ptreads) properly */
210      private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes =
211                 new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>();
212      private int buffersize = 1;
213      
214      private final byte[] oneByteBuf = new byte[1]; // used for 'int read()'
215    
216      void addToDeadNodes(DatanodeInfo dnInfo) {
217        deadNodes.put(dnInfo, dnInfo);
218      }
219      
220      DFSInputStream(DFSClient dfsClient, String src, int buffersize, boolean verifyChecksum
221                     ) throws IOException, UnresolvedLinkException {
222        this.dfsClient = dfsClient;
223        this.verifyChecksum = verifyChecksum;
224        this.buffersize = buffersize;
225        this.src = src;
226        this.peerCache = dfsClient.peerCache;
227        this.fileInputStreamCache = new FileInputStreamCache(
228            dfsClient.getConf().shortCircuitStreamsCacheSize,
229            dfsClient.getConf().shortCircuitStreamsCacheExpiryMs);
230        this.cachingStrategy =
231            dfsClient.getDefaultReadCachingStrategy();
232        openInfo();
233      }
234    
235      /**
236       * Grab the open-file info from namenode
237       */
238      synchronized void openInfo() throws IOException, UnresolvedLinkException {
239        lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
240        int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength;
241        while (retriesForLastBlockLength > 0) {
242          // Getting last block length as -1 is a special case. When cluster
243          // restarts, DNs may not report immediately. At this time partial block
244          // locations will not be available with NN for getting the length. Lets
245          // retry for 3 times to get the length.
246          if (lastBlockBeingWrittenLength == -1) {
247            DFSClient.LOG.warn("Last block locations not available. "
248                + "Datanodes might not have reported blocks completely."
249                + " Will retry for " + retriesForLastBlockLength + " times");
250            waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength);
251            lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
252          } else {
253            break;
254          }
255          retriesForLastBlockLength--;
256        }
257        if (retriesForLastBlockLength == 0) {
258          throw new IOException("Could not obtain the last block locations.");
259        }
260      }
261    
262      private void waitFor(int waitTime) throws IOException {
263        try {
264          Thread.sleep(waitTime);
265        } catch (InterruptedException e) {
266          throw new IOException(
267              "Interrupted while getting the last block length.");
268        }
269      }
270    
271      private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException {
272        final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0);
273        if (DFSClient.LOG.isDebugEnabled()) {
274          DFSClient.LOG.debug("newInfo = " + newInfo);
275        }
276        if (newInfo == null) {
277          throw new IOException("Cannot open filename " + src);
278        }
279    
280        if (locatedBlocks != null) {
281          Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator();
282          Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator();
283          while (oldIter.hasNext() && newIter.hasNext()) {
284            if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) {
285              throw new IOException("Blocklist for " + src + " has changed!");
286            }
287          }
288        }
289        locatedBlocks = newInfo;
290        long lastBlockBeingWrittenLength = 0;
291        if (!locatedBlocks.isLastBlockComplete()) {
292          final LocatedBlock last = locatedBlocks.getLastLocatedBlock();
293          if (last != null) {
294            if (last.getLocations().length == 0) {
295              if (last.getBlockSize() == 0) {
296                // if the length is zero, then no data has been written to
297                // datanode. So no need to wait for the locations.
298                return 0;
299              }
300              return -1;
301            }
302            final long len = readBlockLength(last);
303            last.getBlock().setNumBytes(len);
304            lastBlockBeingWrittenLength = len; 
305          }
306        }
307    
308        currentNode = null;
309        return lastBlockBeingWrittenLength;
310      }
311    
312      /** Read the block length from one of the datanodes. */
313      private long readBlockLength(LocatedBlock locatedblock) throws IOException {
314        assert locatedblock != null : "LocatedBlock cannot be null";
315        int replicaNotFoundCount = locatedblock.getLocations().length;
316        
317        for(DatanodeInfo datanode : locatedblock.getLocations()) {
318          ClientDatanodeProtocol cdp = null;
319          
320          try {
321            cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode,
322                dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout,
323                dfsClient.getConf().connectToDnViaHostname, locatedblock);
324            
325            final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock());
326            
327            if (n >= 0) {
328              return n;
329            }
330          }
331          catch(IOException ioe) {
332            if (ioe instanceof RemoteException &&
333              (((RemoteException) ioe).unwrapRemoteException() instanceof
334                ReplicaNotFoundException)) {
335              // special case : replica might not be on the DN, treat as 0 length
336              replicaNotFoundCount--;
337            }
338            
339            if (DFSClient.LOG.isDebugEnabled()) {
340              DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode "
341                  + datanode + " for block " + locatedblock.getBlock(), ioe);
342            }
343          } finally {
344            if (cdp != null) {
345              RPC.stopProxy(cdp);
346            }
347          }
348        }
349    
350        // Namenode told us about these locations, but none know about the replica
351        // means that we hit the race between pipeline creation start and end.
352        // we require all 3 because some other exception could have happened
353        // on a DN that has it.  we want to report that error
354        if (replicaNotFoundCount == 0) {
355          return 0;
356        }
357    
358        throw new IOException("Cannot obtain block length for " + locatedblock);
359      }
360      
361      public synchronized long getFileLength() {
362        return locatedBlocks == null? 0:
363            locatedBlocks.getFileLength() + lastBlockBeingWrittenLength;
364      }
365    
366      // Short circuit local reads are forbidden for files that are
367      // under construction.  See HDFS-2757.
368      synchronized boolean shortCircuitForbidden() {
369        return locatedBlocks.isUnderConstruction();
370      }
371    
372      /**
373       * Returns the datanode from which the stream is currently reading.
374       */
375      public DatanodeInfo getCurrentDatanode() {
376        return currentNode;
377      }
378    
379      /**
380       * Returns the block containing the target position. 
381       */
382      synchronized public ExtendedBlock getCurrentBlock() {
383        if (currentLocatedBlock == null){
384          return null;
385        }
386        return currentLocatedBlock.getBlock();
387      }
388    
389      /**
390       * Return collection of blocks that has already been located.
391       */
392      public synchronized List<LocatedBlock> getAllBlocks() throws IOException {
393        return getBlockRange(0, getFileLength());
394      }
395    
396      /**
397       * Get block at the specified position.
398       * Fetch it from the namenode if not cached.
399       * 
400       * @param offset
401       * @param updatePosition whether to update current position
402       * @return located block
403       * @throws IOException
404       */
405      private synchronized LocatedBlock getBlockAt(long offset,
406          boolean updatePosition) throws IOException {
407        assert (locatedBlocks != null) : "locatedBlocks is null";
408    
409        final LocatedBlock blk;
410    
411        //check offset
412        if (offset < 0 || offset >= getFileLength()) {
413          throw new IOException("offset < 0 || offset >= getFileLength(), offset="
414              + offset
415              + ", updatePosition=" + updatePosition
416              + ", locatedBlocks=" + locatedBlocks);
417        }
418        else if (offset >= locatedBlocks.getFileLength()) {
419          // offset to the portion of the last block,
420          // which is not known to the name-node yet;
421          // getting the last block 
422          blk = locatedBlocks.getLastLocatedBlock();
423        }
424        else {
425          // search cached blocks first
426          int targetBlockIdx = locatedBlocks.findBlock(offset);
427          if (targetBlockIdx < 0) { // block is not cached
428            targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
429            // fetch more blocks
430            final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
431            assert (newBlocks != null) : "Could not find target position " + offset;
432            locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
433          }
434          blk = locatedBlocks.get(targetBlockIdx);
435        }
436    
437        // update current position
438        if (updatePosition) {
439          pos = offset;
440          blockEnd = blk.getStartOffset() + blk.getBlockSize() - 1;
441          currentLocatedBlock = blk;
442        }
443        return blk;
444      }
445    
446      /** Fetch a block from namenode and cache it */
447      private synchronized void fetchBlockAt(long offset) throws IOException {
448        int targetBlockIdx = locatedBlocks.findBlock(offset);
449        if (targetBlockIdx < 0) { // block is not cached
450          targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
451        }
452        // fetch blocks
453        final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
454        if (newBlocks == null) {
455          throw new IOException("Could not find target position " + offset);
456        }
457        locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
458      }
459    
460      /**
461       * Get blocks in the specified range.
462       * Fetch them from the namenode if not cached. This function
463       * will not get a read request beyond the EOF.
464       * @param offset
465       * @param length
466       * @return consequent segment of located blocks
467       * @throws IOException
468       */
469      private synchronized List<LocatedBlock> getBlockRange(long offset, 
470                                                            long length) 
471                                                          throws IOException {
472        // getFileLength(): returns total file length
473        // locatedBlocks.getFileLength(): returns length of completed blocks
474        if (offset >= getFileLength()) {
475          throw new IOException("Offset: " + offset +
476            " exceeds file length: " + getFileLength());
477        }
478    
479        final List<LocatedBlock> blocks;
480        final long lengthOfCompleteBlk = locatedBlocks.getFileLength();
481        final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk;
482        final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk;
483    
484        if (readOffsetWithinCompleteBlk) {
485          //get the blocks of finalized (completed) block range
486          blocks = getFinalizedBlockRange(offset, 
487            Math.min(length, lengthOfCompleteBlk - offset));
488        } else {
489          blocks = new ArrayList<LocatedBlock>(1);
490        }
491    
492        // get the blocks from incomplete block range
493        if (readLengthPastCompleteBlk) {
494           blocks.add(locatedBlocks.getLastLocatedBlock());
495        }
496    
497        return blocks;
498      }
499    
500      /**
501       * Get blocks in the specified range.
502       * Includes only the complete blocks.
503       * Fetch them from the namenode if not cached.
504       */
505      private synchronized List<LocatedBlock> getFinalizedBlockRange(
506          long offset, long length) throws IOException {
507        assert (locatedBlocks != null) : "locatedBlocks is null";
508        List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>();
509        // search cached blocks first
510        int blockIdx = locatedBlocks.findBlock(offset);
511        if (blockIdx < 0) { // block is not cached
512          blockIdx = LocatedBlocks.getInsertIndex(blockIdx);
513        }
514        long remaining = length;
515        long curOff = offset;
516        while(remaining > 0) {
517          LocatedBlock blk = null;
518          if(blockIdx < locatedBlocks.locatedBlockCount())
519            blk = locatedBlocks.get(blockIdx);
520          if (blk == null || curOff < blk.getStartOffset()) {
521            LocatedBlocks newBlocks;
522            newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining);
523            locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks());
524            continue;
525          }
526          assert curOff >= blk.getStartOffset() : "Block not found";
527          blockRange.add(blk);
528          long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff;
529          remaining -= bytesRead;
530          curOff += bytesRead;
531          blockIdx++;
532        }
533        return blockRange;
534      }
535    
536      /**
537       * Open a DataInputStream to a DataNode so that it can be read from.
538       * We get block ID and the IDs of the destinations at startup, from the namenode.
539       */
540      private synchronized DatanodeInfo blockSeekTo(long target) throws IOException {
541        if (target >= getFileLength()) {
542          throw new IOException("Attempted to read past end of file");
543        }
544    
545        // Will be getting a new BlockReader.
546        if (blockReader != null) {
547          blockReader.close();
548          blockReader = null;
549        }
550    
551        //
552        // Connect to best DataNode for desired Block, with potential offset
553        //
554        DatanodeInfo chosenNode = null;
555        int refetchToken = 1; // only need to get a new access token once
556        int refetchEncryptionKey = 1; // only need to get a new encryption key once
557        
558        boolean connectFailedOnce = false;
559    
560        while (true) {
561          //
562          // Compute desired block
563          //
564          LocatedBlock targetBlock = getBlockAt(target, true);
565          assert (target==pos) : "Wrong postion " + pos + " expect " + target;
566          long offsetIntoBlock = target - targetBlock.getStartOffset();
567    
568          DNAddrPair retval = chooseDataNode(targetBlock);
569          chosenNode = retval.info;
570          InetSocketAddress targetAddr = retval.addr;
571    
572          try {
573            ExtendedBlock blk = targetBlock.getBlock();
574            Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
575            blockReader = getBlockReader(targetAddr, chosenNode, src, blk,
576                accessToken, offsetIntoBlock, blk.getNumBytes() - offsetIntoBlock,
577                buffersize, verifyChecksum, dfsClient.clientName, cachingStrategy);
578            if(connectFailedOnce) {
579              DFSClient.LOG.info("Successfully connected to " + targetAddr +
580                                 " for " + blk);
581            }
582            return chosenNode;
583          } catch (AccessControlException ex) {
584            DFSClient.LOG.warn("Short circuit access failed " + ex);
585            dfsClient.disableLegacyBlockReaderLocal();
586            continue;
587          } catch (IOException ex) {
588            if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
589              DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
590                  + "encryption key was invalid when connecting to " + targetAddr
591                  + " : " + ex);
592              // The encryption key used is invalid.
593              refetchEncryptionKey--;
594              dfsClient.clearDataEncryptionKey();
595            } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
596              refetchToken--;
597              fetchBlockAt(target);
598            } else {
599              connectFailedOnce = true;
600              DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
601                + ", add to deadNodes and continue. " + ex, ex);
602              // Put chosen node into dead list, continue
603              addToDeadNodes(chosenNode);
604            }
605          }
606        }
607      }
608    
609      /**
610       * Close it down!
611       */
612      @Override
613      public synchronized void close() throws IOException {
614        if (closed) {
615          return;
616        }
617        dfsClient.checkOpen();
618    
619        if (!extendedReadBuffers.isEmpty()) {
620          final StringBuilder builder = new StringBuilder();
621          extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() {
622            private String prefix = "";
623            @Override
624            public void accept(ByteBuffer k, Object v) {
625              builder.append(prefix).append(k);
626              prefix = ", ";
627            }
628          });
629          DFSClient.LOG.warn("closing file " + src + ", but there are still " +
630              "unreleased ByteBuffers allocated by read().  " +
631              "Please release " + builder.toString() + ".");
632        }
633        if (blockReader != null) {
634          blockReader.close();
635          blockReader = null;
636        }
637        super.close();
638        fileInputStreamCache.close();
639        closed = true;
640      }
641    
642      @Override
643      public synchronized int read() throws IOException {
644        int ret = read( oneByteBuf, 0, 1 );
645        return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff);
646      }
647    
648      /**
649       * Wraps different possible read implementations so that readBuffer can be
650       * strategy-agnostic.
651       */
652      private interface ReaderStrategy {
653        public int doRead(BlockReader blockReader, int off, int len,
654            ReadStatistics readStatistics) throws ChecksumException, IOException;
655      }
656    
657      private static void updateReadStatistics(ReadStatistics readStatistics, 
658            int nRead, BlockReader blockReader) {
659        if (nRead <= 0) return;
660        if (blockReader.isShortCircuit()) {
661          readStatistics.totalBytesRead += nRead;
662          readStatistics.totalLocalBytesRead += nRead;
663          readStatistics.totalShortCircuitBytesRead += nRead;
664        } else if (blockReader.isLocal()) {
665          readStatistics.totalBytesRead += nRead;
666          readStatistics.totalLocalBytesRead += nRead;
667        } else {
668          readStatistics.totalBytesRead += nRead;
669        }
670      }
671      
672      /**
673       * Used to read bytes into a byte[]
674       */
675      private static class ByteArrayStrategy implements ReaderStrategy {
676        final byte[] buf;
677    
678        public ByteArrayStrategy(byte[] buf) {
679          this.buf = buf;
680        }
681    
682        @Override
683        public int doRead(BlockReader blockReader, int off, int len,
684                ReadStatistics readStatistics) throws ChecksumException, IOException {
685            int nRead = blockReader.read(buf, off, len);
686            updateReadStatistics(readStatistics, nRead, blockReader);
687            return nRead;
688        }
689      }
690    
691      /**
692       * Used to read bytes into a user-supplied ByteBuffer
693       */
694      private static class ByteBufferStrategy implements ReaderStrategy {
695        final ByteBuffer buf;
696        ByteBufferStrategy(ByteBuffer buf) {
697          this.buf = buf;
698        }
699    
700        @Override
701        public int doRead(BlockReader blockReader, int off, int len,
702            ReadStatistics readStatistics) throws ChecksumException, IOException {
703          int oldpos = buf.position();
704          int oldlimit = buf.limit();
705          boolean success = false;
706          try {
707            int ret = blockReader.read(buf);
708            success = true;
709            updateReadStatistics(readStatistics, ret, blockReader);
710            return ret;
711          } finally {
712            if (!success) {
713              // Reset to original state so that retries work correctly.
714              buf.position(oldpos);
715              buf.limit(oldlimit);
716            }
717          } 
718        }
719      }
720    
721      /* This is a used by regular read() and handles ChecksumExceptions.
722       * name readBuffer() is chosen to imply similarity to readBuffer() in
723       * ChecksumFileSystem
724       */ 
725      private synchronized int readBuffer(ReaderStrategy reader, int off, int len,
726          Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
727          throws IOException {
728        IOException ioe;
729        
730        /* we retry current node only once. So this is set to true only here.
731         * Intention is to handle one common case of an error that is not a
732         * failure on datanode or client : when DataNode closes the connection
733         * since client is idle. If there are other cases of "non-errors" then
734         * then a datanode might be retried by setting this to true again.
735         */
736        boolean retryCurrentNode = true;
737    
738        while (true) {
739          // retry as many times as seekToNewSource allows.
740          try {
741            return reader.doRead(blockReader, off, len, readStatistics);
742          } catch ( ChecksumException ce ) {
743            DFSClient.LOG.warn("Found Checksum error for "
744                + getCurrentBlock() + " from " + currentNode
745                + " at " + ce.getPos());        
746            ioe = ce;
747            retryCurrentNode = false;
748            // we want to remember which block replicas we have tried
749            addIntoCorruptedBlockMap(getCurrentBlock(), currentNode,
750                corruptedBlockMap);
751          } catch ( IOException e ) {
752            if (!retryCurrentNode) {
753              DFSClient.LOG.warn("Exception while reading from "
754                  + getCurrentBlock() + " of " + src + " from "
755                  + currentNode, e);
756            }
757            ioe = e;
758          }
759          boolean sourceFound = false;
760          if (retryCurrentNode) {
761            /* possibly retry the same node so that transient errors don't
762             * result in application level failures (e.g. Datanode could have
763             * closed the connection because the client is idle for too long).
764             */ 
765            sourceFound = seekToBlockSource(pos);
766          } else {
767            addToDeadNodes(currentNode);
768            sourceFound = seekToNewSource(pos);
769          }
770          if (!sourceFound) {
771            throw ioe;
772          }
773          retryCurrentNode = false;
774        }
775      }
776    
777      private int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException {
778        dfsClient.checkOpen();
779        if (closed) {
780          throw new IOException("Stream closed");
781        }
782        Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
783          = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
784        failures = 0;
785        if (pos < getFileLength()) {
786          int retries = 2;
787          while (retries > 0) {
788            try {
789              // currentNode can be left as null if previous read had a checksum
790              // error on the same block. See HDFS-3067
791              if (pos > blockEnd || currentNode == null) {
792                currentNode = blockSeekTo(pos);
793              }
794              int realLen = (int) Math.min(len, (blockEnd - pos + 1L));
795              if (locatedBlocks.isLastBlockComplete()) {
796                realLen = (int) Math.min(realLen, locatedBlocks.getFileLength());
797              }
798              int result = readBuffer(strategy, off, realLen, corruptedBlockMap);
799              
800              if (result >= 0) {
801                pos += result;
802              } else {
803                // got a EOS from reader though we expect more data on it.
804                throw new IOException("Unexpected EOS from the reader");
805              }
806              if (dfsClient.stats != null && result != -1) {
807                dfsClient.stats.incrementBytesRead(result);
808              }
809              return result;
810            } catch (ChecksumException ce) {
811              throw ce;            
812            } catch (IOException e) {
813              if (retries == 1) {
814                DFSClient.LOG.warn("DFS Read", e);
815              }
816              blockEnd = -1;
817              if (currentNode != null) { addToDeadNodes(currentNode); }
818              if (--retries == 0) {
819                throw e;
820              }
821            } finally {
822              // Check if need to report block replicas corruption either read
823              // was successful or ChecksumException occured.
824              reportCheckSumFailure(corruptedBlockMap, 
825                  currentLocatedBlock.getLocations().length);
826            }
827          }
828        }
829        return -1;
830      }
831    
832      /**
833       * Read the entire buffer.
834       */
835      @Override
836      public synchronized int read(final byte buf[], int off, int len) throws IOException {
837        ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf);
838    
839        return readWithStrategy(byteArrayReader, off, len);
840      }
841    
842      @Override
843      public synchronized int read(final ByteBuffer buf) throws IOException {
844        ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf);
845    
846        return readWithStrategy(byteBufferReader, 0, buf.remaining());
847      }
848    
849    
850      /**
851       * Add corrupted block replica into map.
852       * @param corruptedBlockMap 
853       */
854      private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 
855          Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
856        Set<DatanodeInfo> dnSet = null;
857        if((corruptedBlockMap.containsKey(blk))) {
858          dnSet = corruptedBlockMap.get(blk);
859        }else {
860          dnSet = new HashSet<DatanodeInfo>();
861        }
862        if (!dnSet.contains(node)) {
863          dnSet.add(node);
864          corruptedBlockMap.put(blk, dnSet);
865        }
866      }
867          
868      private DNAddrPair chooseDataNode(LocatedBlock block)
869        throws IOException {
870        while (true) {
871          DatanodeInfo[] nodes = block.getLocations();
872          try {
873            DatanodeInfo chosenNode = bestNode(nodes, deadNodes);
874            final String dnAddr =
875                chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname);
876            if (DFSClient.LOG.isDebugEnabled()) {
877              DFSClient.LOG.debug("Connecting to datanode " + dnAddr);
878            }
879            InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr);
880            return new DNAddrPair(chosenNode, targetAddr);
881          } catch (IOException ie) {
882            String blockInfo = block.getBlock() + " file=" + src;
883            if (failures >= dfsClient.getMaxBlockAcquireFailures()) {
884              throw new BlockMissingException(src, "Could not obtain block: " + blockInfo,
885                                              block.getStartOffset());
886            }
887            
888            if (nodes == null || nodes.length == 0) {
889              DFSClient.LOG.info("No node available for " + blockInfo);
890            }
891            DFSClient.LOG.info("Could not obtain " + block.getBlock()
892                + " from any node: " + ie
893                + ". Will get new block locations from namenode and retry...");
894            try {
895              // Introducing a random factor to the wait time before another retry.
896              // The wait time is dependent on # of failures and a random factor.
897              // At the first time of getting a BlockMissingException, the wait time
898              // is a random number between 0..3000 ms. If the first retry
899              // still fails, we will wait 3000 ms grace period before the 2nd retry.
900              // Also at the second retry, the waiting window is expanded to 6000 ms
901              // alleviating the request rate from the server. Similarly the 3rd retry
902              // will wait 6000ms grace period before retry and the waiting window is
903              // expanded to 9000ms. 
904              final int timeWindow = dfsClient.getConf().timeWindow;
905              double waitTime = timeWindow * failures +       // grace period for the last round of attempt
906                timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure
907              DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec.");
908              Thread.sleep((long)waitTime);
909            } catch (InterruptedException iex) {
910            }
911            deadNodes.clear(); //2nd option is to remove only nodes[blockId]
912            openInfo();
913            block = getBlockAt(block.getStartOffset(), false);
914            failures++;
915            continue;
916          }
917        }
918      } 
919          
920      private void fetchBlockByteRange(LocatedBlock block, long start, long end,
921          byte[] buf, int offset,
922          Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
923          throws IOException {
924        //
925        // Connect to best DataNode for desired Block, with potential offset
926        //
927        int refetchToken = 1; // only need to get a new access token once
928        int refetchEncryptionKey = 1; // only need to get a new encryption key once
929        
930        while (true) {
931          // cached block locations may have been updated by chooseDataNode()
932          // or fetchBlockAt(). Always get the latest list of locations at the 
933          // start of the loop.
934          CachingStrategy curCachingStrategy;
935          synchronized (this) {
936            block = getBlockAt(block.getStartOffset(), false);
937            curCachingStrategy = cachingStrategy;
938          }
939          DNAddrPair retval = chooseDataNode(block);
940          DatanodeInfo chosenNode = retval.info;
941          InetSocketAddress targetAddr = retval.addr;
942          BlockReader reader = null;
943              
944          try {
945            Token<BlockTokenIdentifier> blockToken = block.getBlockToken();
946                
947            int len = (int) (end - start + 1);
948            reader = getBlockReader(targetAddr, chosenNode, src, block.getBlock(),
949                blockToken, start, len, buffersize, verifyChecksum,
950                dfsClient.clientName, curCachingStrategy);
951            int nread = reader.readAll(buf, offset, len);
952            if (nread != len) {
953              throw new IOException("truncated return from reader.read(): " +
954                                    "excpected " + len + ", got " + nread);
955            }
956            return;
957          } catch (ChecksumException e) {
958            DFSClient.LOG.warn("fetchBlockByteRange(). Got a checksum exception for " +
959                     src + " at " + block.getBlock() + ":" + 
960                     e.getPos() + " from " + chosenNode);
961            // we want to remember what we have tried
962            addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
963          } catch (AccessControlException ex) {
964            DFSClient.LOG.warn("Short circuit access failed " + ex);
965            dfsClient.disableLegacyBlockReaderLocal();
966            continue;
967          } catch (IOException e) {
968            if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
969              DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
970                  + "encryption key was invalid when connecting to " + targetAddr
971                  + " : " + e);
972              // The encryption key used is invalid.
973              refetchEncryptionKey--;
974              dfsClient.clearDataEncryptionKey();
975              continue;
976            } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) {
977              refetchToken--;
978              fetchBlockAt(block.getStartOffset());
979              continue;
980            } else {
981              DFSClient.LOG.warn("Failed to connect to " + targetAddr + 
982                  " for file " + src + " for block " + block.getBlock() + ":" + e);
983              if (DFSClient.LOG.isDebugEnabled()) {
984                DFSClient.LOG.debug("Connection failure ", e);
985              }
986            }
987          } finally {
988            if (reader != null) {
989              reader.close();
990            }
991          }
992          // Put chosen node into dead list, continue
993          addToDeadNodes(chosenNode);
994        }
995      }
996    
997      /**
998       * Should the block access token be refetched on an exception
999       * 
1000       * @param ex Exception received
1001       * @param targetAddr Target datanode address from where exception was received
1002       * @return true if block access token has expired or invalid and it should be
1003       *         refetched
1004       */
1005      private static boolean tokenRefetchNeeded(IOException ex,
1006          InetSocketAddress targetAddr) {
1007        /*
1008         * Get a new access token and retry. Retry is needed in 2 cases. 1)
1009         * When both NN and DN re-started while DFSClient holding a cached
1010         * access token. 2) In the case that NN fails to update its
1011         * access key at pre-set interval (by a wide margin) and
1012         * subsequently restarts. In this case, DN re-registers itself with
1013         * NN and receives a new access key, but DN will delete the old
1014         * access key from its memory since it's considered expired based on
1015         * the estimated expiration date.
1016         */
1017        if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) {
1018          DFSClient.LOG.info("Access token was invalid when connecting to "
1019              + targetAddr + " : " + ex);
1020          return true;
1021        }
1022        return false;
1023      }
1024    
1025      private Peer newTcpPeer(InetSocketAddress addr) throws IOException {
1026        Peer peer = null;
1027        boolean success = false;
1028        Socket sock = null;
1029        try {
1030          sock = dfsClient.socketFactory.createSocket();
1031          NetUtils.connect(sock, addr,
1032            dfsClient.getRandomLocalInterfaceAddr(),
1033            dfsClient.getConf().socketTimeout);
1034          peer = TcpPeerServer.peerFromSocketAndKey(sock, 
1035              dfsClient.getDataEncryptionKey());
1036          success = true;
1037          return peer;
1038        } finally {
1039          if (!success) {
1040            IOUtils.closeQuietly(peer);
1041            IOUtils.closeQuietly(sock);
1042          }
1043        }
1044      }
1045    
1046      /**
1047       * Retrieve a BlockReader suitable for reading.
1048       * This method will reuse the cached connection to the DN if appropriate.
1049       * Otherwise, it will create a new connection.
1050       * Throwing an IOException from this method is basically equivalent to 
1051       * declaring the DataNode bad, so we try to connect a lot of different ways
1052       * before doing that.
1053       *
1054       * @param dnAddr  Address of the datanode
1055       * @param chosenNode Chosen datanode information
1056       * @param file  File location
1057       * @param block  The Block object
1058       * @param blockToken  The access token for security
1059       * @param startOffset  The read offset, relative to block head
1060       * @param len  The number of bytes to read
1061       * @param bufferSize  The IO buffer size (not the client buffer size)
1062       * @param verifyChecksum  Whether to verify checksum
1063       * @param clientName  Client name
1064       * @param CachingStrategy  caching strategy to use
1065       * @return New BlockReader instance
1066       */
1067      protected BlockReader getBlockReader(InetSocketAddress dnAddr,
1068                                           DatanodeInfo chosenNode,
1069                                           String file,
1070                                           ExtendedBlock block,
1071                                           Token<BlockTokenIdentifier> blockToken,
1072                                           long startOffset,
1073                                           long len,
1074                                           int bufferSize,
1075                                           boolean verifyChecksum,
1076                                           String clientName,
1077                                           CachingStrategy curCachingStrategy)
1078          throws IOException {
1079        // Firstly, we check to see if we have cached any file descriptors for
1080        // local blocks.  If so, we can just re-use those file descriptors.
1081        FileInputStream fis[] = fileInputStreamCache.get(chosenNode, block);
1082        if (fis != null) {
1083          if (DFSClient.LOG.isDebugEnabled()) {
1084            DFSClient.LOG.debug("got FileInputStreams for " + block + " from " +
1085                "the FileInputStreamCache.");
1086          }
1087          return new BlockReaderLocal.Builder(dfsClient.getConf()).
1088              setFilename(file).
1089              setBlock(block).
1090              setStartOffset(startOffset).
1091              setStreams(fis).
1092              setDatanodeID(chosenNode).
1093              setVerifyChecksum(verifyChecksum).
1094              setBlockMetadataHeader(BlockMetadataHeader.
1095                  preadHeader(fis[1].getChannel())).
1096              setFileInputStreamCache(fileInputStreamCache).
1097              setCachingStrategy(curCachingStrategy).
1098              build();
1099        }
1100        
1101        // If the legacy local block reader is enabled and we are reading a local
1102        // block, try to create a BlockReaderLocalLegacy.  The legacy local block
1103        // reader implements local reads in the style first introduced by HDFS-2246.
1104        if ((dfsClient.useLegacyBlockReaderLocal()) &&
1105            DFSClient.isLocalAddress(dnAddr) &&
1106            (!shortCircuitForbidden())) {
1107          try {
1108            return BlockReaderFactory.getLegacyBlockReaderLocal(dfsClient,
1109                clientName, block, blockToken, chosenNode, startOffset);
1110          } catch (IOException e) {
1111            DFSClient.LOG.warn("error creating legacy BlockReaderLocal.  " +
1112                "Disabling legacy local reads.", e);
1113            dfsClient.disableLegacyBlockReaderLocal();
1114          }
1115        }
1116    
1117        // Look for cached domain peers.
1118        int cacheTries = 0;
1119        DomainSocketFactory dsFactory = dfsClient.getDomainSocketFactory();
1120        BlockReader reader = null;
1121        final int nCachedConnRetry = dfsClient.getConf().nCachedConnRetry;
1122        for (; cacheTries < nCachedConnRetry; ++cacheTries) {
1123          Peer peer = peerCache.get(chosenNode, true);
1124          if (peer == null) break;
1125          try {
1126            boolean allowShortCircuitLocalReads = dfsClient.getConf().
1127                shortCircuitLocalReads && (!shortCircuitForbidden());
1128            reader = BlockReaderFactory.newBlockReader(
1129                dfsClient.getConf(), file, block, blockToken, startOffset,
1130                len, verifyChecksum, clientName, peer, chosenNode, 
1131                dsFactory, peerCache, fileInputStreamCache,
1132                allowShortCircuitLocalReads, curCachingStrategy);
1133            return reader;
1134          } catch (IOException ex) {
1135            DFSClient.LOG.debug("Error making BlockReader with DomainSocket. " +
1136                "Closing stale " + peer, ex);
1137          } finally {
1138            if (reader == null) {
1139              IOUtils.closeQuietly(peer);
1140            }
1141          }
1142        }
1143    
1144        // Try to create a DomainPeer.
1145        DomainSocket domSock = dsFactory.create(dnAddr, this);
1146        if (domSock != null) {
1147          Peer peer = new DomainPeer(domSock);
1148          try {
1149            boolean allowShortCircuitLocalReads = dfsClient.getConf().
1150                shortCircuitLocalReads && (!shortCircuitForbidden());
1151            reader = BlockReaderFactory.newBlockReader(
1152                dfsClient.getConf(), file, block, blockToken, startOffset,
1153                len, verifyChecksum, clientName, peer, chosenNode,
1154                dsFactory, peerCache, fileInputStreamCache,
1155                allowShortCircuitLocalReads, curCachingStrategy);
1156            return reader;
1157          } catch (IOException e) {
1158            DFSClient.LOG.warn("failed to connect to " + domSock, e);
1159          } finally {
1160            if (reader == null) {
1161             // If the Peer that we got the error from was a DomainPeer,
1162             // mark the socket path as bad, so that newDataSocket will not try 
1163             // to re-open this socket for a while.
1164             dsFactory.disableDomainSocketPath(domSock.getPath());
1165             IOUtils.closeQuietly(peer);
1166            }
1167          }
1168        }
1169    
1170        // Look for cached peers.
1171        for (; cacheTries < nCachedConnRetry; ++cacheTries) {
1172          Peer peer = peerCache.get(chosenNode, false);
1173          if (peer == null) break;
1174          try {
1175            reader = BlockReaderFactory.newBlockReader(
1176                dfsClient.getConf(), file, block, blockToken, startOffset,
1177                len, verifyChecksum, clientName, peer, chosenNode, 
1178                dsFactory, peerCache, fileInputStreamCache, false,
1179                curCachingStrategy);
1180            return reader;
1181          } catch (IOException ex) {
1182            DFSClient.LOG.debug("Error making BlockReader. Closing stale " +
1183              peer, ex);
1184          } finally {
1185            if (reader == null) {
1186              IOUtils.closeQuietly(peer);
1187            }
1188          }
1189        }
1190        if (tcpReadsDisabledForTesting) {
1191          throw new IOException("TCP reads are disabled.");
1192        }
1193        // Try to create a new remote peer.
1194        Peer peer = newTcpPeer(dnAddr);
1195        try {
1196          reader = BlockReaderFactory.newBlockReader(dfsClient.getConf(), file,
1197              block, blockToken, startOffset, len, verifyChecksum, clientName,
1198              peer, chosenNode, dsFactory, peerCache, fileInputStreamCache, false,
1199            curCachingStrategy);
1200          return reader;
1201        } catch (IOException ex) {
1202          DFSClient.LOG.debug(
1203              "Exception while getting block reader, closing stale " + peer, ex);
1204          throw ex;
1205        } finally {
1206          if (reader == null) {
1207            IOUtils.closeQuietly(peer);
1208      }
1209        }
1210      }
1211    
1212    
1213      /**
1214       * Read bytes starting from the specified position.
1215       * 
1216       * @param position start read from this position
1217       * @param buffer read buffer
1218       * @param offset offset into buffer
1219       * @param length number of bytes to read
1220       * 
1221       * @return actual number of bytes read
1222       */
1223      @Override
1224      public int read(long position, byte[] buffer, int offset, int length)
1225        throws IOException {
1226        // sanity checks
1227        dfsClient.checkOpen();
1228        if (closed) {
1229          throw new IOException("Stream closed");
1230        }
1231        failures = 0;
1232        long filelen = getFileLength();
1233        if ((position < 0) || (position >= filelen)) {
1234          return -1;
1235        }
1236        int realLen = length;
1237        if ((position + length) > filelen) {
1238          realLen = (int)(filelen - position);
1239        }
1240        
1241        // determine the block and byte range within the block
1242        // corresponding to position and realLen
1243        List<LocatedBlock> blockRange = getBlockRange(position, realLen);
1244        int remaining = realLen;
1245        Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
1246          = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
1247        for (LocatedBlock blk : blockRange) {
1248          long targetStart = position - blk.getStartOffset();
1249          long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart);
1250          try {
1251            fetchBlockByteRange(blk, targetStart, 
1252                targetStart + bytesToRead - 1, buffer, offset, corruptedBlockMap);
1253          } finally {
1254            // Check and report if any block replicas are corrupted.
1255            // BlockMissingException may be caught if all block replicas are
1256            // corrupted.
1257            reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length);
1258          }
1259    
1260          remaining -= bytesToRead;
1261          position += bytesToRead;
1262          offset += bytesToRead;
1263        }
1264        assert remaining == 0 : "Wrong number of bytes read.";
1265        if (dfsClient.stats != null) {
1266          dfsClient.stats.incrementBytesRead(realLen);
1267        }
1268        return realLen;
1269      }
1270      
1271      /**
1272       * DFSInputStream reports checksum failure.
1273       * Case I : client has tried multiple data nodes and at least one of the
1274       * attempts has succeeded. We report the other failures as corrupted block to
1275       * namenode. 
1276       * Case II: client has tried out all data nodes, but all failed. We
1277       * only report if the total number of replica is 1. We do not
1278       * report otherwise since this maybe due to the client is a handicapped client
1279       * (who can not read).
1280       * @param corruptedBlockMap map of corrupted blocks
1281       * @param dataNodeCount number of data nodes who contains the block replicas
1282       */
1283      private void reportCheckSumFailure(
1284          Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 
1285          int dataNodeCount) {
1286        if (corruptedBlockMap.isEmpty()) {
1287          return;
1288        }
1289        Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap
1290            .entrySet().iterator();
1291        Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next();
1292        ExtendedBlock blk = entry.getKey();
1293        Set<DatanodeInfo> dnSet = entry.getValue();
1294        if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0))
1295            || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) {
1296          DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()];
1297          int i = 0;
1298          for (DatanodeInfo dn:dnSet) {
1299            locs[i++] = dn;
1300          }
1301          LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) };
1302          dfsClient.reportChecksumFailure(src, lblocks);
1303        }
1304        corruptedBlockMap.clear();
1305      }
1306    
1307      @Override
1308      public long skip(long n) throws IOException {
1309        if ( n > 0 ) {
1310          long curPos = getPos();
1311          long fileLen = getFileLength();
1312          if( n+curPos > fileLen ) {
1313            n = fileLen - curPos;
1314          }
1315          seek(curPos+n);
1316          return n;
1317        }
1318        return n < 0 ? -1 : 0;
1319      }
1320    
1321      /**
1322       * Seek to a new arbitrary location
1323       */
1324      @Override
1325      public synchronized void seek(long targetPos) throws IOException {
1326        if (targetPos > getFileLength()) {
1327          throw new IOException("Cannot seek after EOF");
1328        }
1329        if (targetPos < 0) {
1330          throw new IOException("Cannot seek to negative offset");
1331        }
1332        if (closed) {
1333          throw new IOException("Stream is closed!");
1334        }
1335        boolean done = false;
1336        if (pos <= targetPos && targetPos <= blockEnd) {
1337          //
1338          // If this seek is to a positive position in the current
1339          // block, and this piece of data might already be lying in
1340          // the TCP buffer, then just eat up the intervening data.
1341          //
1342          int diff = (int)(targetPos - pos);
1343          if (diff <= blockReader.available()) {
1344            try {
1345              pos += blockReader.skip(diff);
1346              if (pos == targetPos) {
1347                done = true;
1348              }
1349            } catch (IOException e) {//make following read to retry
1350              if(DFSClient.LOG.isDebugEnabled()) {
1351                DFSClient.LOG.debug("Exception while seek to " + targetPos
1352                    + " from " + getCurrentBlock() + " of " + src + " from "
1353                    + currentNode, e);
1354              }
1355            }
1356          }
1357        }
1358        if (!done) {
1359          pos = targetPos;
1360          blockEnd = -1;
1361        }
1362      }
1363    
1364      /**
1365       * Same as {@link #seekToNewSource(long)} except that it does not exclude
1366       * the current datanode and might connect to the same node.
1367       */
1368      private synchronized boolean seekToBlockSource(long targetPos)
1369                                                     throws IOException {
1370        currentNode = blockSeekTo(targetPos);
1371        return true;
1372      }
1373      
1374      /**
1375       * Seek to given position on a node other than the current node.  If
1376       * a node other than the current node is found, then returns true. 
1377       * If another node could not be found, then returns false.
1378       */
1379      @Override
1380      public synchronized boolean seekToNewSource(long targetPos) throws IOException {
1381        boolean markedDead = deadNodes.containsKey(currentNode);
1382        addToDeadNodes(currentNode);
1383        DatanodeInfo oldNode = currentNode;
1384        DatanodeInfo newNode = blockSeekTo(targetPos);
1385        if (!markedDead) {
1386          /* remove it from deadNodes. blockSeekTo could have cleared 
1387           * deadNodes and added currentNode again. Thats ok. */
1388          deadNodes.remove(oldNode);
1389        }
1390        if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) {
1391          currentNode = newNode;
1392          return true;
1393        } else {
1394          return false;
1395        }
1396      }
1397          
1398      /**
1399       */
1400      @Override
1401      public synchronized long getPos() throws IOException {
1402        return pos;
1403      }
1404    
1405      /** Return the size of the remaining available bytes
1406       * if the size is less than or equal to {@link Integer#MAX_VALUE},
1407       * otherwise, return {@link Integer#MAX_VALUE}.
1408       */
1409      @Override
1410      public synchronized int available() throws IOException {
1411        if (closed) {
1412          throw new IOException("Stream closed");
1413        }
1414    
1415        final long remaining = getFileLength() - pos;
1416        return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE;
1417      }
1418    
1419      /**
1420       * We definitely don't support marks
1421       */
1422      @Override
1423      public boolean markSupported() {
1424        return false;
1425      }
1426      @Override
1427      public void mark(int readLimit) {
1428      }
1429      @Override
1430      public void reset() throws IOException {
1431        throw new IOException("Mark/reset not supported");
1432      }
1433    
1434      /**
1435       * Pick the best node from which to stream the data.
1436       * Entries in <i>nodes</i> are already in the priority order
1437       */
1438      static DatanodeInfo bestNode(DatanodeInfo nodes[], 
1439                                   AbstractMap<DatanodeInfo, DatanodeInfo> deadNodes)
1440                                   throws IOException {
1441        if (nodes != null) { 
1442          for (int i = 0; i < nodes.length; i++) {
1443            if (!deadNodes.containsKey(nodes[i])) {
1444              return nodes[i];
1445            }
1446          }
1447        }
1448        throw new IOException("No live nodes contain current block");
1449      }
1450    
1451      /** Utility class to encapsulate data node info and its address. */
1452      static class DNAddrPair {
1453        DatanodeInfo info;
1454        InetSocketAddress addr;
1455        DNAddrPair(DatanodeInfo info, InetSocketAddress addr) {
1456          this.info = info;
1457          this.addr = addr;
1458        }
1459      }
1460    
1461      /**
1462       * Get statistics about the reads which this DFSInputStream has done.
1463       */
1464      public synchronized ReadStatistics getReadStatistics() {
1465        return new ReadStatistics(readStatistics);
1466      }
1467    
1468      private synchronized void closeCurrentBlockReader() {
1469        if (blockReader == null) return;
1470        // Close the current block reader so that the new caching settings can 
1471        // take effect immediately.
1472        try {
1473          blockReader.close();
1474        } catch (IOException e) {
1475          DFSClient.LOG.error("error closing blockReader", e);
1476        }
1477        blockReader = null;
1478      }
1479    
1480      @Override
1481      public synchronized void setReadahead(Long readahead)
1482          throws IOException {
1483        this.cachingStrategy =
1484            new CachingStrategy.Builder(this.cachingStrategy).
1485                setReadahead(readahead).build();
1486        closeCurrentBlockReader();
1487      }
1488    
1489      @Override
1490      public synchronized void setDropBehind(Boolean dropBehind)
1491          throws IOException {
1492        this.cachingStrategy =
1493            new CachingStrategy.Builder(this.cachingStrategy).
1494                setDropBehind(dropBehind).build();
1495        closeCurrentBlockReader();
1496      }
1497    
1498      @Override
1499      public synchronized ByteBuffer read(ByteBufferPool bufferPool,
1500          int maxLength, EnumSet<ReadOption> opts) 
1501              throws IOException, UnsupportedOperationException {
1502        assert(maxLength > 0);
1503        if (((blockReader == null) || (blockEnd == -1)) &&
1504              (pos < getFileLength())) {
1505          /*
1506           * If we don't have a blockReader, or the one we have has no more bytes
1507           * left to read, we call seekToBlockSource to get a new blockReader and
1508           * recalculate blockEnd.  Note that we assume we're not at EOF here
1509           * (we check this above).
1510           */
1511          if ((!seekToBlockSource(pos)) || (blockReader == null)) {
1512            throw new IOException("failed to allocate new BlockReader " +
1513                "at position " + pos);
1514          }
1515        }
1516        ByteBuffer buffer = tryReadZeroCopy(maxLength, opts);
1517        if (buffer != null) {
1518          return buffer;
1519        }
1520        buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength);
1521        if (buffer != null) {
1522          extendedReadBuffers.put(buffer, bufferPool);
1523        }
1524        return buffer;
1525      }
1526    
1527      private synchronized ByteBuffer tryReadZeroCopy(int maxLength,
1528          EnumSet<ReadOption> opts) throws IOException {
1529        // Java ByteBuffers can't be longer than 2 GB, because they use
1530        // 4-byte signed integers to represent capacity, etc.
1531        // So we can't mmap the parts of the block higher than the 2 GB offset.
1532        // FIXME: we could work around this with multiple memory maps.
1533        // See HDFS-5101.
1534        long blockEnd32 = Math.min(Integer.MAX_VALUE, blockEnd);
1535        long curPos = pos;
1536        long blockLeft = blockEnd32 - curPos + 1;
1537        if (blockLeft <= 0) {
1538          if (DFSClient.LOG.isDebugEnabled()) {
1539            DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
1540              curPos + " of " + src + "; blockLeft = " + blockLeft +
1541              "; blockEnd32 = " + blockEnd32 + ", blockEnd = " + blockEnd +
1542              "; maxLength = " + maxLength);
1543          }
1544          return null;
1545        }
1546        int length = Math.min((int)blockLeft, maxLength);
1547        long blockStartInFile = currentLocatedBlock.getStartOffset();
1548        long blockPos = curPos - blockStartInFile;
1549        long limit = blockPos + length;
1550        ClientMmap clientMmap =
1551            blockReader.getClientMmap(opts, dfsClient.getMmapManager());
1552        if (clientMmap == null) {
1553          if (DFSClient.LOG.isDebugEnabled()) {
1554            DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
1555              curPos + " of " + src + "; BlockReader#getClientMmap returned " +
1556              "null.");
1557          }
1558          return null;
1559        }
1560        seek(pos + length);
1561        ByteBuffer buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
1562        buffer.position((int)blockPos);
1563        buffer.limit((int)limit);
1564        clientMmap.ref();
1565        extendedReadBuffers.put(buffer, clientMmap);
1566        readStatistics.addZeroCopyBytes(length);
1567        if (DFSClient.LOG.isDebugEnabled()) {
1568          DFSClient.LOG.debug("readZeroCopy read " + maxLength + " bytes from " +
1569              "offset " + curPos + " via the zero-copy read path.  " +
1570              "blockEnd = " + blockEnd);
1571        }
1572        return buffer;
1573      }
1574    
1575      @Override
1576      public synchronized void releaseBuffer(ByteBuffer buffer) {
1577        Object val = extendedReadBuffers.remove(buffer);
1578        if (val == null) {
1579          throw new IllegalArgumentException("tried to release a buffer " +
1580              "that was not created by this stream, " + buffer);
1581        }
1582        if (val instanceof ClientMmap) {
1583          ((ClientMmap)val).unref();
1584        } else if (val instanceof ByteBufferPool) {
1585          ((ByteBufferPool)val).putBuffer(buffer);
1586        }
1587      }
1588    }