001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs;
019    
020    import java.io.FileInputStream;
021    import java.io.IOException;
022    import java.net.InetSocketAddress;
023    import java.net.Socket;
024    import java.nio.ByteBuffer;
025    import java.util.AbstractMap;
026    import java.util.ArrayList;
027    import java.util.HashMap;
028    import java.util.HashSet;
029    import java.util.Iterator;
030    import java.util.List;
031    import java.util.Map;
032    import java.util.Map.Entry;
033    import java.util.Set;
034    import java.util.concurrent.ConcurrentHashMap;
035    
036    import org.apache.commons.io.IOUtils;
037    import org.apache.hadoop.classification.InterfaceAudience;
038    import org.apache.hadoop.fs.ByteBufferReadable;
039    import org.apache.hadoop.fs.ChecksumException;
040    import org.apache.hadoop.fs.FSInputStream;
041    import org.apache.hadoop.fs.UnresolvedLinkException;
042    import org.apache.hadoop.hdfs.net.DomainPeer;
043    import org.apache.hadoop.hdfs.net.Peer;
044    import org.apache.hadoop.hdfs.net.TcpPeerServer;
045    import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
046    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
047    import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
048    import org.apache.hadoop.hdfs.protocol.LocatedBlock;
049    import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
050    import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
051    import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
052    import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
053    import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
054    import org.apache.hadoop.ipc.RPC;
055    import org.apache.hadoop.ipc.RemoteException;
056    import org.apache.hadoop.net.NetUtils;
057    import org.apache.hadoop.net.unix.DomainSocket;
058    import org.apache.hadoop.security.AccessControlException;
059    import org.apache.hadoop.security.token.Token;
060    
061    import com.google.common.annotations.VisibleForTesting;
062    
063    /****************************************************************
064     * DFSInputStream provides bytes from a named file.  It handles 
065     * negotiation of the namenode and various datanodes as necessary.
066     ****************************************************************/
067    @InterfaceAudience.Private
068    public class DFSInputStream extends FSInputStream implements ByteBufferReadable {
069      @VisibleForTesting
070      static boolean tcpReadsDisabledForTesting = false;
071      private final PeerCache peerCache;
072      private final DFSClient dfsClient;
073      private boolean closed = false;
074      private final String src;
075      private BlockReader blockReader = null;
076      private final boolean verifyChecksum;
077      private LocatedBlocks locatedBlocks = null;
078      private long lastBlockBeingWrittenLength = 0;
079      private DatanodeInfo currentNode = null;
080      private LocatedBlock currentLocatedBlock = null;
081      private long pos = 0;
082      private long blockEnd = -1;
083      private final ReadStatistics readStatistics = new ReadStatistics();
084    
085      public static class ReadStatistics {
086        public ReadStatistics() {
087          this.totalBytesRead = 0;
088          this.totalLocalBytesRead = 0;
089          this.totalShortCircuitBytesRead = 0;
090        }
091    
092        public ReadStatistics(ReadStatistics rhs) {
093          this.totalBytesRead = rhs.getTotalBytesRead();
094          this.totalLocalBytesRead = rhs.getTotalLocalBytesRead();
095          this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead();
096        }
097    
098        /**
099         * @return The total bytes read.  This will always be at least as
100         * high as the other numbers, since it includes all of them.
101         */
102        public long getTotalBytesRead() {
103          return totalBytesRead;
104        }
105    
106        /**
107         * @return The total local bytes read.  This will always be at least
108         * as high as totalShortCircuitBytesRead, since all short-circuit
109         * reads are also local.
110         */
111        public long getTotalLocalBytesRead() {
112          return totalLocalBytesRead;
113        }
114    
115        /**
116         * @return The total short-circuit local bytes read.
117         */
118        public long getTotalShortCircuitBytesRead() {
119          return totalShortCircuitBytesRead;
120        }
121    
122        /**
123         * @return The total number of bytes read which were not local.
124         */
125        public long getRemoteBytesRead() {
126          return totalBytesRead - totalLocalBytesRead;
127        }
128        
129        void addRemoteBytes(long amt) {
130          this.totalBytesRead += amt;
131        }
132    
133        void addLocalBytes(long amt) {
134          this.totalBytesRead += amt;
135          this.totalLocalBytesRead += amt;
136        }
137    
138        void addShortCircuitBytes(long amt) {
139          this.totalBytesRead += amt;
140          this.totalLocalBytesRead += amt;
141          this.totalShortCircuitBytesRead += amt;
142        }
143        
144        private long totalBytesRead;
145    
146        private long totalLocalBytesRead;
147    
148        private long totalShortCircuitBytesRead;
149      }
150      
151      private final FileInputStreamCache fileInputStreamCache;
152    
153      /**
154       * This variable tracks the number of failures since the start of the
155       * most recent user-facing operation. That is to say, it should be reset
156       * whenever the user makes a call on this stream, and if at any point
157       * during the retry logic, the failure count exceeds a threshold,
158       * the errors will be thrown back to the operation.
159       *
160       * Specifically this counts the number of times the client has gone
161       * back to the namenode to get a new list of block locations, and is
162       * capped at maxBlockAcquireFailures
163       */
164      private int failures = 0;
165    
166      /* XXX Use of CocurrentHashMap is temp fix. Need to fix 
167       * parallel accesses to DFSInputStream (through ptreads) properly */
168      private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes =
169                 new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>();
170      private int buffersize = 1;
171      
172      private final byte[] oneByteBuf = new byte[1]; // used for 'int read()'
173    
174      void addToDeadNodes(DatanodeInfo dnInfo) {
175        deadNodes.put(dnInfo, dnInfo);
176      }
177      
178      DFSInputStream(DFSClient dfsClient, String src, int buffersize, boolean verifyChecksum
179                     ) throws IOException, UnresolvedLinkException {
180        this.dfsClient = dfsClient;
181        this.verifyChecksum = verifyChecksum;
182        this.buffersize = buffersize;
183        this.src = src;
184        this.peerCache = dfsClient.peerCache;
185        this.fileInputStreamCache = new FileInputStreamCache(
186            dfsClient.getConf().shortCircuitStreamsCacheSize,
187            dfsClient.getConf().shortCircuitStreamsCacheExpiryMs);
188        openInfo();
189      }
190    
191      /**
192       * Grab the open-file info from namenode
193       */
194      synchronized void openInfo() throws IOException, UnresolvedLinkException {
195        lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
196        int retriesForLastBlockLength = 3;
197        while (retriesForLastBlockLength > 0) {
198          // Getting last block length as -1 is a special case. When cluster
199          // restarts, DNs may not report immediately. At this time partial block
200          // locations will not be available with NN for getting the length. Lets
201          // retry for 3 times to get the length.
202          if (lastBlockBeingWrittenLength == -1) {
203            DFSClient.LOG.warn("Last block locations not available. "
204                + "Datanodes might not have reported blocks completely."
205                + " Will retry for " + retriesForLastBlockLength + " times");
206            waitFor(4000);
207            lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
208          } else {
209            break;
210          }
211          retriesForLastBlockLength--;
212        }
213        if (retriesForLastBlockLength == 0) {
214          throw new IOException("Could not obtain the last block locations.");
215        }
216      }
217    
218      private void waitFor(int waitTime) throws IOException {
219        try {
220          Thread.sleep(waitTime);
221        } catch (InterruptedException e) {
222          throw new IOException(
223              "Interrupted while getting the last block length.");
224        }
225      }
226    
227      private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException {
228        final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0);
229        if (DFSClient.LOG.isDebugEnabled()) {
230          DFSClient.LOG.debug("newInfo = " + newInfo);
231        }
232        if (newInfo == null) {
233          throw new IOException("Cannot open filename " + src);
234        }
235    
236        if (locatedBlocks != null) {
237          Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator();
238          Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator();
239          while (oldIter.hasNext() && newIter.hasNext()) {
240            if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) {
241              throw new IOException("Blocklist for " + src + " has changed!");
242            }
243          }
244        }
245        locatedBlocks = newInfo;
246        long lastBlockBeingWrittenLength = 0;
247        if (!locatedBlocks.isLastBlockComplete()) {
248          final LocatedBlock last = locatedBlocks.getLastLocatedBlock();
249          if (last != null) {
250            if (last.getLocations().length == 0) {
251              return -1;
252            }
253            final long len = readBlockLength(last);
254            last.getBlock().setNumBytes(len);
255            lastBlockBeingWrittenLength = len; 
256          }
257        }
258    
259        currentNode = null;
260        return lastBlockBeingWrittenLength;
261      }
262    
263      /** Read the block length from one of the datanodes. */
264      private long readBlockLength(LocatedBlock locatedblock) throws IOException {
265        assert locatedblock != null : "LocatedBlock cannot be null";
266        int replicaNotFoundCount = locatedblock.getLocations().length;
267        
268        for(DatanodeInfo datanode : locatedblock.getLocations()) {
269          ClientDatanodeProtocol cdp = null;
270          
271          try {
272            cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode,
273                dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout,
274                dfsClient.getConf().connectToDnViaHostname, locatedblock);
275            
276            final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock());
277            
278            if (n >= 0) {
279              return n;
280            }
281          }
282          catch(IOException ioe) {
283            if (ioe instanceof RemoteException &&
284              (((RemoteException) ioe).unwrapRemoteException() instanceof
285                ReplicaNotFoundException)) {
286              // special case : replica might not be on the DN, treat as 0 length
287              replicaNotFoundCount--;
288            }
289            
290            if (DFSClient.LOG.isDebugEnabled()) {
291              DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode "
292                  + datanode + " for block " + locatedblock.getBlock(), ioe);
293            }
294          } finally {
295            if (cdp != null) {
296              RPC.stopProxy(cdp);
297            }
298          }
299        }
300    
301        // Namenode told us about these locations, but none know about the replica
302        // means that we hit the race between pipeline creation start and end.
303        // we require all 3 because some other exception could have happened
304        // on a DN that has it.  we want to report that error
305        if (replicaNotFoundCount == 0) {
306          return 0;
307        }
308    
309        throw new IOException("Cannot obtain block length for " + locatedblock);
310      }
311      
312      public synchronized long getFileLength() {
313        return locatedBlocks == null? 0:
314            locatedBlocks.getFileLength() + lastBlockBeingWrittenLength;
315      }
316    
317      // Short circuit local reads are forbidden for files that are
318      // under construction.  See HDFS-2757.
319      synchronized boolean shortCircuitForbidden() {
320        return locatedBlocks.isUnderConstruction();
321      }
322    
323      /**
324       * Returns the datanode from which the stream is currently reading.
325       */
326      public DatanodeInfo getCurrentDatanode() {
327        return currentNode;
328      }
329    
330      /**
331       * Returns the block containing the target position. 
332       */
333      synchronized public ExtendedBlock getCurrentBlock() {
334        if (currentLocatedBlock == null){
335          return null;
336        }
337        return currentLocatedBlock.getBlock();
338      }
339    
340      /**
341       * Return collection of blocks that has already been located.
342       */
343      public synchronized List<LocatedBlock> getAllBlocks() throws IOException {
344        return getBlockRange(0, getFileLength());
345      }
346    
347      /**
348       * Get block at the specified position.
349       * Fetch it from the namenode if not cached.
350       * 
351       * @param offset
352       * @param updatePosition whether to update current position
353       * @return located block
354       * @throws IOException
355       */
356      private synchronized LocatedBlock getBlockAt(long offset,
357          boolean updatePosition) throws IOException {
358        assert (locatedBlocks != null) : "locatedBlocks is null";
359    
360        final LocatedBlock blk;
361    
362        //check offset
363        if (offset < 0 || offset >= getFileLength()) {
364          throw new IOException("offset < 0 || offset > getFileLength(), offset="
365              + offset
366              + ", updatePosition=" + updatePosition
367              + ", locatedBlocks=" + locatedBlocks);
368        }
369        else if (offset >= locatedBlocks.getFileLength()) {
370          // offset to the portion of the last block,
371          // which is not known to the name-node yet;
372          // getting the last block 
373          blk = locatedBlocks.getLastLocatedBlock();
374        }
375        else {
376          // search cached blocks first
377          int targetBlockIdx = locatedBlocks.findBlock(offset);
378          if (targetBlockIdx < 0) { // block is not cached
379            targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
380            // fetch more blocks
381            final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
382            assert (newBlocks != null) : "Could not find target position " + offset;
383            locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
384          }
385          blk = locatedBlocks.get(targetBlockIdx);
386        }
387    
388        // update current position
389        if (updatePosition) {
390          pos = offset;
391          blockEnd = blk.getStartOffset() + blk.getBlockSize() - 1;
392          currentLocatedBlock = blk;
393        }
394        return blk;
395      }
396    
397      /** Fetch a block from namenode and cache it */
398      private synchronized void fetchBlockAt(long offset) throws IOException {
399        int targetBlockIdx = locatedBlocks.findBlock(offset);
400        if (targetBlockIdx < 0) { // block is not cached
401          targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
402        }
403        // fetch blocks
404        final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
405        if (newBlocks == null) {
406          throw new IOException("Could not find target position " + offset);
407        }
408        locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
409      }
410    
411      /**
412       * Get blocks in the specified range.
413       * Fetch them from the namenode if not cached. This function
414       * will not get a read request beyond the EOF.
415       * @param offset
416       * @param length
417       * @return consequent segment of located blocks
418       * @throws IOException
419       */
420      private synchronized List<LocatedBlock> getBlockRange(long offset, 
421                                                            long length) 
422                                                          throws IOException {
423        // getFileLength(): returns total file length
424        // locatedBlocks.getFileLength(): returns length of completed blocks
425        if (offset >= getFileLength()) {
426          throw new IOException("Offset: " + offset +
427            " exceeds file length: " + getFileLength());
428        }
429    
430        final List<LocatedBlock> blocks;
431        final long lengthOfCompleteBlk = locatedBlocks.getFileLength();
432        final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk;
433        final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk;
434    
435        if (readOffsetWithinCompleteBlk) {
436          //get the blocks of finalized (completed) block range
437          blocks = getFinalizedBlockRange(offset, 
438            Math.min(length, lengthOfCompleteBlk - offset));
439        } else {
440          blocks = new ArrayList<LocatedBlock>(1);
441        }
442    
443        // get the blocks from incomplete block range
444        if (readLengthPastCompleteBlk) {
445           blocks.add(locatedBlocks.getLastLocatedBlock());
446        }
447    
448        return blocks;
449      }
450    
451      /**
452       * Get blocks in the specified range.
453       * Includes only the complete blocks.
454       * Fetch them from the namenode if not cached.
455       */
456      private synchronized List<LocatedBlock> getFinalizedBlockRange(
457          long offset, long length) throws IOException {
458        assert (locatedBlocks != null) : "locatedBlocks is null";
459        List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>();
460        // search cached blocks first
461        int blockIdx = locatedBlocks.findBlock(offset);
462        if (blockIdx < 0) { // block is not cached
463          blockIdx = LocatedBlocks.getInsertIndex(blockIdx);
464        }
465        long remaining = length;
466        long curOff = offset;
467        while(remaining > 0) {
468          LocatedBlock blk = null;
469          if(blockIdx < locatedBlocks.locatedBlockCount())
470            blk = locatedBlocks.get(blockIdx);
471          if (blk == null || curOff < blk.getStartOffset()) {
472            LocatedBlocks newBlocks;
473            newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining);
474            locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks());
475            continue;
476          }
477          assert curOff >= blk.getStartOffset() : "Block not found";
478          blockRange.add(blk);
479          long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff;
480          remaining -= bytesRead;
481          curOff += bytesRead;
482          blockIdx++;
483        }
484        return blockRange;
485      }
486    
487      /**
488       * Open a DataInputStream to a DataNode so that it can be read from.
489       * We get block ID and the IDs of the destinations at startup, from the namenode.
490       */
491      private synchronized DatanodeInfo blockSeekTo(long target) throws IOException {
492        if (target >= getFileLength()) {
493          throw new IOException("Attempted to read past end of file");
494        }
495    
496        // Will be getting a new BlockReader.
497        if (blockReader != null) {
498          blockReader.close();
499          blockReader = null;
500        }
501    
502        //
503        // Connect to best DataNode for desired Block, with potential offset
504        //
505        DatanodeInfo chosenNode = null;
506        int refetchToken = 1; // only need to get a new access token once
507        int refetchEncryptionKey = 1; // only need to get a new encryption key once
508        
509        boolean connectFailedOnce = false;
510    
511        while (true) {
512          //
513          // Compute desired block
514          //
515          LocatedBlock targetBlock = getBlockAt(target, true);
516          assert (target==pos) : "Wrong postion " + pos + " expect " + target;
517          long offsetIntoBlock = target - targetBlock.getStartOffset();
518    
519          DNAddrPair retval = chooseDataNode(targetBlock);
520          chosenNode = retval.info;
521          InetSocketAddress targetAddr = retval.addr;
522    
523          try {
524            ExtendedBlock blk = targetBlock.getBlock();
525            Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
526            blockReader = getBlockReader(targetAddr, chosenNode, src, blk,
527                accessToken, offsetIntoBlock, blk.getNumBytes() - offsetIntoBlock,
528                buffersize, verifyChecksum, dfsClient.clientName);
529            if(connectFailedOnce) {
530              DFSClient.LOG.info("Successfully connected to " + targetAddr +
531                                 " for " + blk);
532            }
533            return chosenNode;
534          } catch (AccessControlException ex) {
535            DFSClient.LOG.warn("Short circuit access failed " + ex);
536            dfsClient.disableLegacyBlockReaderLocal();
537            continue;
538          } catch (IOException ex) {
539            if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
540              DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
541                  + "encryption key was invalid when connecting to " + targetAddr
542                  + " : " + ex);
543              // The encryption key used is invalid.
544              refetchEncryptionKey--;
545              dfsClient.clearDataEncryptionKey();
546            } else if (ex instanceof InvalidBlockTokenException && refetchToken > 0) {
547              DFSClient.LOG.info("Will fetch a new access token and retry, " 
548                  + "access token was invalid when connecting to " + targetAddr
549                  + " : " + ex);
550              /*
551               * Get a new access token and retry. Retry is needed in 2 cases. 1)
552               * When both NN and DN re-started while DFSClient holding a cached
553               * access token. 2) In the case that NN fails to update its
554               * access key at pre-set interval (by a wide margin) and
555               * subsequently restarts. In this case, DN re-registers itself with
556               * NN and receives a new access key, but DN will delete the old
557               * access key from its memory since it's considered expired based on
558               * the estimated expiration date.
559               */
560              refetchToken--;
561              fetchBlockAt(target);
562            } else {
563              connectFailedOnce = true;
564              DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
565                + ", add to deadNodes and continue. " + ex, ex);
566              // Put chosen node into dead list, continue
567              addToDeadNodes(chosenNode);
568            }
569          }
570        }
571      }
572    
573      /**
574       * Close it down!
575       */
576      @Override
577      public synchronized void close() throws IOException {
578        if (closed) {
579          return;
580        }
581        dfsClient.checkOpen();
582    
583        if (blockReader != null) {
584          blockReader.close();
585          blockReader = null;
586        }
587        super.close();
588        fileInputStreamCache.close();
589        closed = true;
590      }
591    
592      @Override
593      public synchronized int read() throws IOException {
594        int ret = read( oneByteBuf, 0, 1 );
595        return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff);
596      }
597    
598      /**
599       * Wraps different possible read implementations so that readBuffer can be
600       * strategy-agnostic.
601       */
602      private interface ReaderStrategy {
603        public int doRead(BlockReader blockReader, int off, int len,
604            ReadStatistics readStatistics) throws ChecksumException, IOException;
605      }
606    
607      private static void updateReadStatistics(ReadStatistics readStatistics, 
608            int nRead, BlockReader blockReader) {
609        if (nRead <= 0) return;
610        if (blockReader.isShortCircuit()) {
611          readStatistics.totalBytesRead += nRead;
612          readStatistics.totalLocalBytesRead += nRead;
613          readStatistics.totalShortCircuitBytesRead += nRead;
614        } else if (blockReader.isLocal()) {
615          readStatistics.totalBytesRead += nRead;
616          readStatistics.totalLocalBytesRead += nRead;
617        } else {
618          readStatistics.totalBytesRead += nRead;
619        }
620      }
621      
622      /**
623       * Used to read bytes into a byte[]
624       */
625      private static class ByteArrayStrategy implements ReaderStrategy {
626        final byte[] buf;
627    
628        public ByteArrayStrategy(byte[] buf) {
629          this.buf = buf;
630        }
631    
632        @Override
633        public int doRead(BlockReader blockReader, int off, int len,
634                ReadStatistics readStatistics) throws ChecksumException, IOException {
635            int nRead = blockReader.read(buf, off, len);
636            updateReadStatistics(readStatistics, nRead, blockReader);
637            return nRead;
638        }
639      }
640    
641      /**
642       * Used to read bytes into a user-supplied ByteBuffer
643       */
644      private static class ByteBufferStrategy implements ReaderStrategy {
645        final ByteBuffer buf;
646        ByteBufferStrategy(ByteBuffer buf) {
647          this.buf = buf;
648        }
649    
650        @Override
651        public int doRead(BlockReader blockReader, int off, int len,
652            ReadStatistics readStatistics) throws ChecksumException, IOException {
653          int oldpos = buf.position();
654          int oldlimit = buf.limit();
655          boolean success = false;
656          try {
657            int ret = blockReader.read(buf);
658            success = true;
659            updateReadStatistics(readStatistics, ret, blockReader);
660            return ret;
661          } finally {
662            if (!success) {
663              // Reset to original state so that retries work correctly.
664              buf.position(oldpos);
665              buf.limit(oldlimit);
666            }
667          } 
668        }
669      }
670    
671      /* This is a used by regular read() and handles ChecksumExceptions.
672       * name readBuffer() is chosen to imply similarity to readBuffer() in
673       * ChecksumFileSystem
674       */ 
675      private synchronized int readBuffer(ReaderStrategy reader, int off, int len,
676          Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
677          throws IOException {
678        IOException ioe;
679        
680        /* we retry current node only once. So this is set to true only here.
681         * Intention is to handle one common case of an error that is not a
682         * failure on datanode or client : when DataNode closes the connection
683         * since client is idle. If there are other cases of "non-errors" then
684         * then a datanode might be retried by setting this to true again.
685         */
686        boolean retryCurrentNode = true;
687    
688        while (true) {
689          // retry as many times as seekToNewSource allows.
690          try {
691            return reader.doRead(blockReader, off, len, readStatistics);
692          } catch ( ChecksumException ce ) {
693            DFSClient.LOG.warn("Found Checksum error for "
694                + getCurrentBlock() + " from " + currentNode
695                + " at " + ce.getPos());        
696            ioe = ce;
697            retryCurrentNode = false;
698            // we want to remember which block replicas we have tried
699            addIntoCorruptedBlockMap(getCurrentBlock(), currentNode,
700                corruptedBlockMap);
701          } catch ( IOException e ) {
702            if (!retryCurrentNode) {
703              DFSClient.LOG.warn("Exception while reading from "
704                  + getCurrentBlock() + " of " + src + " from "
705                  + currentNode, e);
706            }
707            ioe = e;
708          }
709          boolean sourceFound = false;
710          if (retryCurrentNode) {
711            /* possibly retry the same node so that transient errors don't
712             * result in application level failures (e.g. Datanode could have
713             * closed the connection because the client is idle for too long).
714             */ 
715            sourceFound = seekToBlockSource(pos);
716          } else {
717            addToDeadNodes(currentNode);
718            sourceFound = seekToNewSource(pos);
719          }
720          if (!sourceFound) {
721            throw ioe;
722          }
723          retryCurrentNode = false;
724        }
725      }
726    
727      private int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException {
728        dfsClient.checkOpen();
729        if (closed) {
730          throw new IOException("Stream closed");
731        }
732        Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
733          = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
734        failures = 0;
735        if (pos < getFileLength()) {
736          int retries = 2;
737          while (retries > 0) {
738            try {
739              // currentNode can be left as null if previous read had a checksum
740              // error on the same block. See HDFS-3067
741              if (pos > blockEnd || currentNode == null) {
742                currentNode = blockSeekTo(pos);
743              }
744              int realLen = (int) Math.min(len, (blockEnd - pos + 1L));
745              int result = readBuffer(strategy, off, realLen, corruptedBlockMap);
746              
747              if (result >= 0) {
748                pos += result;
749              } else {
750                // got a EOS from reader though we expect more data on it.
751                throw new IOException("Unexpected EOS from the reader");
752              }
753              if (dfsClient.stats != null && result != -1) {
754                dfsClient.stats.incrementBytesRead(result);
755              }
756              return result;
757            } catch (ChecksumException ce) {
758              throw ce;            
759            } catch (IOException e) {
760              if (retries == 1) {
761                DFSClient.LOG.warn("DFS Read", e);
762              }
763              blockEnd = -1;
764              if (currentNode != null) { addToDeadNodes(currentNode); }
765              if (--retries == 0) {
766                throw e;
767              }
768            } finally {
769              // Check if need to report block replicas corruption either read
770              // was successful or ChecksumException occured.
771              reportCheckSumFailure(corruptedBlockMap, 
772                  currentLocatedBlock.getLocations().length);
773            }
774          }
775        }
776        return -1;
777      }
778    
779      /**
780       * Read the entire buffer.
781       */
782      @Override
783      public synchronized int read(final byte buf[], int off, int len) throws IOException {
784        ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf);
785    
786        return readWithStrategy(byteArrayReader, off, len);
787      }
788    
789      @Override
790      public synchronized int read(final ByteBuffer buf) throws IOException {
791        ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf);
792    
793        return readWithStrategy(byteBufferReader, 0, buf.remaining());
794      }
795    
796    
797      /**
798       * Add corrupted block replica into map.
799       * @param corruptedBlockMap 
800       */
801      private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 
802          Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
803        Set<DatanodeInfo> dnSet = null;
804        if((corruptedBlockMap.containsKey(blk))) {
805          dnSet = corruptedBlockMap.get(blk);
806        }else {
807          dnSet = new HashSet<DatanodeInfo>();
808        }
809        if (!dnSet.contains(node)) {
810          dnSet.add(node);
811          corruptedBlockMap.put(blk, dnSet);
812        }
813      }
814          
815      private DNAddrPair chooseDataNode(LocatedBlock block)
816        throws IOException {
817        while (true) {
818          DatanodeInfo[] nodes = block.getLocations();
819          try {
820            DatanodeInfo chosenNode = bestNode(nodes, deadNodes);
821            final String dnAddr =
822                chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname);
823            if (DFSClient.LOG.isDebugEnabled()) {
824              DFSClient.LOG.debug("Connecting to datanode " + dnAddr);
825            }
826            InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr);
827            return new DNAddrPair(chosenNode, targetAddr);
828          } catch (IOException ie) {
829            String blockInfo = block.getBlock() + " file=" + src;
830            if (failures >= dfsClient.getMaxBlockAcquireFailures()) {
831              throw new BlockMissingException(src, "Could not obtain block: " + blockInfo,
832                                              block.getStartOffset());
833            }
834            
835            if (nodes == null || nodes.length == 0) {
836              DFSClient.LOG.info("No node available for " + blockInfo);
837            }
838            DFSClient.LOG.info("Could not obtain " + block.getBlock()
839                + " from any node: " + ie
840                + ". Will get new block locations from namenode and retry...");
841            try {
842              // Introducing a random factor to the wait time before another retry.
843              // The wait time is dependent on # of failures and a random factor.
844              // At the first time of getting a BlockMissingException, the wait time
845              // is a random number between 0..3000 ms. If the first retry
846              // still fails, we will wait 3000 ms grace period before the 2nd retry.
847              // Also at the second retry, the waiting window is expanded to 6000 ms
848              // alleviating the request rate from the server. Similarly the 3rd retry
849              // will wait 6000ms grace period before retry and the waiting window is
850              // expanded to 9000ms. 
851              final int timeWindow = dfsClient.getConf().timeWindow;
852              double waitTime = timeWindow * failures +       // grace period for the last round of attempt
853                timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure
854              DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec.");
855              Thread.sleep((long)waitTime);
856            } catch (InterruptedException iex) {
857            }
858            deadNodes.clear(); //2nd option is to remove only nodes[blockId]
859            openInfo();
860            block = getBlockAt(block.getStartOffset(), false);
861            failures++;
862            continue;
863          }
864        }
865      } 
866          
867      private void fetchBlockByteRange(LocatedBlock block, long start, long end,
868          byte[] buf, int offset,
869          Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
870          throws IOException {
871        //
872        // Connect to best DataNode for desired Block, with potential offset
873        //
874        int refetchToken = 1; // only need to get a new access token once
875        int refetchEncryptionKey = 1; // only need to get a new encryption key once
876        
877        while (true) {
878          // cached block locations may have been updated by chooseDataNode()
879          // or fetchBlockAt(). Always get the latest list of locations at the 
880          // start of the loop.
881          block = getBlockAt(block.getStartOffset(), false);
882          DNAddrPair retval = chooseDataNode(block);
883          DatanodeInfo chosenNode = retval.info;
884          InetSocketAddress targetAddr = retval.addr;
885          BlockReader reader = null;
886              
887          try {
888            Token<BlockTokenIdentifier> blockToken = block.getBlockToken();
889                
890            int len = (int) (end - start + 1);
891            reader = getBlockReader(targetAddr, chosenNode, src, block.getBlock(),
892                blockToken, start, len, buffersize, verifyChecksum,
893                dfsClient.clientName);
894            int nread = reader.readAll(buf, offset, len);
895            if (nread != len) {
896              throw new IOException("truncated return from reader.read(): " +
897                                    "excpected " + len + ", got " + nread);
898            }
899            return;
900          } catch (ChecksumException e) {
901            DFSClient.LOG.warn("fetchBlockByteRange(). Got a checksum exception for " +
902                     src + " at " + block.getBlock() + ":" + 
903                     e.getPos() + " from " + chosenNode);
904            // we want to remember what we have tried
905            addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
906          } catch (AccessControlException ex) {
907            DFSClient.LOG.warn("Short circuit access failed " + ex);
908            dfsClient.disableLegacyBlockReaderLocal();
909            continue;
910          } catch (IOException e) {
911            if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
912              DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
913                  + "encryption key was invalid when connecting to " + targetAddr
914                  + " : " + e);
915              // The encryption key used is invalid.
916              refetchEncryptionKey--;
917              dfsClient.clearDataEncryptionKey();
918            } else if (e instanceof InvalidBlockTokenException && refetchToken > 0) {
919              DFSClient.LOG.info("Will get a new access token and retry, "
920                  + "access token was invalid when connecting to " + targetAddr
921                  + " : " + e);
922              refetchToken--;
923              fetchBlockAt(block.getStartOffset());
924              continue;
925            } else {
926              DFSClient.LOG.warn("Failed to connect to " + targetAddr + 
927                  " for file " + src + " for block " + block.getBlock() + ":" + e);
928              if (DFSClient.LOG.isDebugEnabled()) {
929                DFSClient.LOG.debug("Connection failure ", e);
930              }
931            }
932          } finally {
933            if (reader != null) {
934              reader.close();
935            }
936          }
937          // Put chosen node into dead list, continue
938          addToDeadNodes(chosenNode);
939        }
940      }
941    
942      private Peer newTcpPeer(InetSocketAddress addr) throws IOException {
943        Peer peer = null;
944        boolean success = false;
945        Socket sock = null;
946        try {
947          sock = dfsClient.socketFactory.createSocket();
948          NetUtils.connect(sock, addr,
949            dfsClient.getRandomLocalInterfaceAddr(),
950            dfsClient.getConf().socketTimeout);
951          peer = TcpPeerServer.peerFromSocketAndKey(sock, 
952              dfsClient.getDataEncryptionKey());
953          success = true;
954          return peer;
955        } finally {
956          if (!success) {
957            IOUtils.closeQuietly(peer);
958            IOUtils.closeQuietly(sock);
959          }
960        }
961      }
962    
963      /**
964       * Retrieve a BlockReader suitable for reading.
965       * This method will reuse the cached connection to the DN if appropriate.
966       * Otherwise, it will create a new connection.
967       * Throwing an IOException from this method is basically equivalent to 
968       * declaring the DataNode bad, so we try to connect a lot of different ways
969       * before doing that.
970       *
971       * @param dnAddr  Address of the datanode
972       * @param chosenNode Chosen datanode information
973       * @param file  File location
974       * @param block  The Block object
975       * @param blockToken  The access token for security
976       * @param startOffset  The read offset, relative to block head
977       * @param len  The number of bytes to read
978       * @param bufferSize  The IO buffer size (not the client buffer size)
979       * @param verifyChecksum  Whether to verify checksum
980       * @param clientName  Client name
981       * @return New BlockReader instance
982       */
983      protected BlockReader getBlockReader(InetSocketAddress dnAddr,
984                                           DatanodeInfo chosenNode,
985                                           String file,
986                                           ExtendedBlock block,
987                                           Token<BlockTokenIdentifier> blockToken,
988                                           long startOffset,
989                                           long len,
990                                           int bufferSize,
991                                           boolean verifyChecksum,
992                                           String clientName)
993          throws IOException {
994        // Firstly, we check to see if we have cached any file descriptors for
995        // local blocks.  If so, we can just re-use those file descriptors.
996        FileInputStream fis[] = fileInputStreamCache.get(chosenNode, block);
997        if (fis != null) {
998          if (DFSClient.LOG.isDebugEnabled()) {
999            DFSClient.LOG.debug("got FileInputStreams for " + block + " from " +
1000                "the FileInputStreamCache.");
1001          }
1002          return new BlockReaderLocal(dfsClient.getConf(), file,
1003            block, startOffset, len, fis[0], fis[1], chosenNode, verifyChecksum,
1004            fileInputStreamCache);
1005        }
1006        
1007        // If the legacy local block reader is enabled and we are reading a local
1008        // block, try to create a BlockReaderLocalLegacy.  The legacy local block
1009        // reader implements local reads in the style first introduced by HDFS-2246.
1010        if ((dfsClient.useLegacyBlockReaderLocal()) &&
1011            DFSClient.isLocalAddress(dnAddr) &&
1012            (!shortCircuitForbidden())) {
1013          try {
1014            return BlockReaderFactory.getLegacyBlockReaderLocal(dfsClient,
1015                clientName, block, blockToken, chosenNode, startOffset);
1016          } catch (IOException e) {
1017            DFSClient.LOG.warn("error creating legacy BlockReaderLocal.  " +
1018                "Disabling legacy local reads.", e);
1019            dfsClient.disableLegacyBlockReaderLocal();
1020          }
1021        }
1022    
1023        // Look for cached domain peers.
1024        int cacheTries = 0;
1025        DomainSocketFactory dsFactory = dfsClient.getDomainSocketFactory();
1026        BlockReader reader = null;
1027        final int nCachedConnRetry = dfsClient.getConf().nCachedConnRetry;
1028        for (; cacheTries < nCachedConnRetry; ++cacheTries) {
1029          Peer peer = peerCache.get(chosenNode, true);
1030          if (peer == null) break;
1031          try {
1032            boolean allowShortCircuitLocalReads = dfsClient.getConf().
1033                shortCircuitLocalReads && (!shortCircuitForbidden());
1034            reader = BlockReaderFactory.newBlockReader(
1035                dfsClient.getConf(), file, block, blockToken, startOffset,
1036                len, verifyChecksum, clientName, peer, chosenNode, 
1037                dsFactory, peerCache, fileInputStreamCache,
1038                allowShortCircuitLocalReads);
1039            return reader;
1040          } catch (IOException ex) {
1041            DFSClient.LOG.debug("Error making BlockReader with DomainSocket. " +
1042                "Closing stale " + peer, ex);
1043          } finally {
1044            if (reader == null) {
1045              IOUtils.closeQuietly(peer);
1046            }
1047          }
1048        }
1049    
1050        // Try to create a DomainPeer.
1051        DomainSocket domSock = dsFactory.create(dnAddr, this);
1052        if (domSock != null) {
1053          Peer peer = new DomainPeer(domSock);
1054          try {
1055            boolean allowShortCircuitLocalReads = dfsClient.getConf().
1056                shortCircuitLocalReads && (!shortCircuitForbidden());
1057            reader = BlockReaderFactory.newBlockReader(
1058                dfsClient.getConf(), file, block, blockToken, startOffset,
1059                len, verifyChecksum, clientName, peer, chosenNode,
1060                dsFactory, peerCache, fileInputStreamCache,
1061                allowShortCircuitLocalReads);
1062            return reader;
1063          } catch (IOException e) {
1064            DFSClient.LOG.warn("failed to connect to " + domSock, e);
1065          } finally {
1066            if (reader == null) {
1067             // If the Peer that we got the error from was a DomainPeer,
1068             // mark the socket path as bad, so that newDataSocket will not try 
1069             // to re-open this socket for a while.
1070             dsFactory.disableDomainSocketPath(domSock.getPath());
1071             IOUtils.closeQuietly(peer);
1072            }
1073          }
1074        }
1075    
1076        // Look for cached peers.
1077        for (; cacheTries < nCachedConnRetry; ++cacheTries) {
1078          Peer peer = peerCache.get(chosenNode, false);
1079          if (peer == null) break;
1080          try {
1081            reader = BlockReaderFactory.newBlockReader(
1082                dfsClient.getConf(), file, block, blockToken, startOffset,
1083                len, verifyChecksum, clientName, peer, chosenNode, 
1084                dsFactory, peerCache, fileInputStreamCache, false);
1085            return reader;
1086          } catch (IOException ex) {
1087            DFSClient.LOG.debug("Error making BlockReader. Closing stale " +
1088              peer, ex);
1089          } finally {
1090            if (reader == null) {
1091              IOUtils.closeQuietly(peer);
1092            }
1093          }
1094        }
1095        if (tcpReadsDisabledForTesting) {
1096          throw new IOException("TCP reads are disabled.");
1097        }
1098        // Try to create a new remote peer.
1099        Peer peer = newTcpPeer(dnAddr);
1100        return BlockReaderFactory.newBlockReader(
1101            dfsClient.getConf(), file, block, blockToken, startOffset,
1102            len, verifyChecksum, clientName, peer, chosenNode, 
1103            dsFactory, peerCache, fileInputStreamCache, false);
1104      }
1105    
1106    
1107      /**
1108       * Read bytes starting from the specified position.
1109       * 
1110       * @param position start read from this position
1111       * @param buffer read buffer
1112       * @param offset offset into buffer
1113       * @param length number of bytes to read
1114       * 
1115       * @return actual number of bytes read
1116       */
1117      @Override
1118      public int read(long position, byte[] buffer, int offset, int length)
1119        throws IOException {
1120        // sanity checks
1121        dfsClient.checkOpen();
1122        if (closed) {
1123          throw new IOException("Stream closed");
1124        }
1125        failures = 0;
1126        long filelen = getFileLength();
1127        if ((position < 0) || (position >= filelen)) {
1128          return -1;
1129        }
1130        int realLen = length;
1131        if ((position + length) > filelen) {
1132          realLen = (int)(filelen - position);
1133        }
1134        
1135        // determine the block and byte range within the block
1136        // corresponding to position and realLen
1137        List<LocatedBlock> blockRange = getBlockRange(position, realLen);
1138        int remaining = realLen;
1139        Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
1140          = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
1141        for (LocatedBlock blk : blockRange) {
1142          long targetStart = position - blk.getStartOffset();
1143          long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart);
1144          try {
1145            fetchBlockByteRange(blk, targetStart, 
1146                targetStart + bytesToRead - 1, buffer, offset, corruptedBlockMap);
1147          } finally {
1148            // Check and report if any block replicas are corrupted.
1149            // BlockMissingException may be caught if all block replicas are
1150            // corrupted.
1151            reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length);
1152          }
1153    
1154          remaining -= bytesToRead;
1155          position += bytesToRead;
1156          offset += bytesToRead;
1157        }
1158        assert remaining == 0 : "Wrong number of bytes read.";
1159        if (dfsClient.stats != null) {
1160          dfsClient.stats.incrementBytesRead(realLen);
1161        }
1162        return realLen;
1163      }
1164      
1165      /**
1166       * DFSInputStream reports checksum failure.
1167       * Case I : client has tried multiple data nodes and at least one of the
1168       * attempts has succeeded. We report the other failures as corrupted block to
1169       * namenode. 
1170       * Case II: client has tried out all data nodes, but all failed. We
1171       * only report if the total number of replica is 1. We do not
1172       * report otherwise since this maybe due to the client is a handicapped client
1173       * (who can not read).
1174       * @param corruptedBlockMap map of corrupted blocks
1175       * @param dataNodeCount number of data nodes who contains the block replicas
1176       */
1177      private void reportCheckSumFailure(
1178          Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 
1179          int dataNodeCount) {
1180        if (corruptedBlockMap.isEmpty()) {
1181          return;
1182        }
1183        Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap
1184            .entrySet().iterator();
1185        Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next();
1186        ExtendedBlock blk = entry.getKey();
1187        Set<DatanodeInfo> dnSet = entry.getValue();
1188        if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0))
1189            || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) {
1190          DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()];
1191          int i = 0;
1192          for (DatanodeInfo dn:dnSet) {
1193            locs[i++] = dn;
1194          }
1195          LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) };
1196          dfsClient.reportChecksumFailure(src, lblocks);
1197        }
1198        corruptedBlockMap.clear();
1199      }
1200    
1201      @Override
1202      public long skip(long n) throws IOException {
1203        if ( n > 0 ) {
1204          long curPos = getPos();
1205          long fileLen = getFileLength();
1206          if( n+curPos > fileLen ) {
1207            n = fileLen - curPos;
1208          }
1209          seek(curPos+n);
1210          return n;
1211        }
1212        return n < 0 ? -1 : 0;
1213      }
1214    
1215      /**
1216       * Seek to a new arbitrary location
1217       */
1218      @Override
1219      public synchronized void seek(long targetPos) throws IOException {
1220        if (targetPos > getFileLength()) {
1221          throw new IOException("Cannot seek after EOF");
1222        }
1223        if (targetPos < 0) {
1224          throw new IOException("Cannot seek to negative offset");
1225        }
1226        if (closed) {
1227          throw new IOException("Stream is closed!");
1228        }
1229        boolean done = false;
1230        if (pos <= targetPos && targetPos <= blockEnd) {
1231          //
1232          // If this seek is to a positive position in the current
1233          // block, and this piece of data might already be lying in
1234          // the TCP buffer, then just eat up the intervening data.
1235          //
1236          int diff = (int)(targetPos - pos);
1237          if (diff <= blockReader.available()) {
1238            try {
1239              pos += blockReader.skip(diff);
1240              if (pos == targetPos) {
1241                done = true;
1242              }
1243            } catch (IOException e) {//make following read to retry
1244              if(DFSClient.LOG.isDebugEnabled()) {
1245                DFSClient.LOG.debug("Exception while seek to " + targetPos
1246                    + " from " + getCurrentBlock() + " of " + src + " from "
1247                    + currentNode, e);
1248              }
1249            }
1250          }
1251        }
1252        if (!done) {
1253          pos = targetPos;
1254          blockEnd = -1;
1255        }
1256      }
1257    
1258      /**
1259       * Same as {@link #seekToNewSource(long)} except that it does not exclude
1260       * the current datanode and might connect to the same node.
1261       */
1262      private synchronized boolean seekToBlockSource(long targetPos)
1263                                                     throws IOException {
1264        currentNode = blockSeekTo(targetPos);
1265        return true;
1266      }
1267      
1268      /**
1269       * Seek to given position on a node other than the current node.  If
1270       * a node other than the current node is found, then returns true. 
1271       * If another node could not be found, then returns false.
1272       */
1273      @Override
1274      public synchronized boolean seekToNewSource(long targetPos) throws IOException {
1275        boolean markedDead = deadNodes.containsKey(currentNode);
1276        addToDeadNodes(currentNode);
1277        DatanodeInfo oldNode = currentNode;
1278        DatanodeInfo newNode = blockSeekTo(targetPos);
1279        if (!markedDead) {
1280          /* remove it from deadNodes. blockSeekTo could have cleared 
1281           * deadNodes and added currentNode again. Thats ok. */
1282          deadNodes.remove(oldNode);
1283        }
1284        if (!oldNode.getStorageID().equals(newNode.getStorageID())) {
1285          currentNode = newNode;
1286          return true;
1287        } else {
1288          return false;
1289        }
1290      }
1291          
1292      /**
1293       */
1294      @Override
1295      public synchronized long getPos() throws IOException {
1296        return pos;
1297      }
1298    
1299      /** Return the size of the remaining available bytes
1300       * if the size is less than or equal to {@link Integer#MAX_VALUE},
1301       * otherwise, return {@link Integer#MAX_VALUE}.
1302       */
1303      @Override
1304      public synchronized int available() throws IOException {
1305        if (closed) {
1306          throw new IOException("Stream closed");
1307        }
1308    
1309        final long remaining = getFileLength() - pos;
1310        return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE;
1311      }
1312    
1313      /**
1314       * We definitely don't support marks
1315       */
1316      @Override
1317      public boolean markSupported() {
1318        return false;
1319      }
1320      @Override
1321      public void mark(int readLimit) {
1322      }
1323      @Override
1324      public void reset() throws IOException {
1325        throw new IOException("Mark/reset not supported");
1326      }
1327    
1328      /**
1329       * Pick the best node from which to stream the data.
1330       * Entries in <i>nodes</i> are already in the priority order
1331       */
1332      static DatanodeInfo bestNode(DatanodeInfo nodes[], 
1333                                   AbstractMap<DatanodeInfo, DatanodeInfo> deadNodes)
1334                                   throws IOException {
1335        if (nodes != null) { 
1336          for (int i = 0; i < nodes.length; i++) {
1337            if (!deadNodes.containsKey(nodes[i])) {
1338              return nodes[i];
1339            }
1340          }
1341        }
1342        throw new IOException("No live nodes contain current block");
1343      }
1344    
1345      /** Utility class to encapsulate data node info and its address. */
1346      static class DNAddrPair {
1347        DatanodeInfo info;
1348        InetSocketAddress addr;
1349        DNAddrPair(DatanodeInfo info, InetSocketAddress addr) {
1350          this.info = info;
1351          this.addr = addr;
1352        }
1353      }
1354    
1355      /**
1356       * Get statistics about the reads which this DFSInputStream has done.
1357       */
1358      public synchronized ReadStatistics getReadStatistics() {
1359        return new ReadStatistics(readStatistics);
1360      }
1361    }