001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.hdfs; 019 020 import java.io.EOFException; 021 import java.io.IOException; 022 import java.net.InetSocketAddress; 023 import java.nio.ByteBuffer; 024 import java.util.AbstractMap; 025 import java.util.ArrayList; 026 import java.util.Arrays; 027 import java.util.Collection; 028 import java.util.EnumSet; 029 import java.util.HashMap; 030 import java.util.HashSet; 031 import java.util.Iterator; 032 import java.util.List; 033 import java.util.Map; 034 import java.util.Map.Entry; 035 import java.util.Set; 036 import java.util.concurrent.Callable; 037 import java.util.concurrent.CancellationException; 038 import java.util.concurrent.CompletionService; 039 import java.util.concurrent.ConcurrentHashMap; 040 import java.util.concurrent.ExecutionException; 041 import java.util.concurrent.ExecutorCompletionService; 042 import java.util.concurrent.Future; 043 import java.util.concurrent.TimeUnit; 044 import java.util.concurrent.atomic.AtomicLong; 045 046 import org.apache.commons.io.IOUtils; 047 import org.apache.hadoop.classification.InterfaceAudience; 048 import org.apache.hadoop.fs.ByteBufferReadable; 049 import org.apache.hadoop.fs.ByteBufferUtil; 050 import org.apache.hadoop.fs.CanSetDropBehind; 051 import org.apache.hadoop.fs.CanSetReadahead; 052 import org.apache.hadoop.fs.ChecksumException; 053 import org.apache.hadoop.fs.FSInputStream; 054 import org.apache.hadoop.fs.HasEnhancedByteBufferAccess; 055 import org.apache.hadoop.fs.ReadOption; 056 import org.apache.hadoop.fs.UnresolvedLinkException; 057 import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol; 058 import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 059 import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 060 import org.apache.hadoop.fs.FileEncryptionInfo; 061 import org.apache.hadoop.hdfs.protocol.LocatedBlock; 062 import org.apache.hadoop.hdfs.protocol.LocatedBlocks; 063 import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException; 064 import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; 065 import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException; 066 import org.apache.hadoop.hdfs.server.datanode.CachingStrategy; 067 import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; 068 import org.apache.hadoop.hdfs.shortcircuit.ClientMmap; 069 import org.apache.hadoop.io.ByteBufferPool; 070 import org.apache.hadoop.ipc.RPC; 071 import org.apache.hadoop.ipc.RemoteException; 072 import org.apache.hadoop.net.NetUtils; 073 import org.apache.hadoop.security.token.SecretManager.InvalidToken; 074 import org.apache.hadoop.security.token.Token; 075 import org.apache.hadoop.util.IdentityHashStore; 076 077 import com.google.common.annotations.VisibleForTesting; 078 079 /**************************************************************** 080 * DFSInputStream provides bytes from a named file. It handles 081 * negotiation of the namenode and various datanodes as necessary. 082 ****************************************************************/ 083 @InterfaceAudience.Private 084 public class DFSInputStream extends FSInputStream 085 implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead, 086 HasEnhancedByteBufferAccess { 087 @VisibleForTesting 088 public static boolean tcpReadsDisabledForTesting = false; 089 private long hedgedReadOpsLoopNumForTesting = 0; 090 private final DFSClient dfsClient; 091 private boolean closed = false; 092 private final String src; 093 private BlockReader blockReader = null; 094 private final boolean verifyChecksum; 095 private LocatedBlocks locatedBlocks = null; 096 private long lastBlockBeingWrittenLength = 0; 097 private FileEncryptionInfo fileEncryptionInfo = null; 098 private DatanodeInfo currentNode = null; 099 private LocatedBlock currentLocatedBlock = null; 100 private long pos = 0; 101 private long blockEnd = -1; 102 private CachingStrategy cachingStrategy; 103 private final ReadStatistics readStatistics = new ReadStatistics(); 104 105 /** 106 * Track the ByteBuffers that we have handed out to readers. 107 * 108 * The value type can be either ByteBufferPool or ClientMmap, depending on 109 * whether we this is a memory-mapped buffer or not. 110 */ 111 private final IdentityHashStore<ByteBuffer, Object> 112 extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0); 113 114 public static class ReadStatistics { 115 public ReadStatistics() { 116 this.totalBytesRead = 0; 117 this.totalLocalBytesRead = 0; 118 this.totalShortCircuitBytesRead = 0; 119 this.totalZeroCopyBytesRead = 0; 120 } 121 122 public ReadStatistics(ReadStatistics rhs) { 123 this.totalBytesRead = rhs.getTotalBytesRead(); 124 this.totalLocalBytesRead = rhs.getTotalLocalBytesRead(); 125 this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead(); 126 this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead(); 127 } 128 129 /** 130 * @return The total bytes read. This will always be at least as 131 * high as the other numbers, since it includes all of them. 132 */ 133 public long getTotalBytesRead() { 134 return totalBytesRead; 135 } 136 137 /** 138 * @return The total local bytes read. This will always be at least 139 * as high as totalShortCircuitBytesRead, since all short-circuit 140 * reads are also local. 141 */ 142 public long getTotalLocalBytesRead() { 143 return totalLocalBytesRead; 144 } 145 146 /** 147 * @return The total short-circuit local bytes read. 148 */ 149 public long getTotalShortCircuitBytesRead() { 150 return totalShortCircuitBytesRead; 151 } 152 153 /** 154 * @return The total number of zero-copy bytes read. 155 */ 156 public long getTotalZeroCopyBytesRead() { 157 return totalZeroCopyBytesRead; 158 } 159 160 /** 161 * @return The total number of bytes read which were not local. 162 */ 163 public long getRemoteBytesRead() { 164 return totalBytesRead - totalLocalBytesRead; 165 } 166 167 void addRemoteBytes(long amt) { 168 this.totalBytesRead += amt; 169 } 170 171 void addLocalBytes(long amt) { 172 this.totalBytesRead += amt; 173 this.totalLocalBytesRead += amt; 174 } 175 176 void addShortCircuitBytes(long amt) { 177 this.totalBytesRead += amt; 178 this.totalLocalBytesRead += amt; 179 this.totalShortCircuitBytesRead += amt; 180 } 181 182 void addZeroCopyBytes(long amt) { 183 this.totalBytesRead += amt; 184 this.totalLocalBytesRead += amt; 185 this.totalShortCircuitBytesRead += amt; 186 this.totalZeroCopyBytesRead += amt; 187 } 188 189 private long totalBytesRead; 190 191 private long totalLocalBytesRead; 192 193 private long totalShortCircuitBytesRead; 194 195 private long totalZeroCopyBytesRead; 196 } 197 198 /** 199 * This variable tracks the number of failures since the start of the 200 * most recent user-facing operation. That is to say, it should be reset 201 * whenever the user makes a call on this stream, and if at any point 202 * during the retry logic, the failure count exceeds a threshold, 203 * the errors will be thrown back to the operation. 204 * 205 * Specifically this counts the number of times the client has gone 206 * back to the namenode to get a new list of block locations, and is 207 * capped at maxBlockAcquireFailures 208 */ 209 private int failures = 0; 210 211 /* XXX Use of CocurrentHashMap is temp fix. Need to fix 212 * parallel accesses to DFSInputStream (through ptreads) properly */ 213 private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes = 214 new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>(); 215 private int buffersize = 1; 216 217 private final byte[] oneByteBuf = new byte[1]; // used for 'int read()' 218 219 void addToDeadNodes(DatanodeInfo dnInfo) { 220 deadNodes.put(dnInfo, dnInfo); 221 } 222 223 DFSInputStream(DFSClient dfsClient, String src, int buffersize, boolean verifyChecksum 224 ) throws IOException, UnresolvedLinkException { 225 this.dfsClient = dfsClient; 226 this.verifyChecksum = verifyChecksum; 227 this.buffersize = buffersize; 228 this.src = src; 229 this.cachingStrategy = 230 dfsClient.getDefaultReadCachingStrategy(); 231 openInfo(); 232 } 233 234 /** 235 * Grab the open-file info from namenode 236 */ 237 synchronized void openInfo() throws IOException, UnresolvedLinkException { 238 lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); 239 int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength; 240 while (retriesForLastBlockLength > 0) { 241 // Getting last block length as -1 is a special case. When cluster 242 // restarts, DNs may not report immediately. At this time partial block 243 // locations will not be available with NN for getting the length. Lets 244 // retry for 3 times to get the length. 245 if (lastBlockBeingWrittenLength == -1) { 246 DFSClient.LOG.warn("Last block locations not available. " 247 + "Datanodes might not have reported blocks completely." 248 + " Will retry for " + retriesForLastBlockLength + " times"); 249 waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength); 250 lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); 251 } else { 252 break; 253 } 254 retriesForLastBlockLength--; 255 } 256 if (retriesForLastBlockLength == 0) { 257 throw new IOException("Could not obtain the last block locations."); 258 } 259 } 260 261 private void waitFor(int waitTime) throws IOException { 262 try { 263 Thread.sleep(waitTime); 264 } catch (InterruptedException e) { 265 throw new IOException( 266 "Interrupted while getting the last block length."); 267 } 268 } 269 270 private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException { 271 final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0); 272 if (DFSClient.LOG.isDebugEnabled()) { 273 DFSClient.LOG.debug("newInfo = " + newInfo); 274 } 275 if (newInfo == null) { 276 throw new IOException("Cannot open filename " + src); 277 } 278 279 if (locatedBlocks != null) { 280 Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator(); 281 Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator(); 282 while (oldIter.hasNext() && newIter.hasNext()) { 283 if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) { 284 throw new IOException("Blocklist for " + src + " has changed!"); 285 } 286 } 287 } 288 locatedBlocks = newInfo; 289 long lastBlockBeingWrittenLength = 0; 290 if (!locatedBlocks.isLastBlockComplete()) { 291 final LocatedBlock last = locatedBlocks.getLastLocatedBlock(); 292 if (last != null) { 293 if (last.getLocations().length == 0) { 294 if (last.getBlockSize() == 0) { 295 // if the length is zero, then no data has been written to 296 // datanode. So no need to wait for the locations. 297 return 0; 298 } 299 return -1; 300 } 301 final long len = readBlockLength(last); 302 last.getBlock().setNumBytes(len); 303 lastBlockBeingWrittenLength = len; 304 } 305 } 306 307 fileEncryptionInfo = locatedBlocks.getFileEncryptionInfo(); 308 309 currentNode = null; 310 return lastBlockBeingWrittenLength; 311 } 312 313 /** Read the block length from one of the datanodes. */ 314 private long readBlockLength(LocatedBlock locatedblock) throws IOException { 315 assert locatedblock != null : "LocatedBlock cannot be null"; 316 int replicaNotFoundCount = locatedblock.getLocations().length; 317 318 for(DatanodeInfo datanode : locatedblock.getLocations()) { 319 ClientDatanodeProtocol cdp = null; 320 321 try { 322 cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode, 323 dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout, 324 dfsClient.getConf().connectToDnViaHostname, locatedblock); 325 326 final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock()); 327 328 if (n >= 0) { 329 return n; 330 } 331 } 332 catch(IOException ioe) { 333 if (ioe instanceof RemoteException && 334 (((RemoteException) ioe).unwrapRemoteException() instanceof 335 ReplicaNotFoundException)) { 336 // special case : replica might not be on the DN, treat as 0 length 337 replicaNotFoundCount--; 338 } 339 340 if (DFSClient.LOG.isDebugEnabled()) { 341 DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode " 342 + datanode + " for block " + locatedblock.getBlock(), ioe); 343 } 344 } finally { 345 if (cdp != null) { 346 RPC.stopProxy(cdp); 347 } 348 } 349 } 350 351 // Namenode told us about these locations, but none know about the replica 352 // means that we hit the race between pipeline creation start and end. 353 // we require all 3 because some other exception could have happened 354 // on a DN that has it. we want to report that error 355 if (replicaNotFoundCount == 0) { 356 return 0; 357 } 358 359 throw new IOException("Cannot obtain block length for " + locatedblock); 360 } 361 362 public synchronized long getFileLength() { 363 return locatedBlocks == null? 0: 364 locatedBlocks.getFileLength() + lastBlockBeingWrittenLength; 365 } 366 367 // Short circuit local reads are forbidden for files that are 368 // under construction. See HDFS-2757. 369 synchronized boolean shortCircuitForbidden() { 370 return locatedBlocks.isUnderConstruction(); 371 } 372 373 /** 374 * Returns the datanode from which the stream is currently reading. 375 */ 376 public DatanodeInfo getCurrentDatanode() { 377 return currentNode; 378 } 379 380 /** 381 * Returns the block containing the target position. 382 */ 383 synchronized public ExtendedBlock getCurrentBlock() { 384 if (currentLocatedBlock == null){ 385 return null; 386 } 387 return currentLocatedBlock.getBlock(); 388 } 389 390 /** 391 * Return collection of blocks that has already been located. 392 */ 393 public synchronized List<LocatedBlock> getAllBlocks() throws IOException { 394 return getBlockRange(0, getFileLength()); 395 } 396 397 /** 398 * Get block at the specified position. 399 * Fetch it from the namenode if not cached. 400 * 401 * @param offset block corresponding to this offset in file is returned 402 * @param updatePosition whether to update current position 403 * @return located block 404 * @throws IOException 405 */ 406 private synchronized LocatedBlock getBlockAt(long offset, 407 boolean updatePosition) throws IOException { 408 assert (locatedBlocks != null) : "locatedBlocks is null"; 409 410 final LocatedBlock blk; 411 412 //check offset 413 if (offset < 0 || offset >= getFileLength()) { 414 throw new IOException("offset < 0 || offset >= getFileLength(), offset=" 415 + offset 416 + ", updatePosition=" + updatePosition 417 + ", locatedBlocks=" + locatedBlocks); 418 } 419 else if (offset >= locatedBlocks.getFileLength()) { 420 // offset to the portion of the last block, 421 // which is not known to the name-node yet; 422 // getting the last block 423 blk = locatedBlocks.getLastLocatedBlock(); 424 } 425 else { 426 // search cached blocks first 427 int targetBlockIdx = locatedBlocks.findBlock(offset); 428 if (targetBlockIdx < 0) { // block is not cached 429 targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx); 430 // fetch more blocks 431 final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset); 432 assert (newBlocks != null) : "Could not find target position " + offset; 433 locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks()); 434 } 435 blk = locatedBlocks.get(targetBlockIdx); 436 } 437 438 // update current position 439 if (updatePosition) { 440 pos = offset; 441 blockEnd = blk.getStartOffset() + blk.getBlockSize() - 1; 442 currentLocatedBlock = blk; 443 } 444 return blk; 445 } 446 447 /** Fetch a block from namenode and cache it */ 448 private synchronized void fetchBlockAt(long offset) throws IOException { 449 int targetBlockIdx = locatedBlocks.findBlock(offset); 450 if (targetBlockIdx < 0) { // block is not cached 451 targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx); 452 } 453 // fetch blocks 454 final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset); 455 if (newBlocks == null) { 456 throw new IOException("Could not find target position " + offset); 457 } 458 locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks()); 459 } 460 461 /** 462 * Get blocks in the specified range. 463 * Fetch them from the namenode if not cached. This function 464 * will not get a read request beyond the EOF. 465 * @param offset starting offset in file 466 * @param length length of data 467 * @return consequent segment of located blocks 468 * @throws IOException 469 */ 470 private synchronized List<LocatedBlock> getBlockRange(long offset, 471 long length) throws IOException { 472 // getFileLength(): returns total file length 473 // locatedBlocks.getFileLength(): returns length of completed blocks 474 if (offset >= getFileLength()) { 475 throw new IOException("Offset: " + offset + 476 " exceeds file length: " + getFileLength()); 477 } 478 479 final List<LocatedBlock> blocks; 480 final long lengthOfCompleteBlk = locatedBlocks.getFileLength(); 481 final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk; 482 final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk; 483 484 if (readOffsetWithinCompleteBlk) { 485 //get the blocks of finalized (completed) block range 486 blocks = getFinalizedBlockRange(offset, 487 Math.min(length, lengthOfCompleteBlk - offset)); 488 } else { 489 blocks = new ArrayList<LocatedBlock>(1); 490 } 491 492 // get the blocks from incomplete block range 493 if (readLengthPastCompleteBlk) { 494 blocks.add(locatedBlocks.getLastLocatedBlock()); 495 } 496 497 return blocks; 498 } 499 500 /** 501 * Get blocks in the specified range. 502 * Includes only the complete blocks. 503 * Fetch them from the namenode if not cached. 504 */ 505 private synchronized List<LocatedBlock> getFinalizedBlockRange( 506 long offset, long length) throws IOException { 507 assert (locatedBlocks != null) : "locatedBlocks is null"; 508 List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>(); 509 // search cached blocks first 510 int blockIdx = locatedBlocks.findBlock(offset); 511 if (blockIdx < 0) { // block is not cached 512 blockIdx = LocatedBlocks.getInsertIndex(blockIdx); 513 } 514 long remaining = length; 515 long curOff = offset; 516 while(remaining > 0) { 517 LocatedBlock blk = null; 518 if(blockIdx < locatedBlocks.locatedBlockCount()) 519 blk = locatedBlocks.get(blockIdx); 520 if (blk == null || curOff < blk.getStartOffset()) { 521 LocatedBlocks newBlocks; 522 newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining); 523 locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks()); 524 continue; 525 } 526 assert curOff >= blk.getStartOffset() : "Block not found"; 527 blockRange.add(blk); 528 long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff; 529 remaining -= bytesRead; 530 curOff += bytesRead; 531 blockIdx++; 532 } 533 return blockRange; 534 } 535 536 /** 537 * Open a DataInputStream to a DataNode so that it can be read from. 538 * We get block ID and the IDs of the destinations at startup, from the namenode. 539 */ 540 private synchronized DatanodeInfo blockSeekTo(long target) throws IOException { 541 if (target >= getFileLength()) { 542 throw new IOException("Attempted to read past end of file"); 543 } 544 545 // Will be getting a new BlockReader. 546 if (blockReader != null) { 547 blockReader.close(); 548 blockReader = null; 549 } 550 551 // 552 // Connect to best DataNode for desired Block, with potential offset 553 // 554 DatanodeInfo chosenNode = null; 555 int refetchToken = 1; // only need to get a new access token once 556 int refetchEncryptionKey = 1; // only need to get a new encryption key once 557 558 boolean connectFailedOnce = false; 559 560 while (true) { 561 // 562 // Compute desired block 563 // 564 LocatedBlock targetBlock = getBlockAt(target, true); 565 assert (target==pos) : "Wrong postion " + pos + " expect " + target; 566 long offsetIntoBlock = target - targetBlock.getStartOffset(); 567 568 DNAddrPair retval = chooseDataNode(targetBlock, null); 569 chosenNode = retval.info; 570 InetSocketAddress targetAddr = retval.addr; 571 StorageType storageType = retval.storageType; 572 573 try { 574 ExtendedBlock blk = targetBlock.getBlock(); 575 Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken(); 576 blockReader = new BlockReaderFactory(dfsClient.getConf()). 577 setInetSocketAddress(targetAddr). 578 setRemotePeerFactory(dfsClient). 579 setDatanodeInfo(chosenNode). 580 setStorageType(storageType). 581 setFileName(src). 582 setBlock(blk). 583 setBlockToken(accessToken). 584 setStartOffset(offsetIntoBlock). 585 setVerifyChecksum(verifyChecksum). 586 setClientName(dfsClient.clientName). 587 setLength(blk.getNumBytes() - offsetIntoBlock). 588 setCachingStrategy(cachingStrategy). 589 setAllowShortCircuitLocalReads(!shortCircuitForbidden()). 590 setClientCacheContext(dfsClient.getClientContext()). 591 setUserGroupInformation(dfsClient.ugi). 592 setConfiguration(dfsClient.getConfiguration()). 593 build(); 594 if(connectFailedOnce) { 595 DFSClient.LOG.info("Successfully connected to " + targetAddr + 596 " for " + blk); 597 } 598 return chosenNode; 599 } catch (IOException ex) { 600 if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { 601 DFSClient.LOG.info("Will fetch a new encryption key and retry, " 602 + "encryption key was invalid when connecting to " + targetAddr 603 + " : " + ex); 604 // The encryption key used is invalid. 605 refetchEncryptionKey--; 606 dfsClient.clearDataEncryptionKey(); 607 } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) { 608 refetchToken--; 609 fetchBlockAt(target); 610 } else { 611 connectFailedOnce = true; 612 DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block" 613 + ", add to deadNodes and continue. " + ex, ex); 614 // Put chosen node into dead list, continue 615 addToDeadNodes(chosenNode); 616 } 617 } 618 } 619 } 620 621 /** 622 * Close it down! 623 */ 624 @Override 625 public synchronized void close() throws IOException { 626 if (closed) { 627 return; 628 } 629 dfsClient.checkOpen(); 630 631 if (!extendedReadBuffers.isEmpty()) { 632 final StringBuilder builder = new StringBuilder(); 633 extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() { 634 private String prefix = ""; 635 @Override 636 public void accept(ByteBuffer k, Object v) { 637 builder.append(prefix).append(k); 638 prefix = ", "; 639 } 640 }); 641 DFSClient.LOG.warn("closing file " + src + ", but there are still " + 642 "unreleased ByteBuffers allocated by read(). " + 643 "Please release " + builder.toString() + "."); 644 } 645 if (blockReader != null) { 646 blockReader.close(); 647 blockReader = null; 648 } 649 super.close(); 650 closed = true; 651 } 652 653 @Override 654 public synchronized int read() throws IOException { 655 int ret = read( oneByteBuf, 0, 1 ); 656 return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff); 657 } 658 659 /** 660 * Wraps different possible read implementations so that readBuffer can be 661 * strategy-agnostic. 662 */ 663 private interface ReaderStrategy { 664 public int doRead(BlockReader blockReader, int off, int len, 665 ReadStatistics readStatistics) throws ChecksumException, IOException; 666 } 667 668 private static void updateReadStatistics(ReadStatistics readStatistics, 669 int nRead, BlockReader blockReader) { 670 if (nRead <= 0) return; 671 if (blockReader.isShortCircuit()) { 672 readStatistics.addShortCircuitBytes(nRead); 673 } else if (blockReader.isLocal()) { 674 readStatistics.addLocalBytes(nRead); 675 } else { 676 readStatistics.addRemoteBytes(nRead); 677 } 678 } 679 680 /** 681 * Used to read bytes into a byte[] 682 */ 683 private static class ByteArrayStrategy implements ReaderStrategy { 684 final byte[] buf; 685 686 public ByteArrayStrategy(byte[] buf) { 687 this.buf = buf; 688 } 689 690 @Override 691 public int doRead(BlockReader blockReader, int off, int len, 692 ReadStatistics readStatistics) throws ChecksumException, IOException { 693 int nRead = blockReader.read(buf, off, len); 694 updateReadStatistics(readStatistics, nRead, blockReader); 695 return nRead; 696 } 697 } 698 699 /** 700 * Used to read bytes into a user-supplied ByteBuffer 701 */ 702 private static class ByteBufferStrategy implements ReaderStrategy { 703 final ByteBuffer buf; 704 ByteBufferStrategy(ByteBuffer buf) { 705 this.buf = buf; 706 } 707 708 @Override 709 public int doRead(BlockReader blockReader, int off, int len, 710 ReadStatistics readStatistics) throws ChecksumException, IOException { 711 int oldpos = buf.position(); 712 int oldlimit = buf.limit(); 713 boolean success = false; 714 try { 715 int ret = blockReader.read(buf); 716 success = true; 717 updateReadStatistics(readStatistics, ret, blockReader); 718 return ret; 719 } finally { 720 if (!success) { 721 // Reset to original state so that retries work correctly. 722 buf.position(oldpos); 723 buf.limit(oldlimit); 724 } 725 } 726 } 727 } 728 729 /* This is a used by regular read() and handles ChecksumExceptions. 730 * name readBuffer() is chosen to imply similarity to readBuffer() in 731 * ChecksumFileSystem 732 */ 733 private synchronized int readBuffer(ReaderStrategy reader, int off, int len, 734 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 735 throws IOException { 736 IOException ioe; 737 738 /* we retry current node only once. So this is set to true only here. 739 * Intention is to handle one common case of an error that is not a 740 * failure on datanode or client : when DataNode closes the connection 741 * since client is idle. If there are other cases of "non-errors" then 742 * then a datanode might be retried by setting this to true again. 743 */ 744 boolean retryCurrentNode = true; 745 746 while (true) { 747 // retry as many times as seekToNewSource allows. 748 try { 749 return reader.doRead(blockReader, off, len, readStatistics); 750 } catch ( ChecksumException ce ) { 751 DFSClient.LOG.warn("Found Checksum error for " 752 + getCurrentBlock() + " from " + currentNode 753 + " at " + ce.getPos()); 754 ioe = ce; 755 retryCurrentNode = false; 756 // we want to remember which block replicas we have tried 757 addIntoCorruptedBlockMap(getCurrentBlock(), currentNode, 758 corruptedBlockMap); 759 } catch ( IOException e ) { 760 if (!retryCurrentNode) { 761 DFSClient.LOG.warn("Exception while reading from " 762 + getCurrentBlock() + " of " + src + " from " 763 + currentNode, e); 764 } 765 ioe = e; 766 } 767 boolean sourceFound = false; 768 if (retryCurrentNode) { 769 /* possibly retry the same node so that transient errors don't 770 * result in application level failures (e.g. Datanode could have 771 * closed the connection because the client is idle for too long). 772 */ 773 sourceFound = seekToBlockSource(pos); 774 } else { 775 addToDeadNodes(currentNode); 776 sourceFound = seekToNewSource(pos); 777 } 778 if (!sourceFound) { 779 throw ioe; 780 } 781 retryCurrentNode = false; 782 } 783 } 784 785 private int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException { 786 dfsClient.checkOpen(); 787 if (closed) { 788 throw new IOException("Stream closed"); 789 } 790 Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 791 = new HashMap<ExtendedBlock, Set<DatanodeInfo>>(); 792 failures = 0; 793 if (pos < getFileLength()) { 794 int retries = 2; 795 while (retries > 0) { 796 try { 797 // currentNode can be left as null if previous read had a checksum 798 // error on the same block. See HDFS-3067 799 if (pos > blockEnd || currentNode == null) { 800 currentNode = blockSeekTo(pos); 801 } 802 int realLen = (int) Math.min(len, (blockEnd - pos + 1L)); 803 if (locatedBlocks.isLastBlockComplete()) { 804 realLen = (int) Math.min(realLen, locatedBlocks.getFileLength()); 805 } 806 int result = readBuffer(strategy, off, realLen, corruptedBlockMap); 807 808 if (result >= 0) { 809 pos += result; 810 } else { 811 // got a EOS from reader though we expect more data on it. 812 throw new IOException("Unexpected EOS from the reader"); 813 } 814 if (dfsClient.stats != null) { 815 dfsClient.stats.incrementBytesRead(result); 816 } 817 return result; 818 } catch (ChecksumException ce) { 819 throw ce; 820 } catch (IOException e) { 821 if (retries == 1) { 822 DFSClient.LOG.warn("DFS Read", e); 823 } 824 blockEnd = -1; 825 if (currentNode != null) { addToDeadNodes(currentNode); } 826 if (--retries == 0) { 827 throw e; 828 } 829 } finally { 830 // Check if need to report block replicas corruption either read 831 // was successful or ChecksumException occured. 832 reportCheckSumFailure(corruptedBlockMap, 833 currentLocatedBlock.getLocations().length); 834 } 835 } 836 } 837 return -1; 838 } 839 840 /** 841 * Read the entire buffer. 842 */ 843 @Override 844 public synchronized int read(final byte buf[], int off, int len) throws IOException { 845 ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf); 846 847 return readWithStrategy(byteArrayReader, off, len); 848 } 849 850 @Override 851 public synchronized int read(final ByteBuffer buf) throws IOException { 852 ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf); 853 854 return readWithStrategy(byteBufferReader, 0, buf.remaining()); 855 } 856 857 858 /** 859 * Add corrupted block replica into map. 860 */ 861 private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 862 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) { 863 Set<DatanodeInfo> dnSet = null; 864 if((corruptedBlockMap.containsKey(blk))) { 865 dnSet = corruptedBlockMap.get(blk); 866 }else { 867 dnSet = new HashSet<DatanodeInfo>(); 868 } 869 if (!dnSet.contains(node)) { 870 dnSet.add(node); 871 corruptedBlockMap.put(blk, dnSet); 872 } 873 } 874 875 private DNAddrPair chooseDataNode(LocatedBlock block, 876 Collection<DatanodeInfo> ignoredNodes) throws IOException { 877 while (true) { 878 try { 879 return getBestNodeDNAddrPair(block, ignoredNodes); 880 } catch (IOException ie) { 881 String errMsg = getBestNodeDNAddrPairErrorString(block.getLocations(), 882 deadNodes, ignoredNodes); 883 String blockInfo = block.getBlock() + " file=" + src; 884 if (failures >= dfsClient.getMaxBlockAcquireFailures()) { 885 String description = "Could not obtain block: " + blockInfo; 886 DFSClient.LOG.warn(description + errMsg 887 + ". Throwing a BlockMissingException"); 888 throw new BlockMissingException(src, description, 889 block.getStartOffset()); 890 } 891 892 DatanodeInfo[] nodes = block.getLocations(); 893 if (nodes == null || nodes.length == 0) { 894 DFSClient.LOG.info("No node available for " + blockInfo); 895 } 896 DFSClient.LOG.info("Could not obtain " + block.getBlock() 897 + " from any node: " + ie + errMsg 898 + ". Will get new block locations from namenode and retry..."); 899 try { 900 // Introducing a random factor to the wait time before another retry. 901 // The wait time is dependent on # of failures and a random factor. 902 // At the first time of getting a BlockMissingException, the wait time 903 // is a random number between 0..3000 ms. If the first retry 904 // still fails, we will wait 3000 ms grace period before the 2nd retry. 905 // Also at the second retry, the waiting window is expanded to 6000 ms 906 // alleviating the request rate from the server. Similarly the 3rd retry 907 // will wait 6000ms grace period before retry and the waiting window is 908 // expanded to 9000ms. 909 final int timeWindow = dfsClient.getConf().timeWindow; 910 double waitTime = timeWindow * failures + // grace period for the last round of attempt 911 timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure 912 DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec."); 913 Thread.sleep((long)waitTime); 914 } catch (InterruptedException iex) { 915 } 916 deadNodes.clear(); //2nd option is to remove only nodes[blockId] 917 openInfo(); 918 block = getBlockAt(block.getStartOffset(), false); 919 failures++; 920 continue; 921 } 922 } 923 } 924 925 /** 926 * Get the best node from which to stream the data. 927 * @param block LocatedBlock, containing nodes in priority order. 928 * @param ignoredNodes Do not choose nodes in this array (may be null) 929 * @return The DNAddrPair of the best node. 930 * @throws IOException 931 */ 932 private DNAddrPair getBestNodeDNAddrPair(LocatedBlock block, 933 Collection<DatanodeInfo> ignoredNodes) throws IOException { 934 DatanodeInfo[] nodes = block.getLocations(); 935 StorageType[] storageTypes = block.getStorageTypes(); 936 DatanodeInfo chosenNode = null; 937 StorageType storageType = null; 938 if (nodes != null) { 939 for (int i = 0; i < nodes.length; i++) { 940 if (!deadNodes.containsKey(nodes[i]) 941 && (ignoredNodes == null || !ignoredNodes.contains(nodes[i]))) { 942 chosenNode = nodes[i]; 943 // Storage types are ordered to correspond with nodes, so use the same 944 // index to get storage type. 945 if (storageTypes != null && i < storageTypes.length) { 946 storageType = storageTypes[i]; 947 } 948 break; 949 } 950 } 951 } 952 if (chosenNode == null) { 953 throw new IOException("No live nodes contain block " + block.getBlock() + 954 " after checking nodes = " + Arrays.toString(nodes) + 955 ", ignoredNodes = " + ignoredNodes); 956 } 957 final String dnAddr = 958 chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname); 959 if (DFSClient.LOG.isDebugEnabled()) { 960 DFSClient.LOG.debug("Connecting to datanode " + dnAddr); 961 } 962 InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr); 963 return new DNAddrPair(chosenNode, targetAddr, storageType); 964 } 965 966 private static String getBestNodeDNAddrPairErrorString( 967 DatanodeInfo nodes[], AbstractMap<DatanodeInfo, 968 DatanodeInfo> deadNodes, Collection<DatanodeInfo> ignoredNodes) { 969 StringBuilder errMsgr = new StringBuilder( 970 " No live nodes contain current block "); 971 errMsgr.append("Block locations:"); 972 for (DatanodeInfo datanode : nodes) { 973 errMsgr.append(" "); 974 errMsgr.append(datanode.toString()); 975 } 976 errMsgr.append(" Dead nodes: "); 977 for (DatanodeInfo datanode : deadNodes.keySet()) { 978 errMsgr.append(" "); 979 errMsgr.append(datanode.toString()); 980 } 981 if (ignoredNodes != null) { 982 errMsgr.append(" Ignored nodes: "); 983 for (DatanodeInfo datanode : ignoredNodes) { 984 errMsgr.append(" "); 985 errMsgr.append(datanode.toString()); 986 } 987 } 988 return errMsgr.toString(); 989 } 990 991 private void fetchBlockByteRange(LocatedBlock block, long start, long end, 992 byte[] buf, int offset, 993 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 994 throws IOException { 995 block = getBlockAt(block.getStartOffset(), false); 996 while (true) { 997 DNAddrPair addressPair = chooseDataNode(block, null); 998 try { 999 actualGetFromOneDataNode(addressPair, block, start, end, buf, offset, 1000 corruptedBlockMap); 1001 return; 1002 } catch (IOException e) { 1003 // Ignore. Already processed inside the function. 1004 // Loop through to try the next node. 1005 } 1006 } 1007 } 1008 1009 private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode, 1010 final LocatedBlock block, final long start, final long end, 1011 final ByteBuffer bb, 1012 final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) { 1013 return new Callable<ByteBuffer>() { 1014 @Override 1015 public ByteBuffer call() throws Exception { 1016 byte[] buf = bb.array(); 1017 int offset = bb.position(); 1018 actualGetFromOneDataNode(datanode, block, start, end, buf, offset, 1019 corruptedBlockMap); 1020 return bb; 1021 } 1022 }; 1023 } 1024 1025 private void actualGetFromOneDataNode(final DNAddrPair datanode, 1026 LocatedBlock block, final long start, final long end, byte[] buf, 1027 int offset, Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 1028 throws IOException { 1029 DFSClientFaultInjector.get().startFetchFromDatanode(); 1030 int refetchToken = 1; // only need to get a new access token once 1031 int refetchEncryptionKey = 1; // only need to get a new encryption key once 1032 1033 while (true) { 1034 // cached block locations may have been updated by chooseDataNode() 1035 // or fetchBlockAt(). Always get the latest list of locations at the 1036 // start of the loop. 1037 CachingStrategy curCachingStrategy; 1038 boolean allowShortCircuitLocalReads; 1039 synchronized (this) { 1040 block = getBlockAt(block.getStartOffset(), false); 1041 curCachingStrategy = cachingStrategy; 1042 allowShortCircuitLocalReads = !shortCircuitForbidden(); 1043 } 1044 DatanodeInfo chosenNode = datanode.info; 1045 InetSocketAddress targetAddr = datanode.addr; 1046 StorageType storageType = datanode.storageType; 1047 BlockReader reader = null; 1048 1049 try { 1050 DFSClientFaultInjector.get().fetchFromDatanodeException(); 1051 Token<BlockTokenIdentifier> blockToken = block.getBlockToken(); 1052 int len = (int) (end - start + 1); 1053 reader = new BlockReaderFactory(dfsClient.getConf()). 1054 setInetSocketAddress(targetAddr). 1055 setRemotePeerFactory(dfsClient). 1056 setDatanodeInfo(chosenNode). 1057 setStorageType(storageType). 1058 setFileName(src). 1059 setBlock(block.getBlock()). 1060 setBlockToken(blockToken). 1061 setStartOffset(start). 1062 setVerifyChecksum(verifyChecksum). 1063 setClientName(dfsClient.clientName). 1064 setLength(len). 1065 setCachingStrategy(curCachingStrategy). 1066 setAllowShortCircuitLocalReads(allowShortCircuitLocalReads). 1067 setClientCacheContext(dfsClient.getClientContext()). 1068 setUserGroupInformation(dfsClient.ugi). 1069 setConfiguration(dfsClient.getConfiguration()). 1070 build(); 1071 int nread = reader.readAll(buf, offset, len); 1072 updateReadStatistics(readStatistics, nread, reader); 1073 1074 if (nread != len) { 1075 throw new IOException("truncated return from reader.read(): " + 1076 "excpected " + len + ", got " + nread); 1077 } 1078 DFSClientFaultInjector.get().readFromDatanodeDelay(); 1079 return; 1080 } catch (ChecksumException e) { 1081 String msg = "fetchBlockByteRange(). Got a checksum exception for " 1082 + src + " at " + block.getBlock() + ":" + e.getPos() + " from " 1083 + chosenNode; 1084 DFSClient.LOG.warn(msg); 1085 // we want to remember what we have tried 1086 addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap); 1087 addToDeadNodes(chosenNode); 1088 throw new IOException(msg); 1089 } catch (IOException e) { 1090 if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { 1091 DFSClient.LOG.info("Will fetch a new encryption key and retry, " 1092 + "encryption key was invalid when connecting to " + targetAddr 1093 + " : " + e); 1094 // The encryption key used is invalid. 1095 refetchEncryptionKey--; 1096 dfsClient.clearDataEncryptionKey(); 1097 continue; 1098 } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) { 1099 refetchToken--; 1100 try { 1101 fetchBlockAt(block.getStartOffset()); 1102 } catch (IOException fbae) { 1103 // ignore IOE, since we can retry it later in a loop 1104 } 1105 continue; 1106 } else { 1107 String msg = "Failed to connect to " + targetAddr + " for file " 1108 + src + " for block " + block.getBlock() + ":" + e; 1109 DFSClient.LOG.warn("Connection failure: " + msg, e); 1110 addToDeadNodes(chosenNode); 1111 throw new IOException(msg); 1112 } 1113 } finally { 1114 if (reader != null) { 1115 reader.close(); 1116 } 1117 } 1118 } 1119 } 1120 1121 /** 1122 * Like {@link #fetchBlockByteRange(LocatedBlock, long, long, byte[], 1123 * int, Map)} except we start up a second, parallel, 'hedged' read 1124 * if the first read is taking longer than configured amount of 1125 * time. We then wait on which ever read returns first. 1126 */ 1127 private void hedgedFetchBlockByteRange(LocatedBlock block, long start, 1128 long end, byte[] buf, int offset, 1129 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 1130 throws IOException { 1131 ArrayList<Future<ByteBuffer>> futures = new ArrayList<Future<ByteBuffer>>(); 1132 CompletionService<ByteBuffer> hedgedService = 1133 new ExecutorCompletionService<ByteBuffer>( 1134 dfsClient.getHedgedReadsThreadPool()); 1135 ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>(); 1136 ByteBuffer bb = null; 1137 int len = (int) (end - start + 1); 1138 block = getBlockAt(block.getStartOffset(), false); 1139 while (true) { 1140 // see HDFS-6591, this metric is used to verify/catch unnecessary loops 1141 hedgedReadOpsLoopNumForTesting++; 1142 DNAddrPair chosenNode = null; 1143 // there is no request already executing. 1144 if (futures.isEmpty()) { 1145 // chooseDataNode is a commitment. If no node, we go to 1146 // the NN to reget block locations. Only go here on first read. 1147 chosenNode = chooseDataNode(block, ignored); 1148 bb = ByteBuffer.wrap(buf, offset, len); 1149 Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode( 1150 chosenNode, block, start, end, bb, corruptedBlockMap); 1151 Future<ByteBuffer> firstRequest = hedgedService 1152 .submit(getFromDataNodeCallable); 1153 futures.add(firstRequest); 1154 try { 1155 Future<ByteBuffer> future = hedgedService.poll( 1156 dfsClient.getHedgedReadTimeout(), TimeUnit.MILLISECONDS); 1157 if (future != null) { 1158 future.get(); 1159 return; 1160 } 1161 if (DFSClient.LOG.isDebugEnabled()) { 1162 DFSClient.LOG.debug("Waited " + dfsClient.getHedgedReadTimeout() 1163 + "ms to read from " + chosenNode.info 1164 + "; spawning hedged read"); 1165 } 1166 // Ignore this node on next go around. 1167 ignored.add(chosenNode.info); 1168 dfsClient.getHedgedReadMetrics().incHedgedReadOps(); 1169 continue; // no need to refresh block locations 1170 } catch (InterruptedException e) { 1171 // Ignore 1172 } catch (ExecutionException e) { 1173 // Ignore already logged in the call. 1174 } 1175 } else { 1176 // We are starting up a 'hedged' read. We have a read already 1177 // ongoing. Call getBestNodeDNAddrPair instead of chooseDataNode. 1178 // If no nodes to do hedged reads against, pass. 1179 try { 1180 try { 1181 chosenNode = getBestNodeDNAddrPair(block, ignored); 1182 } catch (IOException ioe) { 1183 chosenNode = chooseDataNode(block, ignored); 1184 } 1185 bb = ByteBuffer.allocate(len); 1186 Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode( 1187 chosenNode, block, start, end, bb, corruptedBlockMap); 1188 Future<ByteBuffer> oneMoreRequest = hedgedService 1189 .submit(getFromDataNodeCallable); 1190 futures.add(oneMoreRequest); 1191 } catch (IOException ioe) { 1192 if (DFSClient.LOG.isDebugEnabled()) { 1193 DFSClient.LOG.debug("Failed getting node for hedged read: " 1194 + ioe.getMessage()); 1195 } 1196 } 1197 // if not succeeded. Submit callables for each datanode in a loop, wait 1198 // for a fixed interval and get the result from the fastest one. 1199 try { 1200 ByteBuffer result = getFirstToComplete(hedgedService, futures); 1201 // cancel the rest. 1202 cancelAll(futures); 1203 if (result.array() != buf) { // compare the array pointers 1204 dfsClient.getHedgedReadMetrics().incHedgedReadWins(); 1205 System.arraycopy(result.array(), result.position(), buf, offset, 1206 len); 1207 } else { 1208 dfsClient.getHedgedReadMetrics().incHedgedReadOps(); 1209 } 1210 return; 1211 } catch (InterruptedException ie) { 1212 // Ignore and retry 1213 } 1214 // We got here if exception. Ignore this node on next go around IFF 1215 // we found a chosenNode to hedge read against. 1216 if (chosenNode != null && chosenNode.info != null) { 1217 ignored.add(chosenNode.info); 1218 } 1219 } 1220 } 1221 } 1222 1223 @VisibleForTesting 1224 public long getHedgedReadOpsLoopNumForTesting() { 1225 return hedgedReadOpsLoopNumForTesting; 1226 } 1227 1228 private ByteBuffer getFirstToComplete( 1229 CompletionService<ByteBuffer> hedgedService, 1230 ArrayList<Future<ByteBuffer>> futures) throws InterruptedException { 1231 if (futures.isEmpty()) { 1232 throw new InterruptedException("let's retry"); 1233 } 1234 Future<ByteBuffer> future = null; 1235 try { 1236 future = hedgedService.take(); 1237 ByteBuffer bb = future.get(); 1238 futures.remove(future); 1239 return bb; 1240 } catch (ExecutionException e) { 1241 // already logged in the Callable 1242 futures.remove(future); 1243 } catch (CancellationException ce) { 1244 // already logged in the Callable 1245 futures.remove(future); 1246 } 1247 1248 throw new InterruptedException("let's retry"); 1249 } 1250 1251 private void cancelAll(List<Future<ByteBuffer>> futures) { 1252 for (Future<ByteBuffer> future : futures) { 1253 // Unfortunately, hdfs reads do not take kindly to interruption. 1254 // Threads return a variety of interrupted-type exceptions but 1255 // also complaints about invalid pbs -- likely because read 1256 // is interrupted before gets whole pb. Also verbose WARN 1257 // logging. So, for now, do not interrupt running read. 1258 future.cancel(false); 1259 } 1260 } 1261 1262 /** 1263 * Should the block access token be refetched on an exception 1264 * 1265 * @param ex Exception received 1266 * @param targetAddr Target datanode address from where exception was received 1267 * @return true if block access token has expired or invalid and it should be 1268 * refetched 1269 */ 1270 private static boolean tokenRefetchNeeded(IOException ex, 1271 InetSocketAddress targetAddr) { 1272 /* 1273 * Get a new access token and retry. Retry is needed in 2 cases. 1) 1274 * When both NN and DN re-started while DFSClient holding a cached 1275 * access token. 2) In the case that NN fails to update its 1276 * access key at pre-set interval (by a wide margin) and 1277 * subsequently restarts. In this case, DN re-registers itself with 1278 * NN and receives a new access key, but DN will delete the old 1279 * access key from its memory since it's considered expired based on 1280 * the estimated expiration date. 1281 */ 1282 if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) { 1283 DFSClient.LOG.info("Access token was invalid when connecting to " 1284 + targetAddr + " : " + ex); 1285 return true; 1286 } 1287 return false; 1288 } 1289 1290 /** 1291 * Read bytes starting from the specified position. 1292 * 1293 * @param position start read from this position 1294 * @param buffer read buffer 1295 * @param offset offset into buffer 1296 * @param length number of bytes to read 1297 * 1298 * @return actual number of bytes read 1299 */ 1300 @Override 1301 public int read(long position, byte[] buffer, int offset, int length) 1302 throws IOException { 1303 // sanity checks 1304 dfsClient.checkOpen(); 1305 if (closed) { 1306 throw new IOException("Stream closed"); 1307 } 1308 failures = 0; 1309 long filelen = getFileLength(); 1310 if ((position < 0) || (position >= filelen)) { 1311 return -1; 1312 } 1313 int realLen = length; 1314 if ((position + length) > filelen) { 1315 realLen = (int)(filelen - position); 1316 } 1317 1318 // determine the block and byte range within the block 1319 // corresponding to position and realLen 1320 List<LocatedBlock> blockRange = getBlockRange(position, realLen); 1321 int remaining = realLen; 1322 Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 1323 = new HashMap<ExtendedBlock, Set<DatanodeInfo>>(); 1324 for (LocatedBlock blk : blockRange) { 1325 long targetStart = position - blk.getStartOffset(); 1326 long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart); 1327 try { 1328 if (dfsClient.isHedgedReadsEnabled()) { 1329 hedgedFetchBlockByteRange(blk, targetStart, targetStart + bytesToRead 1330 - 1, buffer, offset, corruptedBlockMap); 1331 } else { 1332 fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1, 1333 buffer, offset, corruptedBlockMap); 1334 } 1335 } finally { 1336 // Check and report if any block replicas are corrupted. 1337 // BlockMissingException may be caught if all block replicas are 1338 // corrupted. 1339 reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length); 1340 } 1341 1342 remaining -= bytesToRead; 1343 position += bytesToRead; 1344 offset += bytesToRead; 1345 } 1346 assert remaining == 0 : "Wrong number of bytes read."; 1347 if (dfsClient.stats != null) { 1348 dfsClient.stats.incrementBytesRead(realLen); 1349 } 1350 return realLen; 1351 } 1352 1353 /** 1354 * DFSInputStream reports checksum failure. 1355 * Case I : client has tried multiple data nodes and at least one of the 1356 * attempts has succeeded. We report the other failures as corrupted block to 1357 * namenode. 1358 * Case II: client has tried out all data nodes, but all failed. We 1359 * only report if the total number of replica is 1. We do not 1360 * report otherwise since this maybe due to the client is a handicapped client 1361 * (who can not read). 1362 * @param corruptedBlockMap map of corrupted blocks 1363 * @param dataNodeCount number of data nodes who contains the block replicas 1364 */ 1365 private void reportCheckSumFailure( 1366 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 1367 int dataNodeCount) { 1368 if (corruptedBlockMap.isEmpty()) { 1369 return; 1370 } 1371 Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap 1372 .entrySet().iterator(); 1373 Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next(); 1374 ExtendedBlock blk = entry.getKey(); 1375 Set<DatanodeInfo> dnSet = entry.getValue(); 1376 if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0)) 1377 || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) { 1378 DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()]; 1379 int i = 0; 1380 for (DatanodeInfo dn:dnSet) { 1381 locs[i++] = dn; 1382 } 1383 LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) }; 1384 dfsClient.reportChecksumFailure(src, lblocks); 1385 } 1386 corruptedBlockMap.clear(); 1387 } 1388 1389 @Override 1390 public long skip(long n) throws IOException { 1391 if ( n > 0 ) { 1392 long curPos = getPos(); 1393 long fileLen = getFileLength(); 1394 if( n+curPos > fileLen ) { 1395 n = fileLen - curPos; 1396 } 1397 seek(curPos+n); 1398 return n; 1399 } 1400 return n < 0 ? -1 : 0; 1401 } 1402 1403 /** 1404 * Seek to a new arbitrary location 1405 */ 1406 @Override 1407 public synchronized void seek(long targetPos) throws IOException { 1408 if (targetPos > getFileLength()) { 1409 throw new EOFException("Cannot seek after EOF"); 1410 } 1411 if (targetPos < 0) { 1412 throw new EOFException("Cannot seek to negative offset"); 1413 } 1414 if (closed) { 1415 throw new IOException("Stream is closed!"); 1416 } 1417 boolean done = false; 1418 if (pos <= targetPos && targetPos <= blockEnd) { 1419 // 1420 // If this seek is to a positive position in the current 1421 // block, and this piece of data might already be lying in 1422 // the TCP buffer, then just eat up the intervening data. 1423 // 1424 int diff = (int)(targetPos - pos); 1425 if (diff <= blockReader.available()) { 1426 try { 1427 pos += blockReader.skip(diff); 1428 if (pos == targetPos) { 1429 done = true; 1430 } else { 1431 // The range was already checked. If the block reader returns 1432 // something unexpected instead of throwing an exception, it is 1433 // most likely a bug. 1434 String errMsg = "BlockReader failed to seek to " + 1435 targetPos + ". Instead, it seeked to " + pos + "."; 1436 DFSClient.LOG.warn(errMsg); 1437 throw new IOException(errMsg); 1438 } 1439 } catch (IOException e) {//make following read to retry 1440 if(DFSClient.LOG.isDebugEnabled()) { 1441 DFSClient.LOG.debug("Exception while seek to " + targetPos 1442 + " from " + getCurrentBlock() + " of " + src + " from " 1443 + currentNode, e); 1444 } 1445 } 1446 } 1447 } 1448 if (!done) { 1449 pos = targetPos; 1450 blockEnd = -1; 1451 } 1452 } 1453 1454 /** 1455 * Same as {@link #seekToNewSource(long)} except that it does not exclude 1456 * the current datanode and might connect to the same node. 1457 */ 1458 private synchronized boolean seekToBlockSource(long targetPos) 1459 throws IOException { 1460 currentNode = blockSeekTo(targetPos); 1461 return true; 1462 } 1463 1464 /** 1465 * Seek to given position on a node other than the current node. If 1466 * a node other than the current node is found, then returns true. 1467 * If another node could not be found, then returns false. 1468 */ 1469 @Override 1470 public synchronized boolean seekToNewSource(long targetPos) throws IOException { 1471 boolean markedDead = deadNodes.containsKey(currentNode); 1472 addToDeadNodes(currentNode); 1473 DatanodeInfo oldNode = currentNode; 1474 DatanodeInfo newNode = blockSeekTo(targetPos); 1475 if (!markedDead) { 1476 /* remove it from deadNodes. blockSeekTo could have cleared 1477 * deadNodes and added currentNode again. Thats ok. */ 1478 deadNodes.remove(oldNode); 1479 } 1480 if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) { 1481 currentNode = newNode; 1482 return true; 1483 } else { 1484 return false; 1485 } 1486 } 1487 1488 /** 1489 */ 1490 @Override 1491 public synchronized long getPos() throws IOException { 1492 return pos; 1493 } 1494 1495 /** Return the size of the remaining available bytes 1496 * if the size is less than or equal to {@link Integer#MAX_VALUE}, 1497 * otherwise, return {@link Integer#MAX_VALUE}. 1498 */ 1499 @Override 1500 public synchronized int available() throws IOException { 1501 if (closed) { 1502 throw new IOException("Stream closed"); 1503 } 1504 1505 final long remaining = getFileLength() - pos; 1506 return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE; 1507 } 1508 1509 /** 1510 * We definitely don't support marks 1511 */ 1512 @Override 1513 public boolean markSupported() { 1514 return false; 1515 } 1516 @Override 1517 public void mark(int readLimit) { 1518 } 1519 @Override 1520 public void reset() throws IOException { 1521 throw new IOException("Mark/reset not supported"); 1522 } 1523 1524 /** Utility class to encapsulate data node info and its address. */ 1525 private static final class DNAddrPair { 1526 final DatanodeInfo info; 1527 final InetSocketAddress addr; 1528 final StorageType storageType; 1529 1530 DNAddrPair(DatanodeInfo info, InetSocketAddress addr, 1531 StorageType storageType) { 1532 this.info = info; 1533 this.addr = addr; 1534 this.storageType = storageType; 1535 } 1536 } 1537 1538 /** 1539 * Get statistics about the reads which this DFSInputStream has done. 1540 */ 1541 public synchronized ReadStatistics getReadStatistics() { 1542 return new ReadStatistics(readStatistics); 1543 } 1544 1545 public synchronized FileEncryptionInfo getFileEncryptionInfo() { 1546 return fileEncryptionInfo; 1547 } 1548 1549 private synchronized void closeCurrentBlockReader() { 1550 if (blockReader == null) return; 1551 // Close the current block reader so that the new caching settings can 1552 // take effect immediately. 1553 try { 1554 blockReader.close(); 1555 } catch (IOException e) { 1556 DFSClient.LOG.error("error closing blockReader", e); 1557 } 1558 blockReader = null; 1559 } 1560 1561 @Override 1562 public synchronized void setReadahead(Long readahead) 1563 throws IOException { 1564 this.cachingStrategy = 1565 new CachingStrategy.Builder(this.cachingStrategy). 1566 setReadahead(readahead).build(); 1567 closeCurrentBlockReader(); 1568 } 1569 1570 @Override 1571 public synchronized void setDropBehind(Boolean dropBehind) 1572 throws IOException { 1573 this.cachingStrategy = 1574 new CachingStrategy.Builder(this.cachingStrategy). 1575 setDropBehind(dropBehind).build(); 1576 closeCurrentBlockReader(); 1577 } 1578 1579 /** 1580 * The immutable empty buffer we return when we reach EOF when doing a 1581 * zero-copy read. 1582 */ 1583 private static final ByteBuffer EMPTY_BUFFER = 1584 ByteBuffer.allocateDirect(0).asReadOnlyBuffer(); 1585 1586 @Override 1587 public synchronized ByteBuffer read(ByteBufferPool bufferPool, 1588 int maxLength, EnumSet<ReadOption> opts) 1589 throws IOException, UnsupportedOperationException { 1590 if (maxLength == 0) { 1591 return EMPTY_BUFFER; 1592 } else if (maxLength < 0) { 1593 throw new IllegalArgumentException("can't read a negative " + 1594 "number of bytes."); 1595 } 1596 if ((blockReader == null) || (blockEnd == -1)) { 1597 if (pos >= getFileLength()) { 1598 return null; 1599 } 1600 /* 1601 * If we don't have a blockReader, or the one we have has no more bytes 1602 * left to read, we call seekToBlockSource to get a new blockReader and 1603 * recalculate blockEnd. Note that we assume we're not at EOF here 1604 * (we check this above). 1605 */ 1606 if ((!seekToBlockSource(pos)) || (blockReader == null)) { 1607 throw new IOException("failed to allocate new BlockReader " + 1608 "at position " + pos); 1609 } 1610 } 1611 ByteBuffer buffer = null; 1612 if (dfsClient.getConf().shortCircuitMmapEnabled) { 1613 buffer = tryReadZeroCopy(maxLength, opts); 1614 } 1615 if (buffer != null) { 1616 return buffer; 1617 } 1618 buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength); 1619 if (buffer != null) { 1620 extendedReadBuffers.put(buffer, bufferPool); 1621 } 1622 return buffer; 1623 } 1624 1625 private synchronized ByteBuffer tryReadZeroCopy(int maxLength, 1626 EnumSet<ReadOption> opts) throws IOException { 1627 // Copy 'pos' and 'blockEnd' to local variables to make it easier for the 1628 // JVM to optimize this function. 1629 final long curPos = pos; 1630 final long curEnd = blockEnd; 1631 final long blockStartInFile = currentLocatedBlock.getStartOffset(); 1632 final long blockPos = curPos - blockStartInFile; 1633 1634 // Shorten this read if the end of the block is nearby. 1635 long length63; 1636 if ((curPos + maxLength) <= (curEnd + 1)) { 1637 length63 = maxLength; 1638 } else { 1639 length63 = 1 + curEnd - curPos; 1640 if (length63 <= 0) { 1641 if (DFSClient.LOG.isDebugEnabled()) { 1642 DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " + 1643 curPos + " of " + src + "; " + length63 + " bytes left in block. " + 1644 "blockPos=" + blockPos + "; curPos=" + curPos + 1645 "; curEnd=" + curEnd); 1646 } 1647 return null; 1648 } 1649 if (DFSClient.LOG.isDebugEnabled()) { 1650 DFSClient.LOG.debug("Reducing read length from " + maxLength + 1651 " to " + length63 + " to avoid going more than one byte " + 1652 "past the end of the block. blockPos=" + blockPos + 1653 "; curPos=" + curPos + "; curEnd=" + curEnd); 1654 } 1655 } 1656 // Make sure that don't go beyond 31-bit offsets in the MappedByteBuffer. 1657 int length; 1658 if (blockPos + length63 <= Integer.MAX_VALUE) { 1659 length = (int)length63; 1660 } else { 1661 long length31 = Integer.MAX_VALUE - blockPos; 1662 if (length31 <= 0) { 1663 // Java ByteBuffers can't be longer than 2 GB, because they use 1664 // 4-byte signed integers to represent capacity, etc. 1665 // So we can't mmap the parts of the block higher than the 2 GB offset. 1666 // FIXME: we could work around this with multiple memory maps. 1667 // See HDFS-5101. 1668 if (DFSClient.LOG.isDebugEnabled()) { 1669 DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " + 1670 curPos + " of " + src + "; 31-bit MappedByteBuffer limit " + 1671 "exceeded. blockPos=" + blockPos + ", curEnd=" + curEnd); 1672 } 1673 return null; 1674 } 1675 length = (int)length31; 1676 if (DFSClient.LOG.isDebugEnabled()) { 1677 DFSClient.LOG.debug("Reducing read length from " + maxLength + 1678 " to " + length + " to avoid 31-bit limit. " + 1679 "blockPos=" + blockPos + "; curPos=" + curPos + 1680 "; curEnd=" + curEnd); 1681 } 1682 } 1683 final ClientMmap clientMmap = blockReader.getClientMmap(opts); 1684 if (clientMmap == null) { 1685 if (DFSClient.LOG.isDebugEnabled()) { 1686 DFSClient.LOG.debug("unable to perform a zero-copy read from offset " + 1687 curPos + " of " + src + "; BlockReader#getClientMmap returned " + 1688 "null."); 1689 } 1690 return null; 1691 } 1692 boolean success = false; 1693 ByteBuffer buffer; 1694 try { 1695 seek(curPos + length); 1696 buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer(); 1697 buffer.position((int)blockPos); 1698 buffer.limit((int)(blockPos + length)); 1699 extendedReadBuffers.put(buffer, clientMmap); 1700 readStatistics.addZeroCopyBytes(length); 1701 if (DFSClient.LOG.isDebugEnabled()) { 1702 DFSClient.LOG.debug("readZeroCopy read " + length + 1703 " bytes from offset " + curPos + " via the zero-copy read " + 1704 "path. blockEnd = " + blockEnd); 1705 } 1706 success = true; 1707 } finally { 1708 if (!success) { 1709 IOUtils.closeQuietly(clientMmap); 1710 } 1711 } 1712 return buffer; 1713 } 1714 1715 @Override 1716 public synchronized void releaseBuffer(ByteBuffer buffer) { 1717 if (buffer == EMPTY_BUFFER) return; 1718 Object val = extendedReadBuffers.remove(buffer); 1719 if (val == null) { 1720 throw new IllegalArgumentException("tried to release a buffer " + 1721 "that was not created by this stream, " + buffer); 1722 } 1723 if (val instanceof ClientMmap) { 1724 IOUtils.closeQuietly((ClientMmap)val); 1725 } else if (val instanceof ByteBufferPool) { 1726 ((ByteBufferPool)val).putBuffer(buffer); 1727 } 1728 } 1729 }