001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs; 019 020import java.io.EOFException; 021import java.io.IOException; 022import java.net.InetSocketAddress; 023import java.nio.ByteBuffer; 024import java.util.AbstractMap; 025import java.util.ArrayList; 026import java.util.Arrays; 027import java.util.Collection; 028import java.util.EnumSet; 029import java.util.HashMap; 030import java.util.HashSet; 031import java.util.Iterator; 032import java.util.List; 033import java.util.Map; 034import java.util.Map.Entry; 035import java.util.Set; 036import java.util.concurrent.Callable; 037import java.util.concurrent.CancellationException; 038import java.util.concurrent.CompletionService; 039import java.util.concurrent.ConcurrentHashMap; 040import java.util.concurrent.ExecutionException; 041import java.util.concurrent.ExecutorCompletionService; 042import java.util.concurrent.Future; 043import java.util.concurrent.TimeUnit; 044import java.util.concurrent.atomic.AtomicBoolean; 045 046import org.apache.commons.io.IOUtils; 047import org.apache.hadoop.classification.InterfaceAudience; 048import org.apache.hadoop.fs.ByteBufferReadable; 049import org.apache.hadoop.fs.ByteBufferUtil; 050import org.apache.hadoop.fs.CanSetDropBehind; 051import org.apache.hadoop.fs.CanSetReadahead; 052import org.apache.hadoop.fs.CanUnbuffer; 053import org.apache.hadoop.fs.ChecksumException; 054import org.apache.hadoop.fs.FSInputStream; 055import org.apache.hadoop.fs.HasEnhancedByteBufferAccess; 056import org.apache.hadoop.fs.ReadOption; 057import org.apache.hadoop.fs.StorageType; 058import org.apache.hadoop.fs.UnresolvedLinkException; 059import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol; 060import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 061import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 062import org.apache.hadoop.fs.FileEncryptionInfo; 063import org.apache.hadoop.hdfs.protocol.LocatedBlock; 064import org.apache.hadoop.hdfs.protocol.LocatedBlocks; 065import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException; 066import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; 067import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException; 068import org.apache.hadoop.hdfs.server.datanode.CachingStrategy; 069import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; 070import org.apache.hadoop.hdfs.shortcircuit.ClientMmap; 071import org.apache.hadoop.io.ByteBufferPool; 072import org.apache.hadoop.ipc.RPC; 073import org.apache.hadoop.ipc.RemoteException; 074import org.apache.hadoop.net.NetUtils; 075import org.apache.hadoop.security.token.SecretManager.InvalidToken; 076import org.apache.hadoop.security.token.Token; 077import org.apache.hadoop.util.IdentityHashStore; 078import org.apache.htrace.Span; 079import org.apache.htrace.Trace; 080import org.apache.htrace.TraceScope; 081 082import com.google.common.annotations.VisibleForTesting; 083 084/**************************************************************** 085 * DFSInputStream provides bytes from a named file. It handles 086 * negotiation of the namenode and various datanodes as necessary. 087 ****************************************************************/ 088@InterfaceAudience.Private 089public class DFSInputStream extends FSInputStream 090implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead, 091 HasEnhancedByteBufferAccess, CanUnbuffer { 092 @VisibleForTesting 093 public static boolean tcpReadsDisabledForTesting = false; 094 private long hedgedReadOpsLoopNumForTesting = 0; 095 private final DFSClient dfsClient; 096 private AtomicBoolean closed = new AtomicBoolean(false); 097 private final String src; 098 private final boolean verifyChecksum; 099 100 // state by stateful read only: 101 // (protected by lock on this) 102 ///// 103 private DatanodeInfo currentNode = null; 104 private LocatedBlock currentLocatedBlock = null; 105 private long pos = 0; 106 private long blockEnd = -1; 107 private BlockReader blockReader = null; 108 //// 109 110 // state shared by stateful and positional read: 111 // (protected by lock on infoLock) 112 //// 113 private LocatedBlocks locatedBlocks = null; 114 private long lastBlockBeingWrittenLength = 0; 115 private FileEncryptionInfo fileEncryptionInfo = null; 116 private CachingStrategy cachingStrategy; 117 //// 118 119 private final ReadStatistics readStatistics = new ReadStatistics(); 120 // lock for state shared between read and pread 121 // Note: Never acquire a lock on <this> with this lock held to avoid deadlocks 122 // (it's OK to acquire this lock when the lock on <this> is held) 123 private final Object infoLock = new Object(); 124 125 /** 126 * Track the ByteBuffers that we have handed out to readers. 127 * 128 * The value type can be either ByteBufferPool or ClientMmap, depending on 129 * whether we this is a memory-mapped buffer or not. 130 */ 131 private IdentityHashStore<ByteBuffer, Object> extendedReadBuffers; 132 133 private synchronized IdentityHashStore<ByteBuffer, Object> 134 getExtendedReadBuffers() { 135 if (extendedReadBuffers == null) { 136 extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0); 137 } 138 return extendedReadBuffers; 139 } 140 141 public static class ReadStatistics { 142 public ReadStatistics() { 143 clear(); 144 } 145 146 public ReadStatistics(ReadStatistics rhs) { 147 this.totalBytesRead = rhs.getTotalBytesRead(); 148 this.totalLocalBytesRead = rhs.getTotalLocalBytesRead(); 149 this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead(); 150 this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead(); 151 } 152 153 /** 154 * @return The total bytes read. This will always be at least as 155 * high as the other numbers, since it includes all of them. 156 */ 157 public long getTotalBytesRead() { 158 return totalBytesRead; 159 } 160 161 /** 162 * @return The total local bytes read. This will always be at least 163 * as high as totalShortCircuitBytesRead, since all short-circuit 164 * reads are also local. 165 */ 166 public long getTotalLocalBytesRead() { 167 return totalLocalBytesRead; 168 } 169 170 /** 171 * @return The total short-circuit local bytes read. 172 */ 173 public long getTotalShortCircuitBytesRead() { 174 return totalShortCircuitBytesRead; 175 } 176 177 /** 178 * @return The total number of zero-copy bytes read. 179 */ 180 public long getTotalZeroCopyBytesRead() { 181 return totalZeroCopyBytesRead; 182 } 183 184 /** 185 * @return The total number of bytes read which were not local. 186 */ 187 public long getRemoteBytesRead() { 188 return totalBytesRead - totalLocalBytesRead; 189 } 190 191 void addRemoteBytes(long amt) { 192 this.totalBytesRead += amt; 193 } 194 195 void addLocalBytes(long amt) { 196 this.totalBytesRead += amt; 197 this.totalLocalBytesRead += amt; 198 } 199 200 void addShortCircuitBytes(long amt) { 201 this.totalBytesRead += amt; 202 this.totalLocalBytesRead += amt; 203 this.totalShortCircuitBytesRead += amt; 204 } 205 206 void addZeroCopyBytes(long amt) { 207 this.totalBytesRead += amt; 208 this.totalLocalBytesRead += amt; 209 this.totalShortCircuitBytesRead += amt; 210 this.totalZeroCopyBytesRead += amt; 211 } 212 213 void clear() { 214 this.totalBytesRead = 0; 215 this.totalLocalBytesRead = 0; 216 this.totalShortCircuitBytesRead = 0; 217 this.totalZeroCopyBytesRead = 0; 218 } 219 220 private long totalBytesRead; 221 222 private long totalLocalBytesRead; 223 224 private long totalShortCircuitBytesRead; 225 226 private long totalZeroCopyBytesRead; 227 } 228 229 /** 230 * This variable tracks the number of failures since the start of the 231 * most recent user-facing operation. That is to say, it should be reset 232 * whenever the user makes a call on this stream, and if at any point 233 * during the retry logic, the failure count exceeds a threshold, 234 * the errors will be thrown back to the operation. 235 * 236 * Specifically this counts the number of times the client has gone 237 * back to the namenode to get a new list of block locations, and is 238 * capped at maxBlockAcquireFailures 239 */ 240 private int failures = 0; 241 242 /* XXX Use of CocurrentHashMap is temp fix. Need to fix 243 * parallel accesses to DFSInputStream (through ptreads) properly */ 244 private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes = 245 new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>(); 246 247 private byte[] oneByteBuf; // used for 'int read()' 248 249 void addToDeadNodes(DatanodeInfo dnInfo) { 250 deadNodes.put(dnInfo, dnInfo); 251 } 252 253 DFSInputStream(DFSClient dfsClient, String src, boolean verifyChecksum 254 ) throws IOException, UnresolvedLinkException { 255 this.dfsClient = dfsClient; 256 this.verifyChecksum = verifyChecksum; 257 this.src = src; 258 synchronized (infoLock) { 259 this.cachingStrategy = dfsClient.getDefaultReadCachingStrategy(); 260 } 261 openInfo(); 262 } 263 264 /** 265 * Grab the open-file info from namenode 266 */ 267 void openInfo() throws IOException, UnresolvedLinkException { 268 synchronized(infoLock) { 269 lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); 270 int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength; 271 while (retriesForLastBlockLength > 0) { 272 // Getting last block length as -1 is a special case. When cluster 273 // restarts, DNs may not report immediately. At this time partial block 274 // locations will not be available with NN for getting the length. Lets 275 // retry for 3 times to get the length. 276 if (lastBlockBeingWrittenLength == -1) { 277 DFSClient.LOG.warn("Last block locations not available. " 278 + "Datanodes might not have reported blocks completely." 279 + " Will retry for " + retriesForLastBlockLength + " times"); 280 waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength); 281 lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); 282 } else { 283 break; 284 } 285 retriesForLastBlockLength--; 286 } 287 if (retriesForLastBlockLength == 0) { 288 throw new IOException("Could not obtain the last block locations."); 289 } 290 } 291 } 292 293 private void waitFor(int waitTime) throws IOException { 294 try { 295 Thread.sleep(waitTime); 296 } catch (InterruptedException e) { 297 throw new IOException( 298 "Interrupted while getting the last block length."); 299 } 300 } 301 302 private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException { 303 final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0); 304 if (DFSClient.LOG.isDebugEnabled()) { 305 DFSClient.LOG.debug("newInfo = " + newInfo); 306 } 307 if (newInfo == null) { 308 throw new IOException("Cannot open filename " + src); 309 } 310 311 if (locatedBlocks != null) { 312 Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator(); 313 Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator(); 314 while (oldIter.hasNext() && newIter.hasNext()) { 315 if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) { 316 throw new IOException("Blocklist for " + src + " has changed!"); 317 } 318 } 319 } 320 locatedBlocks = newInfo; 321 long lastBlockBeingWrittenLength = 0; 322 if (!locatedBlocks.isLastBlockComplete()) { 323 final LocatedBlock last = locatedBlocks.getLastLocatedBlock(); 324 if (last != null) { 325 if (last.getLocations().length == 0) { 326 if (last.getBlockSize() == 0) { 327 // if the length is zero, then no data has been written to 328 // datanode. So no need to wait for the locations. 329 return 0; 330 } 331 return -1; 332 } 333 final long len = readBlockLength(last); 334 last.getBlock().setNumBytes(len); 335 lastBlockBeingWrittenLength = len; 336 } 337 } 338 339 fileEncryptionInfo = locatedBlocks.getFileEncryptionInfo(); 340 341 return lastBlockBeingWrittenLength; 342 } 343 344 /** Read the block length from one of the datanodes. */ 345 private long readBlockLength(LocatedBlock locatedblock) throws IOException { 346 assert locatedblock != null : "LocatedBlock cannot be null"; 347 int replicaNotFoundCount = locatedblock.getLocations().length; 348 349 for(DatanodeInfo datanode : locatedblock.getLocations()) { 350 ClientDatanodeProtocol cdp = null; 351 352 try { 353 cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode, 354 dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout, 355 dfsClient.getConf().connectToDnViaHostname, locatedblock); 356 357 final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock()); 358 359 if (n >= 0) { 360 return n; 361 } 362 } 363 catch(IOException ioe) { 364 if (ioe instanceof RemoteException && 365 (((RemoteException) ioe).unwrapRemoteException() instanceof 366 ReplicaNotFoundException)) { 367 // special case : replica might not be on the DN, treat as 0 length 368 replicaNotFoundCount--; 369 } 370 371 if (DFSClient.LOG.isDebugEnabled()) { 372 DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode " 373 + datanode + " for block " + locatedblock.getBlock(), ioe); 374 } 375 } finally { 376 if (cdp != null) { 377 RPC.stopProxy(cdp); 378 } 379 } 380 } 381 382 // Namenode told us about these locations, but none know about the replica 383 // means that we hit the race between pipeline creation start and end. 384 // we require all 3 because some other exception could have happened 385 // on a DN that has it. we want to report that error 386 if (replicaNotFoundCount == 0) { 387 return 0; 388 } 389 390 throw new IOException("Cannot obtain block length for " + locatedblock); 391 } 392 393 public long getFileLength() { 394 synchronized(infoLock) { 395 return locatedBlocks == null? 0: 396 locatedBlocks.getFileLength() + lastBlockBeingWrittenLength; 397 } 398 } 399 400 // Short circuit local reads are forbidden for files that are 401 // under construction. See HDFS-2757. 402 boolean shortCircuitForbidden() { 403 synchronized(infoLock) { 404 return locatedBlocks.isUnderConstruction(); 405 } 406 } 407 408 /** 409 * Returns the datanode from which the stream is currently reading. 410 */ 411 public synchronized DatanodeInfo getCurrentDatanode() { 412 return currentNode; 413 } 414 415 /** 416 * Returns the block containing the target position. 417 */ 418 synchronized public ExtendedBlock getCurrentBlock() { 419 if (currentLocatedBlock == null){ 420 return null; 421 } 422 return currentLocatedBlock.getBlock(); 423 } 424 425 /** 426 * Return collection of blocks that has already been located. 427 */ 428 public List<LocatedBlock> getAllBlocks() throws IOException { 429 return getBlockRange(0, getFileLength()); 430 } 431 432 /** 433 * Get block at the specified position. 434 * Fetch it from the namenode if not cached. 435 * 436 * @param offset block corresponding to this offset in file is returned 437 * @return located block 438 * @throws IOException 439 */ 440 private LocatedBlock getBlockAt(long offset) throws IOException { 441 synchronized(infoLock) { 442 assert (locatedBlocks != null) : "locatedBlocks is null"; 443 444 final LocatedBlock blk; 445 446 //check offset 447 if (offset < 0 || offset >= getFileLength()) { 448 throw new IOException("offset < 0 || offset >= getFileLength(), offset=" 449 + offset 450 + ", locatedBlocks=" + locatedBlocks); 451 } 452 else if (offset >= locatedBlocks.getFileLength()) { 453 // offset to the portion of the last block, 454 // which is not known to the name-node yet; 455 // getting the last block 456 blk = locatedBlocks.getLastLocatedBlock(); 457 } 458 else { 459 // search cached blocks first 460 int targetBlockIdx = locatedBlocks.findBlock(offset); 461 if (targetBlockIdx < 0) { // block is not cached 462 targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx); 463 // fetch more blocks 464 final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset); 465 assert (newBlocks != null) : "Could not find target position " + offset; 466 locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks()); 467 } 468 blk = locatedBlocks.get(targetBlockIdx); 469 } 470 return blk; 471 } 472 } 473 474 /** Fetch a block from namenode and cache it */ 475 private void fetchBlockAt(long offset) throws IOException { 476 synchronized(infoLock) { 477 int targetBlockIdx = locatedBlocks.findBlock(offset); 478 if (targetBlockIdx < 0) { // block is not cached 479 targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx); 480 } 481 // fetch blocks 482 final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset); 483 if (newBlocks == null) { 484 throw new IOException("Could not find target position " + offset); 485 } 486 locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks()); 487 } 488 } 489 490 /** 491 * Get blocks in the specified range. 492 * Fetch them from the namenode if not cached. This function 493 * will not get a read request beyond the EOF. 494 * @param offset starting offset in file 495 * @param length length of data 496 * @return consequent segment of located blocks 497 * @throws IOException 498 */ 499 private List<LocatedBlock> getBlockRange(long offset, 500 long length) throws IOException { 501 // getFileLength(): returns total file length 502 // locatedBlocks.getFileLength(): returns length of completed blocks 503 if (offset >= getFileLength()) { 504 throw new IOException("Offset: " + offset + 505 " exceeds file length: " + getFileLength()); 506 } 507 synchronized(infoLock) { 508 final List<LocatedBlock> blocks; 509 final long lengthOfCompleteBlk = locatedBlocks.getFileLength(); 510 final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk; 511 final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk; 512 513 if (readOffsetWithinCompleteBlk) { 514 //get the blocks of finalized (completed) block range 515 blocks = getFinalizedBlockRange(offset, 516 Math.min(length, lengthOfCompleteBlk - offset)); 517 } else { 518 blocks = new ArrayList<LocatedBlock>(1); 519 } 520 521 // get the blocks from incomplete block range 522 if (readLengthPastCompleteBlk) { 523 blocks.add(locatedBlocks.getLastLocatedBlock()); 524 } 525 526 return blocks; 527 } 528 } 529 530 /** 531 * Get blocks in the specified range. 532 * Includes only the complete blocks. 533 * Fetch them from the namenode if not cached. 534 */ 535 private List<LocatedBlock> getFinalizedBlockRange( 536 long offset, long length) throws IOException { 537 synchronized(infoLock) { 538 assert (locatedBlocks != null) : "locatedBlocks is null"; 539 List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>(); 540 // search cached blocks first 541 int blockIdx = locatedBlocks.findBlock(offset); 542 if (blockIdx < 0) { // block is not cached 543 blockIdx = LocatedBlocks.getInsertIndex(blockIdx); 544 } 545 long remaining = length; 546 long curOff = offset; 547 while(remaining > 0) { 548 LocatedBlock blk = null; 549 if(blockIdx < locatedBlocks.locatedBlockCount()) 550 blk = locatedBlocks.get(blockIdx); 551 if (blk == null || curOff < blk.getStartOffset()) { 552 LocatedBlocks newBlocks; 553 newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining); 554 locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks()); 555 continue; 556 } 557 assert curOff >= blk.getStartOffset() : "Block not found"; 558 blockRange.add(blk); 559 long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff; 560 remaining -= bytesRead; 561 curOff += bytesRead; 562 blockIdx++; 563 } 564 return blockRange; 565 } 566 } 567 568 /** 569 * Open a DataInputStream to a DataNode so that it can be read from. 570 * We get block ID and the IDs of the destinations at startup, from the namenode. 571 */ 572 private synchronized DatanodeInfo blockSeekTo(long target) throws IOException { 573 if (target >= getFileLength()) { 574 throw new IOException("Attempted to read past end of file"); 575 } 576 577 // Will be getting a new BlockReader. 578 closeCurrentBlockReader(); 579 580 // 581 // Connect to best DataNode for desired Block, with potential offset 582 // 583 DatanodeInfo chosenNode = null; 584 int refetchToken = 1; // only need to get a new access token once 585 int refetchEncryptionKey = 1; // only need to get a new encryption key once 586 587 boolean connectFailedOnce = false; 588 589 while (true) { 590 // 591 // Compute desired block 592 // 593 LocatedBlock targetBlock = getBlockAt(target); 594 595 // update current position 596 this.pos = target; 597 this.blockEnd = targetBlock.getStartOffset() + 598 targetBlock.getBlockSize() - 1; 599 this.currentLocatedBlock = targetBlock; 600 601 assert (target==pos) : "Wrong postion " + pos + " expect " + target; 602 long offsetIntoBlock = target - targetBlock.getStartOffset(); 603 604 DNAddrPair retval = chooseDataNode(targetBlock, null); 605 chosenNode = retval.info; 606 InetSocketAddress targetAddr = retval.addr; 607 StorageType storageType = retval.storageType; 608 609 try { 610 ExtendedBlock blk = targetBlock.getBlock(); 611 Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken(); 612 CachingStrategy curCachingStrategy; 613 boolean shortCircuitForbidden; 614 synchronized(infoLock) { 615 curCachingStrategy = cachingStrategy; 616 shortCircuitForbidden = shortCircuitForbidden(); 617 } 618 blockReader = new BlockReaderFactory(dfsClient.getConf()). 619 setInetSocketAddress(targetAddr). 620 setRemotePeerFactory(dfsClient). 621 setDatanodeInfo(chosenNode). 622 setStorageType(storageType). 623 setFileName(src). 624 setBlock(blk). 625 setBlockToken(accessToken). 626 setStartOffset(offsetIntoBlock). 627 setVerifyChecksum(verifyChecksum). 628 setClientName(dfsClient.clientName). 629 setLength(blk.getNumBytes() - offsetIntoBlock). 630 setCachingStrategy(curCachingStrategy). 631 setAllowShortCircuitLocalReads(!shortCircuitForbidden). 632 setClientCacheContext(dfsClient.getClientContext()). 633 setUserGroupInformation(dfsClient.ugi). 634 setConfiguration(dfsClient.getConfiguration()). 635 build(); 636 if(connectFailedOnce) { 637 DFSClient.LOG.info("Successfully connected to " + targetAddr + 638 " for " + blk); 639 } 640 return chosenNode; 641 } catch (IOException ex) { 642 if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { 643 DFSClient.LOG.info("Will fetch a new encryption key and retry, " 644 + "encryption key was invalid when connecting to " + targetAddr 645 + " : " + ex); 646 // The encryption key used is invalid. 647 refetchEncryptionKey--; 648 dfsClient.clearDataEncryptionKey(); 649 } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) { 650 refetchToken--; 651 fetchBlockAt(target); 652 } else { 653 connectFailedOnce = true; 654 DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block" 655 + ", add to deadNodes and continue. " + ex, ex); 656 // Put chosen node into dead list, continue 657 addToDeadNodes(chosenNode); 658 } 659 } 660 } 661 } 662 663 /** 664 * Close it down! 665 */ 666 @Override 667 public synchronized void close() throws IOException { 668 if (!closed.compareAndSet(false, true)) { 669 DFSClient.LOG.warn("DFSInputStream has been closed already"); 670 return; 671 } 672 dfsClient.checkOpen(); 673 674 if ((extendedReadBuffers != null) && (!extendedReadBuffers.isEmpty())) { 675 final StringBuilder builder = new StringBuilder(); 676 extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() { 677 private String prefix = ""; 678 @Override 679 public void accept(ByteBuffer k, Object v) { 680 builder.append(prefix).append(k); 681 prefix = ", "; 682 } 683 }); 684 DFSClient.LOG.warn("closing file " + src + ", but there are still " + 685 "unreleased ByteBuffers allocated by read(). " + 686 "Please release " + builder.toString() + "."); 687 } 688 closeCurrentBlockReader(); 689 super.close(); 690 } 691 692 @Override 693 public synchronized int read() throws IOException { 694 if (oneByteBuf == null) { 695 oneByteBuf = new byte[1]; 696 } 697 int ret = read( oneByteBuf, 0, 1 ); 698 return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff); 699 } 700 701 /** 702 * Wraps different possible read implementations so that readBuffer can be 703 * strategy-agnostic. 704 */ 705 private interface ReaderStrategy { 706 public int doRead(BlockReader blockReader, int off, int len) 707 throws ChecksumException, IOException; 708 } 709 710 private void updateReadStatistics(ReadStatistics readStatistics, 711 int nRead, BlockReader blockReader) { 712 if (nRead <= 0) return; 713 synchronized(infoLock) { 714 if (blockReader.isShortCircuit()) { 715 readStatistics.addShortCircuitBytes(nRead); 716 } else if (blockReader.isLocal()) { 717 readStatistics.addLocalBytes(nRead); 718 } else { 719 readStatistics.addRemoteBytes(nRead); 720 } 721 } 722 } 723 724 /** 725 * Used to read bytes into a byte[] 726 */ 727 private class ByteArrayStrategy implements ReaderStrategy { 728 final byte[] buf; 729 730 public ByteArrayStrategy(byte[] buf) { 731 this.buf = buf; 732 } 733 734 @Override 735 public int doRead(BlockReader blockReader, int off, int len) 736 throws ChecksumException, IOException { 737 int nRead = blockReader.read(buf, off, len); 738 updateReadStatistics(readStatistics, nRead, blockReader); 739 return nRead; 740 } 741 } 742 743 /** 744 * Used to read bytes into a user-supplied ByteBuffer 745 */ 746 private class ByteBufferStrategy implements ReaderStrategy { 747 final ByteBuffer buf; 748 ByteBufferStrategy(ByteBuffer buf) { 749 this.buf = buf; 750 } 751 752 @Override 753 public int doRead(BlockReader blockReader, int off, int len) 754 throws ChecksumException, IOException { 755 int oldpos = buf.position(); 756 int oldlimit = buf.limit(); 757 boolean success = false; 758 try { 759 int ret = blockReader.read(buf); 760 success = true; 761 updateReadStatistics(readStatistics, ret, blockReader); 762 return ret; 763 } finally { 764 if (!success) { 765 // Reset to original state so that retries work correctly. 766 buf.position(oldpos); 767 buf.limit(oldlimit); 768 } 769 } 770 } 771 } 772 773 /* This is a used by regular read() and handles ChecksumExceptions. 774 * name readBuffer() is chosen to imply similarity to readBuffer() in 775 * ChecksumFileSystem 776 */ 777 private synchronized int readBuffer(ReaderStrategy reader, int off, int len, 778 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 779 throws IOException { 780 IOException ioe; 781 782 /* we retry current node only once. So this is set to true only here. 783 * Intention is to handle one common case of an error that is not a 784 * failure on datanode or client : when DataNode closes the connection 785 * since client is idle. If there are other cases of "non-errors" then 786 * then a datanode might be retried by setting this to true again. 787 */ 788 boolean retryCurrentNode = true; 789 790 while (true) { 791 // retry as many times as seekToNewSource allows. 792 try { 793 return reader.doRead(blockReader, off, len); 794 } catch ( ChecksumException ce ) { 795 DFSClient.LOG.warn("Found Checksum error for " 796 + getCurrentBlock() + " from " + currentNode 797 + " at " + ce.getPos()); 798 ioe = ce; 799 retryCurrentNode = false; 800 // we want to remember which block replicas we have tried 801 addIntoCorruptedBlockMap(getCurrentBlock(), currentNode, 802 corruptedBlockMap); 803 } catch ( IOException e ) { 804 if (!retryCurrentNode) { 805 DFSClient.LOG.warn("Exception while reading from " 806 + getCurrentBlock() + " of " + src + " from " 807 + currentNode, e); 808 } 809 ioe = e; 810 } 811 boolean sourceFound = false; 812 if (retryCurrentNode) { 813 /* possibly retry the same node so that transient errors don't 814 * result in application level failures (e.g. Datanode could have 815 * closed the connection because the client is idle for too long). 816 */ 817 sourceFound = seekToBlockSource(pos); 818 } else { 819 addToDeadNodes(currentNode); 820 sourceFound = seekToNewSource(pos); 821 } 822 if (!sourceFound) { 823 throw ioe; 824 } 825 retryCurrentNode = false; 826 } 827 } 828 829 private synchronized int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException { 830 dfsClient.checkOpen(); 831 if (closed.get()) { 832 throw new IOException("Stream closed"); 833 } 834 Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 835 = new HashMap<ExtendedBlock, Set<DatanodeInfo>>(); 836 failures = 0; 837 if (pos < getFileLength()) { 838 int retries = 2; 839 while (retries > 0) { 840 try { 841 // currentNode can be left as null if previous read had a checksum 842 // error on the same block. See HDFS-3067 843 if (pos > blockEnd || currentNode == null) { 844 currentNode = blockSeekTo(pos); 845 } 846 int realLen = (int) Math.min(len, (blockEnd - pos + 1L)); 847 synchronized(infoLock) { 848 if (locatedBlocks.isLastBlockComplete()) { 849 realLen = (int) Math.min(realLen, 850 locatedBlocks.getFileLength() - pos); 851 } 852 } 853 int result = readBuffer(strategy, off, realLen, corruptedBlockMap); 854 855 if (result >= 0) { 856 pos += result; 857 } else { 858 // got a EOS from reader though we expect more data on it. 859 throw new IOException("Unexpected EOS from the reader"); 860 } 861 if (dfsClient.stats != null) { 862 dfsClient.stats.incrementBytesRead(result); 863 } 864 return result; 865 } catch (ChecksumException ce) { 866 throw ce; 867 } catch (IOException e) { 868 if (retries == 1) { 869 DFSClient.LOG.warn("DFS Read", e); 870 } 871 blockEnd = -1; 872 if (currentNode != null) { addToDeadNodes(currentNode); } 873 if (--retries == 0) { 874 throw e; 875 } 876 } finally { 877 // Check if need to report block replicas corruption either read 878 // was successful or ChecksumException occured. 879 reportCheckSumFailure(corruptedBlockMap, 880 currentLocatedBlock.getLocations().length); 881 } 882 } 883 } 884 return -1; 885 } 886 887 /** 888 * Read the entire buffer. 889 */ 890 @Override 891 public synchronized int read(final byte buf[], int off, int len) throws IOException { 892 ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf); 893 TraceScope scope = 894 dfsClient.getPathTraceScope("DFSInputStream#byteArrayRead", src); 895 try { 896 return readWithStrategy(byteArrayReader, off, len); 897 } finally { 898 scope.close(); 899 } 900 } 901 902 @Override 903 public synchronized int read(final ByteBuffer buf) throws IOException { 904 ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf); 905 TraceScope scope = 906 dfsClient.getPathTraceScope("DFSInputStream#byteBufferRead", src); 907 try { 908 return readWithStrategy(byteBufferReader, 0, buf.remaining()); 909 } finally { 910 scope.close(); 911 } 912 } 913 914 915 /** 916 * Add corrupted block replica into map. 917 */ 918 private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 919 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) { 920 Set<DatanodeInfo> dnSet = null; 921 if((corruptedBlockMap.containsKey(blk))) { 922 dnSet = corruptedBlockMap.get(blk); 923 }else { 924 dnSet = new HashSet<DatanodeInfo>(); 925 } 926 if (!dnSet.contains(node)) { 927 dnSet.add(node); 928 corruptedBlockMap.put(blk, dnSet); 929 } 930 } 931 932 private DNAddrPair chooseDataNode(LocatedBlock block, 933 Collection<DatanodeInfo> ignoredNodes) throws IOException { 934 while (true) { 935 try { 936 return getBestNodeDNAddrPair(block, ignoredNodes); 937 } catch (IOException ie) { 938 String errMsg = getBestNodeDNAddrPairErrorString(block.getLocations(), 939 deadNodes, ignoredNodes); 940 String blockInfo = block.getBlock() + " file=" + src; 941 if (failures >= dfsClient.getMaxBlockAcquireFailures()) { 942 String description = "Could not obtain block: " + blockInfo; 943 DFSClient.LOG.warn(description + errMsg 944 + ". Throwing a BlockMissingException"); 945 throw new BlockMissingException(src, description, 946 block.getStartOffset()); 947 } 948 949 DatanodeInfo[] nodes = block.getLocations(); 950 if (nodes == null || nodes.length == 0) { 951 DFSClient.LOG.info("No node available for " + blockInfo); 952 } 953 DFSClient.LOG.info("Could not obtain " + block.getBlock() 954 + " from any node: " + ie + errMsg 955 + ". Will get new block locations from namenode and retry..."); 956 try { 957 // Introducing a random factor to the wait time before another retry. 958 // The wait time is dependent on # of failures and a random factor. 959 // At the first time of getting a BlockMissingException, the wait time 960 // is a random number between 0..3000 ms. If the first retry 961 // still fails, we will wait 3000 ms grace period before the 2nd retry. 962 // Also at the second retry, the waiting window is expanded to 6000 ms 963 // alleviating the request rate from the server. Similarly the 3rd retry 964 // will wait 6000ms grace period before retry and the waiting window is 965 // expanded to 9000ms. 966 final int timeWindow = dfsClient.getConf().timeWindow; 967 double waitTime = timeWindow * failures + // grace period for the last round of attempt 968 timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure 969 DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec."); 970 Thread.sleep((long)waitTime); 971 } catch (InterruptedException iex) { 972 } 973 deadNodes.clear(); //2nd option is to remove only nodes[blockId] 974 openInfo(); 975 block = getBlockAt(block.getStartOffset()); 976 failures++; 977 continue; 978 } 979 } 980 } 981 982 /** 983 * Get the best node from which to stream the data. 984 * @param block LocatedBlock, containing nodes in priority order. 985 * @param ignoredNodes Do not choose nodes in this array (may be null) 986 * @return The DNAddrPair of the best node. 987 * @throws IOException 988 */ 989 private DNAddrPair getBestNodeDNAddrPair(LocatedBlock block, 990 Collection<DatanodeInfo> ignoredNodes) throws IOException { 991 DatanodeInfo[] nodes = block.getLocations(); 992 StorageType[] storageTypes = block.getStorageTypes(); 993 DatanodeInfo chosenNode = null; 994 StorageType storageType = null; 995 if (nodes != null) { 996 for (int i = 0; i < nodes.length; i++) { 997 if (!deadNodes.containsKey(nodes[i]) 998 && (ignoredNodes == null || !ignoredNodes.contains(nodes[i]))) { 999 chosenNode = nodes[i]; 1000 // Storage types are ordered to correspond with nodes, so use the same 1001 // index to get storage type. 1002 if (storageTypes != null && i < storageTypes.length) { 1003 storageType = storageTypes[i]; 1004 } 1005 break; 1006 } 1007 } 1008 } 1009 if (chosenNode == null) { 1010 throw new IOException("No live nodes contain block " + block.getBlock() + 1011 " after checking nodes = " + Arrays.toString(nodes) + 1012 ", ignoredNodes = " + ignoredNodes); 1013 } 1014 final String dnAddr = 1015 chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname); 1016 if (DFSClient.LOG.isDebugEnabled()) { 1017 DFSClient.LOG.debug("Connecting to datanode " + dnAddr); 1018 } 1019 InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr); 1020 return new DNAddrPair(chosenNode, targetAddr, storageType); 1021 } 1022 1023 private static String getBestNodeDNAddrPairErrorString( 1024 DatanodeInfo nodes[], AbstractMap<DatanodeInfo, 1025 DatanodeInfo> deadNodes, Collection<DatanodeInfo> ignoredNodes) { 1026 StringBuilder errMsgr = new StringBuilder( 1027 " No live nodes contain current block "); 1028 errMsgr.append("Block locations:"); 1029 for (DatanodeInfo datanode : nodes) { 1030 errMsgr.append(" "); 1031 errMsgr.append(datanode.toString()); 1032 } 1033 errMsgr.append(" Dead nodes: "); 1034 for (DatanodeInfo datanode : deadNodes.keySet()) { 1035 errMsgr.append(" "); 1036 errMsgr.append(datanode.toString()); 1037 } 1038 if (ignoredNodes != null) { 1039 errMsgr.append(" Ignored nodes: "); 1040 for (DatanodeInfo datanode : ignoredNodes) { 1041 errMsgr.append(" "); 1042 errMsgr.append(datanode.toString()); 1043 } 1044 } 1045 return errMsgr.toString(); 1046 } 1047 1048 private void fetchBlockByteRange(LocatedBlock block, long start, long end, 1049 byte[] buf, int offset, 1050 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 1051 throws IOException { 1052 block = getBlockAt(block.getStartOffset()); 1053 while (true) { 1054 DNAddrPair addressPair = chooseDataNode(block, null); 1055 try { 1056 actualGetFromOneDataNode(addressPair, block, start, end, buf, offset, 1057 corruptedBlockMap); 1058 return; 1059 } catch (IOException e) { 1060 // Ignore. Already processed inside the function. 1061 // Loop through to try the next node. 1062 } 1063 } 1064 } 1065 1066 private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode, 1067 final LocatedBlock block, final long start, final long end, 1068 final ByteBuffer bb, 1069 final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 1070 final int hedgedReadId) { 1071 final Span parentSpan = Trace.currentSpan(); 1072 return new Callable<ByteBuffer>() { 1073 @Override 1074 public ByteBuffer call() throws Exception { 1075 byte[] buf = bb.array(); 1076 int offset = bb.position(); 1077 TraceScope scope = 1078 Trace.startSpan("hedgedRead" + hedgedReadId, parentSpan); 1079 try { 1080 actualGetFromOneDataNode(datanode, block, start, end, buf, offset, 1081 corruptedBlockMap); 1082 return bb; 1083 } finally { 1084 scope.close(); 1085 } 1086 } 1087 }; 1088 } 1089 1090 private void actualGetFromOneDataNode(final DNAddrPair datanode, 1091 LocatedBlock block, final long start, final long end, byte[] buf, 1092 int offset, Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 1093 throws IOException { 1094 DFSClientFaultInjector.get().startFetchFromDatanode(); 1095 int refetchToken = 1; // only need to get a new access token once 1096 int refetchEncryptionKey = 1; // only need to get a new encryption key once 1097 1098 while (true) { 1099 // cached block locations may have been updated by chooseDataNode() 1100 // or fetchBlockAt(). Always get the latest list of locations at the 1101 // start of the loop. 1102 CachingStrategy curCachingStrategy; 1103 boolean allowShortCircuitLocalReads; 1104 block = getBlockAt(block.getStartOffset()); 1105 synchronized(infoLock) { 1106 curCachingStrategy = cachingStrategy; 1107 allowShortCircuitLocalReads = !shortCircuitForbidden(); 1108 } 1109 DatanodeInfo chosenNode = datanode.info; 1110 InetSocketAddress targetAddr = datanode.addr; 1111 StorageType storageType = datanode.storageType; 1112 BlockReader reader = null; 1113 1114 try { 1115 DFSClientFaultInjector.get().fetchFromDatanodeException(); 1116 Token<BlockTokenIdentifier> blockToken = block.getBlockToken(); 1117 int len = (int) (end - start + 1); 1118 reader = new BlockReaderFactory(dfsClient.getConf()). 1119 setInetSocketAddress(targetAddr). 1120 setRemotePeerFactory(dfsClient). 1121 setDatanodeInfo(chosenNode). 1122 setStorageType(storageType). 1123 setFileName(src). 1124 setBlock(block.getBlock()). 1125 setBlockToken(blockToken). 1126 setStartOffset(start). 1127 setVerifyChecksum(verifyChecksum). 1128 setClientName(dfsClient.clientName). 1129 setLength(len). 1130 setCachingStrategy(curCachingStrategy). 1131 setAllowShortCircuitLocalReads(allowShortCircuitLocalReads). 1132 setClientCacheContext(dfsClient.getClientContext()). 1133 setUserGroupInformation(dfsClient.ugi). 1134 setConfiguration(dfsClient.getConfiguration()). 1135 build(); 1136 int nread = reader.readAll(buf, offset, len); 1137 updateReadStatistics(readStatistics, nread, reader); 1138 1139 if (nread != len) { 1140 throw new IOException("truncated return from reader.read(): " + 1141 "excpected " + len + ", got " + nread); 1142 } 1143 DFSClientFaultInjector.get().readFromDatanodeDelay(); 1144 return; 1145 } catch (ChecksumException e) { 1146 String msg = "fetchBlockByteRange(). Got a checksum exception for " 1147 + src + " at " + block.getBlock() + ":" + e.getPos() + " from " 1148 + chosenNode; 1149 DFSClient.LOG.warn(msg); 1150 // we want to remember what we have tried 1151 addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap); 1152 addToDeadNodes(chosenNode); 1153 throw new IOException(msg); 1154 } catch (IOException e) { 1155 if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { 1156 DFSClient.LOG.info("Will fetch a new encryption key and retry, " 1157 + "encryption key was invalid when connecting to " + targetAddr 1158 + " : " + e); 1159 // The encryption key used is invalid. 1160 refetchEncryptionKey--; 1161 dfsClient.clearDataEncryptionKey(); 1162 continue; 1163 } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) { 1164 refetchToken--; 1165 try { 1166 fetchBlockAt(block.getStartOffset()); 1167 } catch (IOException fbae) { 1168 // ignore IOE, since we can retry it later in a loop 1169 } 1170 continue; 1171 } else { 1172 String msg = "Failed to connect to " + targetAddr + " for file " 1173 + src + " for block " + block.getBlock() + ":" + e; 1174 DFSClient.LOG.warn("Connection failure: " + msg, e); 1175 addToDeadNodes(chosenNode); 1176 throw new IOException(msg); 1177 } 1178 } finally { 1179 if (reader != null) { 1180 reader.close(); 1181 } 1182 } 1183 } 1184 } 1185 1186 /** 1187 * Like {@link #fetchBlockByteRange(LocatedBlock, long, long, byte[], 1188 * int, Map)} except we start up a second, parallel, 'hedged' read 1189 * if the first read is taking longer than configured amount of 1190 * time. We then wait on which ever read returns first. 1191 */ 1192 private void hedgedFetchBlockByteRange(LocatedBlock block, long start, 1193 long end, byte[] buf, int offset, 1194 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 1195 throws IOException { 1196 ArrayList<Future<ByteBuffer>> futures = new ArrayList<Future<ByteBuffer>>(); 1197 CompletionService<ByteBuffer> hedgedService = 1198 new ExecutorCompletionService<ByteBuffer>( 1199 dfsClient.getHedgedReadsThreadPool()); 1200 ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>(); 1201 ByteBuffer bb = null; 1202 int len = (int) (end - start + 1); 1203 int hedgedReadId = 0; 1204 block = getBlockAt(block.getStartOffset()); 1205 while (true) { 1206 // see HDFS-6591, this metric is used to verify/catch unnecessary loops 1207 hedgedReadOpsLoopNumForTesting++; 1208 DNAddrPair chosenNode = null; 1209 // there is no request already executing. 1210 if (futures.isEmpty()) { 1211 // chooseDataNode is a commitment. If no node, we go to 1212 // the NN to reget block locations. Only go here on first read. 1213 chosenNode = chooseDataNode(block, ignored); 1214 bb = ByteBuffer.wrap(buf, offset, len); 1215 Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode( 1216 chosenNode, block, start, end, bb, corruptedBlockMap, 1217 hedgedReadId++); 1218 Future<ByteBuffer> firstRequest = hedgedService 1219 .submit(getFromDataNodeCallable); 1220 futures.add(firstRequest); 1221 try { 1222 Future<ByteBuffer> future = hedgedService.poll( 1223 dfsClient.getHedgedReadTimeout(), TimeUnit.MILLISECONDS); 1224 if (future != null) { 1225 future.get(); 1226 return; 1227 } 1228 if (DFSClient.LOG.isDebugEnabled()) { 1229 DFSClient.LOG.debug("Waited " + dfsClient.getHedgedReadTimeout() 1230 + "ms to read from " + chosenNode.info 1231 + "; spawning hedged read"); 1232 } 1233 // Ignore this node on next go around. 1234 ignored.add(chosenNode.info); 1235 dfsClient.getHedgedReadMetrics().incHedgedReadOps(); 1236 continue; // no need to refresh block locations 1237 } catch (InterruptedException e) { 1238 // Ignore 1239 } catch (ExecutionException e) { 1240 // Ignore already logged in the call. 1241 } 1242 } else { 1243 // We are starting up a 'hedged' read. We have a read already 1244 // ongoing. Call getBestNodeDNAddrPair instead of chooseDataNode. 1245 // If no nodes to do hedged reads against, pass. 1246 try { 1247 try { 1248 chosenNode = getBestNodeDNAddrPair(block, ignored); 1249 } catch (IOException ioe) { 1250 chosenNode = chooseDataNode(block, ignored); 1251 } 1252 bb = ByteBuffer.allocate(len); 1253 Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode( 1254 chosenNode, block, start, end, bb, corruptedBlockMap, 1255 hedgedReadId++); 1256 Future<ByteBuffer> oneMoreRequest = hedgedService 1257 .submit(getFromDataNodeCallable); 1258 futures.add(oneMoreRequest); 1259 } catch (IOException ioe) { 1260 if (DFSClient.LOG.isDebugEnabled()) { 1261 DFSClient.LOG.debug("Failed getting node for hedged read: " 1262 + ioe.getMessage()); 1263 } 1264 } 1265 // if not succeeded. Submit callables for each datanode in a loop, wait 1266 // for a fixed interval and get the result from the fastest one. 1267 try { 1268 ByteBuffer result = getFirstToComplete(hedgedService, futures); 1269 // cancel the rest. 1270 cancelAll(futures); 1271 if (result.array() != buf) { // compare the array pointers 1272 dfsClient.getHedgedReadMetrics().incHedgedReadWins(); 1273 System.arraycopy(result.array(), result.position(), buf, offset, 1274 len); 1275 } else { 1276 dfsClient.getHedgedReadMetrics().incHedgedReadOps(); 1277 } 1278 return; 1279 } catch (InterruptedException ie) { 1280 // Ignore and retry 1281 } 1282 // We got here if exception. Ignore this node on next go around IFF 1283 // we found a chosenNode to hedge read against. 1284 if (chosenNode != null && chosenNode.info != null) { 1285 ignored.add(chosenNode.info); 1286 } 1287 } 1288 } 1289 } 1290 1291 @VisibleForTesting 1292 public long getHedgedReadOpsLoopNumForTesting() { 1293 return hedgedReadOpsLoopNumForTesting; 1294 } 1295 1296 private ByteBuffer getFirstToComplete( 1297 CompletionService<ByteBuffer> hedgedService, 1298 ArrayList<Future<ByteBuffer>> futures) throws InterruptedException { 1299 if (futures.isEmpty()) { 1300 throw new InterruptedException("let's retry"); 1301 } 1302 Future<ByteBuffer> future = null; 1303 try { 1304 future = hedgedService.take(); 1305 ByteBuffer bb = future.get(); 1306 futures.remove(future); 1307 return bb; 1308 } catch (ExecutionException e) { 1309 // already logged in the Callable 1310 futures.remove(future); 1311 } catch (CancellationException ce) { 1312 // already logged in the Callable 1313 futures.remove(future); 1314 } 1315 1316 throw new InterruptedException("let's retry"); 1317 } 1318 1319 private void cancelAll(List<Future<ByteBuffer>> futures) { 1320 for (Future<ByteBuffer> future : futures) { 1321 // Unfortunately, hdfs reads do not take kindly to interruption. 1322 // Threads return a variety of interrupted-type exceptions but 1323 // also complaints about invalid pbs -- likely because read 1324 // is interrupted before gets whole pb. Also verbose WARN 1325 // logging. So, for now, do not interrupt running read. 1326 future.cancel(false); 1327 } 1328 } 1329 1330 /** 1331 * Should the block access token be refetched on an exception 1332 * 1333 * @param ex Exception received 1334 * @param targetAddr Target datanode address from where exception was received 1335 * @return true if block access token has expired or invalid and it should be 1336 * refetched 1337 */ 1338 private static boolean tokenRefetchNeeded(IOException ex, 1339 InetSocketAddress targetAddr) { 1340 /* 1341 * Get a new access token and retry. Retry is needed in 2 cases. 1) 1342 * When both NN and DN re-started while DFSClient holding a cached 1343 * access token. 2) In the case that NN fails to update its 1344 * access key at pre-set interval (by a wide margin) and 1345 * subsequently restarts. In this case, DN re-registers itself with 1346 * NN and receives a new access key, but DN will delete the old 1347 * access key from its memory since it's considered expired based on 1348 * the estimated expiration date. 1349 */ 1350 if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) { 1351 DFSClient.LOG.info("Access token was invalid when connecting to " 1352 + targetAddr + " : " + ex); 1353 return true; 1354 } 1355 return false; 1356 } 1357 1358 /** 1359 * Read bytes starting from the specified position. 1360 * 1361 * @param position start read from this position 1362 * @param buffer read buffer 1363 * @param offset offset into buffer 1364 * @param length number of bytes to read 1365 * 1366 * @return actual number of bytes read 1367 */ 1368 @Override 1369 public int read(long position, byte[] buffer, int offset, int length) 1370 throws IOException { 1371 TraceScope scope = 1372 dfsClient.getPathTraceScope("DFSInputStream#byteArrayPread", src); 1373 try { 1374 return pread(position, buffer, offset, length); 1375 } finally { 1376 scope.close(); 1377 } 1378 } 1379 1380 private int pread(long position, byte[] buffer, int offset, int length) 1381 throws IOException { 1382 // sanity checks 1383 dfsClient.checkOpen(); 1384 if (closed.get()) { 1385 throw new IOException("Stream closed"); 1386 } 1387 failures = 0; 1388 long filelen = getFileLength(); 1389 if ((position < 0) || (position >= filelen)) { 1390 return -1; 1391 } 1392 int realLen = length; 1393 if ((position + length) > filelen) { 1394 realLen = (int)(filelen - position); 1395 } 1396 1397 // determine the block and byte range within the block 1398 // corresponding to position and realLen 1399 List<LocatedBlock> blockRange = getBlockRange(position, realLen); 1400 int remaining = realLen; 1401 Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 1402 = new HashMap<ExtendedBlock, Set<DatanodeInfo>>(); 1403 for (LocatedBlock blk : blockRange) { 1404 long targetStart = position - blk.getStartOffset(); 1405 long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart); 1406 try { 1407 if (dfsClient.isHedgedReadsEnabled()) { 1408 hedgedFetchBlockByteRange(blk, targetStart, targetStart + bytesToRead 1409 - 1, buffer, offset, corruptedBlockMap); 1410 } else { 1411 fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1, 1412 buffer, offset, corruptedBlockMap); 1413 } 1414 } finally { 1415 // Check and report if any block replicas are corrupted. 1416 // BlockMissingException may be caught if all block replicas are 1417 // corrupted. 1418 reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length); 1419 } 1420 1421 remaining -= bytesToRead; 1422 position += bytesToRead; 1423 offset += bytesToRead; 1424 } 1425 assert remaining == 0 : "Wrong number of bytes read."; 1426 if (dfsClient.stats != null) { 1427 dfsClient.stats.incrementBytesRead(realLen); 1428 } 1429 return realLen; 1430 } 1431 1432 /** 1433 * DFSInputStream reports checksum failure. 1434 * Case I : client has tried multiple data nodes and at least one of the 1435 * attempts has succeeded. We report the other failures as corrupted block to 1436 * namenode. 1437 * Case II: client has tried out all data nodes, but all failed. We 1438 * only report if the total number of replica is 1. We do not 1439 * report otherwise since this maybe due to the client is a handicapped client 1440 * (who can not read). 1441 * @param corruptedBlockMap map of corrupted blocks 1442 * @param dataNodeCount number of data nodes who contains the block replicas 1443 */ 1444 private void reportCheckSumFailure( 1445 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 1446 int dataNodeCount) { 1447 if (corruptedBlockMap.isEmpty()) { 1448 return; 1449 } 1450 Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap 1451 .entrySet().iterator(); 1452 Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next(); 1453 ExtendedBlock blk = entry.getKey(); 1454 Set<DatanodeInfo> dnSet = entry.getValue(); 1455 if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0)) 1456 || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) { 1457 DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()]; 1458 int i = 0; 1459 for (DatanodeInfo dn:dnSet) { 1460 locs[i++] = dn; 1461 } 1462 LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) }; 1463 dfsClient.reportChecksumFailure(src, lblocks); 1464 } 1465 corruptedBlockMap.clear(); 1466 } 1467 1468 @Override 1469 public long skip(long n) throws IOException { 1470 if ( n > 0 ) { 1471 long curPos = getPos(); 1472 long fileLen = getFileLength(); 1473 if( n+curPos > fileLen ) { 1474 n = fileLen - curPos; 1475 } 1476 seek(curPos+n); 1477 return n; 1478 } 1479 return n < 0 ? -1 : 0; 1480 } 1481 1482 /** 1483 * Seek to a new arbitrary location 1484 */ 1485 @Override 1486 public synchronized void seek(long targetPos) throws IOException { 1487 if (targetPos > getFileLength()) { 1488 throw new EOFException("Cannot seek after EOF"); 1489 } 1490 if (targetPos < 0) { 1491 throw new EOFException("Cannot seek to negative offset"); 1492 } 1493 if (closed.get()) { 1494 throw new IOException("Stream is closed!"); 1495 } 1496 boolean done = false; 1497 if (pos <= targetPos && targetPos <= blockEnd) { 1498 // 1499 // If this seek is to a positive position in the current 1500 // block, and this piece of data might already be lying in 1501 // the TCP buffer, then just eat up the intervening data. 1502 // 1503 int diff = (int)(targetPos - pos); 1504 if (diff <= blockReader.available()) { 1505 try { 1506 pos += blockReader.skip(diff); 1507 if (pos == targetPos) { 1508 done = true; 1509 } else { 1510 // The range was already checked. If the block reader returns 1511 // something unexpected instead of throwing an exception, it is 1512 // most likely a bug. 1513 String errMsg = "BlockReader failed to seek to " + 1514 targetPos + ". Instead, it seeked to " + pos + "."; 1515 DFSClient.LOG.warn(errMsg); 1516 throw new IOException(errMsg); 1517 } 1518 } catch (IOException e) {//make following read to retry 1519 if(DFSClient.LOG.isDebugEnabled()) { 1520 DFSClient.LOG.debug("Exception while seek to " + targetPos 1521 + " from " + getCurrentBlock() + " of " + src + " from " 1522 + currentNode, e); 1523 } 1524 } 1525 } 1526 } 1527 if (!done) { 1528 pos = targetPos; 1529 blockEnd = -1; 1530 } 1531 } 1532 1533 /** 1534 * Same as {@link #seekToNewSource(long)} except that it does not exclude 1535 * the current datanode and might connect to the same node. 1536 */ 1537 private boolean seekToBlockSource(long targetPos) 1538 throws IOException { 1539 currentNode = blockSeekTo(targetPos); 1540 return true; 1541 } 1542 1543 /** 1544 * Seek to given position on a node other than the current node. If 1545 * a node other than the current node is found, then returns true. 1546 * If another node could not be found, then returns false. 1547 */ 1548 @Override 1549 public synchronized boolean seekToNewSource(long targetPos) throws IOException { 1550 boolean markedDead = deadNodes.containsKey(currentNode); 1551 addToDeadNodes(currentNode); 1552 DatanodeInfo oldNode = currentNode; 1553 DatanodeInfo newNode = blockSeekTo(targetPos); 1554 if (!markedDead) { 1555 /* remove it from deadNodes. blockSeekTo could have cleared 1556 * deadNodes and added currentNode again. Thats ok. */ 1557 deadNodes.remove(oldNode); 1558 } 1559 if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) { 1560 currentNode = newNode; 1561 return true; 1562 } else { 1563 return false; 1564 } 1565 } 1566 1567 /** 1568 */ 1569 @Override 1570 public synchronized long getPos() throws IOException { 1571 return pos; 1572 } 1573 1574 /** Return the size of the remaining available bytes 1575 * if the size is less than or equal to {@link Integer#MAX_VALUE}, 1576 * otherwise, return {@link Integer#MAX_VALUE}. 1577 */ 1578 @Override 1579 public synchronized int available() throws IOException { 1580 if (closed.get()) { 1581 throw new IOException("Stream closed"); 1582 } 1583 1584 final long remaining = getFileLength() - pos; 1585 return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE; 1586 } 1587 1588 /** 1589 * We definitely don't support marks 1590 */ 1591 @Override 1592 public boolean markSupported() { 1593 return false; 1594 } 1595 @Override 1596 public void mark(int readLimit) { 1597 } 1598 @Override 1599 public void reset() throws IOException { 1600 throw new IOException("Mark/reset not supported"); 1601 } 1602 1603 /** Utility class to encapsulate data node info and its address. */ 1604 private static final class DNAddrPair { 1605 final DatanodeInfo info; 1606 final InetSocketAddress addr; 1607 final StorageType storageType; 1608 1609 DNAddrPair(DatanodeInfo info, InetSocketAddress addr, 1610 StorageType storageType) { 1611 this.info = info; 1612 this.addr = addr; 1613 this.storageType = storageType; 1614 } 1615 } 1616 1617 /** 1618 * Get statistics about the reads which this DFSInputStream has done. 1619 */ 1620 public ReadStatistics getReadStatistics() { 1621 synchronized(infoLock) { 1622 return new ReadStatistics(readStatistics); 1623 } 1624 } 1625 1626 /** 1627 * Clear statistics about the reads which this DFSInputStream has done. 1628 */ 1629 public void clearReadStatistics() { 1630 synchronized(infoLock) { 1631 readStatistics.clear(); 1632 } 1633 } 1634 1635 public FileEncryptionInfo getFileEncryptionInfo() { 1636 synchronized(infoLock) { 1637 return fileEncryptionInfo; 1638 } 1639 } 1640 1641 private void closeCurrentBlockReader() { 1642 if (blockReader == null) return; 1643 // Close the current block reader so that the new caching settings can 1644 // take effect immediately. 1645 try { 1646 blockReader.close(); 1647 } catch (IOException e) { 1648 DFSClient.LOG.error("error closing blockReader", e); 1649 } 1650 blockReader = null; 1651 blockEnd = -1; 1652 } 1653 1654 @Override 1655 public synchronized void setReadahead(Long readahead) 1656 throws IOException { 1657 synchronized (infoLock) { 1658 this.cachingStrategy = 1659 new CachingStrategy.Builder(this.cachingStrategy).setReadahead(readahead).build(); 1660 } 1661 closeCurrentBlockReader(); 1662 } 1663 1664 @Override 1665 public synchronized void setDropBehind(Boolean dropBehind) 1666 throws IOException { 1667 synchronized (infoLock) { 1668 this.cachingStrategy = 1669 new CachingStrategy.Builder(this.cachingStrategy).setDropBehind(dropBehind).build(); 1670 } 1671 closeCurrentBlockReader(); 1672 } 1673 1674 /** 1675 * The immutable empty buffer we return when we reach EOF when doing a 1676 * zero-copy read. 1677 */ 1678 private static final ByteBuffer EMPTY_BUFFER = 1679 ByteBuffer.allocateDirect(0).asReadOnlyBuffer(); 1680 1681 @Override 1682 public synchronized ByteBuffer read(ByteBufferPool bufferPool, 1683 int maxLength, EnumSet<ReadOption> opts) 1684 throws IOException, UnsupportedOperationException { 1685 if (maxLength == 0) { 1686 return EMPTY_BUFFER; 1687 } else if (maxLength < 0) { 1688 throw new IllegalArgumentException("can't read a negative " + 1689 "number of bytes."); 1690 } 1691 if ((blockReader == null) || (blockEnd == -1)) { 1692 if (pos >= getFileLength()) { 1693 return null; 1694 } 1695 /* 1696 * If we don't have a blockReader, or the one we have has no more bytes 1697 * left to read, we call seekToBlockSource to get a new blockReader and 1698 * recalculate blockEnd. Note that we assume we're not at EOF here 1699 * (we check this above). 1700 */ 1701 if ((!seekToBlockSource(pos)) || (blockReader == null)) { 1702 throw new IOException("failed to allocate new BlockReader " + 1703 "at position " + pos); 1704 } 1705 } 1706 ByteBuffer buffer = null; 1707 if (dfsClient.getConf().shortCircuitMmapEnabled) { 1708 buffer = tryReadZeroCopy(maxLength, opts); 1709 } 1710 if (buffer != null) { 1711 return buffer; 1712 } 1713 buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength); 1714 if (buffer != null) { 1715 getExtendedReadBuffers().put(buffer, bufferPool); 1716 } 1717 return buffer; 1718 } 1719 1720 private synchronized ByteBuffer tryReadZeroCopy(int maxLength, 1721 EnumSet<ReadOption> opts) throws IOException { 1722 // Copy 'pos' and 'blockEnd' to local variables to make it easier for the 1723 // JVM to optimize this function. 1724 final long curPos = pos; 1725 final long curEnd = blockEnd; 1726 final long blockStartInFile = currentLocatedBlock.getStartOffset(); 1727 final long blockPos = curPos - blockStartInFile; 1728 1729 // Shorten this read if the end of the block is nearby. 1730 long length63; 1731 if ((curPos + maxLength) <= (curEnd + 1)) { 1732 length63 = maxLength; 1733 } else { 1734 length63 = 1 + curEnd - curPos; 1735 if (length63 <= 0) { 1736 if (DFSClient.LOG.isDebugEnabled()) { 1737 DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " + 1738 curPos + " of " + src + "; " + length63 + " bytes left in block. " + 1739 "blockPos=" + blockPos + "; curPos=" + curPos + 1740 "; curEnd=" + curEnd); 1741 } 1742 return null; 1743 } 1744 if (DFSClient.LOG.isDebugEnabled()) { 1745 DFSClient.LOG.debug("Reducing read length from " + maxLength + 1746 " to " + length63 + " to avoid going more than one byte " + 1747 "past the end of the block. blockPos=" + blockPos + 1748 "; curPos=" + curPos + "; curEnd=" + curEnd); 1749 } 1750 } 1751 // Make sure that don't go beyond 31-bit offsets in the MappedByteBuffer. 1752 int length; 1753 if (blockPos + length63 <= Integer.MAX_VALUE) { 1754 length = (int)length63; 1755 } else { 1756 long length31 = Integer.MAX_VALUE - blockPos; 1757 if (length31 <= 0) { 1758 // Java ByteBuffers can't be longer than 2 GB, because they use 1759 // 4-byte signed integers to represent capacity, etc. 1760 // So we can't mmap the parts of the block higher than the 2 GB offset. 1761 // FIXME: we could work around this with multiple memory maps. 1762 // See HDFS-5101. 1763 if (DFSClient.LOG.isDebugEnabled()) { 1764 DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " + 1765 curPos + " of " + src + "; 31-bit MappedByteBuffer limit " + 1766 "exceeded. blockPos=" + blockPos + ", curEnd=" + curEnd); 1767 } 1768 return null; 1769 } 1770 length = (int)length31; 1771 if (DFSClient.LOG.isDebugEnabled()) { 1772 DFSClient.LOG.debug("Reducing read length from " + maxLength + 1773 " to " + length + " to avoid 31-bit limit. " + 1774 "blockPos=" + blockPos + "; curPos=" + curPos + 1775 "; curEnd=" + curEnd); 1776 } 1777 } 1778 final ClientMmap clientMmap = blockReader.getClientMmap(opts); 1779 if (clientMmap == null) { 1780 if (DFSClient.LOG.isDebugEnabled()) { 1781 DFSClient.LOG.debug("unable to perform a zero-copy read from offset " + 1782 curPos + " of " + src + "; BlockReader#getClientMmap returned " + 1783 "null."); 1784 } 1785 return null; 1786 } 1787 boolean success = false; 1788 ByteBuffer buffer; 1789 try { 1790 seek(curPos + length); 1791 buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer(); 1792 buffer.position((int)blockPos); 1793 buffer.limit((int)(blockPos + length)); 1794 getExtendedReadBuffers().put(buffer, clientMmap); 1795 synchronized (infoLock) { 1796 readStatistics.addZeroCopyBytes(length); 1797 } 1798 if (DFSClient.LOG.isDebugEnabled()) { 1799 DFSClient.LOG.debug("readZeroCopy read " + length + 1800 " bytes from offset " + curPos + " via the zero-copy read " + 1801 "path. blockEnd = " + blockEnd); 1802 } 1803 success = true; 1804 } finally { 1805 if (!success) { 1806 IOUtils.closeQuietly(clientMmap); 1807 } 1808 } 1809 return buffer; 1810 } 1811 1812 @Override 1813 public synchronized void releaseBuffer(ByteBuffer buffer) { 1814 if (buffer == EMPTY_BUFFER) return; 1815 Object val = getExtendedReadBuffers().remove(buffer); 1816 if (val == null) { 1817 throw new IllegalArgumentException("tried to release a buffer " + 1818 "that was not created by this stream, " + buffer); 1819 } 1820 if (val instanceof ClientMmap) { 1821 IOUtils.closeQuietly((ClientMmap)val); 1822 } else if (val instanceof ByteBufferPool) { 1823 ((ByteBufferPool)val).putBuffer(buffer); 1824 } 1825 } 1826 1827 @Override 1828 public synchronized void unbuffer() { 1829 closeCurrentBlockReader(); 1830 } 1831}