001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs; 019 020import java.io.EOFException; 021import java.io.IOException; 022import java.net.InetSocketAddress; 023import java.nio.ByteBuffer; 024import java.util.AbstractMap; 025import java.util.ArrayList; 026import java.util.Arrays; 027import java.util.Collection; 028import java.util.EnumSet; 029import java.util.HashMap; 030import java.util.HashSet; 031import java.util.Iterator; 032import java.util.LinkedList; 033import java.util.List; 034import java.util.Map; 035import java.util.Map.Entry; 036import java.util.Set; 037import java.util.concurrent.Callable; 038import java.util.concurrent.CancellationException; 039import java.util.concurrent.CompletionService; 040import java.util.concurrent.ConcurrentHashMap; 041import java.util.concurrent.ExecutionException; 042import java.util.concurrent.ExecutorCompletionService; 043import java.util.concurrent.Future; 044import java.util.concurrent.TimeUnit; 045import java.util.concurrent.atomic.AtomicBoolean; 046 047import org.apache.commons.io.IOUtils; 048import org.apache.hadoop.classification.InterfaceAudience; 049import org.apache.hadoop.fs.ByteBufferReadable; 050import org.apache.hadoop.fs.ByteBufferUtil; 051import org.apache.hadoop.fs.CanSetDropBehind; 052import org.apache.hadoop.fs.CanSetReadahead; 053import org.apache.hadoop.fs.CanUnbuffer; 054import org.apache.hadoop.fs.ChecksumException; 055import org.apache.hadoop.fs.FSInputStream; 056import org.apache.hadoop.fs.HasEnhancedByteBufferAccess; 057import org.apache.hadoop.fs.ReadOption; 058import org.apache.hadoop.fs.StorageType; 059import org.apache.hadoop.fs.UnresolvedLinkException; 060import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol; 061import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 062import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 063import org.apache.hadoop.fs.FileEncryptionInfo; 064import org.apache.hadoop.hdfs.protocol.LocatedBlock; 065import org.apache.hadoop.hdfs.protocol.LocatedBlocks; 066import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException; 067import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; 068import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException; 069import org.apache.hadoop.hdfs.server.datanode.CachingStrategy; 070import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; 071import org.apache.hadoop.hdfs.shortcircuit.ClientMmap; 072import org.apache.hadoop.io.ByteBufferPool; 073import org.apache.hadoop.ipc.RPC; 074import org.apache.hadoop.ipc.RemoteException; 075import org.apache.hadoop.ipc.RetriableException; 076import org.apache.hadoop.net.NetUtils; 077import org.apache.hadoop.security.token.SecretManager.InvalidToken; 078import org.apache.hadoop.security.token.Token; 079import org.apache.hadoop.util.IdentityHashStore; 080import org.apache.hadoop.util.StopWatch; 081import org.apache.htrace.Span; 082import org.apache.htrace.Trace; 083import org.apache.htrace.TraceScope; 084 085import com.google.common.annotations.VisibleForTesting; 086 087/**************************************************************** 088 * DFSInputStream provides bytes from a named file. It handles 089 * negotiation of the namenode and various datanodes as necessary. 090 ****************************************************************/ 091@InterfaceAudience.Private 092public class DFSInputStream extends FSInputStream 093implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead, 094 HasEnhancedByteBufferAccess, CanUnbuffer { 095 @VisibleForTesting 096 public static boolean tcpReadsDisabledForTesting = false; 097 private long hedgedReadOpsLoopNumForTesting = 0; 098 private final DFSClient dfsClient; 099 private AtomicBoolean closed = new AtomicBoolean(false); 100 private final String src; 101 private final boolean verifyChecksum; 102 103 // state by stateful read only: 104 // (protected by lock on this) 105 ///// 106 private DatanodeInfo currentNode = null; 107 private LocatedBlock currentLocatedBlock = null; 108 private long pos = 0; 109 private long blockEnd = -1; 110 private BlockReader blockReader = null; 111 //// 112 113 // state shared by stateful and positional read: 114 // (protected by lock on infoLock) 115 //// 116 private LocatedBlocks locatedBlocks = null; 117 private long lastBlockBeingWrittenLength = 0; 118 private FileEncryptionInfo fileEncryptionInfo = null; 119 private CachingStrategy cachingStrategy; 120 //// 121 122 private final ReadStatistics readStatistics = new ReadStatistics(); 123 // lock for state shared between read and pread 124 // Note: Never acquire a lock on <this> with this lock held to avoid deadlocks 125 // (it's OK to acquire this lock when the lock on <this> is held) 126 private final Object infoLock = new Object(); 127 128 /** 129 * Track the ByteBuffers that we have handed out to readers. 130 * 131 * The value type can be either ByteBufferPool or ClientMmap, depending on 132 * whether we this is a memory-mapped buffer or not. 133 */ 134 private IdentityHashStore<ByteBuffer, Object> extendedReadBuffers; 135 136 private synchronized IdentityHashStore<ByteBuffer, Object> 137 getExtendedReadBuffers() { 138 if (extendedReadBuffers == null) { 139 extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0); 140 } 141 return extendedReadBuffers; 142 } 143 144 public static class ReadStatistics { 145 public ReadStatistics() { 146 clear(); 147 } 148 149 public ReadStatistics(ReadStatistics rhs) { 150 this.totalBytesRead = rhs.getTotalBytesRead(); 151 this.totalLocalBytesRead = rhs.getTotalLocalBytesRead(); 152 this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead(); 153 this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead(); 154 } 155 156 /** 157 * @return The total bytes read. This will always be at least as 158 * high as the other numbers, since it includes all of them. 159 */ 160 public long getTotalBytesRead() { 161 return totalBytesRead; 162 } 163 164 /** 165 * @return The total local bytes read. This will always be at least 166 * as high as totalShortCircuitBytesRead, since all short-circuit 167 * reads are also local. 168 */ 169 public long getTotalLocalBytesRead() { 170 return totalLocalBytesRead; 171 } 172 173 /** 174 * @return The total short-circuit local bytes read. 175 */ 176 public long getTotalShortCircuitBytesRead() { 177 return totalShortCircuitBytesRead; 178 } 179 180 /** 181 * @return The total number of zero-copy bytes read. 182 */ 183 public long getTotalZeroCopyBytesRead() { 184 return totalZeroCopyBytesRead; 185 } 186 187 /** 188 * @return The total number of bytes read which were not local. 189 */ 190 public long getRemoteBytesRead() { 191 return totalBytesRead - totalLocalBytesRead; 192 } 193 194 void addRemoteBytes(long amt) { 195 this.totalBytesRead += amt; 196 } 197 198 void addLocalBytes(long amt) { 199 this.totalBytesRead += amt; 200 this.totalLocalBytesRead += amt; 201 } 202 203 void addShortCircuitBytes(long amt) { 204 this.totalBytesRead += amt; 205 this.totalLocalBytesRead += amt; 206 this.totalShortCircuitBytesRead += amt; 207 } 208 209 void addZeroCopyBytes(long amt) { 210 this.totalBytesRead += amt; 211 this.totalLocalBytesRead += amt; 212 this.totalShortCircuitBytesRead += amt; 213 this.totalZeroCopyBytesRead += amt; 214 } 215 216 void clear() { 217 this.totalBytesRead = 0; 218 this.totalLocalBytesRead = 0; 219 this.totalShortCircuitBytesRead = 0; 220 this.totalZeroCopyBytesRead = 0; 221 } 222 223 private long totalBytesRead; 224 225 private long totalLocalBytesRead; 226 227 private long totalShortCircuitBytesRead; 228 229 private long totalZeroCopyBytesRead; 230 } 231 232 /** 233 * This variable tracks the number of failures since the start of the 234 * most recent user-facing operation. That is to say, it should be reset 235 * whenever the user makes a call on this stream, and if at any point 236 * during the retry logic, the failure count exceeds a threshold, 237 * the errors will be thrown back to the operation. 238 * 239 * Specifically this counts the number of times the client has gone 240 * back to the namenode to get a new list of block locations, and is 241 * capped at maxBlockAcquireFailures 242 */ 243 private int failures = 0; 244 245 /* XXX Use of CocurrentHashMap is temp fix. Need to fix 246 * parallel accesses to DFSInputStream (through ptreads) properly */ 247 private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes = 248 new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>(); 249 250 private byte[] oneByteBuf; // used for 'int read()' 251 252 void addToDeadNodes(DatanodeInfo dnInfo) { 253 deadNodes.put(dnInfo, dnInfo); 254 } 255 256 DFSInputStream(DFSClient dfsClient, String src, boolean verifyChecksum 257 ) throws IOException, UnresolvedLinkException { 258 this.dfsClient = dfsClient; 259 this.verifyChecksum = verifyChecksum; 260 this.src = src; 261 synchronized (infoLock) { 262 this.cachingStrategy = dfsClient.getDefaultReadCachingStrategy(); 263 } 264 openInfo(); 265 } 266 267 /** 268 * Grab the open-file info from namenode 269 */ 270 void openInfo() throws IOException, UnresolvedLinkException { 271 synchronized(infoLock) { 272 lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); 273 int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength; 274 while (retriesForLastBlockLength > 0) { 275 // Getting last block length as -1 is a special case. When cluster 276 // restarts, DNs may not report immediately. At this time partial block 277 // locations will not be available with NN for getting the length. Lets 278 // retry for 3 times to get the length. 279 if (lastBlockBeingWrittenLength == -1) { 280 DFSClient.LOG.warn("Last block locations not available. " 281 + "Datanodes might not have reported blocks completely." 282 + " Will retry for " + retriesForLastBlockLength + " times"); 283 waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength); 284 lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); 285 } else { 286 break; 287 } 288 retriesForLastBlockLength--; 289 } 290 if (retriesForLastBlockLength == 0) { 291 throw new IOException("Could not obtain the last block locations."); 292 } 293 } 294 } 295 296 private void waitFor(int waitTime) throws IOException { 297 try { 298 Thread.sleep(waitTime); 299 } catch (InterruptedException e) { 300 throw new IOException( 301 "Interrupted while getting the last block length."); 302 } 303 } 304 305 private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException { 306 final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0); 307 if (DFSClient.LOG.isDebugEnabled()) { 308 DFSClient.LOG.debug("newInfo = " + newInfo); 309 } 310 if (newInfo == null) { 311 throw new IOException("Cannot open filename " + src); 312 } 313 314 if (locatedBlocks != null) { 315 Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator(); 316 Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator(); 317 while (oldIter.hasNext() && newIter.hasNext()) { 318 if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) { 319 throw new IOException("Blocklist for " + src + " has changed!"); 320 } 321 } 322 } 323 locatedBlocks = newInfo; 324 long lastBlockBeingWrittenLength = 0; 325 if (!locatedBlocks.isLastBlockComplete()) { 326 final LocatedBlock last = locatedBlocks.getLastLocatedBlock(); 327 if (last != null) { 328 if (last.getLocations().length == 0) { 329 if (last.getBlockSize() == 0) { 330 // if the length is zero, then no data has been written to 331 // datanode. So no need to wait for the locations. 332 return 0; 333 } 334 return -1; 335 } 336 final long len = readBlockLength(last); 337 last.getBlock().setNumBytes(len); 338 lastBlockBeingWrittenLength = len; 339 } 340 } 341 342 fileEncryptionInfo = locatedBlocks.getFileEncryptionInfo(); 343 344 return lastBlockBeingWrittenLength; 345 } 346 347 /** Read the block length from one of the datanodes. */ 348 private long readBlockLength(LocatedBlock locatedblock) throws IOException { 349 assert locatedblock != null : "LocatedBlock cannot be null"; 350 int replicaNotFoundCount = locatedblock.getLocations().length; 351 352 final int timeout = dfsClient.getConf().socketTimeout; 353 LinkedList<DatanodeInfo> nodeList = new LinkedList<DatanodeInfo>( 354 Arrays.asList(locatedblock.getLocations())); 355 LinkedList<DatanodeInfo> retryList = new LinkedList<DatanodeInfo>(); 356 boolean isRetry = false; 357 StopWatch sw = new StopWatch(); 358 while (nodeList.size() > 0) { 359 DatanodeInfo datanode = nodeList.pop(); 360 ClientDatanodeProtocol cdp = null; 361 try { 362 cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode, 363 dfsClient.getConfiguration(), timeout, 364 dfsClient.getConf().connectToDnViaHostname, locatedblock); 365 366 final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock()); 367 368 if (n >= 0) { 369 return n; 370 } 371 } catch (IOException ioe) { 372 if (ioe instanceof RemoteException) { 373 if (((RemoteException) ioe).unwrapRemoteException() instanceof 374 ReplicaNotFoundException) { 375 // replica is not on the DN. We will treat it as 0 length 376 // if no one actually has a replica. 377 replicaNotFoundCount--; 378 } else if (((RemoteException) ioe).unwrapRemoteException() instanceof 379 RetriableException) { 380 // add to the list to be retried if necessary. 381 retryList.add(datanode); 382 } 383 } 384 385 if (DFSClient.LOG.isDebugEnabled()) { 386 DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode " 387 + datanode + " for block " + locatedblock.getBlock(), ioe); 388 } 389 } finally { 390 if (cdp != null) { 391 RPC.stopProxy(cdp); 392 } 393 } 394 395 // Ran out of nodes, but there are retriable nodes. 396 if (nodeList.size() == 0 && retryList.size() > 0) { 397 nodeList.addAll(retryList); 398 retryList.clear(); 399 isRetry = true; 400 } 401 402 if (isRetry) { 403 // start the stop watch if not already running. 404 if (!sw.isRunning()) { 405 sw.start(); 406 } 407 try { 408 Thread.sleep(500); // delay between retries. 409 } catch (InterruptedException e) { 410 throw new IOException("Interrupted while getting the length."); 411 } 412 } 413 414 // see if we ran out of retry time 415 if (sw.isRunning() && sw.now(TimeUnit.MILLISECONDS) > timeout) { 416 break; 417 } 418 } 419 420 // Namenode told us about these locations, but none know about the replica 421 // means that we hit the race between pipeline creation start and end. 422 // we require all 3 because some other exception could have happened 423 // on a DN that has it. we want to report that error 424 if (replicaNotFoundCount == 0) { 425 return 0; 426 } 427 428 throw new IOException("Cannot obtain block length for " + locatedblock); 429 } 430 431 public long getFileLength() { 432 synchronized(infoLock) { 433 return locatedBlocks == null? 0: 434 locatedBlocks.getFileLength() + lastBlockBeingWrittenLength; 435 } 436 } 437 438 // Short circuit local reads are forbidden for files that are 439 // under construction. See HDFS-2757. 440 boolean shortCircuitForbidden() { 441 synchronized(infoLock) { 442 return locatedBlocks.isUnderConstruction(); 443 } 444 } 445 446 /** 447 * Returns the datanode from which the stream is currently reading. 448 */ 449 public synchronized DatanodeInfo getCurrentDatanode() { 450 return currentNode; 451 } 452 453 /** 454 * Returns the block containing the target position. 455 */ 456 synchronized public ExtendedBlock getCurrentBlock() { 457 if (currentLocatedBlock == null){ 458 return null; 459 } 460 return currentLocatedBlock.getBlock(); 461 } 462 463 /** 464 * Return collection of blocks that has already been located. 465 */ 466 public List<LocatedBlock> getAllBlocks() throws IOException { 467 return getBlockRange(0, getFileLength()); 468 } 469 470 /** 471 * Get block at the specified position. 472 * Fetch it from the namenode if not cached. 473 * 474 * @param offset block corresponding to this offset in file is returned 475 * @return located block 476 * @throws IOException 477 */ 478 private LocatedBlock getBlockAt(long offset) throws IOException { 479 synchronized(infoLock) { 480 assert (locatedBlocks != null) : "locatedBlocks is null"; 481 482 final LocatedBlock blk; 483 484 //check offset 485 if (offset < 0 || offset >= getFileLength()) { 486 throw new IOException("offset < 0 || offset >= getFileLength(), offset=" 487 + offset 488 + ", locatedBlocks=" + locatedBlocks); 489 } 490 else if (offset >= locatedBlocks.getFileLength()) { 491 // offset to the portion of the last block, 492 // which is not known to the name-node yet; 493 // getting the last block 494 blk = locatedBlocks.getLastLocatedBlock(); 495 } 496 else { 497 // search cached blocks first 498 int targetBlockIdx = locatedBlocks.findBlock(offset); 499 if (targetBlockIdx < 0) { // block is not cached 500 targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx); 501 // fetch more blocks 502 final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset); 503 assert (newBlocks != null) : "Could not find target position " + offset; 504 locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks()); 505 } 506 blk = locatedBlocks.get(targetBlockIdx); 507 } 508 return blk; 509 } 510 } 511 512 /** Fetch a block from namenode and cache it */ 513 private void fetchBlockAt(long offset) throws IOException { 514 synchronized(infoLock) { 515 int targetBlockIdx = locatedBlocks.findBlock(offset); 516 if (targetBlockIdx < 0) { // block is not cached 517 targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx); 518 } 519 // fetch blocks 520 final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset); 521 if (newBlocks == null) { 522 throw new IOException("Could not find target position " + offset); 523 } 524 locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks()); 525 } 526 } 527 528 /** 529 * Get blocks in the specified range. 530 * Fetch them from the namenode if not cached. This function 531 * will not get a read request beyond the EOF. 532 * @param offset starting offset in file 533 * @param length length of data 534 * @return consequent segment of located blocks 535 * @throws IOException 536 */ 537 private List<LocatedBlock> getBlockRange(long offset, 538 long length) throws IOException { 539 // getFileLength(): returns total file length 540 // locatedBlocks.getFileLength(): returns length of completed blocks 541 if (offset >= getFileLength()) { 542 throw new IOException("Offset: " + offset + 543 " exceeds file length: " + getFileLength()); 544 } 545 synchronized(infoLock) { 546 final List<LocatedBlock> blocks; 547 final long lengthOfCompleteBlk = locatedBlocks.getFileLength(); 548 final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk; 549 final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk; 550 551 if (readOffsetWithinCompleteBlk) { 552 //get the blocks of finalized (completed) block range 553 blocks = getFinalizedBlockRange(offset, 554 Math.min(length, lengthOfCompleteBlk - offset)); 555 } else { 556 blocks = new ArrayList<LocatedBlock>(1); 557 } 558 559 // get the blocks from incomplete block range 560 if (readLengthPastCompleteBlk) { 561 blocks.add(locatedBlocks.getLastLocatedBlock()); 562 } 563 564 return blocks; 565 } 566 } 567 568 /** 569 * Get blocks in the specified range. 570 * Includes only the complete blocks. 571 * Fetch them from the namenode if not cached. 572 */ 573 private List<LocatedBlock> getFinalizedBlockRange( 574 long offset, long length) throws IOException { 575 synchronized(infoLock) { 576 assert (locatedBlocks != null) : "locatedBlocks is null"; 577 List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>(); 578 // search cached blocks first 579 int blockIdx = locatedBlocks.findBlock(offset); 580 if (blockIdx < 0) { // block is not cached 581 blockIdx = LocatedBlocks.getInsertIndex(blockIdx); 582 } 583 long remaining = length; 584 long curOff = offset; 585 while(remaining > 0) { 586 LocatedBlock blk = null; 587 if(blockIdx < locatedBlocks.locatedBlockCount()) 588 blk = locatedBlocks.get(blockIdx); 589 if (blk == null || curOff < blk.getStartOffset()) { 590 LocatedBlocks newBlocks; 591 newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining); 592 locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks()); 593 continue; 594 } 595 assert curOff >= blk.getStartOffset() : "Block not found"; 596 blockRange.add(blk); 597 long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff; 598 remaining -= bytesRead; 599 curOff += bytesRead; 600 blockIdx++; 601 } 602 return blockRange; 603 } 604 } 605 606 /** 607 * Open a DataInputStream to a DataNode so that it can be read from. 608 * We get block ID and the IDs of the destinations at startup, from the namenode. 609 */ 610 private synchronized DatanodeInfo blockSeekTo(long target) throws IOException { 611 if (target >= getFileLength()) { 612 throw new IOException("Attempted to read past end of file"); 613 } 614 615 // Will be getting a new BlockReader. 616 closeCurrentBlockReader(); 617 618 // 619 // Connect to best DataNode for desired Block, with potential offset 620 // 621 DatanodeInfo chosenNode = null; 622 int refetchToken = 1; // only need to get a new access token once 623 int refetchEncryptionKey = 1; // only need to get a new encryption key once 624 625 boolean connectFailedOnce = false; 626 627 while (true) { 628 // 629 // Compute desired block 630 // 631 LocatedBlock targetBlock = getBlockAt(target); 632 633 // update current position 634 this.pos = target; 635 this.blockEnd = targetBlock.getStartOffset() + 636 targetBlock.getBlockSize() - 1; 637 this.currentLocatedBlock = targetBlock; 638 639 assert (target==pos) : "Wrong postion " + pos + " expect " + target; 640 long offsetIntoBlock = target - targetBlock.getStartOffset(); 641 642 DNAddrPair retval = chooseDataNode(targetBlock, null); 643 chosenNode = retval.info; 644 InetSocketAddress targetAddr = retval.addr; 645 StorageType storageType = retval.storageType; 646 647 try { 648 ExtendedBlock blk = targetBlock.getBlock(); 649 Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken(); 650 CachingStrategy curCachingStrategy; 651 boolean shortCircuitForbidden; 652 synchronized(infoLock) { 653 curCachingStrategy = cachingStrategy; 654 shortCircuitForbidden = shortCircuitForbidden(); 655 } 656 blockReader = new BlockReaderFactory(dfsClient.getConf()). 657 setInetSocketAddress(targetAddr). 658 setRemotePeerFactory(dfsClient). 659 setDatanodeInfo(chosenNode). 660 setStorageType(storageType). 661 setFileName(src). 662 setBlock(blk). 663 setBlockToken(accessToken). 664 setStartOffset(offsetIntoBlock). 665 setVerifyChecksum(verifyChecksum). 666 setClientName(dfsClient.clientName). 667 setLength(blk.getNumBytes() - offsetIntoBlock). 668 setCachingStrategy(curCachingStrategy). 669 setAllowShortCircuitLocalReads(!shortCircuitForbidden). 670 setClientCacheContext(dfsClient.getClientContext()). 671 setUserGroupInformation(dfsClient.ugi). 672 setConfiguration(dfsClient.getConfiguration()). 673 build(); 674 if(connectFailedOnce) { 675 DFSClient.LOG.info("Successfully connected to " + targetAddr + 676 " for " + blk); 677 } 678 return chosenNode; 679 } catch (IOException ex) { 680 if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { 681 DFSClient.LOG.info("Will fetch a new encryption key and retry, " 682 + "encryption key was invalid when connecting to " + targetAddr 683 + " : " + ex); 684 // The encryption key used is invalid. 685 refetchEncryptionKey--; 686 dfsClient.clearDataEncryptionKey(); 687 } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) { 688 refetchToken--; 689 fetchBlockAt(target); 690 } else { 691 connectFailedOnce = true; 692 DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block" 693 + ", add to deadNodes and continue. " + ex, ex); 694 // Put chosen node into dead list, continue 695 addToDeadNodes(chosenNode); 696 } 697 } 698 } 699 } 700 701 /** 702 * Close it down! 703 */ 704 @Override 705 public synchronized void close() throws IOException { 706 if (!closed.compareAndSet(false, true)) { 707 DFSClient.LOG.debug("DFSInputStream has been closed already"); 708 return; 709 } 710 dfsClient.checkOpen(); 711 712 if ((extendedReadBuffers != null) && (!extendedReadBuffers.isEmpty())) { 713 final StringBuilder builder = new StringBuilder(); 714 extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() { 715 private String prefix = ""; 716 @Override 717 public void accept(ByteBuffer k, Object v) { 718 builder.append(prefix).append(k); 719 prefix = ", "; 720 } 721 }); 722 DFSClient.LOG.warn("closing file " + src + ", but there are still " + 723 "unreleased ByteBuffers allocated by read(). " + 724 "Please release " + builder.toString() + "."); 725 } 726 closeCurrentBlockReader(); 727 super.close(); 728 } 729 730 @Override 731 public synchronized int read() throws IOException { 732 if (oneByteBuf == null) { 733 oneByteBuf = new byte[1]; 734 } 735 int ret = read( oneByteBuf, 0, 1 ); 736 return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff); 737 } 738 739 /** 740 * Wraps different possible read implementations so that readBuffer can be 741 * strategy-agnostic. 742 */ 743 private interface ReaderStrategy { 744 public int doRead(BlockReader blockReader, int off, int len) 745 throws ChecksumException, IOException; 746 } 747 748 private void updateReadStatistics(ReadStatistics readStatistics, 749 int nRead, BlockReader blockReader) { 750 if (nRead <= 0) return; 751 synchronized(infoLock) { 752 if (blockReader.isShortCircuit()) { 753 readStatistics.addShortCircuitBytes(nRead); 754 } else if (blockReader.isLocal()) { 755 readStatistics.addLocalBytes(nRead); 756 } else { 757 readStatistics.addRemoteBytes(nRead); 758 } 759 } 760 } 761 762 /** 763 * Used to read bytes into a byte[] 764 */ 765 private class ByteArrayStrategy implements ReaderStrategy { 766 final byte[] buf; 767 768 public ByteArrayStrategy(byte[] buf) { 769 this.buf = buf; 770 } 771 772 @Override 773 public int doRead(BlockReader blockReader, int off, int len) 774 throws ChecksumException, IOException { 775 int nRead = blockReader.read(buf, off, len); 776 updateReadStatistics(readStatistics, nRead, blockReader); 777 return nRead; 778 } 779 } 780 781 /** 782 * Used to read bytes into a user-supplied ByteBuffer 783 */ 784 private class ByteBufferStrategy implements ReaderStrategy { 785 final ByteBuffer buf; 786 ByteBufferStrategy(ByteBuffer buf) { 787 this.buf = buf; 788 } 789 790 @Override 791 public int doRead(BlockReader blockReader, int off, int len) 792 throws ChecksumException, IOException { 793 int oldpos = buf.position(); 794 int oldlimit = buf.limit(); 795 boolean success = false; 796 try { 797 int ret = blockReader.read(buf); 798 success = true; 799 updateReadStatistics(readStatistics, ret, blockReader); 800 return ret; 801 } finally { 802 if (!success) { 803 // Reset to original state so that retries work correctly. 804 buf.position(oldpos); 805 buf.limit(oldlimit); 806 } 807 } 808 } 809 } 810 811 /* This is a used by regular read() and handles ChecksumExceptions. 812 * name readBuffer() is chosen to imply similarity to readBuffer() in 813 * ChecksumFileSystem 814 */ 815 private synchronized int readBuffer(ReaderStrategy reader, int off, int len, 816 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 817 throws IOException { 818 IOException ioe; 819 820 /* we retry current node only once. So this is set to true only here. 821 * Intention is to handle one common case of an error that is not a 822 * failure on datanode or client : when DataNode closes the connection 823 * since client is idle. If there are other cases of "non-errors" then 824 * then a datanode might be retried by setting this to true again. 825 */ 826 boolean retryCurrentNode = true; 827 828 while (true) { 829 // retry as many times as seekToNewSource allows. 830 try { 831 return reader.doRead(blockReader, off, len); 832 } catch ( ChecksumException ce ) { 833 DFSClient.LOG.warn("Found Checksum error for " 834 + getCurrentBlock() + " from " + currentNode 835 + " at " + ce.getPos()); 836 ioe = ce; 837 retryCurrentNode = false; 838 // we want to remember which block replicas we have tried 839 addIntoCorruptedBlockMap(getCurrentBlock(), currentNode, 840 corruptedBlockMap); 841 } catch ( IOException e ) { 842 if (!retryCurrentNode) { 843 DFSClient.LOG.warn("Exception while reading from " 844 + getCurrentBlock() + " of " + src + " from " 845 + currentNode, e); 846 } 847 ioe = e; 848 } 849 boolean sourceFound = false; 850 if (retryCurrentNode) { 851 /* possibly retry the same node so that transient errors don't 852 * result in application level failures (e.g. Datanode could have 853 * closed the connection because the client is idle for too long). 854 */ 855 sourceFound = seekToBlockSource(pos); 856 } else { 857 addToDeadNodes(currentNode); 858 sourceFound = seekToNewSource(pos); 859 } 860 if (!sourceFound) { 861 throw ioe; 862 } 863 retryCurrentNode = false; 864 } 865 } 866 867 private synchronized int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException { 868 dfsClient.checkOpen(); 869 if (closed.get()) { 870 throw new IOException("Stream closed"); 871 } 872 Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 873 = new HashMap<ExtendedBlock, Set<DatanodeInfo>>(); 874 failures = 0; 875 if (pos < getFileLength()) { 876 int retries = 2; 877 while (retries > 0) { 878 try { 879 // currentNode can be left as null if previous read had a checksum 880 // error on the same block. See HDFS-3067 881 if (pos > blockEnd || currentNode == null) { 882 currentNode = blockSeekTo(pos); 883 } 884 int realLen = (int) Math.min(len, (blockEnd - pos + 1L)); 885 synchronized(infoLock) { 886 if (locatedBlocks.isLastBlockComplete()) { 887 realLen = (int) Math.min(realLen, 888 locatedBlocks.getFileLength() - pos); 889 } 890 } 891 int result = readBuffer(strategy, off, realLen, corruptedBlockMap); 892 893 if (result >= 0) { 894 pos += result; 895 } else { 896 // got a EOS from reader though we expect more data on it. 897 throw new IOException("Unexpected EOS from the reader"); 898 } 899 if (dfsClient.stats != null) { 900 dfsClient.stats.incrementBytesRead(result); 901 } 902 return result; 903 } catch (ChecksumException ce) { 904 throw ce; 905 } catch (IOException e) { 906 if (retries == 1) { 907 DFSClient.LOG.warn("DFS Read", e); 908 } 909 blockEnd = -1; 910 if (currentNode != null) { addToDeadNodes(currentNode); } 911 if (--retries == 0) { 912 throw e; 913 } 914 } finally { 915 // Check if need to report block replicas corruption either read 916 // was successful or ChecksumException occured. 917 reportCheckSumFailure(corruptedBlockMap, 918 currentLocatedBlock.getLocations().length); 919 } 920 } 921 } 922 return -1; 923 } 924 925 /** 926 * Read the entire buffer. 927 */ 928 @Override 929 public synchronized int read(final byte buf[], int off, int len) throws IOException { 930 ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf); 931 TraceScope scope = 932 dfsClient.getPathTraceScope("DFSInputStream#byteArrayRead", src); 933 try { 934 return readWithStrategy(byteArrayReader, off, len); 935 } finally { 936 scope.close(); 937 } 938 } 939 940 @Override 941 public synchronized int read(final ByteBuffer buf) throws IOException { 942 ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf); 943 TraceScope scope = 944 dfsClient.getPathTraceScope("DFSInputStream#byteBufferRead", src); 945 try { 946 return readWithStrategy(byteBufferReader, 0, buf.remaining()); 947 } finally { 948 scope.close(); 949 } 950 } 951 952 953 /** 954 * Add corrupted block replica into map. 955 */ 956 private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 957 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) { 958 Set<DatanodeInfo> dnSet = null; 959 if((corruptedBlockMap.containsKey(blk))) { 960 dnSet = corruptedBlockMap.get(blk); 961 }else { 962 dnSet = new HashSet<DatanodeInfo>(); 963 } 964 if (!dnSet.contains(node)) { 965 dnSet.add(node); 966 corruptedBlockMap.put(blk, dnSet); 967 } 968 } 969 970 private DNAddrPair chooseDataNode(LocatedBlock block, 971 Collection<DatanodeInfo> ignoredNodes) throws IOException { 972 while (true) { 973 try { 974 return getBestNodeDNAddrPair(block, ignoredNodes); 975 } catch (IOException ie) { 976 String errMsg = getBestNodeDNAddrPairErrorString(block.getLocations(), 977 deadNodes, ignoredNodes); 978 String blockInfo = block.getBlock() + " file=" + src; 979 if (failures >= dfsClient.getMaxBlockAcquireFailures()) { 980 String description = "Could not obtain block: " + blockInfo; 981 DFSClient.LOG.warn(description + errMsg 982 + ". Throwing a BlockMissingException"); 983 throw new BlockMissingException(src, description, 984 block.getStartOffset()); 985 } 986 987 DatanodeInfo[] nodes = block.getLocations(); 988 if (nodes == null || nodes.length == 0) { 989 DFSClient.LOG.info("No node available for " + blockInfo); 990 } 991 DFSClient.LOG.info("Could not obtain " + block.getBlock() 992 + " from any node: " + ie + errMsg 993 + ". Will get new block locations from namenode and retry..."); 994 try { 995 // Introducing a random factor to the wait time before another retry. 996 // The wait time is dependent on # of failures and a random factor. 997 // At the first time of getting a BlockMissingException, the wait time 998 // is a random number between 0..3000 ms. If the first retry 999 // still fails, we will wait 3000 ms grace period before the 2nd retry. 1000 // Also at the second retry, the waiting window is expanded to 6000 ms 1001 // alleviating the request rate from the server. Similarly the 3rd retry 1002 // will wait 6000ms grace period before retry and the waiting window is 1003 // expanded to 9000ms. 1004 final int timeWindow = dfsClient.getConf().timeWindow; 1005 double waitTime = timeWindow * failures + // grace period for the last round of attempt 1006 timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure 1007 DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec."); 1008 Thread.sleep((long)waitTime); 1009 } catch (InterruptedException iex) { 1010 } 1011 deadNodes.clear(); //2nd option is to remove only nodes[blockId] 1012 openInfo(); 1013 block = getBlockAt(block.getStartOffset()); 1014 failures++; 1015 continue; 1016 } 1017 } 1018 } 1019 1020 /** 1021 * Get the best node from which to stream the data. 1022 * @param block LocatedBlock, containing nodes in priority order. 1023 * @param ignoredNodes Do not choose nodes in this array (may be null) 1024 * @return The DNAddrPair of the best node. 1025 * @throws IOException 1026 */ 1027 private DNAddrPair getBestNodeDNAddrPair(LocatedBlock block, 1028 Collection<DatanodeInfo> ignoredNodes) throws IOException { 1029 DatanodeInfo[] nodes = block.getLocations(); 1030 StorageType[] storageTypes = block.getStorageTypes(); 1031 DatanodeInfo chosenNode = null; 1032 StorageType storageType = null; 1033 if (nodes != null) { 1034 for (int i = 0; i < nodes.length; i++) { 1035 if (!deadNodes.containsKey(nodes[i]) 1036 && (ignoredNodes == null || !ignoredNodes.contains(nodes[i]))) { 1037 chosenNode = nodes[i]; 1038 // Storage types are ordered to correspond with nodes, so use the same 1039 // index to get storage type. 1040 if (storageTypes != null && i < storageTypes.length) { 1041 storageType = storageTypes[i]; 1042 } 1043 break; 1044 } 1045 } 1046 } 1047 if (chosenNode == null) { 1048 throw new IOException("No live nodes contain block " + block.getBlock() + 1049 " after checking nodes = " + Arrays.toString(nodes) + 1050 ", ignoredNodes = " + ignoredNodes); 1051 } 1052 final String dnAddr = 1053 chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname); 1054 if (DFSClient.LOG.isDebugEnabled()) { 1055 DFSClient.LOG.debug("Connecting to datanode " + dnAddr); 1056 } 1057 InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr); 1058 return new DNAddrPair(chosenNode, targetAddr, storageType); 1059 } 1060 1061 private static String getBestNodeDNAddrPairErrorString( 1062 DatanodeInfo nodes[], AbstractMap<DatanodeInfo, 1063 DatanodeInfo> deadNodes, Collection<DatanodeInfo> ignoredNodes) { 1064 StringBuilder errMsgr = new StringBuilder( 1065 " No live nodes contain current block "); 1066 errMsgr.append("Block locations:"); 1067 for (DatanodeInfo datanode : nodes) { 1068 errMsgr.append(" "); 1069 errMsgr.append(datanode.toString()); 1070 } 1071 errMsgr.append(" Dead nodes: "); 1072 for (DatanodeInfo datanode : deadNodes.keySet()) { 1073 errMsgr.append(" "); 1074 errMsgr.append(datanode.toString()); 1075 } 1076 if (ignoredNodes != null) { 1077 errMsgr.append(" Ignored nodes: "); 1078 for (DatanodeInfo datanode : ignoredNodes) { 1079 errMsgr.append(" "); 1080 errMsgr.append(datanode.toString()); 1081 } 1082 } 1083 return errMsgr.toString(); 1084 } 1085 1086 private void fetchBlockByteRange(LocatedBlock block, long start, long end, 1087 byte[] buf, int offset, 1088 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 1089 throws IOException { 1090 block = getBlockAt(block.getStartOffset()); 1091 while (true) { 1092 DNAddrPair addressPair = chooseDataNode(block, null); 1093 try { 1094 actualGetFromOneDataNode(addressPair, block, start, end, buf, offset, 1095 corruptedBlockMap); 1096 return; 1097 } catch (IOException e) { 1098 // Ignore. Already processed inside the function. 1099 // Loop through to try the next node. 1100 } 1101 } 1102 } 1103 1104 private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode, 1105 final LocatedBlock block, final long start, final long end, 1106 final ByteBuffer bb, 1107 final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 1108 final int hedgedReadId) { 1109 final Span parentSpan = Trace.currentSpan(); 1110 return new Callable<ByteBuffer>() { 1111 @Override 1112 public ByteBuffer call() throws Exception { 1113 byte[] buf = bb.array(); 1114 int offset = bb.position(); 1115 TraceScope scope = 1116 Trace.startSpan("hedgedRead" + hedgedReadId, parentSpan); 1117 try { 1118 actualGetFromOneDataNode(datanode, block, start, end, buf, offset, 1119 corruptedBlockMap); 1120 return bb; 1121 } finally { 1122 scope.close(); 1123 } 1124 } 1125 }; 1126 } 1127 1128 private void actualGetFromOneDataNode(final DNAddrPair datanode, 1129 LocatedBlock block, final long start, final long end, byte[] buf, 1130 int offset, Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 1131 throws IOException { 1132 DFSClientFaultInjector.get().startFetchFromDatanode(); 1133 int refetchToken = 1; // only need to get a new access token once 1134 int refetchEncryptionKey = 1; // only need to get a new encryption key once 1135 1136 while (true) { 1137 // cached block locations may have been updated by chooseDataNode() 1138 // or fetchBlockAt(). Always get the latest list of locations at the 1139 // start of the loop. 1140 CachingStrategy curCachingStrategy; 1141 boolean allowShortCircuitLocalReads; 1142 block = getBlockAt(block.getStartOffset()); 1143 synchronized(infoLock) { 1144 curCachingStrategy = cachingStrategy; 1145 allowShortCircuitLocalReads = !shortCircuitForbidden(); 1146 } 1147 DatanodeInfo chosenNode = datanode.info; 1148 InetSocketAddress targetAddr = datanode.addr; 1149 StorageType storageType = datanode.storageType; 1150 BlockReader reader = null; 1151 1152 try { 1153 DFSClientFaultInjector.get().fetchFromDatanodeException(); 1154 Token<BlockTokenIdentifier> blockToken = block.getBlockToken(); 1155 int len = (int) (end - start + 1); 1156 reader = new BlockReaderFactory(dfsClient.getConf()). 1157 setInetSocketAddress(targetAddr). 1158 setRemotePeerFactory(dfsClient). 1159 setDatanodeInfo(chosenNode). 1160 setStorageType(storageType). 1161 setFileName(src). 1162 setBlock(block.getBlock()). 1163 setBlockToken(blockToken). 1164 setStartOffset(start). 1165 setVerifyChecksum(verifyChecksum). 1166 setClientName(dfsClient.clientName). 1167 setLength(len). 1168 setCachingStrategy(curCachingStrategy). 1169 setAllowShortCircuitLocalReads(allowShortCircuitLocalReads). 1170 setClientCacheContext(dfsClient.getClientContext()). 1171 setUserGroupInformation(dfsClient.ugi). 1172 setConfiguration(dfsClient.getConfiguration()). 1173 build(); 1174 int nread = reader.readAll(buf, offset, len); 1175 updateReadStatistics(readStatistics, nread, reader); 1176 1177 if (nread != len) { 1178 throw new IOException("truncated return from reader.read(): " + 1179 "excpected " + len + ", got " + nread); 1180 } 1181 DFSClientFaultInjector.get().readFromDatanodeDelay(); 1182 return; 1183 } catch (ChecksumException e) { 1184 String msg = "fetchBlockByteRange(). Got a checksum exception for " 1185 + src + " at " + block.getBlock() + ":" + e.getPos() + " from " 1186 + chosenNode; 1187 DFSClient.LOG.warn(msg); 1188 // we want to remember what we have tried 1189 addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap); 1190 addToDeadNodes(chosenNode); 1191 throw new IOException(msg); 1192 } catch (IOException e) { 1193 if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { 1194 DFSClient.LOG.info("Will fetch a new encryption key and retry, " 1195 + "encryption key was invalid when connecting to " + targetAddr 1196 + " : " + e); 1197 // The encryption key used is invalid. 1198 refetchEncryptionKey--; 1199 dfsClient.clearDataEncryptionKey(); 1200 continue; 1201 } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) { 1202 refetchToken--; 1203 try { 1204 fetchBlockAt(block.getStartOffset()); 1205 } catch (IOException fbae) { 1206 // ignore IOE, since we can retry it later in a loop 1207 } 1208 continue; 1209 } else { 1210 String msg = "Failed to connect to " + targetAddr + " for file " 1211 + src + " for block " + block.getBlock() + ":" + e; 1212 DFSClient.LOG.warn("Connection failure: " + msg, e); 1213 addToDeadNodes(chosenNode); 1214 throw new IOException(msg); 1215 } 1216 } finally { 1217 if (reader != null) { 1218 reader.close(); 1219 } 1220 } 1221 } 1222 } 1223 1224 /** 1225 * Like {@link #fetchBlockByteRange(LocatedBlock, long, long, byte[], 1226 * int, Map)} except we start up a second, parallel, 'hedged' read 1227 * if the first read is taking longer than configured amount of 1228 * time. We then wait on which ever read returns first. 1229 */ 1230 private void hedgedFetchBlockByteRange(LocatedBlock block, long start, 1231 long end, byte[] buf, int offset, 1232 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 1233 throws IOException { 1234 ArrayList<Future<ByteBuffer>> futures = new ArrayList<Future<ByteBuffer>>(); 1235 CompletionService<ByteBuffer> hedgedService = 1236 new ExecutorCompletionService<ByteBuffer>( 1237 dfsClient.getHedgedReadsThreadPool()); 1238 ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>(); 1239 ByteBuffer bb = null; 1240 int len = (int) (end - start + 1); 1241 int hedgedReadId = 0; 1242 block = getBlockAt(block.getStartOffset()); 1243 while (true) { 1244 // see HDFS-6591, this metric is used to verify/catch unnecessary loops 1245 hedgedReadOpsLoopNumForTesting++; 1246 DNAddrPair chosenNode = null; 1247 // there is no request already executing. 1248 if (futures.isEmpty()) { 1249 // chooseDataNode is a commitment. If no node, we go to 1250 // the NN to reget block locations. Only go here on first read. 1251 chosenNode = chooseDataNode(block, ignored); 1252 bb = ByteBuffer.allocate(len); 1253 Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode( 1254 chosenNode, block, start, end, bb, corruptedBlockMap, 1255 hedgedReadId++); 1256 Future<ByteBuffer> firstRequest = hedgedService 1257 .submit(getFromDataNodeCallable); 1258 futures.add(firstRequest); 1259 try { 1260 Future<ByteBuffer> future = hedgedService.poll( 1261 dfsClient.getHedgedReadTimeout(), TimeUnit.MILLISECONDS); 1262 if (future != null) { 1263 ByteBuffer result = future.get(); 1264 System.arraycopy(result.array(), result.position(), buf, offset, 1265 len); 1266 return; 1267 } 1268 if (DFSClient.LOG.isDebugEnabled()) { 1269 DFSClient.LOG.debug("Waited " + dfsClient.getHedgedReadTimeout() 1270 + "ms to read from " + chosenNode.info 1271 + "; spawning hedged read"); 1272 } 1273 // Ignore this node on next go around. 1274 ignored.add(chosenNode.info); 1275 dfsClient.getHedgedReadMetrics().incHedgedReadOps(); 1276 continue; // no need to refresh block locations 1277 } catch (InterruptedException e) { 1278 // Ignore 1279 } catch (ExecutionException e) { 1280 // Ignore already logged in the call. 1281 } 1282 } else { 1283 // We are starting up a 'hedged' read. We have a read already 1284 // ongoing. Call getBestNodeDNAddrPair instead of chooseDataNode. 1285 // If no nodes to do hedged reads against, pass. 1286 try { 1287 try { 1288 chosenNode = getBestNodeDNAddrPair(block, ignored); 1289 } catch (IOException ioe) { 1290 chosenNode = chooseDataNode(block, ignored); 1291 } 1292 bb = ByteBuffer.allocate(len); 1293 Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode( 1294 chosenNode, block, start, end, bb, corruptedBlockMap, 1295 hedgedReadId++); 1296 Future<ByteBuffer> oneMoreRequest = hedgedService 1297 .submit(getFromDataNodeCallable); 1298 futures.add(oneMoreRequest); 1299 } catch (IOException ioe) { 1300 if (DFSClient.LOG.isDebugEnabled()) { 1301 DFSClient.LOG.debug("Failed getting node for hedged read: " 1302 + ioe.getMessage()); 1303 } 1304 } 1305 // if not succeeded. Submit callables for each datanode in a loop, wait 1306 // for a fixed interval and get the result from the fastest one. 1307 try { 1308 ByteBuffer result = getFirstToComplete(hedgedService, futures); 1309 // cancel the rest. 1310 cancelAll(futures); 1311 dfsClient.getHedgedReadMetrics().incHedgedReadWins(); 1312 System.arraycopy(result.array(), result.position(), buf, offset, 1313 len); 1314 return; 1315 } catch (InterruptedException ie) { 1316 // Ignore and retry 1317 } 1318 // We got here if exception. Ignore this node on next go around IFF 1319 // we found a chosenNode to hedge read against. 1320 if (chosenNode != null && chosenNode.info != null) { 1321 ignored.add(chosenNode.info); 1322 } 1323 } 1324 } 1325 } 1326 1327 @VisibleForTesting 1328 public long getHedgedReadOpsLoopNumForTesting() { 1329 return hedgedReadOpsLoopNumForTesting; 1330 } 1331 1332 private ByteBuffer getFirstToComplete( 1333 CompletionService<ByteBuffer> hedgedService, 1334 ArrayList<Future<ByteBuffer>> futures) throws InterruptedException { 1335 if (futures.isEmpty()) { 1336 throw new InterruptedException("let's retry"); 1337 } 1338 Future<ByteBuffer> future = null; 1339 try { 1340 future = hedgedService.take(); 1341 ByteBuffer bb = future.get(); 1342 futures.remove(future); 1343 return bb; 1344 } catch (ExecutionException e) { 1345 // already logged in the Callable 1346 futures.remove(future); 1347 } catch (CancellationException ce) { 1348 // already logged in the Callable 1349 futures.remove(future); 1350 } 1351 1352 throw new InterruptedException("let's retry"); 1353 } 1354 1355 private void cancelAll(List<Future<ByteBuffer>> futures) { 1356 for (Future<ByteBuffer> future : futures) { 1357 // Unfortunately, hdfs reads do not take kindly to interruption. 1358 // Threads return a variety of interrupted-type exceptions but 1359 // also complaints about invalid pbs -- likely because read 1360 // is interrupted before gets whole pb. Also verbose WARN 1361 // logging. So, for now, do not interrupt running read. 1362 future.cancel(false); 1363 } 1364 } 1365 1366 /** 1367 * Should the block access token be refetched on an exception 1368 * 1369 * @param ex Exception received 1370 * @param targetAddr Target datanode address from where exception was received 1371 * @return true if block access token has expired or invalid and it should be 1372 * refetched 1373 */ 1374 private static boolean tokenRefetchNeeded(IOException ex, 1375 InetSocketAddress targetAddr) { 1376 /* 1377 * Get a new access token and retry. Retry is needed in 2 cases. 1) 1378 * When both NN and DN re-started while DFSClient holding a cached 1379 * access token. 2) In the case that NN fails to update its 1380 * access key at pre-set interval (by a wide margin) and 1381 * subsequently restarts. In this case, DN re-registers itself with 1382 * NN and receives a new access key, but DN will delete the old 1383 * access key from its memory since it's considered expired based on 1384 * the estimated expiration date. 1385 */ 1386 if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) { 1387 DFSClient.LOG.info("Access token was invalid when connecting to " 1388 + targetAddr + " : " + ex); 1389 return true; 1390 } 1391 return false; 1392 } 1393 1394 /** 1395 * Read bytes starting from the specified position. 1396 * 1397 * @param position start read from this position 1398 * @param buffer read buffer 1399 * @param offset offset into buffer 1400 * @param length number of bytes to read 1401 * 1402 * @return actual number of bytes read 1403 */ 1404 @Override 1405 public int read(long position, byte[] buffer, int offset, int length) 1406 throws IOException { 1407 TraceScope scope = 1408 dfsClient.getPathTraceScope("DFSInputStream#byteArrayPread", src); 1409 try { 1410 return pread(position, buffer, offset, length); 1411 } finally { 1412 scope.close(); 1413 } 1414 } 1415 1416 private int pread(long position, byte[] buffer, int offset, int length) 1417 throws IOException { 1418 // sanity checks 1419 dfsClient.checkOpen(); 1420 if (closed.get()) { 1421 throw new IOException("Stream closed"); 1422 } 1423 failures = 0; 1424 long filelen = getFileLength(); 1425 if ((position < 0) || (position >= filelen)) { 1426 return -1; 1427 } 1428 int realLen = length; 1429 if ((position + length) > filelen) { 1430 realLen = (int)(filelen - position); 1431 } 1432 1433 // determine the block and byte range within the block 1434 // corresponding to position and realLen 1435 List<LocatedBlock> blockRange = getBlockRange(position, realLen); 1436 int remaining = realLen; 1437 Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 1438 = new HashMap<ExtendedBlock, Set<DatanodeInfo>>(); 1439 for (LocatedBlock blk : blockRange) { 1440 long targetStart = position - blk.getStartOffset(); 1441 long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart); 1442 try { 1443 if (dfsClient.isHedgedReadsEnabled()) { 1444 hedgedFetchBlockByteRange(blk, targetStart, targetStart + bytesToRead 1445 - 1, buffer, offset, corruptedBlockMap); 1446 } else { 1447 fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1, 1448 buffer, offset, corruptedBlockMap); 1449 } 1450 } finally { 1451 // Check and report if any block replicas are corrupted. 1452 // BlockMissingException may be caught if all block replicas are 1453 // corrupted. 1454 reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length); 1455 } 1456 1457 remaining -= bytesToRead; 1458 position += bytesToRead; 1459 offset += bytesToRead; 1460 } 1461 assert remaining == 0 : "Wrong number of bytes read."; 1462 if (dfsClient.stats != null) { 1463 dfsClient.stats.incrementBytesRead(realLen); 1464 } 1465 return realLen; 1466 } 1467 1468 /** 1469 * DFSInputStream reports checksum failure. 1470 * Case I : client has tried multiple data nodes and at least one of the 1471 * attempts has succeeded. We report the other failures as corrupted block to 1472 * namenode. 1473 * Case II: client has tried out all data nodes, but all failed. We 1474 * only report if the total number of replica is 1. We do not 1475 * report otherwise since this maybe due to the client is a handicapped client 1476 * (who can not read). 1477 * @param corruptedBlockMap map of corrupted blocks 1478 * @param dataNodeCount number of data nodes who contains the block replicas 1479 */ 1480 private void reportCheckSumFailure( 1481 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 1482 int dataNodeCount) { 1483 if (corruptedBlockMap.isEmpty()) { 1484 return; 1485 } 1486 Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap 1487 .entrySet().iterator(); 1488 Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next(); 1489 ExtendedBlock blk = entry.getKey(); 1490 Set<DatanodeInfo> dnSet = entry.getValue(); 1491 if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0)) 1492 || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) { 1493 DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()]; 1494 int i = 0; 1495 for (DatanodeInfo dn:dnSet) { 1496 locs[i++] = dn; 1497 } 1498 LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) }; 1499 dfsClient.reportChecksumFailure(src, lblocks); 1500 } 1501 corruptedBlockMap.clear(); 1502 } 1503 1504 @Override 1505 public long skip(long n) throws IOException { 1506 if ( n > 0 ) { 1507 long curPos = getPos(); 1508 long fileLen = getFileLength(); 1509 if( n+curPos > fileLen ) { 1510 n = fileLen - curPos; 1511 } 1512 seek(curPos+n); 1513 return n; 1514 } 1515 return n < 0 ? -1 : 0; 1516 } 1517 1518 /** 1519 * Seek to a new arbitrary location 1520 */ 1521 @Override 1522 public synchronized void seek(long targetPos) throws IOException { 1523 if (targetPos > getFileLength()) { 1524 throw new EOFException("Cannot seek after EOF"); 1525 } 1526 if (targetPos < 0) { 1527 throw new EOFException("Cannot seek to negative offset"); 1528 } 1529 if (closed.get()) { 1530 throw new IOException("Stream is closed!"); 1531 } 1532 boolean done = false; 1533 if (pos <= targetPos && targetPos <= blockEnd) { 1534 // 1535 // If this seek is to a positive position in the current 1536 // block, and this piece of data might already be lying in 1537 // the TCP buffer, then just eat up the intervening data. 1538 // 1539 int diff = (int)(targetPos - pos); 1540 if (diff <= blockReader.available()) { 1541 try { 1542 pos += blockReader.skip(diff); 1543 if (pos == targetPos) { 1544 done = true; 1545 } else { 1546 // The range was already checked. If the block reader returns 1547 // something unexpected instead of throwing an exception, it is 1548 // most likely a bug. 1549 String errMsg = "BlockReader failed to seek to " + 1550 targetPos + ". Instead, it seeked to " + pos + "."; 1551 DFSClient.LOG.warn(errMsg); 1552 throw new IOException(errMsg); 1553 } 1554 } catch (IOException e) {//make following read to retry 1555 if(DFSClient.LOG.isDebugEnabled()) { 1556 DFSClient.LOG.debug("Exception while seek to " + targetPos 1557 + " from " + getCurrentBlock() + " of " + src + " from " 1558 + currentNode, e); 1559 } 1560 } 1561 } 1562 } 1563 if (!done) { 1564 pos = targetPos; 1565 blockEnd = -1; 1566 } 1567 } 1568 1569 /** 1570 * Same as {@link #seekToNewSource(long)} except that it does not exclude 1571 * the current datanode and might connect to the same node. 1572 */ 1573 private boolean seekToBlockSource(long targetPos) 1574 throws IOException { 1575 currentNode = blockSeekTo(targetPos); 1576 return true; 1577 } 1578 1579 /** 1580 * Seek to given position on a node other than the current node. If 1581 * a node other than the current node is found, then returns true. 1582 * If another node could not be found, then returns false. 1583 */ 1584 @Override 1585 public synchronized boolean seekToNewSource(long targetPos) throws IOException { 1586 boolean markedDead = deadNodes.containsKey(currentNode); 1587 addToDeadNodes(currentNode); 1588 DatanodeInfo oldNode = currentNode; 1589 DatanodeInfo newNode = blockSeekTo(targetPos); 1590 if (!markedDead) { 1591 /* remove it from deadNodes. blockSeekTo could have cleared 1592 * deadNodes and added currentNode again. Thats ok. */ 1593 deadNodes.remove(oldNode); 1594 } 1595 if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) { 1596 currentNode = newNode; 1597 return true; 1598 } else { 1599 return false; 1600 } 1601 } 1602 1603 /** 1604 */ 1605 @Override 1606 public synchronized long getPos() throws IOException { 1607 return pos; 1608 } 1609 1610 /** Return the size of the remaining available bytes 1611 * if the size is less than or equal to {@link Integer#MAX_VALUE}, 1612 * otherwise, return {@link Integer#MAX_VALUE}. 1613 */ 1614 @Override 1615 public synchronized int available() throws IOException { 1616 if (closed.get()) { 1617 throw new IOException("Stream closed"); 1618 } 1619 1620 final long remaining = getFileLength() - pos; 1621 return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE; 1622 } 1623 1624 /** 1625 * We definitely don't support marks 1626 */ 1627 @Override 1628 public boolean markSupported() { 1629 return false; 1630 } 1631 @Override 1632 public void mark(int readLimit) { 1633 } 1634 @Override 1635 public void reset() throws IOException { 1636 throw new IOException("Mark/reset not supported"); 1637 } 1638 1639 /** Utility class to encapsulate data node info and its address. */ 1640 private static final class DNAddrPair { 1641 final DatanodeInfo info; 1642 final InetSocketAddress addr; 1643 final StorageType storageType; 1644 1645 DNAddrPair(DatanodeInfo info, InetSocketAddress addr, 1646 StorageType storageType) { 1647 this.info = info; 1648 this.addr = addr; 1649 this.storageType = storageType; 1650 } 1651 } 1652 1653 /** 1654 * Get statistics about the reads which this DFSInputStream has done. 1655 */ 1656 public ReadStatistics getReadStatistics() { 1657 synchronized(infoLock) { 1658 return new ReadStatistics(readStatistics); 1659 } 1660 } 1661 1662 /** 1663 * Clear statistics about the reads which this DFSInputStream has done. 1664 */ 1665 public void clearReadStatistics() { 1666 synchronized(infoLock) { 1667 readStatistics.clear(); 1668 } 1669 } 1670 1671 public FileEncryptionInfo getFileEncryptionInfo() { 1672 synchronized(infoLock) { 1673 return fileEncryptionInfo; 1674 } 1675 } 1676 1677 private void closeCurrentBlockReader() { 1678 if (blockReader == null) return; 1679 // Close the current block reader so that the new caching settings can 1680 // take effect immediately. 1681 try { 1682 blockReader.close(); 1683 } catch (IOException e) { 1684 DFSClient.LOG.error("error closing blockReader", e); 1685 } 1686 blockReader = null; 1687 blockEnd = -1; 1688 } 1689 1690 @Override 1691 public synchronized void setReadahead(Long readahead) 1692 throws IOException { 1693 synchronized (infoLock) { 1694 this.cachingStrategy = 1695 new CachingStrategy.Builder(this.cachingStrategy).setReadahead(readahead).build(); 1696 } 1697 closeCurrentBlockReader(); 1698 } 1699 1700 @Override 1701 public synchronized void setDropBehind(Boolean dropBehind) 1702 throws IOException { 1703 synchronized (infoLock) { 1704 this.cachingStrategy = 1705 new CachingStrategy.Builder(this.cachingStrategy).setDropBehind(dropBehind).build(); 1706 } 1707 closeCurrentBlockReader(); 1708 } 1709 1710 /** 1711 * The immutable empty buffer we return when we reach EOF when doing a 1712 * zero-copy read. 1713 */ 1714 private static final ByteBuffer EMPTY_BUFFER = 1715 ByteBuffer.allocateDirect(0).asReadOnlyBuffer(); 1716 1717 @Override 1718 public synchronized ByteBuffer read(ByteBufferPool bufferPool, 1719 int maxLength, EnumSet<ReadOption> opts) 1720 throws IOException, UnsupportedOperationException { 1721 if (maxLength == 0) { 1722 return EMPTY_BUFFER; 1723 } else if (maxLength < 0) { 1724 throw new IllegalArgumentException("can't read a negative " + 1725 "number of bytes."); 1726 } 1727 if ((blockReader == null) || (blockEnd == -1)) { 1728 if (pos >= getFileLength()) { 1729 return null; 1730 } 1731 /* 1732 * If we don't have a blockReader, or the one we have has no more bytes 1733 * left to read, we call seekToBlockSource to get a new blockReader and 1734 * recalculate blockEnd. Note that we assume we're not at EOF here 1735 * (we check this above). 1736 */ 1737 if ((!seekToBlockSource(pos)) || (blockReader == null)) { 1738 throw new IOException("failed to allocate new BlockReader " + 1739 "at position " + pos); 1740 } 1741 } 1742 ByteBuffer buffer = null; 1743 if (dfsClient.getConf().shortCircuitMmapEnabled) { 1744 buffer = tryReadZeroCopy(maxLength, opts); 1745 } 1746 if (buffer != null) { 1747 return buffer; 1748 } 1749 buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength); 1750 if (buffer != null) { 1751 getExtendedReadBuffers().put(buffer, bufferPool); 1752 } 1753 return buffer; 1754 } 1755 1756 private synchronized ByteBuffer tryReadZeroCopy(int maxLength, 1757 EnumSet<ReadOption> opts) throws IOException { 1758 // Copy 'pos' and 'blockEnd' to local variables to make it easier for the 1759 // JVM to optimize this function. 1760 final long curPos = pos; 1761 final long curEnd = blockEnd; 1762 final long blockStartInFile = currentLocatedBlock.getStartOffset(); 1763 final long blockPos = curPos - blockStartInFile; 1764 1765 // Shorten this read if the end of the block is nearby. 1766 long length63; 1767 if ((curPos + maxLength) <= (curEnd + 1)) { 1768 length63 = maxLength; 1769 } else { 1770 length63 = 1 + curEnd - curPos; 1771 if (length63 <= 0) { 1772 if (DFSClient.LOG.isDebugEnabled()) { 1773 DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " + 1774 curPos + " of " + src + "; " + length63 + " bytes left in block. " + 1775 "blockPos=" + blockPos + "; curPos=" + curPos + 1776 "; curEnd=" + curEnd); 1777 } 1778 return null; 1779 } 1780 if (DFSClient.LOG.isDebugEnabled()) { 1781 DFSClient.LOG.debug("Reducing read length from " + maxLength + 1782 " to " + length63 + " to avoid going more than one byte " + 1783 "past the end of the block. blockPos=" + blockPos + 1784 "; curPos=" + curPos + "; curEnd=" + curEnd); 1785 } 1786 } 1787 // Make sure that don't go beyond 31-bit offsets in the MappedByteBuffer. 1788 int length; 1789 if (blockPos + length63 <= Integer.MAX_VALUE) { 1790 length = (int)length63; 1791 } else { 1792 long length31 = Integer.MAX_VALUE - blockPos; 1793 if (length31 <= 0) { 1794 // Java ByteBuffers can't be longer than 2 GB, because they use 1795 // 4-byte signed integers to represent capacity, etc. 1796 // So we can't mmap the parts of the block higher than the 2 GB offset. 1797 // FIXME: we could work around this with multiple memory maps. 1798 // See HDFS-5101. 1799 if (DFSClient.LOG.isDebugEnabled()) { 1800 DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " + 1801 curPos + " of " + src + "; 31-bit MappedByteBuffer limit " + 1802 "exceeded. blockPos=" + blockPos + ", curEnd=" + curEnd); 1803 } 1804 return null; 1805 } 1806 length = (int)length31; 1807 if (DFSClient.LOG.isDebugEnabled()) { 1808 DFSClient.LOG.debug("Reducing read length from " + maxLength + 1809 " to " + length + " to avoid 31-bit limit. " + 1810 "blockPos=" + blockPos + "; curPos=" + curPos + 1811 "; curEnd=" + curEnd); 1812 } 1813 } 1814 final ClientMmap clientMmap = blockReader.getClientMmap(opts); 1815 if (clientMmap == null) { 1816 if (DFSClient.LOG.isDebugEnabled()) { 1817 DFSClient.LOG.debug("unable to perform a zero-copy read from offset " + 1818 curPos + " of " + src + "; BlockReader#getClientMmap returned " + 1819 "null."); 1820 } 1821 return null; 1822 } 1823 boolean success = false; 1824 ByteBuffer buffer; 1825 try { 1826 seek(curPos + length); 1827 buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer(); 1828 buffer.position((int)blockPos); 1829 buffer.limit((int)(blockPos + length)); 1830 getExtendedReadBuffers().put(buffer, clientMmap); 1831 synchronized (infoLock) { 1832 readStatistics.addZeroCopyBytes(length); 1833 } 1834 if (DFSClient.LOG.isDebugEnabled()) { 1835 DFSClient.LOG.debug("readZeroCopy read " + length + 1836 " bytes from offset " + curPos + " via the zero-copy read " + 1837 "path. blockEnd = " + blockEnd); 1838 } 1839 success = true; 1840 } finally { 1841 if (!success) { 1842 IOUtils.closeQuietly(clientMmap); 1843 } 1844 } 1845 return buffer; 1846 } 1847 1848 @Override 1849 public synchronized void releaseBuffer(ByteBuffer buffer) { 1850 if (buffer == EMPTY_BUFFER) return; 1851 Object val = getExtendedReadBuffers().remove(buffer); 1852 if (val == null) { 1853 throw new IllegalArgumentException("tried to release a buffer " + 1854 "that was not created by this stream, " + buffer); 1855 } 1856 if (val instanceof ClientMmap) { 1857 IOUtils.closeQuietly((ClientMmap)val); 1858 } else if (val instanceof ByteBufferPool) { 1859 ((ByteBufferPool)val).putBuffer(buffer); 1860 } 1861 } 1862 1863 @Override 1864 public synchronized void unbuffer() { 1865 closeCurrentBlockReader(); 1866 } 1867}