001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs; 019 020import java.io.EOFException; 021import java.io.IOException; 022import java.net.InetSocketAddress; 023import java.nio.ByteBuffer; 024import java.util.AbstractMap; 025import java.util.ArrayList; 026import java.util.Collection; 027import java.util.EnumSet; 028import java.util.HashMap; 029import java.util.HashSet; 030import java.util.Iterator; 031import java.util.List; 032import java.util.Map; 033import java.util.Map.Entry; 034import java.util.Set; 035import java.util.concurrent.Callable; 036import java.util.concurrent.CancellationException; 037import java.util.concurrent.CompletionService; 038import java.util.concurrent.ConcurrentHashMap; 039import java.util.concurrent.ExecutionException; 040import java.util.concurrent.ExecutorCompletionService; 041import java.util.concurrent.Future; 042import java.util.concurrent.TimeUnit; 043import java.util.concurrent.atomic.AtomicLong; 044 045import org.apache.commons.io.IOUtils; 046import org.apache.hadoop.classification.InterfaceAudience; 047import org.apache.hadoop.fs.ByteBufferReadable; 048import org.apache.hadoop.fs.ByteBufferUtil; 049import org.apache.hadoop.fs.CanSetDropBehind; 050import org.apache.hadoop.fs.CanSetReadahead; 051import org.apache.hadoop.fs.ChecksumException; 052import org.apache.hadoop.fs.FSInputStream; 053import org.apache.hadoop.fs.HasEnhancedByteBufferAccess; 054import org.apache.hadoop.fs.ReadOption; 055import org.apache.hadoop.fs.UnresolvedLinkException; 056import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol; 057import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 058import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 059import org.apache.hadoop.hdfs.protocol.LocatedBlock; 060import org.apache.hadoop.hdfs.protocol.LocatedBlocks; 061import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException; 062import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; 063import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException; 064import org.apache.hadoop.hdfs.server.datanode.CachingStrategy; 065import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; 066import org.apache.hadoop.hdfs.shortcircuit.ClientMmap; 067import org.apache.hadoop.io.ByteBufferPool; 068import org.apache.hadoop.ipc.RPC; 069import org.apache.hadoop.ipc.RemoteException; 070import org.apache.hadoop.net.NetUtils; 071import org.apache.hadoop.security.token.SecretManager.InvalidToken; 072import org.apache.hadoop.security.token.Token; 073import org.apache.hadoop.util.IdentityHashStore; 074 075import com.google.common.annotations.VisibleForTesting; 076 077/**************************************************************** 078 * DFSInputStream provides bytes from a named file. It handles 079 * negotiation of the namenode and various datanodes as necessary. 080 ****************************************************************/ 081@InterfaceAudience.Private 082public class DFSInputStream extends FSInputStream 083implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead, 084 HasEnhancedByteBufferAccess { 085 @VisibleForTesting 086 public static boolean tcpReadsDisabledForTesting = false; 087 private long hedgedReadOpsLoopNumForTesting = 0; 088 private final DFSClient dfsClient; 089 private boolean closed = false; 090 private final String src; 091 private BlockReader blockReader = null; 092 private final boolean verifyChecksum; 093 private LocatedBlocks locatedBlocks = null; 094 private long lastBlockBeingWrittenLength = 0; 095 private DatanodeInfo currentNode = null; 096 private LocatedBlock currentLocatedBlock = null; 097 private long pos = 0; 098 private long blockEnd = -1; 099 private CachingStrategy cachingStrategy; 100 private final ReadStatistics readStatistics = new ReadStatistics(); 101 102 /** 103 * Track the ByteBuffers that we have handed out to readers. 104 * 105 * The value type can be either ByteBufferPool or ClientMmap, depending on 106 * whether we this is a memory-mapped buffer or not. 107 */ 108 private final IdentityHashStore<ByteBuffer, Object> 109 extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0); 110 111 public static class ReadStatistics { 112 public ReadStatistics() { 113 this.totalBytesRead = 0; 114 this.totalLocalBytesRead = 0; 115 this.totalShortCircuitBytesRead = 0; 116 this.totalZeroCopyBytesRead = 0; 117 } 118 119 public ReadStatistics(ReadStatistics rhs) { 120 this.totalBytesRead = rhs.getTotalBytesRead(); 121 this.totalLocalBytesRead = rhs.getTotalLocalBytesRead(); 122 this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead(); 123 this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead(); 124 } 125 126 /** 127 * @return The total bytes read. This will always be at least as 128 * high as the other numbers, since it includes all of them. 129 */ 130 public long getTotalBytesRead() { 131 return totalBytesRead; 132 } 133 134 /** 135 * @return The total local bytes read. This will always be at least 136 * as high as totalShortCircuitBytesRead, since all short-circuit 137 * reads are also local. 138 */ 139 public long getTotalLocalBytesRead() { 140 return totalLocalBytesRead; 141 } 142 143 /** 144 * @return The total short-circuit local bytes read. 145 */ 146 public long getTotalShortCircuitBytesRead() { 147 return totalShortCircuitBytesRead; 148 } 149 150 /** 151 * @return The total number of zero-copy bytes read. 152 */ 153 public long getTotalZeroCopyBytesRead() { 154 return totalZeroCopyBytesRead; 155 } 156 157 /** 158 * @return The total number of bytes read which were not local. 159 */ 160 public long getRemoteBytesRead() { 161 return totalBytesRead - totalLocalBytesRead; 162 } 163 164 void addRemoteBytes(long amt) { 165 this.totalBytesRead += amt; 166 } 167 168 void addLocalBytes(long amt) { 169 this.totalBytesRead += amt; 170 this.totalLocalBytesRead += amt; 171 } 172 173 void addShortCircuitBytes(long amt) { 174 this.totalBytesRead += amt; 175 this.totalLocalBytesRead += amt; 176 this.totalShortCircuitBytesRead += amt; 177 } 178 179 void addZeroCopyBytes(long amt) { 180 this.totalBytesRead += amt; 181 this.totalLocalBytesRead += amt; 182 this.totalShortCircuitBytesRead += amt; 183 this.totalZeroCopyBytesRead += amt; 184 } 185 186 private long totalBytesRead; 187 188 private long totalLocalBytesRead; 189 190 private long totalShortCircuitBytesRead; 191 192 private long totalZeroCopyBytesRead; 193 } 194 195 /** 196 * This variable tracks the number of failures since the start of the 197 * most recent user-facing operation. That is to say, it should be reset 198 * whenever the user makes a call on this stream, and if at any point 199 * during the retry logic, the failure count exceeds a threshold, 200 * the errors will be thrown back to the operation. 201 * 202 * Specifically this counts the number of times the client has gone 203 * back to the namenode to get a new list of block locations, and is 204 * capped at maxBlockAcquireFailures 205 */ 206 private int failures = 0; 207 208 /* XXX Use of CocurrentHashMap is temp fix. Need to fix 209 * parallel accesses to DFSInputStream (through ptreads) properly */ 210 private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes = 211 new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>(); 212 private int buffersize = 1; 213 214 private final byte[] oneByteBuf = new byte[1]; // used for 'int read()' 215 216 void addToDeadNodes(DatanodeInfo dnInfo) { 217 deadNodes.put(dnInfo, dnInfo); 218 } 219 220 DFSInputStream(DFSClient dfsClient, String src, int buffersize, boolean verifyChecksum 221 ) throws IOException, UnresolvedLinkException { 222 this.dfsClient = dfsClient; 223 this.verifyChecksum = verifyChecksum; 224 this.buffersize = buffersize; 225 this.src = src; 226 this.cachingStrategy = 227 dfsClient.getDefaultReadCachingStrategy(); 228 openInfo(); 229 } 230 231 /** 232 * Grab the open-file info from namenode 233 */ 234 synchronized void openInfo() throws IOException, UnresolvedLinkException { 235 lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); 236 int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength; 237 while (retriesForLastBlockLength > 0) { 238 // Getting last block length as -1 is a special case. When cluster 239 // restarts, DNs may not report immediately. At this time partial block 240 // locations will not be available with NN for getting the length. Lets 241 // retry for 3 times to get the length. 242 if (lastBlockBeingWrittenLength == -1) { 243 DFSClient.LOG.warn("Last block locations not available. " 244 + "Datanodes might not have reported blocks completely." 245 + " Will retry for " + retriesForLastBlockLength + " times"); 246 waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength); 247 lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); 248 } else { 249 break; 250 } 251 retriesForLastBlockLength--; 252 } 253 if (retriesForLastBlockLength == 0) { 254 throw new IOException("Could not obtain the last block locations."); 255 } 256 } 257 258 private void waitFor(int waitTime) throws IOException { 259 try { 260 Thread.sleep(waitTime); 261 } catch (InterruptedException e) { 262 throw new IOException( 263 "Interrupted while getting the last block length."); 264 } 265 } 266 267 private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException { 268 final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0); 269 if (DFSClient.LOG.isDebugEnabled()) { 270 DFSClient.LOG.debug("newInfo = " + newInfo); 271 } 272 if (newInfo == null) { 273 throw new IOException("Cannot open filename " + src); 274 } 275 276 if (locatedBlocks != null) { 277 Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator(); 278 Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator(); 279 while (oldIter.hasNext() && newIter.hasNext()) { 280 if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) { 281 throw new IOException("Blocklist for " + src + " has changed!"); 282 } 283 } 284 } 285 locatedBlocks = newInfo; 286 long lastBlockBeingWrittenLength = 0; 287 if (!locatedBlocks.isLastBlockComplete()) { 288 final LocatedBlock last = locatedBlocks.getLastLocatedBlock(); 289 if (last != null) { 290 if (last.getLocations().length == 0) { 291 if (last.getBlockSize() == 0) { 292 // if the length is zero, then no data has been written to 293 // datanode. So no need to wait for the locations. 294 return 0; 295 } 296 return -1; 297 } 298 final long len = readBlockLength(last); 299 last.getBlock().setNumBytes(len); 300 lastBlockBeingWrittenLength = len; 301 } 302 } 303 304 currentNode = null; 305 return lastBlockBeingWrittenLength; 306 } 307 308 /** Read the block length from one of the datanodes. */ 309 private long readBlockLength(LocatedBlock locatedblock) throws IOException { 310 assert locatedblock != null : "LocatedBlock cannot be null"; 311 int replicaNotFoundCount = locatedblock.getLocations().length; 312 313 for(DatanodeInfo datanode : locatedblock.getLocations()) { 314 ClientDatanodeProtocol cdp = null; 315 316 try { 317 cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode, 318 dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout, 319 dfsClient.getConf().connectToDnViaHostname, locatedblock); 320 321 final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock()); 322 323 if (n >= 0) { 324 return n; 325 } 326 } 327 catch(IOException ioe) { 328 if (ioe instanceof RemoteException && 329 (((RemoteException) ioe).unwrapRemoteException() instanceof 330 ReplicaNotFoundException)) { 331 // special case : replica might not be on the DN, treat as 0 length 332 replicaNotFoundCount--; 333 } 334 335 if (DFSClient.LOG.isDebugEnabled()) { 336 DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode " 337 + datanode + " for block " + locatedblock.getBlock(), ioe); 338 } 339 } finally { 340 if (cdp != null) { 341 RPC.stopProxy(cdp); 342 } 343 } 344 } 345 346 // Namenode told us about these locations, but none know about the replica 347 // means that we hit the race between pipeline creation start and end. 348 // we require all 3 because some other exception could have happened 349 // on a DN that has it. we want to report that error 350 if (replicaNotFoundCount == 0) { 351 return 0; 352 } 353 354 throw new IOException("Cannot obtain block length for " + locatedblock); 355 } 356 357 public synchronized long getFileLength() { 358 return locatedBlocks == null? 0: 359 locatedBlocks.getFileLength() + lastBlockBeingWrittenLength; 360 } 361 362 // Short circuit local reads are forbidden for files that are 363 // under construction. See HDFS-2757. 364 synchronized boolean shortCircuitForbidden() { 365 return locatedBlocks.isUnderConstruction(); 366 } 367 368 /** 369 * Returns the datanode from which the stream is currently reading. 370 */ 371 public DatanodeInfo getCurrentDatanode() { 372 return currentNode; 373 } 374 375 /** 376 * Returns the block containing the target position. 377 */ 378 synchronized public ExtendedBlock getCurrentBlock() { 379 if (currentLocatedBlock == null){ 380 return null; 381 } 382 return currentLocatedBlock.getBlock(); 383 } 384 385 /** 386 * Return collection of blocks that has already been located. 387 */ 388 public synchronized List<LocatedBlock> getAllBlocks() throws IOException { 389 return getBlockRange(0, getFileLength()); 390 } 391 392 /** 393 * Get block at the specified position. 394 * Fetch it from the namenode if not cached. 395 * 396 * @param offset block corresponding to this offset in file is returned 397 * @param updatePosition whether to update current position 398 * @return located block 399 * @throws IOException 400 */ 401 private synchronized LocatedBlock getBlockAt(long offset, 402 boolean updatePosition) throws IOException { 403 assert (locatedBlocks != null) : "locatedBlocks is null"; 404 405 final LocatedBlock blk; 406 407 //check offset 408 if (offset < 0 || offset >= getFileLength()) { 409 throw new IOException("offset < 0 || offset >= getFileLength(), offset=" 410 + offset 411 + ", updatePosition=" + updatePosition 412 + ", locatedBlocks=" + locatedBlocks); 413 } 414 else if (offset >= locatedBlocks.getFileLength()) { 415 // offset to the portion of the last block, 416 // which is not known to the name-node yet; 417 // getting the last block 418 blk = locatedBlocks.getLastLocatedBlock(); 419 } 420 else { 421 // search cached blocks first 422 int targetBlockIdx = locatedBlocks.findBlock(offset); 423 if (targetBlockIdx < 0) { // block is not cached 424 targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx); 425 // fetch more blocks 426 final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset); 427 assert (newBlocks != null) : "Could not find target position " + offset; 428 locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks()); 429 } 430 blk = locatedBlocks.get(targetBlockIdx); 431 } 432 433 // update current position 434 if (updatePosition) { 435 pos = offset; 436 blockEnd = blk.getStartOffset() + blk.getBlockSize() - 1; 437 currentLocatedBlock = blk; 438 } 439 return blk; 440 } 441 442 /** Fetch a block from namenode and cache it */ 443 private synchronized void fetchBlockAt(long offset) throws IOException { 444 int targetBlockIdx = locatedBlocks.findBlock(offset); 445 if (targetBlockIdx < 0) { // block is not cached 446 targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx); 447 } 448 // fetch blocks 449 final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset); 450 if (newBlocks == null) { 451 throw new IOException("Could not find target position " + offset); 452 } 453 locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks()); 454 } 455 456 /** 457 * Get blocks in the specified range. 458 * Fetch them from the namenode if not cached. This function 459 * will not get a read request beyond the EOF. 460 * @param offset starting offset in file 461 * @param length length of data 462 * @return consequent segment of located blocks 463 * @throws IOException 464 */ 465 private synchronized List<LocatedBlock> getBlockRange(long offset, 466 long length) throws IOException { 467 // getFileLength(): returns total file length 468 // locatedBlocks.getFileLength(): returns length of completed blocks 469 if (offset >= getFileLength()) { 470 throw new IOException("Offset: " + offset + 471 " exceeds file length: " + getFileLength()); 472 } 473 474 final List<LocatedBlock> blocks; 475 final long lengthOfCompleteBlk = locatedBlocks.getFileLength(); 476 final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk; 477 final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk; 478 479 if (readOffsetWithinCompleteBlk) { 480 //get the blocks of finalized (completed) block range 481 blocks = getFinalizedBlockRange(offset, 482 Math.min(length, lengthOfCompleteBlk - offset)); 483 } else { 484 blocks = new ArrayList<LocatedBlock>(1); 485 } 486 487 // get the blocks from incomplete block range 488 if (readLengthPastCompleteBlk) { 489 blocks.add(locatedBlocks.getLastLocatedBlock()); 490 } 491 492 return blocks; 493 } 494 495 /** 496 * Get blocks in the specified range. 497 * Includes only the complete blocks. 498 * Fetch them from the namenode if not cached. 499 */ 500 private synchronized List<LocatedBlock> getFinalizedBlockRange( 501 long offset, long length) throws IOException { 502 assert (locatedBlocks != null) : "locatedBlocks is null"; 503 List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>(); 504 // search cached blocks first 505 int blockIdx = locatedBlocks.findBlock(offset); 506 if (blockIdx < 0) { // block is not cached 507 blockIdx = LocatedBlocks.getInsertIndex(blockIdx); 508 } 509 long remaining = length; 510 long curOff = offset; 511 while(remaining > 0) { 512 LocatedBlock blk = null; 513 if(blockIdx < locatedBlocks.locatedBlockCount()) 514 blk = locatedBlocks.get(blockIdx); 515 if (blk == null || curOff < blk.getStartOffset()) { 516 LocatedBlocks newBlocks; 517 newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining); 518 locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks()); 519 continue; 520 } 521 assert curOff >= blk.getStartOffset() : "Block not found"; 522 blockRange.add(blk); 523 long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff; 524 remaining -= bytesRead; 525 curOff += bytesRead; 526 blockIdx++; 527 } 528 return blockRange; 529 } 530 531 /** 532 * Open a DataInputStream to a DataNode so that it can be read from. 533 * We get block ID and the IDs of the destinations at startup, from the namenode. 534 */ 535 private synchronized DatanodeInfo blockSeekTo(long target) throws IOException { 536 if (target >= getFileLength()) { 537 throw new IOException("Attempted to read past end of file"); 538 } 539 540 // Will be getting a new BlockReader. 541 if (blockReader != null) { 542 blockReader.close(); 543 blockReader = null; 544 } 545 546 // 547 // Connect to best DataNode for desired Block, with potential offset 548 // 549 DatanodeInfo chosenNode = null; 550 int refetchToken = 1; // only need to get a new access token once 551 int refetchEncryptionKey = 1; // only need to get a new encryption key once 552 553 boolean connectFailedOnce = false; 554 555 while (true) { 556 // 557 // Compute desired block 558 // 559 LocatedBlock targetBlock = getBlockAt(target, true); 560 assert (target==pos) : "Wrong postion " + pos + " expect " + target; 561 long offsetIntoBlock = target - targetBlock.getStartOffset(); 562 563 DNAddrPair retval = chooseDataNode(targetBlock, null); 564 chosenNode = retval.info; 565 InetSocketAddress targetAddr = retval.addr; 566 567 try { 568 ExtendedBlock blk = targetBlock.getBlock(); 569 Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken(); 570 blockReader = new BlockReaderFactory(dfsClient.getConf()). 571 setInetSocketAddress(targetAddr). 572 setRemotePeerFactory(dfsClient). 573 setDatanodeInfo(chosenNode). 574 setFileName(src). 575 setBlock(blk). 576 setBlockToken(accessToken). 577 setStartOffset(offsetIntoBlock). 578 setVerifyChecksum(verifyChecksum). 579 setClientName(dfsClient.clientName). 580 setLength(blk.getNumBytes() - offsetIntoBlock). 581 setCachingStrategy(cachingStrategy). 582 setAllowShortCircuitLocalReads(!shortCircuitForbidden()). 583 setClientCacheContext(dfsClient.getClientContext()). 584 setUserGroupInformation(dfsClient.ugi). 585 setConfiguration(dfsClient.getConfiguration()). 586 build(); 587 if(connectFailedOnce) { 588 DFSClient.LOG.info("Successfully connected to " + targetAddr + 589 " for " + blk); 590 } 591 return chosenNode; 592 } catch (IOException ex) { 593 if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { 594 DFSClient.LOG.info("Will fetch a new encryption key and retry, " 595 + "encryption key was invalid when connecting to " + targetAddr 596 + " : " + ex); 597 // The encryption key used is invalid. 598 refetchEncryptionKey--; 599 dfsClient.clearDataEncryptionKey(); 600 } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) { 601 refetchToken--; 602 fetchBlockAt(target); 603 } else { 604 connectFailedOnce = true; 605 DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block" 606 + ", add to deadNodes and continue. " + ex, ex); 607 // Put chosen node into dead list, continue 608 addToDeadNodes(chosenNode); 609 } 610 } 611 } 612 } 613 614 /** 615 * Close it down! 616 */ 617 @Override 618 public synchronized void close() throws IOException { 619 if (closed) { 620 return; 621 } 622 dfsClient.checkOpen(); 623 624 if (!extendedReadBuffers.isEmpty()) { 625 final StringBuilder builder = new StringBuilder(); 626 extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() { 627 private String prefix = ""; 628 @Override 629 public void accept(ByteBuffer k, Object v) { 630 builder.append(prefix).append(k); 631 prefix = ", "; 632 } 633 }); 634 DFSClient.LOG.warn("closing file " + src + ", but there are still " + 635 "unreleased ByteBuffers allocated by read(). " + 636 "Please release " + builder.toString() + "."); 637 } 638 if (blockReader != null) { 639 blockReader.close(); 640 blockReader = null; 641 } 642 super.close(); 643 closed = true; 644 } 645 646 @Override 647 public synchronized int read() throws IOException { 648 int ret = read( oneByteBuf, 0, 1 ); 649 return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff); 650 } 651 652 /** 653 * Wraps different possible read implementations so that readBuffer can be 654 * strategy-agnostic. 655 */ 656 private interface ReaderStrategy { 657 public int doRead(BlockReader blockReader, int off, int len, 658 ReadStatistics readStatistics) throws ChecksumException, IOException; 659 } 660 661 private static void updateReadStatistics(ReadStatistics readStatistics, 662 int nRead, BlockReader blockReader) { 663 if (nRead <= 0) return; 664 if (blockReader.isShortCircuit()) { 665 readStatistics.addShortCircuitBytes(nRead); 666 } else if (blockReader.isLocal()) { 667 readStatistics.addLocalBytes(nRead); 668 } else { 669 readStatistics.addRemoteBytes(nRead); 670 } 671 } 672 673 /** 674 * Used to read bytes into a byte[] 675 */ 676 private static class ByteArrayStrategy implements ReaderStrategy { 677 final byte[] buf; 678 679 public ByteArrayStrategy(byte[] buf) { 680 this.buf = buf; 681 } 682 683 @Override 684 public int doRead(BlockReader blockReader, int off, int len, 685 ReadStatistics readStatistics) throws ChecksumException, IOException { 686 int nRead = blockReader.read(buf, off, len); 687 updateReadStatistics(readStatistics, nRead, blockReader); 688 return nRead; 689 } 690 } 691 692 /** 693 * Used to read bytes into a user-supplied ByteBuffer 694 */ 695 private static class ByteBufferStrategy implements ReaderStrategy { 696 final ByteBuffer buf; 697 ByteBufferStrategy(ByteBuffer buf) { 698 this.buf = buf; 699 } 700 701 @Override 702 public int doRead(BlockReader blockReader, int off, int len, 703 ReadStatistics readStatistics) throws ChecksumException, IOException { 704 int oldpos = buf.position(); 705 int oldlimit = buf.limit(); 706 boolean success = false; 707 try { 708 int ret = blockReader.read(buf); 709 success = true; 710 updateReadStatistics(readStatistics, ret, blockReader); 711 return ret; 712 } finally { 713 if (!success) { 714 // Reset to original state so that retries work correctly. 715 buf.position(oldpos); 716 buf.limit(oldlimit); 717 } 718 } 719 } 720 } 721 722 /* This is a used by regular read() and handles ChecksumExceptions. 723 * name readBuffer() is chosen to imply similarity to readBuffer() in 724 * ChecksumFileSystem 725 */ 726 private synchronized int readBuffer(ReaderStrategy reader, int off, int len, 727 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 728 throws IOException { 729 IOException ioe; 730 731 /* we retry current node only once. So this is set to true only here. 732 * Intention is to handle one common case of an error that is not a 733 * failure on datanode or client : when DataNode closes the connection 734 * since client is idle. If there are other cases of "non-errors" then 735 * then a datanode might be retried by setting this to true again. 736 */ 737 boolean retryCurrentNode = true; 738 739 while (true) { 740 // retry as many times as seekToNewSource allows. 741 try { 742 return reader.doRead(blockReader, off, len, readStatistics); 743 } catch ( ChecksumException ce ) { 744 DFSClient.LOG.warn("Found Checksum error for " 745 + getCurrentBlock() + " from " + currentNode 746 + " at " + ce.getPos()); 747 ioe = ce; 748 retryCurrentNode = false; 749 // we want to remember which block replicas we have tried 750 addIntoCorruptedBlockMap(getCurrentBlock(), currentNode, 751 corruptedBlockMap); 752 } catch ( IOException e ) { 753 if (!retryCurrentNode) { 754 DFSClient.LOG.warn("Exception while reading from " 755 + getCurrentBlock() + " of " + src + " from " 756 + currentNode, e); 757 } 758 ioe = e; 759 } 760 boolean sourceFound = false; 761 if (retryCurrentNode) { 762 /* possibly retry the same node so that transient errors don't 763 * result in application level failures (e.g. Datanode could have 764 * closed the connection because the client is idle for too long). 765 */ 766 sourceFound = seekToBlockSource(pos); 767 } else { 768 addToDeadNodes(currentNode); 769 sourceFound = seekToNewSource(pos); 770 } 771 if (!sourceFound) { 772 throw ioe; 773 } 774 retryCurrentNode = false; 775 } 776 } 777 778 private int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException { 779 dfsClient.checkOpen(); 780 if (closed) { 781 throw new IOException("Stream closed"); 782 } 783 Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 784 = new HashMap<ExtendedBlock, Set<DatanodeInfo>>(); 785 failures = 0; 786 if (pos < getFileLength()) { 787 int retries = 2; 788 while (retries > 0) { 789 try { 790 // currentNode can be left as null if previous read had a checksum 791 // error on the same block. See HDFS-3067 792 if (pos > blockEnd || currentNode == null) { 793 currentNode = blockSeekTo(pos); 794 } 795 int realLen = (int) Math.min(len, (blockEnd - pos + 1L)); 796 if (locatedBlocks.isLastBlockComplete()) { 797 realLen = (int) Math.min(realLen, locatedBlocks.getFileLength()); 798 } 799 int result = readBuffer(strategy, off, realLen, corruptedBlockMap); 800 801 if (result >= 0) { 802 pos += result; 803 } else { 804 // got a EOS from reader though we expect more data on it. 805 throw new IOException("Unexpected EOS from the reader"); 806 } 807 if (dfsClient.stats != null) { 808 dfsClient.stats.incrementBytesRead(result); 809 } 810 return result; 811 } catch (ChecksumException ce) { 812 throw ce; 813 } catch (IOException e) { 814 if (retries == 1) { 815 DFSClient.LOG.warn("DFS Read", e); 816 } 817 blockEnd = -1; 818 if (currentNode != null) { addToDeadNodes(currentNode); } 819 if (--retries == 0) { 820 throw e; 821 } 822 } finally { 823 // Check if need to report block replicas corruption either read 824 // was successful or ChecksumException occured. 825 reportCheckSumFailure(corruptedBlockMap, 826 currentLocatedBlock.getLocations().length); 827 } 828 } 829 } 830 return -1; 831 } 832 833 /** 834 * Read the entire buffer. 835 */ 836 @Override 837 public synchronized int read(final byte buf[], int off, int len) throws IOException { 838 ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf); 839 840 return readWithStrategy(byteArrayReader, off, len); 841 } 842 843 @Override 844 public synchronized int read(final ByteBuffer buf) throws IOException { 845 ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf); 846 847 return readWithStrategy(byteBufferReader, 0, buf.remaining()); 848 } 849 850 851 /** 852 * Add corrupted block replica into map. 853 */ 854 private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 855 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) { 856 Set<DatanodeInfo> dnSet = null; 857 if((corruptedBlockMap.containsKey(blk))) { 858 dnSet = corruptedBlockMap.get(blk); 859 }else { 860 dnSet = new HashSet<DatanodeInfo>(); 861 } 862 if (!dnSet.contains(node)) { 863 dnSet.add(node); 864 corruptedBlockMap.put(blk, dnSet); 865 } 866 } 867 868 private DNAddrPair chooseDataNode(LocatedBlock block, 869 Collection<DatanodeInfo> ignoredNodes) throws IOException { 870 while (true) { 871 DatanodeInfo[] nodes = block.getLocations(); 872 try { 873 return getBestNodeDNAddrPair(nodes, ignoredNodes); 874 } catch (IOException ie) { 875 String errMsg = 876 getBestNodeDNAddrPairErrorString(nodes, deadNodes, ignoredNodes); 877 String blockInfo = block.getBlock() + " file=" + src; 878 if (failures >= dfsClient.getMaxBlockAcquireFailures()) { 879 String description = "Could not obtain block: " + blockInfo; 880 DFSClient.LOG.warn(description + errMsg 881 + ". Throwing a BlockMissingException"); 882 throw new BlockMissingException(src, description, 883 block.getStartOffset()); 884 } 885 886 if (nodes == null || nodes.length == 0) { 887 DFSClient.LOG.info("No node available for " + blockInfo); 888 } 889 DFSClient.LOG.info("Could not obtain " + block.getBlock() 890 + " from any node: " + ie + errMsg 891 + ". Will get new block locations from namenode and retry..."); 892 try { 893 // Introducing a random factor to the wait time before another retry. 894 // The wait time is dependent on # of failures and a random factor. 895 // At the first time of getting a BlockMissingException, the wait time 896 // is a random number between 0..3000 ms. If the first retry 897 // still fails, we will wait 3000 ms grace period before the 2nd retry. 898 // Also at the second retry, the waiting window is expanded to 6000 ms 899 // alleviating the request rate from the server. Similarly the 3rd retry 900 // will wait 6000ms grace period before retry and the waiting window is 901 // expanded to 9000ms. 902 final int timeWindow = dfsClient.getConf().timeWindow; 903 double waitTime = timeWindow * failures + // grace period for the last round of attempt 904 timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure 905 DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec."); 906 Thread.sleep((long)waitTime); 907 } catch (InterruptedException iex) { 908 } 909 deadNodes.clear(); //2nd option is to remove only nodes[blockId] 910 openInfo(); 911 block = getBlockAt(block.getStartOffset(), false); 912 failures++; 913 continue; 914 } 915 } 916 } 917 918 /** 919 * Get the best node. 920 * @param nodes Nodes to choose from. 921 * @param ignoredNodes Do not chose nodes in this array (may be null) 922 * @return The DNAddrPair of the best node. 923 * @throws IOException 924 */ 925 private DNAddrPair getBestNodeDNAddrPair(final DatanodeInfo[] nodes, 926 Collection<DatanodeInfo> ignoredNodes) throws IOException { 927 DatanodeInfo chosenNode = bestNode(nodes, deadNodes, ignoredNodes); 928 final String dnAddr = 929 chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname); 930 if (DFSClient.LOG.isDebugEnabled()) { 931 DFSClient.LOG.debug("Connecting to datanode " + dnAddr); 932 } 933 InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr); 934 return new DNAddrPair(chosenNode, targetAddr); 935 } 936 937 private static String getBestNodeDNAddrPairErrorString( 938 DatanodeInfo nodes[], AbstractMap<DatanodeInfo, 939 DatanodeInfo> deadNodes, Collection<DatanodeInfo> ignoredNodes) { 940 StringBuilder errMsgr = new StringBuilder( 941 " No live nodes contain current block "); 942 errMsgr.append("Block locations:"); 943 for (DatanodeInfo datanode : nodes) { 944 errMsgr.append(" "); 945 errMsgr.append(datanode.toString()); 946 } 947 errMsgr.append(" Dead nodes: "); 948 for (DatanodeInfo datanode : deadNodes.keySet()) { 949 errMsgr.append(" "); 950 errMsgr.append(datanode.toString()); 951 } 952 if (ignoredNodes != null) { 953 errMsgr.append(" Ignored nodes: "); 954 for (DatanodeInfo datanode : ignoredNodes) { 955 errMsgr.append(" "); 956 errMsgr.append(datanode.toString()); 957 } 958 } 959 return errMsgr.toString(); 960 } 961 962 private void fetchBlockByteRange(LocatedBlock block, long start, long end, 963 byte[] buf, int offset, 964 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 965 throws IOException { 966 block = getBlockAt(block.getStartOffset(), false); 967 while (true) { 968 DNAddrPair addressPair = chooseDataNode(block, null); 969 try { 970 actualGetFromOneDataNode(addressPair, block, start, end, buf, offset, 971 corruptedBlockMap); 972 return; 973 } catch (IOException e) { 974 // Ignore. Already processed inside the function. 975 // Loop through to try the next node. 976 } 977 } 978 } 979 980 private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode, 981 final LocatedBlock block, final long start, final long end, 982 final ByteBuffer bb, 983 final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) { 984 return new Callable<ByteBuffer>() { 985 @Override 986 public ByteBuffer call() throws Exception { 987 byte[] buf = bb.array(); 988 int offset = bb.position(); 989 actualGetFromOneDataNode(datanode, block, start, end, buf, offset, 990 corruptedBlockMap); 991 return bb; 992 } 993 }; 994 } 995 996 private void actualGetFromOneDataNode(final DNAddrPair datanode, 997 LocatedBlock block, final long start, final long end, byte[] buf, 998 int offset, Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 999 throws IOException { 1000 DFSClientFaultInjector.get().startFetchFromDatanode(); 1001 int refetchToken = 1; // only need to get a new access token once 1002 int refetchEncryptionKey = 1; // only need to get a new encryption key once 1003 1004 while (true) { 1005 // cached block locations may have been updated by chooseDataNode() 1006 // or fetchBlockAt(). Always get the latest list of locations at the 1007 // start of the loop. 1008 CachingStrategy curCachingStrategy; 1009 boolean allowShortCircuitLocalReads; 1010 synchronized (this) { 1011 block = getBlockAt(block.getStartOffset(), false); 1012 curCachingStrategy = cachingStrategy; 1013 allowShortCircuitLocalReads = !shortCircuitForbidden(); 1014 } 1015 DatanodeInfo chosenNode = datanode.info; 1016 InetSocketAddress targetAddr = datanode.addr; 1017 BlockReader reader = null; 1018 1019 try { 1020 DFSClientFaultInjector.get().fetchFromDatanodeException(); 1021 Token<BlockTokenIdentifier> blockToken = block.getBlockToken(); 1022 int len = (int) (end - start + 1); 1023 reader = new BlockReaderFactory(dfsClient.getConf()). 1024 setInetSocketAddress(targetAddr). 1025 setRemotePeerFactory(dfsClient). 1026 setDatanodeInfo(chosenNode). 1027 setFileName(src). 1028 setBlock(block.getBlock()). 1029 setBlockToken(blockToken). 1030 setStartOffset(start). 1031 setVerifyChecksum(verifyChecksum). 1032 setClientName(dfsClient.clientName). 1033 setLength(len). 1034 setCachingStrategy(curCachingStrategy). 1035 setAllowShortCircuitLocalReads(allowShortCircuitLocalReads). 1036 setClientCacheContext(dfsClient.getClientContext()). 1037 setUserGroupInformation(dfsClient.ugi). 1038 setConfiguration(dfsClient.getConfiguration()). 1039 build(); 1040 int nread = reader.readAll(buf, offset, len); 1041 updateReadStatistics(readStatistics, nread, reader); 1042 1043 if (nread != len) { 1044 throw new IOException("truncated return from reader.read(): " + 1045 "excpected " + len + ", got " + nread); 1046 } 1047 DFSClientFaultInjector.get().readFromDatanodeDelay(); 1048 return; 1049 } catch (ChecksumException e) { 1050 String msg = "fetchBlockByteRange(). Got a checksum exception for " 1051 + src + " at " + block.getBlock() + ":" + e.getPos() + " from " 1052 + chosenNode; 1053 DFSClient.LOG.warn(msg); 1054 // we want to remember what we have tried 1055 addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap); 1056 addToDeadNodes(chosenNode); 1057 throw new IOException(msg); 1058 } catch (IOException e) { 1059 if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { 1060 DFSClient.LOG.info("Will fetch a new encryption key and retry, " 1061 + "encryption key was invalid when connecting to " + targetAddr 1062 + " : " + e); 1063 // The encryption key used is invalid. 1064 refetchEncryptionKey--; 1065 dfsClient.clearDataEncryptionKey(); 1066 continue; 1067 } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) { 1068 refetchToken--; 1069 try { 1070 fetchBlockAt(block.getStartOffset()); 1071 } catch (IOException fbae) { 1072 // ignore IOE, since we can retry it later in a loop 1073 } 1074 continue; 1075 } else { 1076 String msg = "Failed to connect to " + targetAddr + " for file " 1077 + src + " for block " + block.getBlock() + ":" + e; 1078 DFSClient.LOG.warn("Connection failure: " + msg, e); 1079 addToDeadNodes(chosenNode); 1080 throw new IOException(msg); 1081 } 1082 } finally { 1083 if (reader != null) { 1084 reader.close(); 1085 } 1086 } 1087 } 1088 } 1089 1090 /** 1091 * Like {@link #fetchBlockByteRange(LocatedBlock, long, long, byte[], 1092 * int, Map)} except we start up a second, parallel, 'hedged' read 1093 * if the first read is taking longer than configured amount of 1094 * time. We then wait on which ever read returns first. 1095 */ 1096 private void hedgedFetchBlockByteRange(LocatedBlock block, long start, 1097 long end, byte[] buf, int offset, 1098 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 1099 throws IOException { 1100 ArrayList<Future<ByteBuffer>> futures = new ArrayList<Future<ByteBuffer>>(); 1101 CompletionService<ByteBuffer> hedgedService = 1102 new ExecutorCompletionService<ByteBuffer>( 1103 dfsClient.getHedgedReadsThreadPool()); 1104 ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>(); 1105 ByteBuffer bb = null; 1106 int len = (int) (end - start + 1); 1107 block = getBlockAt(block.getStartOffset(), false); 1108 while (true) { 1109 // see HDFS-6591, this metric is used to verify/catch unnecessary loops 1110 hedgedReadOpsLoopNumForTesting++; 1111 DNAddrPair chosenNode = null; 1112 // there is no request already executing. 1113 if (futures.isEmpty()) { 1114 // chooseDataNode is a commitment. If no node, we go to 1115 // the NN to reget block locations. Only go here on first read. 1116 chosenNode = chooseDataNode(block, ignored); 1117 bb = ByteBuffer.wrap(buf, offset, len); 1118 Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode( 1119 chosenNode, block, start, end, bb, corruptedBlockMap); 1120 Future<ByteBuffer> firstRequest = hedgedService 1121 .submit(getFromDataNodeCallable); 1122 futures.add(firstRequest); 1123 try { 1124 Future<ByteBuffer> future = hedgedService.poll( 1125 dfsClient.getHedgedReadTimeout(), TimeUnit.MILLISECONDS); 1126 if (future != null) { 1127 future.get(); 1128 return; 1129 } 1130 if (DFSClient.LOG.isDebugEnabled()) { 1131 DFSClient.LOG.debug("Waited " + dfsClient.getHedgedReadTimeout() 1132 + "ms to read from " + chosenNode.info 1133 + "; spawning hedged read"); 1134 } 1135 // Ignore this node on next go around. 1136 ignored.add(chosenNode.info); 1137 dfsClient.getHedgedReadMetrics().incHedgedReadOps(); 1138 continue; // no need to refresh block locations 1139 } catch (InterruptedException e) { 1140 // Ignore 1141 } catch (ExecutionException e) { 1142 // Ignore already logged in the call. 1143 } 1144 } else { 1145 // We are starting up a 'hedged' read. We have a read already 1146 // ongoing. Call getBestNodeDNAddrPair instead of chooseDataNode. 1147 // If no nodes to do hedged reads against, pass. 1148 try { 1149 try { 1150 chosenNode = getBestNodeDNAddrPair(block.getLocations(), ignored); 1151 } catch (IOException ioe) { 1152 chosenNode = chooseDataNode(block, ignored); 1153 } 1154 bb = ByteBuffer.allocate(len); 1155 Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode( 1156 chosenNode, block, start, end, bb, corruptedBlockMap); 1157 Future<ByteBuffer> oneMoreRequest = hedgedService 1158 .submit(getFromDataNodeCallable); 1159 futures.add(oneMoreRequest); 1160 } catch (IOException ioe) { 1161 if (DFSClient.LOG.isDebugEnabled()) { 1162 DFSClient.LOG.debug("Failed getting node for hedged read: " 1163 + ioe.getMessage()); 1164 } 1165 } 1166 // if not succeeded. Submit callables for each datanode in a loop, wait 1167 // for a fixed interval and get the result from the fastest one. 1168 try { 1169 ByteBuffer result = getFirstToComplete(hedgedService, futures); 1170 // cancel the rest. 1171 cancelAll(futures); 1172 if (result.array() != buf) { // compare the array pointers 1173 dfsClient.getHedgedReadMetrics().incHedgedReadWins(); 1174 System.arraycopy(result.array(), result.position(), buf, offset, 1175 len); 1176 } else { 1177 dfsClient.getHedgedReadMetrics().incHedgedReadOps(); 1178 } 1179 return; 1180 } catch (InterruptedException ie) { 1181 // Ignore and retry 1182 } 1183 // We got here if exception. Ignore this node on next go around IFF 1184 // we found a chosenNode to hedge read against. 1185 if (chosenNode != null && chosenNode.info != null) { 1186 ignored.add(chosenNode.info); 1187 } 1188 } 1189 } 1190 } 1191 1192 @VisibleForTesting 1193 public long getHedgedReadOpsLoopNumForTesting() { 1194 return hedgedReadOpsLoopNumForTesting; 1195 } 1196 1197 private ByteBuffer getFirstToComplete( 1198 CompletionService<ByteBuffer> hedgedService, 1199 ArrayList<Future<ByteBuffer>> futures) throws InterruptedException { 1200 if (futures.isEmpty()) { 1201 throw new InterruptedException("let's retry"); 1202 } 1203 Future<ByteBuffer> future = null; 1204 try { 1205 future = hedgedService.take(); 1206 ByteBuffer bb = future.get(); 1207 futures.remove(future); 1208 return bb; 1209 } catch (ExecutionException e) { 1210 // already logged in the Callable 1211 futures.remove(future); 1212 } catch (CancellationException ce) { 1213 // already logged in the Callable 1214 futures.remove(future); 1215 } 1216 1217 throw new InterruptedException("let's retry"); 1218 } 1219 1220 private void cancelAll(List<Future<ByteBuffer>> futures) { 1221 for (Future<ByteBuffer> future : futures) { 1222 // Unfortunately, hdfs reads do not take kindly to interruption. 1223 // Threads return a variety of interrupted-type exceptions but 1224 // also complaints about invalid pbs -- likely because read 1225 // is interrupted before gets whole pb. Also verbose WARN 1226 // logging. So, for now, do not interrupt running read. 1227 future.cancel(false); 1228 } 1229 } 1230 1231 /** 1232 * Should the block access token be refetched on an exception 1233 * 1234 * @param ex Exception received 1235 * @param targetAddr Target datanode address from where exception was received 1236 * @return true if block access token has expired or invalid and it should be 1237 * refetched 1238 */ 1239 private static boolean tokenRefetchNeeded(IOException ex, 1240 InetSocketAddress targetAddr) { 1241 /* 1242 * Get a new access token and retry. Retry is needed in 2 cases. 1) 1243 * When both NN and DN re-started while DFSClient holding a cached 1244 * access token. 2) In the case that NN fails to update its 1245 * access key at pre-set interval (by a wide margin) and 1246 * subsequently restarts. In this case, DN re-registers itself with 1247 * NN and receives a new access key, but DN will delete the old 1248 * access key from its memory since it's considered expired based on 1249 * the estimated expiration date. 1250 */ 1251 if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) { 1252 DFSClient.LOG.info("Access token was invalid when connecting to " 1253 + targetAddr + " : " + ex); 1254 return true; 1255 } 1256 return false; 1257 } 1258 1259 /** 1260 * Read bytes starting from the specified position. 1261 * 1262 * @param position start read from this position 1263 * @param buffer read buffer 1264 * @param offset offset into buffer 1265 * @param length number of bytes to read 1266 * 1267 * @return actual number of bytes read 1268 */ 1269 @Override 1270 public int read(long position, byte[] buffer, int offset, int length) 1271 throws IOException { 1272 // sanity checks 1273 dfsClient.checkOpen(); 1274 if (closed) { 1275 throw new IOException("Stream closed"); 1276 } 1277 failures = 0; 1278 long filelen = getFileLength(); 1279 if ((position < 0) || (position >= filelen)) { 1280 return -1; 1281 } 1282 int realLen = length; 1283 if ((position + length) > filelen) { 1284 realLen = (int)(filelen - position); 1285 } 1286 1287 // determine the block and byte range within the block 1288 // corresponding to position and realLen 1289 List<LocatedBlock> blockRange = getBlockRange(position, realLen); 1290 int remaining = realLen; 1291 Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 1292 = new HashMap<ExtendedBlock, Set<DatanodeInfo>>(); 1293 for (LocatedBlock blk : blockRange) { 1294 long targetStart = position - blk.getStartOffset(); 1295 long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart); 1296 try { 1297 if (dfsClient.isHedgedReadsEnabled()) { 1298 hedgedFetchBlockByteRange(blk, targetStart, targetStart + bytesToRead 1299 - 1, buffer, offset, corruptedBlockMap); 1300 } else { 1301 fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1, 1302 buffer, offset, corruptedBlockMap); 1303 } 1304 } finally { 1305 // Check and report if any block replicas are corrupted. 1306 // BlockMissingException may be caught if all block replicas are 1307 // corrupted. 1308 reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length); 1309 } 1310 1311 remaining -= bytesToRead; 1312 position += bytesToRead; 1313 offset += bytesToRead; 1314 } 1315 assert remaining == 0 : "Wrong number of bytes read."; 1316 if (dfsClient.stats != null) { 1317 dfsClient.stats.incrementBytesRead(realLen); 1318 } 1319 return realLen; 1320 } 1321 1322 /** 1323 * DFSInputStream reports checksum failure. 1324 * Case I : client has tried multiple data nodes and at least one of the 1325 * attempts has succeeded. We report the other failures as corrupted block to 1326 * namenode. 1327 * Case II: client has tried out all data nodes, but all failed. We 1328 * only report if the total number of replica is 1. We do not 1329 * report otherwise since this maybe due to the client is a handicapped client 1330 * (who can not read). 1331 * @param corruptedBlockMap map of corrupted blocks 1332 * @param dataNodeCount number of data nodes who contains the block replicas 1333 */ 1334 private void reportCheckSumFailure( 1335 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 1336 int dataNodeCount) { 1337 if (corruptedBlockMap.isEmpty()) { 1338 return; 1339 } 1340 Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap 1341 .entrySet().iterator(); 1342 Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next(); 1343 ExtendedBlock blk = entry.getKey(); 1344 Set<DatanodeInfo> dnSet = entry.getValue(); 1345 if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0)) 1346 || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) { 1347 DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()]; 1348 int i = 0; 1349 for (DatanodeInfo dn:dnSet) { 1350 locs[i++] = dn; 1351 } 1352 LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) }; 1353 dfsClient.reportChecksumFailure(src, lblocks); 1354 } 1355 corruptedBlockMap.clear(); 1356 } 1357 1358 @Override 1359 public long skip(long n) throws IOException { 1360 if ( n > 0 ) { 1361 long curPos = getPos(); 1362 long fileLen = getFileLength(); 1363 if( n+curPos > fileLen ) { 1364 n = fileLen - curPos; 1365 } 1366 seek(curPos+n); 1367 return n; 1368 } 1369 return n < 0 ? -1 : 0; 1370 } 1371 1372 /** 1373 * Seek to a new arbitrary location 1374 */ 1375 @Override 1376 public synchronized void seek(long targetPos) throws IOException { 1377 if (targetPos > getFileLength()) { 1378 throw new EOFException("Cannot seek after EOF"); 1379 } 1380 if (targetPos < 0) { 1381 throw new EOFException("Cannot seek to negative offset"); 1382 } 1383 if (closed) { 1384 throw new IOException("Stream is closed!"); 1385 } 1386 boolean done = false; 1387 if (pos <= targetPos && targetPos <= blockEnd) { 1388 // 1389 // If this seek is to a positive position in the current 1390 // block, and this piece of data might already be lying in 1391 // the TCP buffer, then just eat up the intervening data. 1392 // 1393 int diff = (int)(targetPos - pos); 1394 if (diff <= blockReader.available()) { 1395 try { 1396 pos += blockReader.skip(diff); 1397 if (pos == targetPos) { 1398 done = true; 1399 } else { 1400 // The range was already checked. If the block reader returns 1401 // something unexpected instead of throwing an exception, it is 1402 // most likely a bug. 1403 String errMsg = "BlockReader failed to seek to " + 1404 targetPos + ". Instead, it seeked to " + pos + "."; 1405 DFSClient.LOG.warn(errMsg); 1406 throw new IOException(errMsg); 1407 } 1408 } catch (IOException e) {//make following read to retry 1409 if(DFSClient.LOG.isDebugEnabled()) { 1410 DFSClient.LOG.debug("Exception while seek to " + targetPos 1411 + " from " + getCurrentBlock() + " of " + src + " from " 1412 + currentNode, e); 1413 } 1414 } 1415 } 1416 } 1417 if (!done) { 1418 pos = targetPos; 1419 blockEnd = -1; 1420 } 1421 } 1422 1423 /** 1424 * Same as {@link #seekToNewSource(long)} except that it does not exclude 1425 * the current datanode and might connect to the same node. 1426 */ 1427 private synchronized boolean seekToBlockSource(long targetPos) 1428 throws IOException { 1429 currentNode = blockSeekTo(targetPos); 1430 return true; 1431 } 1432 1433 /** 1434 * Seek to given position on a node other than the current node. If 1435 * a node other than the current node is found, then returns true. 1436 * If another node could not be found, then returns false. 1437 */ 1438 @Override 1439 public synchronized boolean seekToNewSource(long targetPos) throws IOException { 1440 boolean markedDead = deadNodes.containsKey(currentNode); 1441 addToDeadNodes(currentNode); 1442 DatanodeInfo oldNode = currentNode; 1443 DatanodeInfo newNode = blockSeekTo(targetPos); 1444 if (!markedDead) { 1445 /* remove it from deadNodes. blockSeekTo could have cleared 1446 * deadNodes and added currentNode again. Thats ok. */ 1447 deadNodes.remove(oldNode); 1448 } 1449 if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) { 1450 currentNode = newNode; 1451 return true; 1452 } else { 1453 return false; 1454 } 1455 } 1456 1457 /** 1458 */ 1459 @Override 1460 public synchronized long getPos() throws IOException { 1461 return pos; 1462 } 1463 1464 /** Return the size of the remaining available bytes 1465 * if the size is less than or equal to {@link Integer#MAX_VALUE}, 1466 * otherwise, return {@link Integer#MAX_VALUE}. 1467 */ 1468 @Override 1469 public synchronized int available() throws IOException { 1470 if (closed) { 1471 throw new IOException("Stream closed"); 1472 } 1473 1474 final long remaining = getFileLength() - pos; 1475 return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE; 1476 } 1477 1478 /** 1479 * We definitely don't support marks 1480 */ 1481 @Override 1482 public boolean markSupported() { 1483 return false; 1484 } 1485 @Override 1486 public void mark(int readLimit) { 1487 } 1488 @Override 1489 public void reset() throws IOException { 1490 throw new IOException("Mark/reset not supported"); 1491 } 1492 1493 /** 1494 * Pick the best node from which to stream the data. 1495 * Entries in <i>nodes</i> are already in the priority order 1496 */ 1497 static DatanodeInfo bestNode(DatanodeInfo nodes[], 1498 AbstractMap<DatanodeInfo, DatanodeInfo> deadNodes, 1499 Collection<DatanodeInfo> ignoredNodes) throws IOException { 1500 if (nodes != null) { 1501 for (int i = 0; i < nodes.length; i++) { 1502 if (!deadNodes.containsKey(nodes[i]) 1503 && (ignoredNodes == null || !ignoredNodes.contains(nodes[i]))) { 1504 return nodes[i]; 1505 } 1506 } 1507 } 1508 throw new IOException("No live nodes contain current block"); 1509 } 1510 1511 /** Utility class to encapsulate data node info and its address. */ 1512 static class DNAddrPair { 1513 final DatanodeInfo info; 1514 final InetSocketAddress addr; 1515 DNAddrPair(DatanodeInfo info, InetSocketAddress addr) { 1516 this.info = info; 1517 this.addr = addr; 1518 } 1519 } 1520 1521 /** 1522 * Get statistics about the reads which this DFSInputStream has done. 1523 */ 1524 public synchronized ReadStatistics getReadStatistics() { 1525 return new ReadStatistics(readStatistics); 1526 } 1527 1528 private synchronized void closeCurrentBlockReader() { 1529 if (blockReader == null) return; 1530 // Close the current block reader so that the new caching settings can 1531 // take effect immediately. 1532 try { 1533 blockReader.close(); 1534 } catch (IOException e) { 1535 DFSClient.LOG.error("error closing blockReader", e); 1536 } 1537 blockReader = null; 1538 } 1539 1540 @Override 1541 public synchronized void setReadahead(Long readahead) 1542 throws IOException { 1543 this.cachingStrategy = 1544 new CachingStrategy.Builder(this.cachingStrategy). 1545 setReadahead(readahead).build(); 1546 closeCurrentBlockReader(); 1547 } 1548 1549 @Override 1550 public synchronized void setDropBehind(Boolean dropBehind) 1551 throws IOException { 1552 this.cachingStrategy = 1553 new CachingStrategy.Builder(this.cachingStrategy). 1554 setDropBehind(dropBehind).build(); 1555 closeCurrentBlockReader(); 1556 } 1557 1558 /** 1559 * The immutable empty buffer we return when we reach EOF when doing a 1560 * zero-copy read. 1561 */ 1562 private static final ByteBuffer EMPTY_BUFFER = 1563 ByteBuffer.allocateDirect(0).asReadOnlyBuffer(); 1564 1565 @Override 1566 public synchronized ByteBuffer read(ByteBufferPool bufferPool, 1567 int maxLength, EnumSet<ReadOption> opts) 1568 throws IOException, UnsupportedOperationException { 1569 if (maxLength == 0) { 1570 return EMPTY_BUFFER; 1571 } else if (maxLength < 0) { 1572 throw new IllegalArgumentException("can't read a negative " + 1573 "number of bytes."); 1574 } 1575 if ((blockReader == null) || (blockEnd == -1)) { 1576 if (pos >= getFileLength()) { 1577 return null; 1578 } 1579 /* 1580 * If we don't have a blockReader, or the one we have has no more bytes 1581 * left to read, we call seekToBlockSource to get a new blockReader and 1582 * recalculate blockEnd. Note that we assume we're not at EOF here 1583 * (we check this above). 1584 */ 1585 if ((!seekToBlockSource(pos)) || (blockReader == null)) { 1586 throw new IOException("failed to allocate new BlockReader " + 1587 "at position " + pos); 1588 } 1589 } 1590 ByteBuffer buffer = null; 1591 if (dfsClient.getConf().shortCircuitMmapEnabled) { 1592 buffer = tryReadZeroCopy(maxLength, opts); 1593 } 1594 if (buffer != null) { 1595 return buffer; 1596 } 1597 buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength); 1598 if (buffer != null) { 1599 extendedReadBuffers.put(buffer, bufferPool); 1600 } 1601 return buffer; 1602 } 1603 1604 private synchronized ByteBuffer tryReadZeroCopy(int maxLength, 1605 EnumSet<ReadOption> opts) throws IOException { 1606 // Copy 'pos' and 'blockEnd' to local variables to make it easier for the 1607 // JVM to optimize this function. 1608 final long curPos = pos; 1609 final long curEnd = blockEnd; 1610 final long blockStartInFile = currentLocatedBlock.getStartOffset(); 1611 final long blockPos = curPos - blockStartInFile; 1612 1613 // Shorten this read if the end of the block is nearby. 1614 long length63; 1615 if ((curPos + maxLength) <= (curEnd + 1)) { 1616 length63 = maxLength; 1617 } else { 1618 length63 = 1 + curEnd - curPos; 1619 if (length63 <= 0) { 1620 if (DFSClient.LOG.isDebugEnabled()) { 1621 DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " + 1622 curPos + " of " + src + "; " + length63 + " bytes left in block. " + 1623 "blockPos=" + blockPos + "; curPos=" + curPos + 1624 "; curEnd=" + curEnd); 1625 } 1626 return null; 1627 } 1628 if (DFSClient.LOG.isDebugEnabled()) { 1629 DFSClient.LOG.debug("Reducing read length from " + maxLength + 1630 " to " + length63 + " to avoid going more than one byte " + 1631 "past the end of the block. blockPos=" + blockPos + 1632 "; curPos=" + curPos + "; curEnd=" + curEnd); 1633 } 1634 } 1635 // Make sure that don't go beyond 31-bit offsets in the MappedByteBuffer. 1636 int length; 1637 if (blockPos + length63 <= Integer.MAX_VALUE) { 1638 length = (int)length63; 1639 } else { 1640 long length31 = Integer.MAX_VALUE - blockPos; 1641 if (length31 <= 0) { 1642 // Java ByteBuffers can't be longer than 2 GB, because they use 1643 // 4-byte signed integers to represent capacity, etc. 1644 // So we can't mmap the parts of the block higher than the 2 GB offset. 1645 // FIXME: we could work around this with multiple memory maps. 1646 // See HDFS-5101. 1647 if (DFSClient.LOG.isDebugEnabled()) { 1648 DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " + 1649 curPos + " of " + src + "; 31-bit MappedByteBuffer limit " + 1650 "exceeded. blockPos=" + blockPos + ", curEnd=" + curEnd); 1651 } 1652 return null; 1653 } 1654 length = (int)length31; 1655 if (DFSClient.LOG.isDebugEnabled()) { 1656 DFSClient.LOG.debug("Reducing read length from " + maxLength + 1657 " to " + length + " to avoid 31-bit limit. " + 1658 "blockPos=" + blockPos + "; curPos=" + curPos + 1659 "; curEnd=" + curEnd); 1660 } 1661 } 1662 final ClientMmap clientMmap = blockReader.getClientMmap(opts); 1663 if (clientMmap == null) { 1664 if (DFSClient.LOG.isDebugEnabled()) { 1665 DFSClient.LOG.debug("unable to perform a zero-copy read from offset " + 1666 curPos + " of " + src + "; BlockReader#getClientMmap returned " + 1667 "null."); 1668 } 1669 return null; 1670 } 1671 boolean success = false; 1672 ByteBuffer buffer; 1673 try { 1674 seek(curPos + length); 1675 buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer(); 1676 buffer.position((int)blockPos); 1677 buffer.limit((int)(blockPos + length)); 1678 extendedReadBuffers.put(buffer, clientMmap); 1679 readStatistics.addZeroCopyBytes(length); 1680 if (DFSClient.LOG.isDebugEnabled()) { 1681 DFSClient.LOG.debug("readZeroCopy read " + length + 1682 " bytes from offset " + curPos + " via the zero-copy read " + 1683 "path. blockEnd = " + blockEnd); 1684 } 1685 success = true; 1686 } finally { 1687 if (!success) { 1688 IOUtils.closeQuietly(clientMmap); 1689 } 1690 } 1691 return buffer; 1692 } 1693 1694 @Override 1695 public synchronized void releaseBuffer(ByteBuffer buffer) { 1696 if (buffer == EMPTY_BUFFER) return; 1697 Object val = extendedReadBuffers.remove(buffer); 1698 if (val == null) { 1699 throw new IllegalArgumentException("tried to release a buffer " + 1700 "that was not created by this stream, " + buffer); 1701 } 1702 if (val instanceof ClientMmap) { 1703 IOUtils.closeQuietly((ClientMmap)val); 1704 } else if (val instanceof ByteBufferPool) { 1705 ((ByteBufferPool)val).putBuffer(buffer); 1706 } 1707 } 1708}