001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.hdfs; 019 020 import java.io.FileInputStream; 021 import java.io.IOException; 022 import java.net.InetSocketAddress; 023 import java.net.Socket; 024 import java.nio.ByteBuffer; 025 import java.util.AbstractMap; 026 import java.util.ArrayList; 027 import java.util.EnumSet; 028 import java.util.HashMap; 029 import java.util.HashSet; 030 import java.util.Iterator; 031 import java.util.List; 032 import java.util.Map; 033 import java.util.Map.Entry; 034 import java.util.Set; 035 import java.util.concurrent.ConcurrentHashMap; 036 037 import org.apache.commons.io.IOUtils; 038 import org.apache.hadoop.classification.InterfaceAudience; 039 import org.apache.hadoop.fs.ByteBufferReadable; 040 import org.apache.hadoop.fs.ByteBufferUtil; 041 import org.apache.hadoop.fs.CanSetDropBehind; 042 import org.apache.hadoop.fs.CanSetReadahead; 043 import org.apache.hadoop.fs.ChecksumException; 044 import org.apache.hadoop.fs.FSInputStream; 045 import org.apache.hadoop.fs.HasEnhancedByteBufferAccess; 046 import org.apache.hadoop.fs.ReadOption; 047 import org.apache.hadoop.fs.UnresolvedLinkException; 048 import org.apache.hadoop.hdfs.client.ClientMmap; 049 import org.apache.hadoop.hdfs.net.DomainPeer; 050 import org.apache.hadoop.hdfs.net.Peer; 051 import org.apache.hadoop.hdfs.net.TcpPeerServer; 052 import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol; 053 import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 054 import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 055 import org.apache.hadoop.hdfs.protocol.LocatedBlock; 056 import org.apache.hadoop.hdfs.protocol.LocatedBlocks; 057 import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException; 058 import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; 059 import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException; 060 import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader; 061 import org.apache.hadoop.hdfs.server.datanode.CachingStrategy; 062 import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; 063 import org.apache.hadoop.io.ByteBufferPool; 064 import org.apache.hadoop.ipc.RPC; 065 import org.apache.hadoop.ipc.RemoteException; 066 import org.apache.hadoop.net.NetUtils; 067 import org.apache.hadoop.net.unix.DomainSocket; 068 import org.apache.hadoop.security.AccessControlException; 069 import org.apache.hadoop.security.token.SecretManager.InvalidToken; 070 import org.apache.hadoop.security.token.Token; 071 import org.apache.hadoop.util.IdentityHashStore; 072 073 import com.google.common.annotations.VisibleForTesting; 074 075 /**************************************************************** 076 * DFSInputStream provides bytes from a named file. It handles 077 * negotiation of the namenode and various datanodes as necessary. 078 ****************************************************************/ 079 @InterfaceAudience.Private 080 public class DFSInputStream extends FSInputStream 081 implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead, 082 HasEnhancedByteBufferAccess { 083 @VisibleForTesting 084 static boolean tcpReadsDisabledForTesting = false; 085 private final PeerCache peerCache; 086 private final DFSClient dfsClient; 087 private boolean closed = false; 088 private final String src; 089 private BlockReader blockReader = null; 090 private final boolean verifyChecksum; 091 private LocatedBlocks locatedBlocks = null; 092 private long lastBlockBeingWrittenLength = 0; 093 private DatanodeInfo currentNode = null; 094 private LocatedBlock currentLocatedBlock = null; 095 private long pos = 0; 096 private long blockEnd = -1; 097 private CachingStrategy cachingStrategy; 098 private final ReadStatistics readStatistics = new ReadStatistics(); 099 100 /** 101 * Track the ByteBuffers that we have handed out to readers. 102 * 103 * The value type can be either ByteBufferPool or ClientMmap, depending on 104 * whether we this is a memory-mapped buffer or not. 105 */ 106 private final IdentityHashStore<ByteBuffer, Object> 107 extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0); 108 109 public static class ReadStatistics { 110 public ReadStatistics() { 111 this.totalBytesRead = 0; 112 this.totalLocalBytesRead = 0; 113 this.totalShortCircuitBytesRead = 0; 114 this.totalZeroCopyBytesRead = 0; 115 } 116 117 public ReadStatistics(ReadStatistics rhs) { 118 this.totalBytesRead = rhs.getTotalBytesRead(); 119 this.totalLocalBytesRead = rhs.getTotalLocalBytesRead(); 120 this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead(); 121 this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead(); 122 } 123 124 /** 125 * @return The total bytes read. This will always be at least as 126 * high as the other numbers, since it includes all of them. 127 */ 128 public long getTotalBytesRead() { 129 return totalBytesRead; 130 } 131 132 /** 133 * @return The total local bytes read. This will always be at least 134 * as high as totalShortCircuitBytesRead, since all short-circuit 135 * reads are also local. 136 */ 137 public long getTotalLocalBytesRead() { 138 return totalLocalBytesRead; 139 } 140 141 /** 142 * @return The total short-circuit local bytes read. 143 */ 144 public long getTotalShortCircuitBytesRead() { 145 return totalShortCircuitBytesRead; 146 } 147 148 /** 149 * @return The total number of zero-copy bytes read. 150 */ 151 public long getTotalZeroCopyBytesRead() { 152 return totalZeroCopyBytesRead; 153 } 154 155 /** 156 * @return The total number of bytes read which were not local. 157 */ 158 public long getRemoteBytesRead() { 159 return totalBytesRead - totalLocalBytesRead; 160 } 161 162 void addRemoteBytes(long amt) { 163 this.totalBytesRead += amt; 164 } 165 166 void addLocalBytes(long amt) { 167 this.totalBytesRead += amt; 168 this.totalLocalBytesRead += amt; 169 } 170 171 void addShortCircuitBytes(long amt) { 172 this.totalBytesRead += amt; 173 this.totalLocalBytesRead += amt; 174 this.totalShortCircuitBytesRead += amt; 175 } 176 177 void addZeroCopyBytes(long amt) { 178 this.totalBytesRead += amt; 179 this.totalLocalBytesRead += amt; 180 this.totalShortCircuitBytesRead += amt; 181 this.totalZeroCopyBytesRead += amt; 182 } 183 184 private long totalBytesRead; 185 186 private long totalLocalBytesRead; 187 188 private long totalShortCircuitBytesRead; 189 190 private long totalZeroCopyBytesRead; 191 } 192 193 private final FileInputStreamCache fileInputStreamCache; 194 195 /** 196 * This variable tracks the number of failures since the start of the 197 * most recent user-facing operation. That is to say, it should be reset 198 * whenever the user makes a call on this stream, and if at any point 199 * during the retry logic, the failure count exceeds a threshold, 200 * the errors will be thrown back to the operation. 201 * 202 * Specifically this counts the number of times the client has gone 203 * back to the namenode to get a new list of block locations, and is 204 * capped at maxBlockAcquireFailures 205 */ 206 private int failures = 0; 207 208 /* XXX Use of CocurrentHashMap is temp fix. Need to fix 209 * parallel accesses to DFSInputStream (through ptreads) properly */ 210 private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes = 211 new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>(); 212 private int buffersize = 1; 213 214 private final byte[] oneByteBuf = new byte[1]; // used for 'int read()' 215 216 void addToDeadNodes(DatanodeInfo dnInfo) { 217 deadNodes.put(dnInfo, dnInfo); 218 } 219 220 DFSInputStream(DFSClient dfsClient, String src, int buffersize, boolean verifyChecksum 221 ) throws IOException, UnresolvedLinkException { 222 this.dfsClient = dfsClient; 223 this.verifyChecksum = verifyChecksum; 224 this.buffersize = buffersize; 225 this.src = src; 226 this.peerCache = dfsClient.peerCache; 227 this.fileInputStreamCache = new FileInputStreamCache( 228 dfsClient.getConf().shortCircuitStreamsCacheSize, 229 dfsClient.getConf().shortCircuitStreamsCacheExpiryMs); 230 this.cachingStrategy = 231 dfsClient.getDefaultReadCachingStrategy(); 232 openInfo(); 233 } 234 235 /** 236 * Grab the open-file info from namenode 237 */ 238 synchronized void openInfo() throws IOException, UnresolvedLinkException { 239 lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); 240 int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength; 241 while (retriesForLastBlockLength > 0) { 242 // Getting last block length as -1 is a special case. When cluster 243 // restarts, DNs may not report immediately. At this time partial block 244 // locations will not be available with NN for getting the length. Lets 245 // retry for 3 times to get the length. 246 if (lastBlockBeingWrittenLength == -1) { 247 DFSClient.LOG.warn("Last block locations not available. " 248 + "Datanodes might not have reported blocks completely." 249 + " Will retry for " + retriesForLastBlockLength + " times"); 250 waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength); 251 lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); 252 } else { 253 break; 254 } 255 retriesForLastBlockLength--; 256 } 257 if (retriesForLastBlockLength == 0) { 258 throw new IOException("Could not obtain the last block locations."); 259 } 260 } 261 262 private void waitFor(int waitTime) throws IOException { 263 try { 264 Thread.sleep(waitTime); 265 } catch (InterruptedException e) { 266 throw new IOException( 267 "Interrupted while getting the last block length."); 268 } 269 } 270 271 private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException { 272 final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0); 273 if (DFSClient.LOG.isDebugEnabled()) { 274 DFSClient.LOG.debug("newInfo = " + newInfo); 275 } 276 if (newInfo == null) { 277 throw new IOException("Cannot open filename " + src); 278 } 279 280 if (locatedBlocks != null) { 281 Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator(); 282 Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator(); 283 while (oldIter.hasNext() && newIter.hasNext()) { 284 if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) { 285 throw new IOException("Blocklist for " + src + " has changed!"); 286 } 287 } 288 } 289 locatedBlocks = newInfo; 290 long lastBlockBeingWrittenLength = 0; 291 if (!locatedBlocks.isLastBlockComplete()) { 292 final LocatedBlock last = locatedBlocks.getLastLocatedBlock(); 293 if (last != null) { 294 if (last.getLocations().length == 0) { 295 if (last.getBlockSize() == 0) { 296 // if the length is zero, then no data has been written to 297 // datanode. So no need to wait for the locations. 298 return 0; 299 } 300 return -1; 301 } 302 final long len = readBlockLength(last); 303 last.getBlock().setNumBytes(len); 304 lastBlockBeingWrittenLength = len; 305 } 306 } 307 308 currentNode = null; 309 return lastBlockBeingWrittenLength; 310 } 311 312 /** Read the block length from one of the datanodes. */ 313 private long readBlockLength(LocatedBlock locatedblock) throws IOException { 314 assert locatedblock != null : "LocatedBlock cannot be null"; 315 int replicaNotFoundCount = locatedblock.getLocations().length; 316 317 for(DatanodeInfo datanode : locatedblock.getLocations()) { 318 ClientDatanodeProtocol cdp = null; 319 320 try { 321 cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode, 322 dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout, 323 dfsClient.getConf().connectToDnViaHostname, locatedblock); 324 325 final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock()); 326 327 if (n >= 0) { 328 return n; 329 } 330 } 331 catch(IOException ioe) { 332 if (ioe instanceof RemoteException && 333 (((RemoteException) ioe).unwrapRemoteException() instanceof 334 ReplicaNotFoundException)) { 335 // special case : replica might not be on the DN, treat as 0 length 336 replicaNotFoundCount--; 337 } 338 339 if (DFSClient.LOG.isDebugEnabled()) { 340 DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode " 341 + datanode + " for block " + locatedblock.getBlock(), ioe); 342 } 343 } finally { 344 if (cdp != null) { 345 RPC.stopProxy(cdp); 346 } 347 } 348 } 349 350 // Namenode told us about these locations, but none know about the replica 351 // means that we hit the race between pipeline creation start and end. 352 // we require all 3 because some other exception could have happened 353 // on a DN that has it. we want to report that error 354 if (replicaNotFoundCount == 0) { 355 return 0; 356 } 357 358 throw new IOException("Cannot obtain block length for " + locatedblock); 359 } 360 361 public synchronized long getFileLength() { 362 return locatedBlocks == null? 0: 363 locatedBlocks.getFileLength() + lastBlockBeingWrittenLength; 364 } 365 366 // Short circuit local reads are forbidden for files that are 367 // under construction. See HDFS-2757. 368 synchronized boolean shortCircuitForbidden() { 369 return locatedBlocks.isUnderConstruction(); 370 } 371 372 /** 373 * Returns the datanode from which the stream is currently reading. 374 */ 375 public DatanodeInfo getCurrentDatanode() { 376 return currentNode; 377 } 378 379 /** 380 * Returns the block containing the target position. 381 */ 382 synchronized public ExtendedBlock getCurrentBlock() { 383 if (currentLocatedBlock == null){ 384 return null; 385 } 386 return currentLocatedBlock.getBlock(); 387 } 388 389 /** 390 * Return collection of blocks that has already been located. 391 */ 392 public synchronized List<LocatedBlock> getAllBlocks() throws IOException { 393 return getBlockRange(0, getFileLength()); 394 } 395 396 /** 397 * Get block at the specified position. 398 * Fetch it from the namenode if not cached. 399 * 400 * @param offset 401 * @param updatePosition whether to update current position 402 * @return located block 403 * @throws IOException 404 */ 405 private synchronized LocatedBlock getBlockAt(long offset, 406 boolean updatePosition) throws IOException { 407 assert (locatedBlocks != null) : "locatedBlocks is null"; 408 409 final LocatedBlock blk; 410 411 //check offset 412 if (offset < 0 || offset >= getFileLength()) { 413 throw new IOException("offset < 0 || offset >= getFileLength(), offset=" 414 + offset 415 + ", updatePosition=" + updatePosition 416 + ", locatedBlocks=" + locatedBlocks); 417 } 418 else if (offset >= locatedBlocks.getFileLength()) { 419 // offset to the portion of the last block, 420 // which is not known to the name-node yet; 421 // getting the last block 422 blk = locatedBlocks.getLastLocatedBlock(); 423 } 424 else { 425 // search cached blocks first 426 int targetBlockIdx = locatedBlocks.findBlock(offset); 427 if (targetBlockIdx < 0) { // block is not cached 428 targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx); 429 // fetch more blocks 430 final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset); 431 assert (newBlocks != null) : "Could not find target position " + offset; 432 locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks()); 433 } 434 blk = locatedBlocks.get(targetBlockIdx); 435 } 436 437 // update current position 438 if (updatePosition) { 439 pos = offset; 440 blockEnd = blk.getStartOffset() + blk.getBlockSize() - 1; 441 currentLocatedBlock = blk; 442 } 443 return blk; 444 } 445 446 /** Fetch a block from namenode and cache it */ 447 private synchronized void fetchBlockAt(long offset) throws IOException { 448 int targetBlockIdx = locatedBlocks.findBlock(offset); 449 if (targetBlockIdx < 0) { // block is not cached 450 targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx); 451 } 452 // fetch blocks 453 final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset); 454 if (newBlocks == null) { 455 throw new IOException("Could not find target position " + offset); 456 } 457 locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks()); 458 } 459 460 /** 461 * Get blocks in the specified range. 462 * Fetch them from the namenode if not cached. This function 463 * will not get a read request beyond the EOF. 464 * @param offset 465 * @param length 466 * @return consequent segment of located blocks 467 * @throws IOException 468 */ 469 private synchronized List<LocatedBlock> getBlockRange(long offset, 470 long length) 471 throws IOException { 472 // getFileLength(): returns total file length 473 // locatedBlocks.getFileLength(): returns length of completed blocks 474 if (offset >= getFileLength()) { 475 throw new IOException("Offset: " + offset + 476 " exceeds file length: " + getFileLength()); 477 } 478 479 final List<LocatedBlock> blocks; 480 final long lengthOfCompleteBlk = locatedBlocks.getFileLength(); 481 final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk; 482 final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk; 483 484 if (readOffsetWithinCompleteBlk) { 485 //get the blocks of finalized (completed) block range 486 blocks = getFinalizedBlockRange(offset, 487 Math.min(length, lengthOfCompleteBlk - offset)); 488 } else { 489 blocks = new ArrayList<LocatedBlock>(1); 490 } 491 492 // get the blocks from incomplete block range 493 if (readLengthPastCompleteBlk) { 494 blocks.add(locatedBlocks.getLastLocatedBlock()); 495 } 496 497 return blocks; 498 } 499 500 /** 501 * Get blocks in the specified range. 502 * Includes only the complete blocks. 503 * Fetch them from the namenode if not cached. 504 */ 505 private synchronized List<LocatedBlock> getFinalizedBlockRange( 506 long offset, long length) throws IOException { 507 assert (locatedBlocks != null) : "locatedBlocks is null"; 508 List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>(); 509 // search cached blocks first 510 int blockIdx = locatedBlocks.findBlock(offset); 511 if (blockIdx < 0) { // block is not cached 512 blockIdx = LocatedBlocks.getInsertIndex(blockIdx); 513 } 514 long remaining = length; 515 long curOff = offset; 516 while(remaining > 0) { 517 LocatedBlock blk = null; 518 if(blockIdx < locatedBlocks.locatedBlockCount()) 519 blk = locatedBlocks.get(blockIdx); 520 if (blk == null || curOff < blk.getStartOffset()) { 521 LocatedBlocks newBlocks; 522 newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining); 523 locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks()); 524 continue; 525 } 526 assert curOff >= blk.getStartOffset() : "Block not found"; 527 blockRange.add(blk); 528 long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff; 529 remaining -= bytesRead; 530 curOff += bytesRead; 531 blockIdx++; 532 } 533 return blockRange; 534 } 535 536 /** 537 * Open a DataInputStream to a DataNode so that it can be read from. 538 * We get block ID and the IDs of the destinations at startup, from the namenode. 539 */ 540 private synchronized DatanodeInfo blockSeekTo(long target) throws IOException { 541 if (target >= getFileLength()) { 542 throw new IOException("Attempted to read past end of file"); 543 } 544 545 // Will be getting a new BlockReader. 546 if (blockReader != null) { 547 blockReader.close(); 548 blockReader = null; 549 } 550 551 // 552 // Connect to best DataNode for desired Block, with potential offset 553 // 554 DatanodeInfo chosenNode = null; 555 int refetchToken = 1; // only need to get a new access token once 556 int refetchEncryptionKey = 1; // only need to get a new encryption key once 557 558 boolean connectFailedOnce = false; 559 560 while (true) { 561 // 562 // Compute desired block 563 // 564 LocatedBlock targetBlock = getBlockAt(target, true); 565 assert (target==pos) : "Wrong postion " + pos + " expect " + target; 566 long offsetIntoBlock = target - targetBlock.getStartOffset(); 567 568 DNAddrPair retval = chooseDataNode(targetBlock); 569 chosenNode = retval.info; 570 InetSocketAddress targetAddr = retval.addr; 571 572 try { 573 ExtendedBlock blk = targetBlock.getBlock(); 574 Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken(); 575 blockReader = getBlockReader(targetAddr, chosenNode, src, blk, 576 accessToken, offsetIntoBlock, blk.getNumBytes() - offsetIntoBlock, 577 buffersize, verifyChecksum, dfsClient.clientName, cachingStrategy); 578 if(connectFailedOnce) { 579 DFSClient.LOG.info("Successfully connected to " + targetAddr + 580 " for " + blk); 581 } 582 return chosenNode; 583 } catch (AccessControlException ex) { 584 DFSClient.LOG.warn("Short circuit access failed " + ex); 585 dfsClient.disableLegacyBlockReaderLocal(); 586 continue; 587 } catch (IOException ex) { 588 if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { 589 DFSClient.LOG.info("Will fetch a new encryption key and retry, " 590 + "encryption key was invalid when connecting to " + targetAddr 591 + " : " + ex); 592 // The encryption key used is invalid. 593 refetchEncryptionKey--; 594 dfsClient.clearDataEncryptionKey(); 595 } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) { 596 refetchToken--; 597 fetchBlockAt(target); 598 } else { 599 connectFailedOnce = true; 600 DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block" 601 + ", add to deadNodes and continue. " + ex, ex); 602 // Put chosen node into dead list, continue 603 addToDeadNodes(chosenNode); 604 } 605 } 606 } 607 } 608 609 /** 610 * Close it down! 611 */ 612 @Override 613 public synchronized void close() throws IOException { 614 if (closed) { 615 return; 616 } 617 dfsClient.checkOpen(); 618 619 if (!extendedReadBuffers.isEmpty()) { 620 final StringBuilder builder = new StringBuilder(); 621 extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() { 622 private String prefix = ""; 623 @Override 624 public void accept(ByteBuffer k, Object v) { 625 builder.append(prefix).append(k); 626 prefix = ", "; 627 } 628 }); 629 DFSClient.LOG.warn("closing file " + src + ", but there are still " + 630 "unreleased ByteBuffers allocated by read(). " + 631 "Please release " + builder.toString() + "."); 632 } 633 if (blockReader != null) { 634 blockReader.close(); 635 blockReader = null; 636 } 637 super.close(); 638 fileInputStreamCache.close(); 639 closed = true; 640 } 641 642 @Override 643 public synchronized int read() throws IOException { 644 int ret = read( oneByteBuf, 0, 1 ); 645 return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff); 646 } 647 648 /** 649 * Wraps different possible read implementations so that readBuffer can be 650 * strategy-agnostic. 651 */ 652 private interface ReaderStrategy { 653 public int doRead(BlockReader blockReader, int off, int len, 654 ReadStatistics readStatistics) throws ChecksumException, IOException; 655 } 656 657 private static void updateReadStatistics(ReadStatistics readStatistics, 658 int nRead, BlockReader blockReader) { 659 if (nRead <= 0) return; 660 if (blockReader.isShortCircuit()) { 661 readStatistics.totalBytesRead += nRead; 662 readStatistics.totalLocalBytesRead += nRead; 663 readStatistics.totalShortCircuitBytesRead += nRead; 664 } else if (blockReader.isLocal()) { 665 readStatistics.totalBytesRead += nRead; 666 readStatistics.totalLocalBytesRead += nRead; 667 } else { 668 readStatistics.totalBytesRead += nRead; 669 } 670 } 671 672 /** 673 * Used to read bytes into a byte[] 674 */ 675 private static class ByteArrayStrategy implements ReaderStrategy { 676 final byte[] buf; 677 678 public ByteArrayStrategy(byte[] buf) { 679 this.buf = buf; 680 } 681 682 @Override 683 public int doRead(BlockReader blockReader, int off, int len, 684 ReadStatistics readStatistics) throws ChecksumException, IOException { 685 int nRead = blockReader.read(buf, off, len); 686 updateReadStatistics(readStatistics, nRead, blockReader); 687 return nRead; 688 } 689 } 690 691 /** 692 * Used to read bytes into a user-supplied ByteBuffer 693 */ 694 private static class ByteBufferStrategy implements ReaderStrategy { 695 final ByteBuffer buf; 696 ByteBufferStrategy(ByteBuffer buf) { 697 this.buf = buf; 698 } 699 700 @Override 701 public int doRead(BlockReader blockReader, int off, int len, 702 ReadStatistics readStatistics) throws ChecksumException, IOException { 703 int oldpos = buf.position(); 704 int oldlimit = buf.limit(); 705 boolean success = false; 706 try { 707 int ret = blockReader.read(buf); 708 success = true; 709 updateReadStatistics(readStatistics, ret, blockReader); 710 return ret; 711 } finally { 712 if (!success) { 713 // Reset to original state so that retries work correctly. 714 buf.position(oldpos); 715 buf.limit(oldlimit); 716 } 717 } 718 } 719 } 720 721 /* This is a used by regular read() and handles ChecksumExceptions. 722 * name readBuffer() is chosen to imply similarity to readBuffer() in 723 * ChecksumFileSystem 724 */ 725 private synchronized int readBuffer(ReaderStrategy reader, int off, int len, 726 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 727 throws IOException { 728 IOException ioe; 729 730 /* we retry current node only once. So this is set to true only here. 731 * Intention is to handle one common case of an error that is not a 732 * failure on datanode or client : when DataNode closes the connection 733 * since client is idle. If there are other cases of "non-errors" then 734 * then a datanode might be retried by setting this to true again. 735 */ 736 boolean retryCurrentNode = true; 737 738 while (true) { 739 // retry as many times as seekToNewSource allows. 740 try { 741 return reader.doRead(blockReader, off, len, readStatistics); 742 } catch ( ChecksumException ce ) { 743 DFSClient.LOG.warn("Found Checksum error for " 744 + getCurrentBlock() + " from " + currentNode 745 + " at " + ce.getPos()); 746 ioe = ce; 747 retryCurrentNode = false; 748 // we want to remember which block replicas we have tried 749 addIntoCorruptedBlockMap(getCurrentBlock(), currentNode, 750 corruptedBlockMap); 751 } catch ( IOException e ) { 752 if (!retryCurrentNode) { 753 DFSClient.LOG.warn("Exception while reading from " 754 + getCurrentBlock() + " of " + src + " from " 755 + currentNode, e); 756 } 757 ioe = e; 758 } 759 boolean sourceFound = false; 760 if (retryCurrentNode) { 761 /* possibly retry the same node so that transient errors don't 762 * result in application level failures (e.g. Datanode could have 763 * closed the connection because the client is idle for too long). 764 */ 765 sourceFound = seekToBlockSource(pos); 766 } else { 767 addToDeadNodes(currentNode); 768 sourceFound = seekToNewSource(pos); 769 } 770 if (!sourceFound) { 771 throw ioe; 772 } 773 retryCurrentNode = false; 774 } 775 } 776 777 private int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException { 778 dfsClient.checkOpen(); 779 if (closed) { 780 throw new IOException("Stream closed"); 781 } 782 Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 783 = new HashMap<ExtendedBlock, Set<DatanodeInfo>>(); 784 failures = 0; 785 if (pos < getFileLength()) { 786 int retries = 2; 787 while (retries > 0) { 788 try { 789 // currentNode can be left as null if previous read had a checksum 790 // error on the same block. See HDFS-3067 791 if (pos > blockEnd || currentNode == null) { 792 currentNode = blockSeekTo(pos); 793 } 794 int realLen = (int) Math.min(len, (blockEnd - pos + 1L)); 795 if (locatedBlocks.isLastBlockComplete()) { 796 realLen = (int) Math.min(realLen, locatedBlocks.getFileLength()); 797 } 798 int result = readBuffer(strategy, off, realLen, corruptedBlockMap); 799 800 if (result >= 0) { 801 pos += result; 802 } else { 803 // got a EOS from reader though we expect more data on it. 804 throw new IOException("Unexpected EOS from the reader"); 805 } 806 if (dfsClient.stats != null && result != -1) { 807 dfsClient.stats.incrementBytesRead(result); 808 } 809 return result; 810 } catch (ChecksumException ce) { 811 throw ce; 812 } catch (IOException e) { 813 if (retries == 1) { 814 DFSClient.LOG.warn("DFS Read", e); 815 } 816 blockEnd = -1; 817 if (currentNode != null) { addToDeadNodes(currentNode); } 818 if (--retries == 0) { 819 throw e; 820 } 821 } finally { 822 // Check if need to report block replicas corruption either read 823 // was successful or ChecksumException occured. 824 reportCheckSumFailure(corruptedBlockMap, 825 currentLocatedBlock.getLocations().length); 826 } 827 } 828 } 829 return -1; 830 } 831 832 /** 833 * Read the entire buffer. 834 */ 835 @Override 836 public synchronized int read(final byte buf[], int off, int len) throws IOException { 837 ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf); 838 839 return readWithStrategy(byteArrayReader, off, len); 840 } 841 842 @Override 843 public synchronized int read(final ByteBuffer buf) throws IOException { 844 ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf); 845 846 return readWithStrategy(byteBufferReader, 0, buf.remaining()); 847 } 848 849 850 /** 851 * Add corrupted block replica into map. 852 * @param corruptedBlockMap 853 */ 854 private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 855 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) { 856 Set<DatanodeInfo> dnSet = null; 857 if((corruptedBlockMap.containsKey(blk))) { 858 dnSet = corruptedBlockMap.get(blk); 859 }else { 860 dnSet = new HashSet<DatanodeInfo>(); 861 } 862 if (!dnSet.contains(node)) { 863 dnSet.add(node); 864 corruptedBlockMap.put(blk, dnSet); 865 } 866 } 867 868 private DNAddrPair chooseDataNode(LocatedBlock block) 869 throws IOException { 870 while (true) { 871 DatanodeInfo[] nodes = block.getLocations(); 872 try { 873 DatanodeInfo chosenNode = bestNode(nodes, deadNodes); 874 final String dnAddr = 875 chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname); 876 if (DFSClient.LOG.isDebugEnabled()) { 877 DFSClient.LOG.debug("Connecting to datanode " + dnAddr); 878 } 879 InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr); 880 return new DNAddrPair(chosenNode, targetAddr); 881 } catch (IOException ie) { 882 String blockInfo = block.getBlock() + " file=" + src; 883 if (failures >= dfsClient.getMaxBlockAcquireFailures()) { 884 throw new BlockMissingException(src, "Could not obtain block: " + blockInfo, 885 block.getStartOffset()); 886 } 887 888 if (nodes == null || nodes.length == 0) { 889 DFSClient.LOG.info("No node available for " + blockInfo); 890 } 891 DFSClient.LOG.info("Could not obtain " + block.getBlock() 892 + " from any node: " + ie 893 + ". Will get new block locations from namenode and retry..."); 894 try { 895 // Introducing a random factor to the wait time before another retry. 896 // The wait time is dependent on # of failures and a random factor. 897 // At the first time of getting a BlockMissingException, the wait time 898 // is a random number between 0..3000 ms. If the first retry 899 // still fails, we will wait 3000 ms grace period before the 2nd retry. 900 // Also at the second retry, the waiting window is expanded to 6000 ms 901 // alleviating the request rate from the server. Similarly the 3rd retry 902 // will wait 6000ms grace period before retry and the waiting window is 903 // expanded to 9000ms. 904 final int timeWindow = dfsClient.getConf().timeWindow; 905 double waitTime = timeWindow * failures + // grace period for the last round of attempt 906 timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure 907 DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec."); 908 Thread.sleep((long)waitTime); 909 } catch (InterruptedException iex) { 910 } 911 deadNodes.clear(); //2nd option is to remove only nodes[blockId] 912 openInfo(); 913 block = getBlockAt(block.getStartOffset(), false); 914 failures++; 915 continue; 916 } 917 } 918 } 919 920 private void fetchBlockByteRange(LocatedBlock block, long start, long end, 921 byte[] buf, int offset, 922 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) 923 throws IOException { 924 // 925 // Connect to best DataNode for desired Block, with potential offset 926 // 927 int refetchToken = 1; // only need to get a new access token once 928 int refetchEncryptionKey = 1; // only need to get a new encryption key once 929 930 while (true) { 931 // cached block locations may have been updated by chooseDataNode() 932 // or fetchBlockAt(). Always get the latest list of locations at the 933 // start of the loop. 934 CachingStrategy curCachingStrategy; 935 synchronized (this) { 936 block = getBlockAt(block.getStartOffset(), false); 937 curCachingStrategy = cachingStrategy; 938 } 939 DNAddrPair retval = chooseDataNode(block); 940 DatanodeInfo chosenNode = retval.info; 941 InetSocketAddress targetAddr = retval.addr; 942 BlockReader reader = null; 943 944 try { 945 Token<BlockTokenIdentifier> blockToken = block.getBlockToken(); 946 947 int len = (int) (end - start + 1); 948 reader = getBlockReader(targetAddr, chosenNode, src, block.getBlock(), 949 blockToken, start, len, buffersize, verifyChecksum, 950 dfsClient.clientName, curCachingStrategy); 951 int nread = reader.readAll(buf, offset, len); 952 if (nread != len) { 953 throw new IOException("truncated return from reader.read(): " + 954 "excpected " + len + ", got " + nread); 955 } 956 return; 957 } catch (ChecksumException e) { 958 DFSClient.LOG.warn("fetchBlockByteRange(). Got a checksum exception for " + 959 src + " at " + block.getBlock() + ":" + 960 e.getPos() + " from " + chosenNode); 961 // we want to remember what we have tried 962 addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap); 963 } catch (AccessControlException ex) { 964 DFSClient.LOG.warn("Short circuit access failed " + ex); 965 dfsClient.disableLegacyBlockReaderLocal(); 966 continue; 967 } catch (IOException e) { 968 if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { 969 DFSClient.LOG.info("Will fetch a new encryption key and retry, " 970 + "encryption key was invalid when connecting to " + targetAddr 971 + " : " + e); 972 // The encryption key used is invalid. 973 refetchEncryptionKey--; 974 dfsClient.clearDataEncryptionKey(); 975 continue; 976 } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) { 977 refetchToken--; 978 fetchBlockAt(block.getStartOffset()); 979 continue; 980 } else { 981 DFSClient.LOG.warn("Failed to connect to " + targetAddr + 982 " for file " + src + " for block " + block.getBlock() + ":" + e); 983 if (DFSClient.LOG.isDebugEnabled()) { 984 DFSClient.LOG.debug("Connection failure ", e); 985 } 986 } 987 } finally { 988 if (reader != null) { 989 reader.close(); 990 } 991 } 992 // Put chosen node into dead list, continue 993 addToDeadNodes(chosenNode); 994 } 995 } 996 997 /** 998 * Should the block access token be refetched on an exception 999 * 1000 * @param ex Exception received 1001 * @param targetAddr Target datanode address from where exception was received 1002 * @return true if block access token has expired or invalid and it should be 1003 * refetched 1004 */ 1005 private static boolean tokenRefetchNeeded(IOException ex, 1006 InetSocketAddress targetAddr) { 1007 /* 1008 * Get a new access token and retry. Retry is needed in 2 cases. 1) 1009 * When both NN and DN re-started while DFSClient holding a cached 1010 * access token. 2) In the case that NN fails to update its 1011 * access key at pre-set interval (by a wide margin) and 1012 * subsequently restarts. In this case, DN re-registers itself with 1013 * NN and receives a new access key, but DN will delete the old 1014 * access key from its memory since it's considered expired based on 1015 * the estimated expiration date. 1016 */ 1017 if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) { 1018 DFSClient.LOG.info("Access token was invalid when connecting to " 1019 + targetAddr + " : " + ex); 1020 return true; 1021 } 1022 return false; 1023 } 1024 1025 private Peer newTcpPeer(InetSocketAddress addr) throws IOException { 1026 Peer peer = null; 1027 boolean success = false; 1028 Socket sock = null; 1029 try { 1030 sock = dfsClient.socketFactory.createSocket(); 1031 NetUtils.connect(sock, addr, 1032 dfsClient.getRandomLocalInterfaceAddr(), 1033 dfsClient.getConf().socketTimeout); 1034 peer = TcpPeerServer.peerFromSocketAndKey(sock, 1035 dfsClient.getDataEncryptionKey()); 1036 success = true; 1037 return peer; 1038 } finally { 1039 if (!success) { 1040 IOUtils.closeQuietly(peer); 1041 IOUtils.closeQuietly(sock); 1042 } 1043 } 1044 } 1045 1046 /** 1047 * Retrieve a BlockReader suitable for reading. 1048 * This method will reuse the cached connection to the DN if appropriate. 1049 * Otherwise, it will create a new connection. 1050 * Throwing an IOException from this method is basically equivalent to 1051 * declaring the DataNode bad, so we try to connect a lot of different ways 1052 * before doing that. 1053 * 1054 * @param dnAddr Address of the datanode 1055 * @param chosenNode Chosen datanode information 1056 * @param file File location 1057 * @param block The Block object 1058 * @param blockToken The access token for security 1059 * @param startOffset The read offset, relative to block head 1060 * @param len The number of bytes to read 1061 * @param bufferSize The IO buffer size (not the client buffer size) 1062 * @param verifyChecksum Whether to verify checksum 1063 * @param clientName Client name 1064 * @param CachingStrategy caching strategy to use 1065 * @return New BlockReader instance 1066 */ 1067 protected BlockReader getBlockReader(InetSocketAddress dnAddr, 1068 DatanodeInfo chosenNode, 1069 String file, 1070 ExtendedBlock block, 1071 Token<BlockTokenIdentifier> blockToken, 1072 long startOffset, 1073 long len, 1074 int bufferSize, 1075 boolean verifyChecksum, 1076 String clientName, 1077 CachingStrategy curCachingStrategy) 1078 throws IOException { 1079 // Firstly, we check to see if we have cached any file descriptors for 1080 // local blocks. If so, we can just re-use those file descriptors. 1081 FileInputStream fis[] = fileInputStreamCache.get(chosenNode, block); 1082 if (fis != null) { 1083 if (DFSClient.LOG.isDebugEnabled()) { 1084 DFSClient.LOG.debug("got FileInputStreams for " + block + " from " + 1085 "the FileInputStreamCache."); 1086 } 1087 return new BlockReaderLocal.Builder(dfsClient.getConf()). 1088 setFilename(file). 1089 setBlock(block). 1090 setStartOffset(startOffset). 1091 setStreams(fis). 1092 setDatanodeID(chosenNode). 1093 setVerifyChecksum(verifyChecksum). 1094 setBlockMetadataHeader(BlockMetadataHeader. 1095 preadHeader(fis[1].getChannel())). 1096 setFileInputStreamCache(fileInputStreamCache). 1097 setCachingStrategy(curCachingStrategy). 1098 build(); 1099 } 1100 1101 // If the legacy local block reader is enabled and we are reading a local 1102 // block, try to create a BlockReaderLocalLegacy. The legacy local block 1103 // reader implements local reads in the style first introduced by HDFS-2246. 1104 if ((dfsClient.useLegacyBlockReaderLocal()) && 1105 DFSClient.isLocalAddress(dnAddr) && 1106 (!shortCircuitForbidden())) { 1107 try { 1108 return BlockReaderFactory.getLegacyBlockReaderLocal(dfsClient, 1109 clientName, block, blockToken, chosenNode, startOffset); 1110 } catch (IOException e) { 1111 DFSClient.LOG.warn("error creating legacy BlockReaderLocal. " + 1112 "Disabling legacy local reads.", e); 1113 dfsClient.disableLegacyBlockReaderLocal(); 1114 } 1115 } 1116 1117 // Look for cached domain peers. 1118 int cacheTries = 0; 1119 DomainSocketFactory dsFactory = dfsClient.getDomainSocketFactory(); 1120 BlockReader reader = null; 1121 final int nCachedConnRetry = dfsClient.getConf().nCachedConnRetry; 1122 for (; cacheTries < nCachedConnRetry; ++cacheTries) { 1123 Peer peer = peerCache.get(chosenNode, true); 1124 if (peer == null) break; 1125 try { 1126 boolean allowShortCircuitLocalReads = dfsClient.getConf(). 1127 shortCircuitLocalReads && (!shortCircuitForbidden()); 1128 reader = BlockReaderFactory.newBlockReader( 1129 dfsClient.getConf(), file, block, blockToken, startOffset, 1130 len, verifyChecksum, clientName, peer, chosenNode, 1131 dsFactory, peerCache, fileInputStreamCache, 1132 allowShortCircuitLocalReads, curCachingStrategy); 1133 return reader; 1134 } catch (IOException ex) { 1135 DFSClient.LOG.debug("Error making BlockReader with DomainSocket. " + 1136 "Closing stale " + peer, ex); 1137 } finally { 1138 if (reader == null) { 1139 IOUtils.closeQuietly(peer); 1140 } 1141 } 1142 } 1143 1144 // Try to create a DomainPeer. 1145 DomainSocket domSock = dsFactory.create(dnAddr, this); 1146 if (domSock != null) { 1147 Peer peer = new DomainPeer(domSock); 1148 try { 1149 boolean allowShortCircuitLocalReads = dfsClient.getConf(). 1150 shortCircuitLocalReads && (!shortCircuitForbidden()); 1151 reader = BlockReaderFactory.newBlockReader( 1152 dfsClient.getConf(), file, block, blockToken, startOffset, 1153 len, verifyChecksum, clientName, peer, chosenNode, 1154 dsFactory, peerCache, fileInputStreamCache, 1155 allowShortCircuitLocalReads, curCachingStrategy); 1156 return reader; 1157 } catch (IOException e) { 1158 DFSClient.LOG.warn("failed to connect to " + domSock, e); 1159 } finally { 1160 if (reader == null) { 1161 // If the Peer that we got the error from was a DomainPeer, 1162 // mark the socket path as bad, so that newDataSocket will not try 1163 // to re-open this socket for a while. 1164 dsFactory.disableDomainSocketPath(domSock.getPath()); 1165 IOUtils.closeQuietly(peer); 1166 } 1167 } 1168 } 1169 1170 // Look for cached peers. 1171 for (; cacheTries < nCachedConnRetry; ++cacheTries) { 1172 Peer peer = peerCache.get(chosenNode, false); 1173 if (peer == null) break; 1174 try { 1175 reader = BlockReaderFactory.newBlockReader( 1176 dfsClient.getConf(), file, block, blockToken, startOffset, 1177 len, verifyChecksum, clientName, peer, chosenNode, 1178 dsFactory, peerCache, fileInputStreamCache, false, 1179 curCachingStrategy); 1180 return reader; 1181 } catch (IOException ex) { 1182 DFSClient.LOG.debug("Error making BlockReader. Closing stale " + 1183 peer, ex); 1184 } finally { 1185 if (reader == null) { 1186 IOUtils.closeQuietly(peer); 1187 } 1188 } 1189 } 1190 if (tcpReadsDisabledForTesting) { 1191 throw new IOException("TCP reads are disabled."); 1192 } 1193 // Try to create a new remote peer. 1194 Peer peer = newTcpPeer(dnAddr); 1195 try { 1196 reader = BlockReaderFactory.newBlockReader(dfsClient.getConf(), file, 1197 block, blockToken, startOffset, len, verifyChecksum, clientName, 1198 peer, chosenNode, dsFactory, peerCache, fileInputStreamCache, false, 1199 curCachingStrategy); 1200 return reader; 1201 } catch (IOException ex) { 1202 DFSClient.LOG.debug( 1203 "Exception while getting block reader, closing stale " + peer, ex); 1204 throw ex; 1205 } finally { 1206 if (reader == null) { 1207 IOUtils.closeQuietly(peer); 1208 } 1209 } 1210 } 1211 1212 1213 /** 1214 * Read bytes starting from the specified position. 1215 * 1216 * @param position start read from this position 1217 * @param buffer read buffer 1218 * @param offset offset into buffer 1219 * @param length number of bytes to read 1220 * 1221 * @return actual number of bytes read 1222 */ 1223 @Override 1224 public int read(long position, byte[] buffer, int offset, int length) 1225 throws IOException { 1226 // sanity checks 1227 dfsClient.checkOpen(); 1228 if (closed) { 1229 throw new IOException("Stream closed"); 1230 } 1231 failures = 0; 1232 long filelen = getFileLength(); 1233 if ((position < 0) || (position >= filelen)) { 1234 return -1; 1235 } 1236 int realLen = length; 1237 if ((position + length) > filelen) { 1238 realLen = (int)(filelen - position); 1239 } 1240 1241 // determine the block and byte range within the block 1242 // corresponding to position and realLen 1243 List<LocatedBlock> blockRange = getBlockRange(position, realLen); 1244 int remaining = realLen; 1245 Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 1246 = new HashMap<ExtendedBlock, Set<DatanodeInfo>>(); 1247 for (LocatedBlock blk : blockRange) { 1248 long targetStart = position - blk.getStartOffset(); 1249 long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart); 1250 try { 1251 fetchBlockByteRange(blk, targetStart, 1252 targetStart + bytesToRead - 1, buffer, offset, corruptedBlockMap); 1253 } finally { 1254 // Check and report if any block replicas are corrupted. 1255 // BlockMissingException may be caught if all block replicas are 1256 // corrupted. 1257 reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length); 1258 } 1259 1260 remaining -= bytesToRead; 1261 position += bytesToRead; 1262 offset += bytesToRead; 1263 } 1264 assert remaining == 0 : "Wrong number of bytes read."; 1265 if (dfsClient.stats != null) { 1266 dfsClient.stats.incrementBytesRead(realLen); 1267 } 1268 return realLen; 1269 } 1270 1271 /** 1272 * DFSInputStream reports checksum failure. 1273 * Case I : client has tried multiple data nodes and at least one of the 1274 * attempts has succeeded. We report the other failures as corrupted block to 1275 * namenode. 1276 * Case II: client has tried out all data nodes, but all failed. We 1277 * only report if the total number of replica is 1. We do not 1278 * report otherwise since this maybe due to the client is a handicapped client 1279 * (who can not read). 1280 * @param corruptedBlockMap map of corrupted blocks 1281 * @param dataNodeCount number of data nodes who contains the block replicas 1282 */ 1283 private void reportCheckSumFailure( 1284 Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 1285 int dataNodeCount) { 1286 if (corruptedBlockMap.isEmpty()) { 1287 return; 1288 } 1289 Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap 1290 .entrySet().iterator(); 1291 Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next(); 1292 ExtendedBlock blk = entry.getKey(); 1293 Set<DatanodeInfo> dnSet = entry.getValue(); 1294 if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0)) 1295 || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) { 1296 DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()]; 1297 int i = 0; 1298 for (DatanodeInfo dn:dnSet) { 1299 locs[i++] = dn; 1300 } 1301 LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) }; 1302 dfsClient.reportChecksumFailure(src, lblocks); 1303 } 1304 corruptedBlockMap.clear(); 1305 } 1306 1307 @Override 1308 public long skip(long n) throws IOException { 1309 if ( n > 0 ) { 1310 long curPos = getPos(); 1311 long fileLen = getFileLength(); 1312 if( n+curPos > fileLen ) { 1313 n = fileLen - curPos; 1314 } 1315 seek(curPos+n); 1316 return n; 1317 } 1318 return n < 0 ? -1 : 0; 1319 } 1320 1321 /** 1322 * Seek to a new arbitrary location 1323 */ 1324 @Override 1325 public synchronized void seek(long targetPos) throws IOException { 1326 if (targetPos > getFileLength()) { 1327 throw new IOException("Cannot seek after EOF"); 1328 } 1329 if (targetPos < 0) { 1330 throw new IOException("Cannot seek to negative offset"); 1331 } 1332 if (closed) { 1333 throw new IOException("Stream is closed!"); 1334 } 1335 boolean done = false; 1336 if (pos <= targetPos && targetPos <= blockEnd) { 1337 // 1338 // If this seek is to a positive position in the current 1339 // block, and this piece of data might already be lying in 1340 // the TCP buffer, then just eat up the intervening data. 1341 // 1342 int diff = (int)(targetPos - pos); 1343 if (diff <= blockReader.available()) { 1344 try { 1345 pos += blockReader.skip(diff); 1346 if (pos == targetPos) { 1347 done = true; 1348 } 1349 } catch (IOException e) {//make following read to retry 1350 if(DFSClient.LOG.isDebugEnabled()) { 1351 DFSClient.LOG.debug("Exception while seek to " + targetPos 1352 + " from " + getCurrentBlock() + " of " + src + " from " 1353 + currentNode, e); 1354 } 1355 } 1356 } 1357 } 1358 if (!done) { 1359 pos = targetPos; 1360 blockEnd = -1; 1361 } 1362 } 1363 1364 /** 1365 * Same as {@link #seekToNewSource(long)} except that it does not exclude 1366 * the current datanode and might connect to the same node. 1367 */ 1368 private synchronized boolean seekToBlockSource(long targetPos) 1369 throws IOException { 1370 currentNode = blockSeekTo(targetPos); 1371 return true; 1372 } 1373 1374 /** 1375 * Seek to given position on a node other than the current node. If 1376 * a node other than the current node is found, then returns true. 1377 * If another node could not be found, then returns false. 1378 */ 1379 @Override 1380 public synchronized boolean seekToNewSource(long targetPos) throws IOException { 1381 boolean markedDead = deadNodes.containsKey(currentNode); 1382 addToDeadNodes(currentNode); 1383 DatanodeInfo oldNode = currentNode; 1384 DatanodeInfo newNode = blockSeekTo(targetPos); 1385 if (!markedDead) { 1386 /* remove it from deadNodes. blockSeekTo could have cleared 1387 * deadNodes and added currentNode again. Thats ok. */ 1388 deadNodes.remove(oldNode); 1389 } 1390 if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) { 1391 currentNode = newNode; 1392 return true; 1393 } else { 1394 return false; 1395 } 1396 } 1397 1398 /** 1399 */ 1400 @Override 1401 public synchronized long getPos() throws IOException { 1402 return pos; 1403 } 1404 1405 /** Return the size of the remaining available bytes 1406 * if the size is less than or equal to {@link Integer#MAX_VALUE}, 1407 * otherwise, return {@link Integer#MAX_VALUE}. 1408 */ 1409 @Override 1410 public synchronized int available() throws IOException { 1411 if (closed) { 1412 throw new IOException("Stream closed"); 1413 } 1414 1415 final long remaining = getFileLength() - pos; 1416 return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE; 1417 } 1418 1419 /** 1420 * We definitely don't support marks 1421 */ 1422 @Override 1423 public boolean markSupported() { 1424 return false; 1425 } 1426 @Override 1427 public void mark(int readLimit) { 1428 } 1429 @Override 1430 public void reset() throws IOException { 1431 throw new IOException("Mark/reset not supported"); 1432 } 1433 1434 /** 1435 * Pick the best node from which to stream the data. 1436 * Entries in <i>nodes</i> are already in the priority order 1437 */ 1438 static DatanodeInfo bestNode(DatanodeInfo nodes[], 1439 AbstractMap<DatanodeInfo, DatanodeInfo> deadNodes) 1440 throws IOException { 1441 if (nodes != null) { 1442 for (int i = 0; i < nodes.length; i++) { 1443 if (!deadNodes.containsKey(nodes[i])) { 1444 return nodes[i]; 1445 } 1446 } 1447 } 1448 throw new IOException("No live nodes contain current block"); 1449 } 1450 1451 /** Utility class to encapsulate data node info and its address. */ 1452 static class DNAddrPair { 1453 DatanodeInfo info; 1454 InetSocketAddress addr; 1455 DNAddrPair(DatanodeInfo info, InetSocketAddress addr) { 1456 this.info = info; 1457 this.addr = addr; 1458 } 1459 } 1460 1461 /** 1462 * Get statistics about the reads which this DFSInputStream has done. 1463 */ 1464 public synchronized ReadStatistics getReadStatistics() { 1465 return new ReadStatistics(readStatistics); 1466 } 1467 1468 private synchronized void closeCurrentBlockReader() { 1469 if (blockReader == null) return; 1470 // Close the current block reader so that the new caching settings can 1471 // take effect immediately. 1472 try { 1473 blockReader.close(); 1474 } catch (IOException e) { 1475 DFSClient.LOG.error("error closing blockReader", e); 1476 } 1477 blockReader = null; 1478 } 1479 1480 @Override 1481 public synchronized void setReadahead(Long readahead) 1482 throws IOException { 1483 this.cachingStrategy = 1484 new CachingStrategy.Builder(this.cachingStrategy). 1485 setReadahead(readahead).build(); 1486 closeCurrentBlockReader(); 1487 } 1488 1489 @Override 1490 public synchronized void setDropBehind(Boolean dropBehind) 1491 throws IOException { 1492 this.cachingStrategy = 1493 new CachingStrategy.Builder(this.cachingStrategy). 1494 setDropBehind(dropBehind).build(); 1495 closeCurrentBlockReader(); 1496 } 1497 1498 @Override 1499 public synchronized ByteBuffer read(ByteBufferPool bufferPool, 1500 int maxLength, EnumSet<ReadOption> opts) 1501 throws IOException, UnsupportedOperationException { 1502 assert(maxLength > 0); 1503 if (((blockReader == null) || (blockEnd == -1)) && 1504 (pos < getFileLength())) { 1505 /* 1506 * If we don't have a blockReader, or the one we have has no more bytes 1507 * left to read, we call seekToBlockSource to get a new blockReader and 1508 * recalculate blockEnd. Note that we assume we're not at EOF here 1509 * (we check this above). 1510 */ 1511 if ((!seekToBlockSource(pos)) || (blockReader == null)) { 1512 throw new IOException("failed to allocate new BlockReader " + 1513 "at position " + pos); 1514 } 1515 } 1516 ByteBuffer buffer = tryReadZeroCopy(maxLength, opts); 1517 if (buffer != null) { 1518 return buffer; 1519 } 1520 buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength); 1521 if (buffer != null) { 1522 extendedReadBuffers.put(buffer, bufferPool); 1523 } 1524 return buffer; 1525 } 1526 1527 private synchronized ByteBuffer tryReadZeroCopy(int maxLength, 1528 EnumSet<ReadOption> opts) throws IOException { 1529 // Java ByteBuffers can't be longer than 2 GB, because they use 1530 // 4-byte signed integers to represent capacity, etc. 1531 // So we can't mmap the parts of the block higher than the 2 GB offset. 1532 // FIXME: we could work around this with multiple memory maps. 1533 // See HDFS-5101. 1534 long blockEnd32 = Math.min(Integer.MAX_VALUE, blockEnd); 1535 long curPos = pos; 1536 long blockLeft = blockEnd32 - curPos + 1; 1537 if (blockLeft <= 0) { 1538 if (DFSClient.LOG.isDebugEnabled()) { 1539 DFSClient.LOG.debug("unable to perform a zero-copy read from offset " + 1540 curPos + " of " + src + "; blockLeft = " + blockLeft + 1541 "; blockEnd32 = " + blockEnd32 + ", blockEnd = " + blockEnd + 1542 "; maxLength = " + maxLength); 1543 } 1544 return null; 1545 } 1546 int length = Math.min((int)blockLeft, maxLength); 1547 long blockStartInFile = currentLocatedBlock.getStartOffset(); 1548 long blockPos = curPos - blockStartInFile; 1549 long limit = blockPos + length; 1550 ClientMmap clientMmap = 1551 blockReader.getClientMmap(opts, dfsClient.getMmapManager()); 1552 if (clientMmap == null) { 1553 if (DFSClient.LOG.isDebugEnabled()) { 1554 DFSClient.LOG.debug("unable to perform a zero-copy read from offset " + 1555 curPos + " of " + src + "; BlockReader#getClientMmap returned " + 1556 "null."); 1557 } 1558 return null; 1559 } 1560 seek(pos + length); 1561 ByteBuffer buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer(); 1562 buffer.position((int)blockPos); 1563 buffer.limit((int)limit); 1564 clientMmap.ref(); 1565 extendedReadBuffers.put(buffer, clientMmap); 1566 readStatistics.addZeroCopyBytes(length); 1567 if (DFSClient.LOG.isDebugEnabled()) { 1568 DFSClient.LOG.debug("readZeroCopy read " + maxLength + " bytes from " + 1569 "offset " + curPos + " via the zero-copy read path. " + 1570 "blockEnd = " + blockEnd); 1571 } 1572 return buffer; 1573 } 1574 1575 @Override 1576 public synchronized void releaseBuffer(ByteBuffer buffer) { 1577 Object val = extendedReadBuffers.remove(buffer); 1578 if (val == null) { 1579 throw new IllegalArgumentException("tried to release a buffer " + 1580 "that was not created by this stream, " + buffer); 1581 } 1582 if (val instanceof ClientMmap) { 1583 ((ClientMmap)val).unref(); 1584 } else if (val instanceof ByteBufferPool) { 1585 ((ByteBufferPool)val).putBuffer(buffer); 1586 } 1587 } 1588 }