001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.balancer; 019 020import static org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed; 021 022import java.io.BufferedInputStream; 023import java.io.BufferedOutputStream; 024import java.io.DataInputStream; 025import java.io.DataOutputStream; 026import java.io.IOException; 027import java.io.InputStream; 028import java.io.OutputStream; 029import java.net.Socket; 030import java.util.ArrayList; 031import java.util.Arrays; 032import java.util.Collection; 033import java.util.EnumMap; 034import java.util.HashMap; 035import java.util.HashSet; 036import java.util.Iterator; 037import java.util.List; 038import java.util.Map; 039import java.util.Set; 040import java.util.concurrent.ExecutionException; 041import java.util.concurrent.ExecutorService; 042import java.util.concurrent.Executors; 043import java.util.concurrent.Future; 044import java.util.concurrent.ThreadPoolExecutor; 045 046import org.apache.commons.logging.Log; 047import org.apache.commons.logging.LogFactory; 048import org.apache.hadoop.classification.InterfaceAudience; 049import org.apache.hadoop.conf.Configuration; 050import org.apache.hadoop.fs.CommonConfigurationKeys; 051import org.apache.hadoop.fs.StorageType; 052import org.apache.hadoop.hdfs.DFSConfigKeys; 053import org.apache.hadoop.hdfs.DFSUtil; 054import org.apache.hadoop.hdfs.DistributedFileSystem; 055import org.apache.hadoop.hdfs.protocol.Block; 056import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 057import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 058import org.apache.hadoop.hdfs.protocol.HdfsConstants; 059import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtoUtil; 060import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair; 061import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; 062import org.apache.hadoop.hdfs.protocol.datatransfer.TrustedChannelResolver; 063import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil; 064import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient; 065import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto; 066import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; 067import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; 068import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode.StorageGroup; 069import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; 070import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations; 071import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations; 072import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport; 073import org.apache.hadoop.io.IOUtils; 074import org.apache.hadoop.net.NetUtils; 075import org.apache.hadoop.net.NetworkTopology; 076import org.apache.hadoop.security.token.Token; 077import org.apache.hadoop.util.HostsFileReader; 078import org.apache.hadoop.util.StringUtils; 079import org.apache.hadoop.util.Time; 080 081import com.google.common.annotations.VisibleForTesting; 082import com.google.common.base.Preconditions; 083 084/** Dispatching block replica moves between datanodes. */ 085@InterfaceAudience.Private 086public class Dispatcher { 087 static final Log LOG = LogFactory.getLog(Dispatcher.class); 088 089 /** 090 * the period of time to delay the usage of a DataNode after hitting 091 * errors when using it for migrating data 092 */ 093 private static long delayAfterErrors = 10 * 1000; 094 095 private final NameNodeConnector nnc; 096 private final SaslDataTransferClient saslClient; 097 098 /** Set of datanodes to be excluded. */ 099 private final Set<String> excludedNodes; 100 /** Restrict to the following nodes. */ 101 private final Set<String> includedNodes; 102 103 private final Collection<Source> sources = new HashSet<Source>(); 104 private final Collection<StorageGroup> targets = new HashSet<StorageGroup>(); 105 106 private final GlobalBlockMap globalBlocks = new GlobalBlockMap(); 107 private final MovedBlocks<StorageGroup> movedBlocks; 108 109 /** Map (datanodeUuid,storageType -> StorageGroup) */ 110 private final StorageGroupMap<StorageGroup> storageGroupMap 111 = new StorageGroupMap<StorageGroup>(); 112 113 private NetworkTopology cluster; 114 115 private final ExecutorService dispatchExecutor; 116 117 private final Allocator moverThreadAllocator; 118 119 /** The maximum number of concurrent blocks moves at a datanode */ 120 private final int maxConcurrentMovesPerNode; 121 private final int maxMoverThreads; 122 123 private final long getBlocksSize; 124 private final long getBlocksMinBlockSize; 125 private final long blockMoveTimeout; 126 /** 127 * If no block can be moved out of a {@link Source} after this configured 128 * amount of time, the Source should give up choosing the next possible move. 129 */ 130 private final int maxNoMoveInterval; 131 132 static class Allocator { 133 private final int max; 134 private int count = 0; 135 private int lotSize = 1; 136 137 Allocator(int max) { 138 this.max = max; 139 } 140 141 /** Allocate specified number of items */ 142 synchronized int allocate(int n) { 143 final int remaining = max - count; 144 if (remaining <= 0) { 145 return 0; 146 } else { 147 final int allocated = remaining < n? remaining: n; 148 count += allocated; 149 return allocated; 150 } 151 } 152 153 /** Aloocate a single lot of items */ 154 int allocate() { 155 return allocate(lotSize); 156 } 157 158 synchronized void reset() { 159 count = 0; 160 } 161 162 /** Set the lot size */ 163 synchronized void setLotSize(int lotSize) { 164 this.lotSize = lotSize; 165 } 166 } 167 168 private static class GlobalBlockMap { 169 private final Map<Block, DBlock> map = new HashMap<Block, DBlock>(); 170 171 /** 172 * Get the block from the map; 173 * if the block is not found, create a new block and put it in the map. 174 */ 175 private DBlock get(Block b) { 176 DBlock block = map.get(b); 177 if (block == null) { 178 block = new DBlock(b); 179 map.put(b, block); 180 } 181 return block; 182 } 183 184 /** Remove all blocks except for the moved blocks. */ 185 private void removeAllButRetain(MovedBlocks<StorageGroup> movedBlocks) { 186 for (Iterator<Block> i = map.keySet().iterator(); i.hasNext();) { 187 if (!movedBlocks.contains(i.next())) { 188 i.remove(); 189 } 190 } 191 } 192 } 193 194 public static class StorageGroupMap<G extends StorageGroup> { 195 private static String toKey(String datanodeUuid, StorageType storageType) { 196 return datanodeUuid + ":" + storageType; 197 } 198 199 private final Map<String, G> map = new HashMap<String, G>(); 200 201 public G get(String datanodeUuid, StorageType storageType) { 202 return map.get(toKey(datanodeUuid, storageType)); 203 } 204 205 public void put(G g) { 206 final String key = toKey(g.getDatanodeInfo().getDatanodeUuid(), g.storageType); 207 final StorageGroup existing = map.put(key, g); 208 Preconditions.checkState(existing == null); 209 } 210 211 int size() { 212 return map.size(); 213 } 214 215 void clear() { 216 map.clear(); 217 } 218 219 public Collection<G> values() { 220 return map.values(); 221 } 222 } 223 224 /** This class keeps track of a scheduled block move */ 225 public class PendingMove { 226 private DBlock block; 227 private Source source; 228 private DDatanode proxySource; 229 private StorageGroup target; 230 231 private PendingMove(Source source, StorageGroup target) { 232 this.source = source; 233 this.target = target; 234 } 235 236 @Override 237 public String toString() { 238 final Block b = block != null ? block.getBlock() : null; 239 String bStr = b != null ? (b + " with size=" + b.getNumBytes() + " ") 240 : " "; 241 return bStr + "from " + source.getDisplayName() + " to " + target 242 .getDisplayName() + " through " + (proxySource != null ? proxySource 243 .datanode : ""); 244 } 245 246 /** 247 * Choose a block & a proxy source for this pendingMove whose source & 248 * target have already been chosen. 249 * 250 * @return true if a block and its proxy are chosen; false otherwise 251 */ 252 private boolean chooseBlockAndProxy() { 253 // source and target must have the same storage type 254 final StorageType t = source.getStorageType(); 255 // iterate all source's blocks until find a good one 256 for (Iterator<DBlock> i = source.getBlockIterator(); i.hasNext();) { 257 if (markMovedIfGoodBlock(i.next(), t)) { 258 i.remove(); 259 return true; 260 } 261 } 262 return false; 263 } 264 265 /** 266 * @return true if the given block is good for the tentative move. 267 */ 268 private boolean markMovedIfGoodBlock(DBlock block, StorageType targetStorageType) { 269 synchronized (block) { 270 synchronized (movedBlocks) { 271 if (isGoodBlockCandidate(source, target, targetStorageType, block)) { 272 this.block = block; 273 if (chooseProxySource()) { 274 movedBlocks.put(block); 275 if (LOG.isDebugEnabled()) { 276 LOG.debug("Decided to move " + this); 277 } 278 return true; 279 } 280 } 281 } 282 } 283 return false; 284 } 285 286 /** 287 * Choose a proxy source. 288 * 289 * @return true if a proxy is found; otherwise false 290 */ 291 private boolean chooseProxySource() { 292 final DatanodeInfo targetDN = target.getDatanodeInfo(); 293 // if source and target are same nodes then no need of proxy 294 if (source.getDatanodeInfo().equals(targetDN) && addTo(source)) { 295 return true; 296 } 297 // if node group is supported, first try add nodes in the same node group 298 if (cluster.isNodeGroupAware()) { 299 for (StorageGroup loc : block.getLocations()) { 300 if (cluster.isOnSameNodeGroup(loc.getDatanodeInfo(), targetDN) 301 && addTo(loc)) { 302 return true; 303 } 304 } 305 } 306 // check if there is replica which is on the same rack with the target 307 for (StorageGroup loc : block.getLocations()) { 308 if (cluster.isOnSameRack(loc.getDatanodeInfo(), targetDN) && addTo(loc)) { 309 return true; 310 } 311 } 312 // find out a non-busy replica 313 for (StorageGroup loc : block.getLocations()) { 314 if (addTo(loc)) { 315 return true; 316 } 317 } 318 return false; 319 } 320 321 /** add to a proxy source for specific block movement */ 322 private boolean addTo(StorageGroup g) { 323 final DDatanode dn = g.getDDatanode(); 324 if (dn.addPendingBlock(this)) { 325 proxySource = dn; 326 return true; 327 } 328 return false; 329 } 330 331 /** Dispatch the move to the proxy source & wait for the response. */ 332 private void dispatch() { 333 LOG.info("Start moving " + this); 334 335 Socket sock = new Socket(); 336 DataOutputStream out = null; 337 DataInputStream in = null; 338 try { 339 sock.connect( 340 NetUtils.createSocketAddr(target.getDatanodeInfo().getXferAddr()), 341 HdfsServerConstants.READ_TIMEOUT); 342 343 // Set read timeout so that it doesn't hang forever against 344 // unresponsive nodes. Datanode normally sends IN_PROGRESS response 345 // twice within the client read timeout period (every 30 seconds by 346 // default). Here, we make it give up after 5 minutes of no response. 347 sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT * 5); 348 sock.setKeepAlive(true); 349 350 OutputStream unbufOut = sock.getOutputStream(); 351 InputStream unbufIn = sock.getInputStream(); 352 ExtendedBlock eb = new ExtendedBlock(nnc.getBlockpoolID(), 353 block.getBlock()); 354 final KeyManager km = nnc.getKeyManager(); 355 Token<BlockTokenIdentifier> accessToken = km.getAccessToken(eb); 356 IOStreamPair saslStreams = saslClient.socketSend(sock, unbufOut, 357 unbufIn, km, accessToken, target.getDatanodeInfo()); 358 unbufOut = saslStreams.out; 359 unbufIn = saslStreams.in; 360 out = new DataOutputStream(new BufferedOutputStream(unbufOut, 361 HdfsConstants.IO_FILE_BUFFER_SIZE)); 362 in = new DataInputStream(new BufferedInputStream(unbufIn, 363 HdfsConstants.IO_FILE_BUFFER_SIZE)); 364 365 sendRequest(out, eb, accessToken); 366 receiveResponse(in); 367 nnc.getBytesMoved().addAndGet(block.getNumBytes()); 368 LOG.info("Successfully moved " + this); 369 } catch (IOException e) { 370 LOG.warn("Failed to move " + this + ": " + e.getMessage()); 371 target.getDDatanode().setHasFailure(); 372 // Proxy or target may have some issues, delay before using these nodes 373 // further in order to avoid a potential storm of "threads quota 374 // exceeded" warnings when the dispatcher gets out of sync with work 375 // going on in datanodes. 376 proxySource.activateDelay(delayAfterErrors); 377 target.getDDatanode().activateDelay(delayAfterErrors); 378 } finally { 379 IOUtils.closeStream(out); 380 IOUtils.closeStream(in); 381 IOUtils.closeSocket(sock); 382 383 proxySource.removePendingBlock(this); 384 target.getDDatanode().removePendingBlock(this); 385 386 synchronized (this) { 387 reset(); 388 } 389 synchronized (Dispatcher.this) { 390 Dispatcher.this.notifyAll(); 391 } 392 } 393 } 394 395 /** Send a block replace request to the output stream */ 396 private void sendRequest(DataOutputStream out, ExtendedBlock eb, 397 Token<BlockTokenIdentifier> accessToken) throws IOException { 398 new Sender(out).replaceBlock(eb, target.storageType, accessToken, 399 source.getDatanodeInfo().getDatanodeUuid(), proxySource.datanode); 400 } 401 402 /** Check whether to continue waiting for response */ 403 private boolean stopWaitingForResponse(long startTime) { 404 return source.isIterationOver() || 405 (blockMoveTimeout > 0 && 406 (Time.monotonicNow() - startTime > blockMoveTimeout)); 407 } 408 409 /** Receive a reportedBlock copy response from the input stream */ 410 private void receiveResponse(DataInputStream in) throws IOException { 411 long startTime = Time.monotonicNow(); 412 BlockOpResponseProto response = 413 BlockOpResponseProto.parseFrom(vintPrefixed(in)); 414 while (response.getStatus() == Status.IN_PROGRESS) { 415 // read intermediate responses 416 response = BlockOpResponseProto.parseFrom(vintPrefixed(in)); 417 // Stop waiting for slow block moves. Even if it stops waiting, 418 // the actual move may continue. 419 if (stopWaitingForResponse(startTime)) { 420 throw new IOException("Block move timed out"); 421 } 422 } 423 String logInfo = "block move is failed"; 424 DataTransferProtoUtil.checkBlockOpStatus(response, logInfo); 425 } 426 427 /** reset the object */ 428 private void reset() { 429 block = null; 430 source = null; 431 proxySource = null; 432 target = null; 433 } 434 } 435 436 /** A class for keeping track of block locations in the dispatcher. */ 437 public static class DBlock extends MovedBlocks.Locations<StorageGroup> { 438 public DBlock(Block block) { 439 super(block); 440 } 441 } 442 443 /** The class represents a desired move. */ 444 static class Task { 445 private final StorageGroup target; 446 private long size; // bytes scheduled to move 447 448 Task(StorageGroup target, long size) { 449 this.target = target; 450 this.size = size; 451 } 452 453 long getSize() { 454 return size; 455 } 456 } 457 458 /** A class that keeps track of a datanode. */ 459 public static class DDatanode { 460 461 /** A group of storages in a datanode with the same storage type. */ 462 public class StorageGroup { 463 final StorageType storageType; 464 final long maxSize2Move; 465 private long scheduledSize = 0L; 466 467 private StorageGroup(StorageType storageType, long maxSize2Move) { 468 this.storageType = storageType; 469 this.maxSize2Move = maxSize2Move; 470 } 471 472 public StorageType getStorageType() { 473 return storageType; 474 } 475 476 private DDatanode getDDatanode() { 477 return DDatanode.this; 478 } 479 480 public DatanodeInfo getDatanodeInfo() { 481 return DDatanode.this.datanode; 482 } 483 484 /** Decide if still need to move more bytes */ 485 boolean hasSpaceForScheduling() { 486 return hasSpaceForScheduling(0L); 487 } 488 489 synchronized boolean hasSpaceForScheduling(long size) { 490 return availableSizeToMove() > size; 491 } 492 493 /** @return the total number of bytes that need to be moved */ 494 synchronized long availableSizeToMove() { 495 return maxSize2Move - scheduledSize; 496 } 497 498 /** increment scheduled size */ 499 public synchronized void incScheduledSize(long size) { 500 scheduledSize += size; 501 } 502 503 /** @return scheduled size */ 504 synchronized long getScheduledSize() { 505 return scheduledSize; 506 } 507 508 /** Reset scheduled size to zero. */ 509 synchronized void resetScheduledSize() { 510 scheduledSize = 0L; 511 } 512 513 private PendingMove addPendingMove(DBlock block, final PendingMove pm) { 514 if (getDDatanode().addPendingBlock(pm)) { 515 if (pm.markMovedIfGoodBlock(block, getStorageType())) { 516 incScheduledSize(pm.block.getNumBytes()); 517 return pm; 518 } else { 519 getDDatanode().removePendingBlock(pm); 520 } 521 } 522 return null; 523 } 524 525 /** @return the name for display */ 526 String getDisplayName() { 527 return datanode + ":" + storageType; 528 } 529 530 @Override 531 public String toString() { 532 return getDisplayName(); 533 } 534 535 @Override 536 public int hashCode() { 537 return getStorageType().hashCode() ^ getDatanodeInfo().hashCode(); 538 } 539 540 @Override 541 public boolean equals(Object obj) { 542 if (this == obj) { 543 return true; 544 } else if (obj == null || !(obj instanceof StorageGroup)) { 545 return false; 546 } else { 547 final StorageGroup that = (StorageGroup) obj; 548 return this.getStorageType() == that.getStorageType() 549 && this.getDatanodeInfo().equals(that.getDatanodeInfo()); 550 } 551 } 552 553 } 554 555 final DatanodeInfo datanode; 556 private final EnumMap<StorageType, Source> sourceMap 557 = new EnumMap<StorageType, Source>(StorageType.class); 558 private final EnumMap<StorageType, StorageGroup> targetMap 559 = new EnumMap<StorageType, StorageGroup>(StorageType.class); 560 protected long delayUntil = 0L; 561 /** blocks being moved but not confirmed yet */ 562 private final List<PendingMove> pendings; 563 private volatile boolean hasFailure = false; 564 private ExecutorService moveExecutor; 565 566 @Override 567 public String toString() { 568 return getClass().getSimpleName() + ":" + datanode; 569 } 570 571 private DDatanode(DatanodeInfo datanode, int maxConcurrentMoves) { 572 this.datanode = datanode; 573 this.pendings = new ArrayList<PendingMove>(maxConcurrentMoves); 574 } 575 576 public DatanodeInfo getDatanodeInfo() { 577 return datanode; 578 } 579 580 synchronized ExecutorService initMoveExecutor(int poolSize) { 581 return moveExecutor = Executors.newFixedThreadPool(poolSize); 582 } 583 584 synchronized ExecutorService getMoveExecutor() { 585 return moveExecutor; 586 } 587 588 synchronized void shutdownMoveExecutor() { 589 if (moveExecutor != null) { 590 moveExecutor.shutdown(); 591 moveExecutor = null; 592 } 593 } 594 595 private static <G extends StorageGroup> void put(StorageType storageType, 596 G g, EnumMap<StorageType, G> map) { 597 final StorageGroup existing = map.put(storageType, g); 598 Preconditions.checkState(existing == null); 599 } 600 601 public StorageGroup addTarget(StorageType storageType, long maxSize2Move) { 602 final StorageGroup g = new StorageGroup(storageType, maxSize2Move); 603 put(storageType, g, targetMap); 604 return g; 605 } 606 607 public Source addSource(StorageType storageType, long maxSize2Move, Dispatcher d) { 608 final Source s = d.new Source(storageType, maxSize2Move, this); 609 put(storageType, s, sourceMap); 610 return s; 611 } 612 613 synchronized private void activateDelay(long delta) { 614 delayUntil = Time.monotonicNow() + delta; 615 LOG.info(this + " activateDelay " + delta/1000.0 + " seconds"); 616 } 617 618 synchronized private boolean isDelayActive() { 619 if (delayUntil == 0 || Time.monotonicNow() > delayUntil) { 620 delayUntil = 0; 621 return false; 622 } 623 return true; 624 } 625 626 /** Check if all the dispatched moves are done */ 627 synchronized boolean isPendingQEmpty() { 628 return pendings.isEmpty(); 629 } 630 631 /** Add a scheduled block move to the node */ 632 synchronized boolean addPendingBlock(PendingMove pendingBlock) { 633 if (!isDelayActive()) { 634 return pendings.add(pendingBlock); 635 } 636 return false; 637 } 638 639 /** Remove a scheduled block move from the node */ 640 synchronized boolean removePendingBlock(PendingMove pendingBlock) { 641 return pendings.remove(pendingBlock); 642 } 643 644 void setHasFailure() { 645 this.hasFailure = true; 646 } 647 } 648 649 /** A node that can be the sources of a block move */ 650 public class Source extends DDatanode.StorageGroup { 651 652 private final List<Task> tasks = new ArrayList<Task>(2); 653 private long blocksToReceive = 0L; 654 private final long startTime = Time.monotonicNow(); 655 /** 656 * Source blocks point to the objects in {@link Dispatcher#globalBlocks} 657 * because we want to keep one copy of a block and be aware that the 658 * locations are changing over time. 659 */ 660 private final List<DBlock> srcBlocks = new ArrayList<DBlock>(); 661 662 private Source(StorageType storageType, long maxSize2Move, DDatanode dn) { 663 dn.super(storageType, maxSize2Move); 664 } 665 666 /** 667 * Check if the iteration is over 668 */ 669 public boolean isIterationOver() { 670 return (Time.monotonicNow()-startTime > MAX_ITERATION_TIME); 671 } 672 673 /** Add a task */ 674 void addTask(Task task) { 675 Preconditions.checkState(task.target != this, 676 "Source and target are the same storage group " + getDisplayName()); 677 incScheduledSize(task.size); 678 tasks.add(task); 679 } 680 681 /** @return an iterator to this source's blocks */ 682 Iterator<DBlock> getBlockIterator() { 683 return srcBlocks.iterator(); 684 } 685 686 /** 687 * Fetch new blocks of this source from namenode and update this source's 688 * block list & {@link Dispatcher#globalBlocks}. 689 * 690 * @return the total size of the received blocks in the number of bytes. 691 */ 692 private long getBlockList() throws IOException { 693 final long size = Math.min(getBlocksSize, blocksToReceive); 694 final BlocksWithLocations newBlocks = nnc.getBlocks(getDatanodeInfo(), size); 695 696 if (LOG.isTraceEnabled()) { 697 LOG.trace("getBlocks(" + getDatanodeInfo() + ", " 698 + StringUtils.TraditionalBinaryPrefix.long2String(size, "B", 2) 699 + ") returns " + newBlocks.getBlocks().length + " blocks."); 700 } 701 702 long bytesReceived = 0; 703 for (BlockWithLocations blk : newBlocks.getBlocks()) { 704 // Skip small blocks. 705 if (blk.getBlock().getNumBytes() < getBlocksMinBlockSize) { 706 continue; 707 } 708 709 bytesReceived += blk.getBlock().getNumBytes(); 710 synchronized (globalBlocks) { 711 final DBlock block = globalBlocks.get(blk.getBlock()); 712 synchronized (block) { 713 block.clearLocations(); 714 715 // update locations 716 final String[] datanodeUuids = blk.getDatanodeUuids(); 717 final StorageType[] storageTypes = blk.getStorageTypes(); 718 for (int i = 0; i < datanodeUuids.length; i++) { 719 final StorageGroup g = storageGroupMap.get( 720 datanodeUuids[i], storageTypes[i]); 721 if (g != null) { // not unknown 722 block.addLocation(g); 723 } 724 } 725 } 726 if (!srcBlocks.contains(block) && isGoodBlockCandidate(block)) { 727 if (LOG.isTraceEnabled()) { 728 LOG.trace("Add " + block + " to " + this); 729 } 730 srcBlocks.add(block); 731 } 732 } 733 } 734 return bytesReceived; 735 } 736 737 /** Decide if the given block is a good candidate to move or not */ 738 private boolean isGoodBlockCandidate(DBlock block) { 739 // source and target must have the same storage type 740 final StorageType sourceStorageType = getStorageType(); 741 for (Task t : tasks) { 742 if (Dispatcher.this.isGoodBlockCandidate(this, t.target, 743 sourceStorageType, block)) { 744 return true; 745 } 746 } 747 return false; 748 } 749 750 /** 751 * Choose a move for the source. The block's source, target, and proxy 752 * are determined too. When choosing proxy and target, source & 753 * target throttling has been considered. They are chosen only when they 754 * have the capacity to support this block move. The block should be 755 * dispatched immediately after this method is returned. 756 * 757 * @return a move that's good for the source to dispatch immediately. 758 */ 759 private PendingMove chooseNextMove() { 760 for (Iterator<Task> i = tasks.iterator(); i.hasNext();) { 761 final Task task = i.next(); 762 final DDatanode target = task.target.getDDatanode(); 763 final PendingMove pendingBlock = new PendingMove(this, task.target); 764 if (target.addPendingBlock(pendingBlock)) { 765 // target is not busy, so do a tentative block allocation 766 if (pendingBlock.chooseBlockAndProxy()) { 767 long blockSize = pendingBlock.block.getNumBytes(); 768 incScheduledSize(-blockSize); 769 task.size -= blockSize; 770 if (task.size <= 0) { 771 i.remove(); 772 } 773 return pendingBlock; 774 } else { 775 // cancel the tentative move 776 target.removePendingBlock(pendingBlock); 777 } 778 } 779 } 780 return null; 781 } 782 783 /** Add a pending move */ 784 public PendingMove addPendingMove(DBlock block, StorageGroup target) { 785 return target.addPendingMove(block, new PendingMove(this, target)); 786 } 787 788 /** Iterate all source's blocks to remove moved ones */ 789 private void removeMovedBlocks() { 790 for (Iterator<DBlock> i = getBlockIterator(); i.hasNext();) { 791 if (movedBlocks.contains(i.next().getBlock())) { 792 i.remove(); 793 } 794 } 795 } 796 797 /** @return if should fetch more blocks from namenode */ 798 private boolean shouldFetchMoreBlocks() { 799 return blocksToReceive > 0; 800 } 801 802 private static final long MAX_ITERATION_TIME = 20 * 60 * 1000L; // 20 mins 803 804 /** 805 * This method iteratively does the following: it first selects a block to 806 * move, then sends a request to the proxy source to start the block move 807 * when the source's block list falls below a threshold, it asks the 808 * namenode for more blocks. It terminates when it has dispatch enough block 809 * move tasks or it has received enough blocks from the namenode, or the 810 * elapsed time of the iteration has exceeded the max time limit. 811 * 812 * @param delay - time to sleep before sending getBlocks. Intended to 813 * disperse Balancer RPCs to NameNode for large clusters. See HDFS-11384. 814 */ 815 private void dispatchBlocks(long delay) { 816 this.blocksToReceive = 2 * getScheduledSize(); 817 long previousMoveTimestamp = Time.monotonicNow(); 818 while (getScheduledSize() > 0 && !isIterationOver() 819 && (!srcBlocks.isEmpty() || blocksToReceive > 0)) { 820 if (LOG.isTraceEnabled()) { 821 LOG.trace(this + " blocksToReceive=" + blocksToReceive 822 + ", scheduledSize=" + getScheduledSize() 823 + ", srcBlocks#=" + srcBlocks.size()); 824 } 825 final PendingMove p = chooseNextMove(); 826 if (p != null) { 827 // Reset previous move timestamp 828 previousMoveTimestamp = Time.monotonicNow(); 829 executePendingMove(p); 830 continue; 831 } 832 833 // Since we cannot schedule any block to move, 834 // remove any moved blocks from the source block list and 835 removeMovedBlocks(); // filter already moved blocks 836 // check if we should fetch more blocks from the namenode 837 if (shouldFetchMoreBlocks()) { 838 // fetch new blocks 839 try { 840 if(delay > 0) { 841 if (LOG.isDebugEnabled()) { 842 LOG.debug("Sleeping " + delay + " msec."); 843 } 844 Thread.sleep(delay); 845 } 846 blocksToReceive -= getBlockList(); 847 continue; 848 } catch (InterruptedException ignored) { 849 // nothing to do 850 } catch (IOException e) { 851 LOG.warn("Exception while getting block list", e); 852 return; 853 } finally { 854 delay = 0L; 855 } 856 } else { 857 // jump out of while-loop after the configured timeout. 858 long noMoveInterval = Time.monotonicNow() - previousMoveTimestamp; 859 if (noMoveInterval > maxNoMoveInterval) { 860 LOG.info("Failed to find a pending move for " + noMoveInterval 861 + " ms. Skipping " + this); 862 resetScheduledSize(); 863 } 864 } 865 866 // Now we can not schedule any block to move and there are 867 // no new blocks added to the source block list, so we wait. 868 try { 869 synchronized (Dispatcher.this) { 870 Dispatcher.this.wait(1000); // wait for targets/sources to be idle 871 } 872 // Didn't find a possible move in this iteration of the while loop, 873 // adding a small delay before choosing next move again. 874 Thread.sleep(100); 875 } catch (InterruptedException ignored) { 876 } 877 } 878 879 if (isIterationOver()) { 880 LOG.info("The maximum iteration time (" + MAX_ITERATION_TIME/1000 881 + " seconds) has been reached. Stopping " + this); 882 } 883 } 884 885 @Override 886 public int hashCode() { 887 return super.hashCode(); 888 } 889 890 @Override 891 public boolean equals(Object obj) { 892 return super.equals(obj); 893 } 894 } 895 896 /** Constructor called by Mover. */ 897 public Dispatcher(NameNodeConnector nnc, Set<String> includedNodes, 898 Set<String> excludedNodes, long movedWinWidth, int moverThreads, 899 int dispatcherThreads, int maxConcurrentMovesPerNode, 900 int maxNoMoveInterval, Configuration conf) { 901 this(nnc, includedNodes, excludedNodes, movedWinWidth, 902 moverThreads, dispatcherThreads, maxConcurrentMovesPerNode, 903 0L, 0L, 0, maxNoMoveInterval, conf); 904 } 905 906 Dispatcher(NameNodeConnector nnc, Set<String> includedNodes, 907 Set<String> excludedNodes, long movedWinWidth, int moverThreads, 908 int dispatcherThreads, int maxConcurrentMovesPerNode, 909 long getBlocksSize, long getBlocksMinBlockSize, 910 int blockMoveTimeout, int maxNoMoveInterval, Configuration conf) { 911 this.nnc = nnc; 912 this.excludedNodes = excludedNodes; 913 this.includedNodes = includedNodes; 914 this.movedBlocks = new MovedBlocks<StorageGroup>(movedWinWidth); 915 916 this.cluster = NetworkTopology.getInstance(conf); 917 918 this.dispatchExecutor = dispatcherThreads == 0? null 919 : Executors.newFixedThreadPool(dispatcherThreads); 920 this.moverThreadAllocator = new Allocator(moverThreads); 921 this.maxMoverThreads = moverThreads; 922 this.maxConcurrentMovesPerNode = maxConcurrentMovesPerNode; 923 924 this.getBlocksSize = getBlocksSize; 925 this.getBlocksMinBlockSize = getBlocksMinBlockSize; 926 this.blockMoveTimeout = blockMoveTimeout; 927 this.maxNoMoveInterval = maxNoMoveInterval; 928 929 this.saslClient = new SaslDataTransferClient(conf, 930 DataTransferSaslUtil.getSaslPropertiesResolver(conf), 931 TrustedChannelResolver.getInstance(conf), nnc.fallbackToSimpleAuth); 932 } 933 934 public DistributedFileSystem getDistributedFileSystem() { 935 return nnc.getDistributedFileSystem(); 936 } 937 938 public StorageGroupMap<StorageGroup> getStorageGroupMap() { 939 return storageGroupMap; 940 } 941 942 public NetworkTopology getCluster() { 943 return cluster; 944 } 945 946 long getBytesMoved() { 947 return nnc.getBytesMoved().get(); 948 } 949 950 long bytesToMove() { 951 Preconditions.checkState( 952 storageGroupMap.size() >= sources.size() + targets.size(), 953 "Mismatched number of storage groups (" + storageGroupMap.size() 954 + " < " + sources.size() + " sources + " + targets.size() 955 + " targets)"); 956 957 long b = 0L; 958 for (Source src : sources) { 959 b += src.getScheduledSize(); 960 } 961 return b; 962 } 963 964 void add(Source source, StorageGroup target) { 965 sources.add(source); 966 targets.add(target); 967 } 968 969 private boolean shouldIgnore(DatanodeInfo dn) { 970 // ignore decommissioned nodes 971 final boolean decommissioned = dn.isDecommissioned(); 972 // ignore decommissioning nodes 973 final boolean decommissioning = dn.isDecommissionInProgress(); 974 // ignore nodes in exclude list 975 final boolean excluded = Util.isExcluded(excludedNodes, dn); 976 // ignore nodes not in the include list (if include list is not empty) 977 final boolean notIncluded = !Util.isIncluded(includedNodes, dn); 978 979 if (decommissioned || decommissioning || excluded || notIncluded) { 980 if (LOG.isTraceEnabled()) { 981 LOG.trace("Excluding datanode " + dn + ": " + decommissioned + ", " 982 + decommissioning + ", " + excluded + ", " + notIncluded); 983 } 984 return true; 985 } 986 return false; 987 } 988 989 /** Get live datanode storage reports and then build the network topology. */ 990 public List<DatanodeStorageReport> init() throws IOException { 991 final DatanodeStorageReport[] reports = nnc.getLiveDatanodeStorageReport(); 992 final List<DatanodeStorageReport> trimmed = new ArrayList<DatanodeStorageReport>(); 993 // create network topology and classify utilization collections: 994 // over-utilized, above-average, below-average and under-utilized. 995 for (DatanodeStorageReport r : DFSUtil.shuffle(reports)) { 996 final DatanodeInfo datanode = r.getDatanodeInfo(); 997 if (shouldIgnore(datanode)) { 998 continue; 999 } 1000 trimmed.add(r); 1001 cluster.add(datanode); 1002 } 1003 return trimmed; 1004 } 1005 1006 public DDatanode newDatanode(DatanodeInfo datanode) { 1007 return new DDatanode(datanode, maxConcurrentMovesPerNode); 1008 } 1009 1010 1011 public void executePendingMove(final PendingMove p) { 1012 // move the block 1013 final DDatanode targetDn = p.target.getDDatanode(); 1014 ExecutorService moveExecutor = targetDn.getMoveExecutor(); 1015 if (moveExecutor == null) { 1016 final int nThreads = moverThreadAllocator.allocate(); 1017 if (nThreads > 0) { 1018 moveExecutor = targetDn.initMoveExecutor(nThreads); 1019 } 1020 } 1021 if (moveExecutor == null) { 1022 LOG.warn("No mover threads available: skip moving " + p); 1023 targetDn.removePendingBlock(p); 1024 p.proxySource.removePendingBlock(p); 1025 return; 1026 } 1027 1028 moveExecutor.execute(new Runnable() { 1029 @Override 1030 public void run() { 1031 p.dispatch(); 1032 } 1033 }); 1034 } 1035 1036 public boolean dispatchAndCheckContinue() throws InterruptedException { 1037 return nnc.shouldContinue(dispatchBlockMoves()); 1038 } 1039 1040 /** 1041 * The best-effort limit on the number of RPCs per second 1042 * the Balancer will send to the NameNode. 1043 */ 1044 final static int BALANCER_NUM_RPC_PER_SEC = 20; 1045 1046 /** 1047 * Dispatch block moves for each source. The thread selects blocks to move & 1048 * sends request to proxy source to initiate block move. The process is flow 1049 * controlled. Block selection is blocked if there are too many un-confirmed 1050 * block moves. 1051 * 1052 * @return the total number of bytes successfully moved in this iteration. 1053 */ 1054 private long dispatchBlockMoves() throws InterruptedException { 1055 final long bytesLastMoved = getBytesMoved(); 1056 final Future<?>[] futures = new Future<?>[sources.size()]; 1057 1058 int concurrentThreads = Math.min(sources.size(), 1059 ((ThreadPoolExecutor)dispatchExecutor).getCorePoolSize()); 1060 assert concurrentThreads > 0 : "Number of concurrent threads is 0."; 1061 if (LOG.isDebugEnabled()) { 1062 LOG.debug("Balancer allowed RPCs per sec = " + BALANCER_NUM_RPC_PER_SEC); 1063 LOG.debug("Balancer concurrent threads = " + concurrentThreads); 1064 LOG.debug("Disperse Interval sec = " + 1065 concurrentThreads / BALANCER_NUM_RPC_PER_SEC); 1066 } 1067 1068 // Determine the size of each mover thread pool per target 1069 int threadsPerTarget = maxMoverThreads/targets.size(); 1070 if (threadsPerTarget == 0) { 1071 // Some scheduled moves will get ignored as some targets won't have 1072 // any threads allocated. 1073 moverThreadAllocator.setLotSize(1); 1074 LOG.warn(DFSConfigKeys.DFS_BALANCER_MOVERTHREADS_KEY + "=" + 1075 maxMoverThreads + " is too small for moving blocks to " + 1076 targets.size() + " targets. Balancing may be slower."); 1077 } else { 1078 if (threadsPerTarget > maxConcurrentMovesPerNode) { 1079 threadsPerTarget = maxConcurrentMovesPerNode; 1080 LOG.info("Limiting threads per target to the specified max."); 1081 } 1082 moverThreadAllocator.setLotSize(threadsPerTarget); 1083 LOG.info("Allocating " + threadsPerTarget + " threads per target."); 1084 } 1085 1086 long dSec = 0; 1087 final Iterator<Source> i = sources.iterator(); 1088 for (int j = 0; j < futures.length; j++) { 1089 final Source s = i.next(); 1090 final long delay = dSec * 1000; 1091 futures[j] = dispatchExecutor.submit(new Runnable() { 1092 @Override 1093 public void run() { 1094 s.dispatchBlocks(delay); 1095 } 1096 }); 1097 // Calculate delay in seconds for the next iteration 1098 if(j >= concurrentThreads) { 1099 dSec = 0; 1100 } else if((j + 1) % BALANCER_NUM_RPC_PER_SEC == 0) { 1101 dSec++; 1102 } 1103 } 1104 1105 // wait for all dispatcher threads to finish 1106 for (Future<?> future : futures) { 1107 try { 1108 future.get(); 1109 } catch (ExecutionException e) { 1110 LOG.warn("Dispatcher thread failed", e.getCause()); 1111 } 1112 } 1113 1114 // wait for all block moving to be done 1115 waitForMoveCompletion(targets); 1116 1117 return getBytesMoved() - bytesLastMoved; 1118 } 1119 1120 /** 1121 * Wait for all block move confirmations. 1122 * @return true if there is failed move execution 1123 */ 1124 public static boolean waitForMoveCompletion( 1125 Iterable<? extends StorageGroup> targets) { 1126 boolean hasFailure = false; 1127 for(;;) { 1128 boolean empty = true; 1129 for (StorageGroup t : targets) { 1130 if (!t.getDDatanode().isPendingQEmpty()) { 1131 empty = false; 1132 break; 1133 } else { 1134 hasFailure |= t.getDDatanode().hasFailure; 1135 } 1136 } 1137 if (empty) { 1138 return hasFailure; // all pending queues are empty 1139 } 1140 try { 1141 Thread.sleep(1000); 1142 } catch (InterruptedException ignored) { 1143 } 1144 } 1145 } 1146 1147 /** 1148 * Decide if the block is a good candidate to be moved from source to target. 1149 * A block is a good candidate if 1150 * 1. the block is not in the process of being moved/has not been moved; 1151 * 2. the block does not have a replica on the target; 1152 * 3. doing the move does not reduce the number of racks that the block has 1153 */ 1154 private boolean isGoodBlockCandidate(StorageGroup source, StorageGroup target, 1155 StorageType targetStorageType, DBlock block) { 1156 if (source.equals(target)) { 1157 return false; 1158 } 1159 if (target.storageType != targetStorageType) { 1160 return false; 1161 } 1162 // check if the block is moved or not 1163 if (movedBlocks.contains(block.getBlock())) { 1164 return false; 1165 } 1166 final DatanodeInfo targetDatanode = target.getDatanodeInfo(); 1167 if (source.getDatanodeInfo().equals(targetDatanode)) { 1168 // the block is moved inside same DN 1169 return true; 1170 } 1171 1172 // check if block has replica in target node 1173 for (StorageGroup blockLocation : block.getLocations()) { 1174 if (blockLocation.getDatanodeInfo().equals(targetDatanode)) { 1175 return false; 1176 } 1177 } 1178 1179 if (cluster.isNodeGroupAware() 1180 && isOnSameNodeGroupWithReplicas(source, target, block)) { 1181 return false; 1182 } 1183 if (reduceNumOfRacks(source, target, block)) { 1184 return false; 1185 } 1186 return true; 1187 } 1188 1189 /** 1190 * Determine whether moving the given block replica from source to target 1191 * would reduce the number of racks of the block replicas. 1192 */ 1193 private boolean reduceNumOfRacks(StorageGroup source, StorageGroup target, 1194 DBlock block) { 1195 final DatanodeInfo sourceDn = source.getDatanodeInfo(); 1196 if (cluster.isOnSameRack(sourceDn, target.getDatanodeInfo())) { 1197 // source and target are on the same rack 1198 return false; 1199 } 1200 boolean notOnSameRack = true; 1201 synchronized (block) { 1202 for (StorageGroup loc : block.getLocations()) { 1203 if (cluster.isOnSameRack(loc.getDatanodeInfo(), target.getDatanodeInfo())) { 1204 notOnSameRack = false; 1205 break; 1206 } 1207 } 1208 } 1209 if (notOnSameRack) { 1210 // target is not on the same rack as any replica 1211 return false; 1212 } 1213 for (StorageGroup g : block.getLocations()) { 1214 if (g != source && cluster.isOnSameRack(g.getDatanodeInfo(), sourceDn)) { 1215 // source is on the same rack of another replica 1216 return false; 1217 } 1218 } 1219 return true; 1220 } 1221 1222 /** 1223 * Check if there are any replica (other than source) on the same node group 1224 * with target. If true, then target is not a good candidate for placing 1225 * specific replica as we don't want 2 replicas under the same nodegroup. 1226 * 1227 * @return true if there are any replica (other than source) on the same node 1228 * group with target 1229 */ 1230 private boolean isOnSameNodeGroupWithReplicas(StorageGroup source, 1231 StorageGroup target, DBlock block) { 1232 final DatanodeInfo targetDn = target.getDatanodeInfo(); 1233 for (StorageGroup g : block.getLocations()) { 1234 if (g != source && cluster.isOnSameNodeGroup(g.getDatanodeInfo(), targetDn)) { 1235 return true; 1236 } 1237 } 1238 return false; 1239 } 1240 1241 /** Reset all fields in order to prepare for the next iteration */ 1242 void reset(Configuration conf) { 1243 cluster = NetworkTopology.getInstance(conf); 1244 storageGroupMap.clear(); 1245 sources.clear(); 1246 1247 moverThreadAllocator.reset(); 1248 for(StorageGroup t : targets) { 1249 t.getDDatanode().shutdownMoveExecutor(); 1250 } 1251 targets.clear(); 1252 globalBlocks.removeAllButRetain(movedBlocks); 1253 movedBlocks.cleanup(); 1254 } 1255 1256 @VisibleForTesting 1257 public static void setDelayAfterErrors(long time) { 1258 delayAfterErrors = time; 1259 } 1260 1261 /** shutdown thread pools */ 1262 public void shutdownNow() { 1263 if (dispatchExecutor != null) { 1264 dispatchExecutor.shutdownNow(); 1265 } 1266 } 1267 1268 static class Util { 1269 /** @return true if data node is part of the excludedNodes. */ 1270 static boolean isExcluded(Set<String> excludedNodes, DatanodeInfo dn) { 1271 return isIn(excludedNodes, dn); 1272 } 1273 1274 /** 1275 * @return true if includedNodes is empty or data node is part of the 1276 * includedNodes. 1277 */ 1278 static boolean isIncluded(Set<String> includedNodes, DatanodeInfo dn) { 1279 return (includedNodes.isEmpty() || isIn(includedNodes, dn)); 1280 } 1281 1282 /** 1283 * Match is checked using host name , ip address with and without port 1284 * number. 1285 * 1286 * @return true if the datanode's transfer address matches the set of nodes. 1287 */ 1288 private static boolean isIn(Set<String> datanodes, DatanodeInfo dn) { 1289 return isIn(datanodes, dn.getPeerHostName(), dn.getXferPort()) 1290 || isIn(datanodes, dn.getIpAddr(), dn.getXferPort()) 1291 || isIn(datanodes, dn.getHostName(), dn.getXferPort()); 1292 } 1293 1294 /** @return true if nodes contains host or host:port */ 1295 private static boolean isIn(Set<String> nodes, String host, int port) { 1296 if (host == null) { 1297 return false; 1298 } 1299 return (nodes.contains(host) || nodes.contains(host + ":" + port)); 1300 } 1301 1302 /** 1303 * Parse a comma separated string to obtain set of host names 1304 * 1305 * @return set of host names 1306 */ 1307 static Set<String> parseHostList(String string) { 1308 String[] addrs = StringUtils.getTrimmedStrings(string); 1309 return new HashSet<String>(Arrays.asList(addrs)); 1310 } 1311 1312 /** 1313 * Read set of host names from a file 1314 * 1315 * @return set of host names 1316 */ 1317 static Set<String> getHostListFromFile(String fileName, String type) { 1318 Set<String> nodes = new HashSet<String>(); 1319 try { 1320 HostsFileReader.readFileToSet(type, fileName, nodes); 1321 return StringUtils.getTrimmedStrings(nodes); 1322 } catch (IOException e) { 1323 throw new IllegalArgumentException( 1324 "Failed to read host list from file: " + fileName); 1325 } 1326 } 1327 } 1328}