001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.balancer; 019 020import static org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed; 021 022import java.io.BufferedInputStream; 023import java.io.BufferedOutputStream; 024import java.io.DataInputStream; 025import java.io.DataOutputStream; 026import java.io.IOException; 027import java.io.InputStream; 028import java.io.OutputStream; 029import java.net.Socket; 030import java.util.ArrayList; 031import java.util.Arrays; 032import java.util.Collection; 033import java.util.EnumMap; 034import java.util.HashMap; 035import java.util.HashSet; 036import java.util.Iterator; 037import java.util.List; 038import java.util.Map; 039import java.util.Set; 040import java.util.concurrent.ExecutionException; 041import java.util.concurrent.ExecutorService; 042import java.util.concurrent.Executors; 043import java.util.concurrent.Future; 044 045import org.apache.commons.logging.Log; 046import org.apache.commons.logging.LogFactory; 047import org.apache.hadoop.classification.InterfaceAudience; 048import org.apache.hadoop.conf.Configuration; 049import org.apache.hadoop.fs.CommonConfigurationKeys; 050import org.apache.hadoop.fs.StorageType; 051import org.apache.hadoop.hdfs.DFSConfigKeys; 052import org.apache.hadoop.hdfs.DFSUtil; 053import org.apache.hadoop.hdfs.DistributedFileSystem; 054import org.apache.hadoop.hdfs.protocol.Block; 055import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 056import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 057import org.apache.hadoop.hdfs.protocol.HdfsConstants; 058import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtoUtil; 059import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair; 060import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; 061import org.apache.hadoop.hdfs.protocol.datatransfer.TrustedChannelResolver; 062import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil; 063import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient; 064import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto; 065import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; 066import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; 067import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode.StorageGroup; 068import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; 069import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations; 070import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations; 071import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport; 072import org.apache.hadoop.io.IOUtils; 073import org.apache.hadoop.net.NetUtils; 074import org.apache.hadoop.net.NetworkTopology; 075import org.apache.hadoop.security.token.Token; 076import org.apache.hadoop.util.HostsFileReader; 077import org.apache.hadoop.util.StringUtils; 078import org.apache.hadoop.util.Time; 079 080import com.google.common.annotations.VisibleForTesting; 081import com.google.common.base.Preconditions; 082 083/** Dispatching block replica moves between datanodes. */ 084@InterfaceAudience.Private 085public class Dispatcher { 086 static final Log LOG = LogFactory.getLog(Dispatcher.class); 087 088 private static final long GB = 1L << 30; // 1GB 089 private static final long MAX_BLOCKS_SIZE_TO_FETCH = 2 * GB; 090 091 private static final int MAX_NO_PENDING_MOVE_ITERATIONS = 5; 092 /** 093 * the period of time to delay the usage of a DataNode after hitting 094 * errors when using it for migrating data 095 */ 096 private static long delayAfterErrors = 10 * 1000; 097 098 private final NameNodeConnector nnc; 099 private final SaslDataTransferClient saslClient; 100 101 /** Set of datanodes to be excluded. */ 102 private final Set<String> excludedNodes; 103 /** Restrict to the following nodes. */ 104 private final Set<String> includedNodes; 105 106 private final Collection<Source> sources = new HashSet<Source>(); 107 private final Collection<StorageGroup> targets = new HashSet<StorageGroup>(); 108 109 private final GlobalBlockMap globalBlocks = new GlobalBlockMap(); 110 private final MovedBlocks<StorageGroup> movedBlocks; 111 112 /** Map (datanodeUuid,storageType -> StorageGroup) */ 113 private final StorageGroupMap<StorageGroup> storageGroupMap 114 = new StorageGroupMap<StorageGroup>(); 115 116 private NetworkTopology cluster; 117 118 private final ExecutorService moveExecutor; 119 private final ExecutorService dispatchExecutor; 120 121 /** The maximum number of concurrent blocks moves at a datanode */ 122 private final int maxConcurrentMovesPerNode; 123 124 private static class GlobalBlockMap { 125 private final Map<Block, DBlock> map = new HashMap<Block, DBlock>(); 126 127 /** 128 * Get the block from the map; 129 * if the block is not found, create a new block and put it in the map. 130 */ 131 private DBlock get(Block b) { 132 DBlock block = map.get(b); 133 if (block == null) { 134 block = new DBlock(b); 135 map.put(b, block); 136 } 137 return block; 138 } 139 140 /** Remove all blocks except for the moved blocks. */ 141 private void removeAllButRetain(MovedBlocks<StorageGroup> movedBlocks) { 142 for (Iterator<Block> i = map.keySet().iterator(); i.hasNext();) { 143 if (!movedBlocks.contains(i.next())) { 144 i.remove(); 145 } 146 } 147 } 148 } 149 150 public static class StorageGroupMap<G extends StorageGroup> { 151 private static String toKey(String datanodeUuid, StorageType storageType) { 152 return datanodeUuid + ":" + storageType; 153 } 154 155 private final Map<String, G> map = new HashMap<String, G>(); 156 157 public G get(String datanodeUuid, StorageType storageType) { 158 return map.get(toKey(datanodeUuid, storageType)); 159 } 160 161 public void put(G g) { 162 final String key = toKey(g.getDatanodeInfo().getDatanodeUuid(), g.storageType); 163 final StorageGroup existing = map.put(key, g); 164 Preconditions.checkState(existing == null); 165 } 166 167 int size() { 168 return map.size(); 169 } 170 171 void clear() { 172 map.clear(); 173 } 174 175 public Collection<G> values() { 176 return map.values(); 177 } 178 } 179 180 /** This class keeps track of a scheduled block move */ 181 public class PendingMove { 182 private DBlock block; 183 private Source source; 184 private DDatanode proxySource; 185 private StorageGroup target; 186 187 private PendingMove(Source source, StorageGroup target) { 188 this.source = source; 189 this.target = target; 190 } 191 192 @Override 193 public String toString() { 194 final Block b = block != null ? block.getBlock() : null; 195 String bStr = b != null ? (b + " with size=" + b.getNumBytes() + " ") 196 : " "; 197 return bStr + "from " + source.getDisplayName() + " to " + target 198 .getDisplayName() + " through " + (proxySource != null ? proxySource 199 .datanode : ""); 200 } 201 202 /** 203 * Choose a block & a proxy source for this pendingMove whose source & 204 * target have already been chosen. 205 * 206 * @return true if a block and its proxy are chosen; false otherwise 207 */ 208 private boolean chooseBlockAndProxy() { 209 // source and target must have the same storage type 210 final StorageType t = source.getStorageType(); 211 // iterate all source's blocks until find a good one 212 for (Iterator<DBlock> i = source.getBlockIterator(); i.hasNext();) { 213 if (markMovedIfGoodBlock(i.next(), t)) { 214 i.remove(); 215 return true; 216 } 217 } 218 return false; 219 } 220 221 /** 222 * @return true if the given block is good for the tentative move. 223 */ 224 private boolean markMovedIfGoodBlock(DBlock block, StorageType targetStorageType) { 225 synchronized (block) { 226 synchronized (movedBlocks) { 227 if (isGoodBlockCandidate(source, target, targetStorageType, block)) { 228 this.block = block; 229 if (chooseProxySource()) { 230 movedBlocks.put(block); 231 if (LOG.isDebugEnabled()) { 232 LOG.debug("Decided to move " + this); 233 } 234 return true; 235 } 236 } 237 } 238 } 239 return false; 240 } 241 242 /** 243 * Choose a proxy source. 244 * 245 * @return true if a proxy is found; otherwise false 246 */ 247 private boolean chooseProxySource() { 248 final DatanodeInfo targetDN = target.getDatanodeInfo(); 249 // if source and target are same nodes then no need of proxy 250 if (source.getDatanodeInfo().equals(targetDN) && addTo(source)) { 251 return true; 252 } 253 // if node group is supported, first try add nodes in the same node group 254 if (cluster.isNodeGroupAware()) { 255 for (StorageGroup loc : block.getLocations()) { 256 if (cluster.isOnSameNodeGroup(loc.getDatanodeInfo(), targetDN) 257 && addTo(loc)) { 258 return true; 259 } 260 } 261 } 262 // check if there is replica which is on the same rack with the target 263 for (StorageGroup loc : block.getLocations()) { 264 if (cluster.isOnSameRack(loc.getDatanodeInfo(), targetDN) && addTo(loc)) { 265 return true; 266 } 267 } 268 // find out a non-busy replica 269 for (StorageGroup loc : block.getLocations()) { 270 if (addTo(loc)) { 271 return true; 272 } 273 } 274 return false; 275 } 276 277 /** add to a proxy source for specific block movement */ 278 private boolean addTo(StorageGroup g) { 279 final DDatanode dn = g.getDDatanode(); 280 if (dn.addPendingBlock(this)) { 281 proxySource = dn; 282 return true; 283 } 284 return false; 285 } 286 287 /** Dispatch the move to the proxy source & wait for the response. */ 288 private void dispatch() { 289 if (LOG.isDebugEnabled()) { 290 LOG.debug("Start moving " + this); 291 } 292 293 Socket sock = new Socket(); 294 DataOutputStream out = null; 295 DataInputStream in = null; 296 try { 297 sock.connect( 298 NetUtils.createSocketAddr(target.getDatanodeInfo().getXferAddr()), 299 HdfsServerConstants.READ_TIMEOUT); 300 301 sock.setKeepAlive(true); 302 303 OutputStream unbufOut = sock.getOutputStream(); 304 InputStream unbufIn = sock.getInputStream(); 305 ExtendedBlock eb = new ExtendedBlock(nnc.getBlockpoolID(), 306 block.getBlock()); 307 final KeyManager km = nnc.getKeyManager(); 308 Token<BlockTokenIdentifier> accessToken = km.getAccessToken(eb); 309 IOStreamPair saslStreams = saslClient.socketSend(sock, unbufOut, 310 unbufIn, km, accessToken, target.getDatanodeInfo()); 311 unbufOut = saslStreams.out; 312 unbufIn = saslStreams.in; 313 out = new DataOutputStream(new BufferedOutputStream(unbufOut, 314 HdfsConstants.IO_FILE_BUFFER_SIZE)); 315 in = new DataInputStream(new BufferedInputStream(unbufIn, 316 HdfsConstants.IO_FILE_BUFFER_SIZE)); 317 318 sendRequest(out, eb, accessToken); 319 receiveResponse(in); 320 nnc.getBytesMoved().addAndGet(block.getNumBytes()); 321 LOG.info("Successfully moved " + this); 322 } catch (IOException e) { 323 LOG.warn("Failed to move " + this + ": " + e.getMessage()); 324 target.getDDatanode().setHasFailure(); 325 // Proxy or target may have some issues, delay before using these nodes 326 // further in order to avoid a potential storm of "threads quota 327 // exceeded" warnings when the dispatcher gets out of sync with work 328 // going on in datanodes. 329 proxySource.activateDelay(delayAfterErrors); 330 target.getDDatanode().activateDelay(delayAfterErrors); 331 } finally { 332 IOUtils.closeStream(out); 333 IOUtils.closeStream(in); 334 IOUtils.closeSocket(sock); 335 336 proxySource.removePendingBlock(this); 337 target.getDDatanode().removePendingBlock(this); 338 339 synchronized (this) { 340 reset(); 341 } 342 synchronized (Dispatcher.this) { 343 Dispatcher.this.notifyAll(); 344 } 345 } 346 } 347 348 /** Send a block replace request to the output stream */ 349 private void sendRequest(DataOutputStream out, ExtendedBlock eb, 350 Token<BlockTokenIdentifier> accessToken) throws IOException { 351 new Sender(out).replaceBlock(eb, target.storageType, accessToken, 352 source.getDatanodeInfo().getDatanodeUuid(), proxySource.datanode); 353 } 354 355 /** Receive a block copy response from the input stream */ 356 private void receiveResponse(DataInputStream in) throws IOException { 357 BlockOpResponseProto response = 358 BlockOpResponseProto.parseFrom(vintPrefixed(in)); 359 while (response.getStatus() == Status.IN_PROGRESS) { 360 // read intermediate responses 361 response = BlockOpResponseProto.parseFrom(vintPrefixed(in)); 362 } 363 String logInfo = "block move is failed"; 364 DataTransferProtoUtil.checkBlockOpStatus(response, logInfo); 365 } 366 367 /** reset the object */ 368 private void reset() { 369 block = null; 370 source = null; 371 proxySource = null; 372 target = null; 373 } 374 } 375 376 /** A class for keeping track of block locations in the dispatcher. */ 377 public static class DBlock extends MovedBlocks.Locations<StorageGroup> { 378 public DBlock(Block block) { 379 super(block); 380 } 381 } 382 383 /** The class represents a desired move. */ 384 static class Task { 385 private final StorageGroup target; 386 private long size; // bytes scheduled to move 387 388 Task(StorageGroup target, long size) { 389 this.target = target; 390 this.size = size; 391 } 392 393 long getSize() { 394 return size; 395 } 396 } 397 398 /** A class that keeps track of a datanode. */ 399 public static class DDatanode { 400 401 /** A group of storages in a datanode with the same storage type. */ 402 public class StorageGroup { 403 final StorageType storageType; 404 final long maxSize2Move; 405 private long scheduledSize = 0L; 406 407 private StorageGroup(StorageType storageType, long maxSize2Move) { 408 this.storageType = storageType; 409 this.maxSize2Move = maxSize2Move; 410 } 411 412 public StorageType getStorageType() { 413 return storageType; 414 } 415 416 private DDatanode getDDatanode() { 417 return DDatanode.this; 418 } 419 420 public DatanodeInfo getDatanodeInfo() { 421 return DDatanode.this.datanode; 422 } 423 424 /** Decide if still need to move more bytes */ 425 boolean hasSpaceForScheduling() { 426 return hasSpaceForScheduling(0L); 427 } 428 429 synchronized boolean hasSpaceForScheduling(long size) { 430 return availableSizeToMove() > size; 431 } 432 433 /** @return the total number of bytes that need to be moved */ 434 synchronized long availableSizeToMove() { 435 return maxSize2Move - scheduledSize; 436 } 437 438 /** increment scheduled size */ 439 public synchronized void incScheduledSize(long size) { 440 scheduledSize += size; 441 } 442 443 /** @return scheduled size */ 444 synchronized long getScheduledSize() { 445 return scheduledSize; 446 } 447 448 /** Reset scheduled size to zero. */ 449 synchronized void resetScheduledSize() { 450 scheduledSize = 0L; 451 } 452 453 private PendingMove addPendingMove(DBlock block, final PendingMove pm) { 454 if (getDDatanode().addPendingBlock(pm)) { 455 if (pm.markMovedIfGoodBlock(block, getStorageType())) { 456 incScheduledSize(pm.block.getNumBytes()); 457 return pm; 458 } else { 459 getDDatanode().removePendingBlock(pm); 460 } 461 } 462 return null; 463 } 464 465 /** @return the name for display */ 466 String getDisplayName() { 467 return datanode + ":" + storageType; 468 } 469 470 @Override 471 public String toString() { 472 return getDisplayName(); 473 } 474 } 475 476 final DatanodeInfo datanode; 477 private final EnumMap<StorageType, Source> sourceMap 478 = new EnumMap<StorageType, Source>(StorageType.class); 479 private final EnumMap<StorageType, StorageGroup> targetMap 480 = new EnumMap<StorageType, StorageGroup>(StorageType.class); 481 protected long delayUntil = 0L; 482 /** blocks being moved but not confirmed yet */ 483 private final List<PendingMove> pendings; 484 private volatile boolean hasFailure = false; 485 private final int maxConcurrentMoves; 486 487 @Override 488 public String toString() { 489 return getClass().getSimpleName() + ":" + datanode; 490 } 491 492 private DDatanode(DatanodeInfo datanode, int maxConcurrentMoves) { 493 this.datanode = datanode; 494 this.maxConcurrentMoves = maxConcurrentMoves; 495 this.pendings = new ArrayList<PendingMove>(maxConcurrentMoves); 496 } 497 498 public DatanodeInfo getDatanodeInfo() { 499 return datanode; 500 } 501 502 private static <G extends StorageGroup> void put(StorageType storageType, 503 G g, EnumMap<StorageType, G> map) { 504 final StorageGroup existing = map.put(storageType, g); 505 Preconditions.checkState(existing == null); 506 } 507 508 public StorageGroup addTarget(StorageType storageType, long maxSize2Move) { 509 final StorageGroup g = new StorageGroup(storageType, maxSize2Move); 510 put(storageType, g, targetMap); 511 return g; 512 } 513 514 public Source addSource(StorageType storageType, long maxSize2Move, Dispatcher d) { 515 final Source s = d.new Source(storageType, maxSize2Move, this); 516 put(storageType, s, sourceMap); 517 return s; 518 } 519 520 synchronized private void activateDelay(long delta) { 521 delayUntil = Time.monotonicNow() + delta; 522 } 523 524 synchronized private boolean isDelayActive() { 525 if (delayUntil == 0 || Time.monotonicNow() > delayUntil) { 526 delayUntil = 0; 527 return false; 528 } 529 return true; 530 } 531 532 /** Check if the node can schedule more blocks to move */ 533 synchronized boolean isPendingQNotFull() { 534 return pendings.size() < maxConcurrentMoves; 535 } 536 537 /** Check if all the dispatched moves are done */ 538 synchronized boolean isPendingQEmpty() { 539 return pendings.isEmpty(); 540 } 541 542 /** Add a scheduled block move to the node */ 543 synchronized boolean addPendingBlock(PendingMove pendingBlock) { 544 if (!isDelayActive() && isPendingQNotFull()) { 545 return pendings.add(pendingBlock); 546 } 547 return false; 548 } 549 550 /** Remove a scheduled block move from the node */ 551 synchronized boolean removePendingBlock(PendingMove pendingBlock) { 552 return pendings.remove(pendingBlock); 553 } 554 555 void setHasFailure() { 556 this.hasFailure = true; 557 } 558 } 559 560 /** A node that can be the sources of a block move */ 561 public class Source extends DDatanode.StorageGroup { 562 563 private final List<Task> tasks = new ArrayList<Task>(2); 564 private long blocksToReceive = 0L; 565 /** 566 * Source blocks point to the objects in {@link Dispatcher#globalBlocks} 567 * because we want to keep one copy of a block and be aware that the 568 * locations are changing over time. 569 */ 570 private final List<DBlock> srcBlocks = new ArrayList<DBlock>(); 571 572 private Source(StorageType storageType, long maxSize2Move, DDatanode dn) { 573 dn.super(storageType, maxSize2Move); 574 } 575 576 /** Add a task */ 577 void addTask(Task task) { 578 Preconditions.checkState(task.target != this, 579 "Source and target are the same storage group " + getDisplayName()); 580 incScheduledSize(task.size); 581 tasks.add(task); 582 } 583 584 /** @return an iterator to this source's blocks */ 585 Iterator<DBlock> getBlockIterator() { 586 return srcBlocks.iterator(); 587 } 588 589 /** 590 * Fetch new blocks of this source from namenode and update this source's 591 * block list & {@link Dispatcher#globalBlocks}. 592 * 593 * @return the total size of the received blocks in the number of bytes. 594 */ 595 private long getBlockList() throws IOException { 596 final long size = Math.min(MAX_BLOCKS_SIZE_TO_FETCH, blocksToReceive); 597 final BlocksWithLocations newBlocks = nnc.getBlocks(getDatanodeInfo(), size); 598 599 long bytesReceived = 0; 600 for (BlockWithLocations blk : newBlocks.getBlocks()) { 601 bytesReceived += blk.getBlock().getNumBytes(); 602 synchronized (globalBlocks) { 603 final DBlock block = globalBlocks.get(blk.getBlock()); 604 synchronized (block) { 605 block.clearLocations(); 606 607 // update locations 608 final String[] datanodeUuids = blk.getDatanodeUuids(); 609 final StorageType[] storageTypes = blk.getStorageTypes(); 610 for (int i = 0; i < datanodeUuids.length; i++) { 611 final StorageGroup g = storageGroupMap.get( 612 datanodeUuids[i], storageTypes[i]); 613 if (g != null) { // not unknown 614 block.addLocation(g); 615 } 616 } 617 } 618 if (!srcBlocks.contains(block) && isGoodBlockCandidate(block)) { 619 // filter bad candidates 620 srcBlocks.add(block); 621 } 622 } 623 } 624 return bytesReceived; 625 } 626 627 /** Decide if the given block is a good candidate to move or not */ 628 private boolean isGoodBlockCandidate(DBlock block) { 629 // source and target must have the same storage type 630 final StorageType sourceStorageType = getStorageType(); 631 for (Task t : tasks) { 632 if (Dispatcher.this.isGoodBlockCandidate(this, t.target, 633 sourceStorageType, block)) { 634 return true; 635 } 636 } 637 return false; 638 } 639 640 /** 641 * Choose a move for the source. The block's source, target, and proxy 642 * are determined too. When choosing proxy and target, source & 643 * target throttling has been considered. They are chosen only when they 644 * have the capacity to support this block move. The block should be 645 * dispatched immediately after this method is returned. 646 * 647 * @return a move that's good for the source to dispatch immediately. 648 */ 649 private PendingMove chooseNextMove() { 650 for (Iterator<Task> i = tasks.iterator(); i.hasNext();) { 651 final Task task = i.next(); 652 final DDatanode target = task.target.getDDatanode(); 653 final PendingMove pendingBlock = new PendingMove(this, task.target); 654 if (target.addPendingBlock(pendingBlock)) { 655 // target is not busy, so do a tentative block allocation 656 if (pendingBlock.chooseBlockAndProxy()) { 657 long blockSize = pendingBlock.block.getNumBytes(); 658 incScheduledSize(-blockSize); 659 task.size -= blockSize; 660 if (task.size == 0) { 661 i.remove(); 662 } 663 return pendingBlock; 664 } else { 665 // cancel the tentative move 666 target.removePendingBlock(pendingBlock); 667 } 668 } 669 } 670 return null; 671 } 672 673 /** Add a pending move */ 674 public PendingMove addPendingMove(DBlock block, StorageGroup target) { 675 return target.addPendingMove(block, new PendingMove(this, target)); 676 } 677 678 /** Iterate all source's blocks to remove moved ones */ 679 private void removeMovedBlocks() { 680 for (Iterator<DBlock> i = getBlockIterator(); i.hasNext();) { 681 if (movedBlocks.contains(i.next().getBlock())) { 682 i.remove(); 683 } 684 } 685 } 686 687 private static final int SOURCE_BLOCKS_MIN_SIZE = 5; 688 689 /** @return if should fetch more blocks from namenode */ 690 private boolean shouldFetchMoreBlocks() { 691 return srcBlocks.size() < SOURCE_BLOCKS_MIN_SIZE && blocksToReceive > 0; 692 } 693 694 private static final long MAX_ITERATION_TIME = 20 * 60 * 1000L; // 20 mins 695 696 /** 697 * This method iteratively does the following: it first selects a block to 698 * move, then sends a request to the proxy source to start the block move 699 * when the source's block list falls below a threshold, it asks the 700 * namenode for more blocks. It terminates when it has dispatch enough block 701 * move tasks or it has received enough blocks from the namenode, or the 702 * elapsed time of the iteration has exceeded the max time limit. 703 */ 704 private void dispatchBlocks() { 705 final long startTime = Time.monotonicNow(); 706 this.blocksToReceive = 2 * getScheduledSize(); 707 boolean isTimeUp = false; 708 int noPendingMoveIteration = 0; 709 while (!isTimeUp && getScheduledSize() > 0 710 && (!srcBlocks.isEmpty() || blocksToReceive > 0)) { 711 final PendingMove p = chooseNextMove(); 712 if (p != null) { 713 // Reset no pending move counter 714 noPendingMoveIteration=0; 715 executePendingMove(p); 716 continue; 717 } 718 719 // Since we cannot schedule any block to move, 720 // remove any moved blocks from the source block list and 721 removeMovedBlocks(); // filter already moved blocks 722 // check if we should fetch more blocks from the namenode 723 if (shouldFetchMoreBlocks()) { 724 // fetch new blocks 725 try { 726 blocksToReceive -= getBlockList(); 727 continue; 728 } catch (IOException e) { 729 LOG.warn("Exception while getting block list", e); 730 return; 731 } 732 } else { 733 // source node cannot find a pending block to move, iteration +1 734 noPendingMoveIteration++; 735 // in case no blocks can be moved for source node's task, 736 // jump out of while-loop after 5 iterations. 737 if (noPendingMoveIteration >= MAX_NO_PENDING_MOVE_ITERATIONS) { 738 resetScheduledSize(); 739 } 740 } 741 742 // check if time is up or not 743 if (Time.monotonicNow() - startTime > MAX_ITERATION_TIME) { 744 isTimeUp = true; 745 continue; 746 } 747 748 // Now we can not schedule any block to move and there are 749 // no new blocks added to the source block list, so we wait. 750 try { 751 synchronized (Dispatcher.this) { 752 Dispatcher.this.wait(1000); // wait for targets/sources to be idle 753 } 754 } catch (InterruptedException ignored) { 755 } 756 } 757 } 758 } 759 760 public Dispatcher(NameNodeConnector nnc, Set<String> includedNodes, 761 Set<String> excludedNodes, long movedWinWidth, int moverThreads, 762 int dispatcherThreads, int maxConcurrentMovesPerNode, Configuration conf) { 763 this.nnc = nnc; 764 this.excludedNodes = excludedNodes; 765 this.includedNodes = includedNodes; 766 this.movedBlocks = new MovedBlocks<StorageGroup>(movedWinWidth); 767 768 this.cluster = NetworkTopology.getInstance(conf); 769 770 this.moveExecutor = Executors.newFixedThreadPool(moverThreads); 771 this.dispatchExecutor = dispatcherThreads == 0? null 772 : Executors.newFixedThreadPool(dispatcherThreads); 773 this.maxConcurrentMovesPerNode = maxConcurrentMovesPerNode; 774 775 this.saslClient = new SaslDataTransferClient(conf, 776 DataTransferSaslUtil.getSaslPropertiesResolver(conf), 777 TrustedChannelResolver.getInstance(conf), nnc.fallbackToSimpleAuth); 778 } 779 780 public DistributedFileSystem getDistributedFileSystem() { 781 return nnc.getDistributedFileSystem(); 782 } 783 784 public StorageGroupMap<StorageGroup> getStorageGroupMap() { 785 return storageGroupMap; 786 } 787 788 public NetworkTopology getCluster() { 789 return cluster; 790 } 791 792 long getBytesMoved() { 793 return nnc.getBytesMoved().get(); 794 } 795 796 long bytesToMove() { 797 Preconditions.checkState( 798 storageGroupMap.size() >= sources.size() + targets.size(), 799 "Mismatched number of storage groups (" + storageGroupMap.size() 800 + " < " + sources.size() + " sources + " + targets.size() 801 + " targets)"); 802 803 long b = 0L; 804 for (Source src : sources) { 805 b += src.getScheduledSize(); 806 } 807 return b; 808 } 809 810 void add(Source source, StorageGroup target) { 811 sources.add(source); 812 targets.add(target); 813 } 814 815 private boolean shouldIgnore(DatanodeInfo dn) { 816 // ignore decommissioned nodes 817 final boolean decommissioned = dn.isDecommissioned(); 818 // ignore decommissioning nodes 819 final boolean decommissioning = dn.isDecommissionInProgress(); 820 // ignore nodes in exclude list 821 final boolean excluded = Util.isExcluded(excludedNodes, dn); 822 // ignore nodes not in the include list (if include list is not empty) 823 final boolean notIncluded = !Util.isIncluded(includedNodes, dn); 824 825 if (decommissioned || decommissioning || excluded || notIncluded) { 826 if (LOG.isTraceEnabled()) { 827 LOG.trace("Excluding datanode " + dn + ": " + decommissioned + ", " 828 + decommissioning + ", " + excluded + ", " + notIncluded); 829 } 830 return true; 831 } 832 return false; 833 } 834 835 /** Get live datanode storage reports and then build the network topology. */ 836 public List<DatanodeStorageReport> init() throws IOException { 837 final DatanodeStorageReport[] reports = nnc.getLiveDatanodeStorageReport(); 838 final List<DatanodeStorageReport> trimmed = new ArrayList<DatanodeStorageReport>(); 839 // create network topology and classify utilization collections: 840 // over-utilized, above-average, below-average and under-utilized. 841 for (DatanodeStorageReport r : DFSUtil.shuffle(reports)) { 842 final DatanodeInfo datanode = r.getDatanodeInfo(); 843 if (shouldIgnore(datanode)) { 844 continue; 845 } 846 trimmed.add(r); 847 cluster.add(datanode); 848 } 849 return trimmed; 850 } 851 852 public DDatanode newDatanode(DatanodeInfo datanode) { 853 return new DDatanode(datanode, maxConcurrentMovesPerNode); 854 } 855 856 public void executePendingMove(final PendingMove p) { 857 // move the block 858 moveExecutor.execute(new Runnable() { 859 @Override 860 public void run() { 861 p.dispatch(); 862 } 863 }); 864 } 865 866 public boolean dispatchAndCheckContinue() throws InterruptedException { 867 return nnc.shouldContinue(dispatchBlockMoves()); 868 } 869 870 /** 871 * Dispatch block moves for each source. The thread selects blocks to move & 872 * sends request to proxy source to initiate block move. The process is flow 873 * controlled. Block selection is blocked if there are too many un-confirmed 874 * block moves. 875 * 876 * @return the total number of bytes successfully moved in this iteration. 877 */ 878 private long dispatchBlockMoves() throws InterruptedException { 879 final long bytesLastMoved = getBytesMoved(); 880 final Future<?>[] futures = new Future<?>[sources.size()]; 881 882 final Iterator<Source> i = sources.iterator(); 883 for (int j = 0; j < futures.length; j++) { 884 final Source s = i.next(); 885 futures[j] = dispatchExecutor.submit(new Runnable() { 886 @Override 887 public void run() { 888 s.dispatchBlocks(); 889 } 890 }); 891 } 892 893 // wait for all dispatcher threads to finish 894 for (Future<?> future : futures) { 895 try { 896 future.get(); 897 } catch (ExecutionException e) { 898 LOG.warn("Dispatcher thread failed", e.getCause()); 899 } 900 } 901 902 // wait for all block moving to be done 903 waitForMoveCompletion(targets); 904 905 return getBytesMoved() - bytesLastMoved; 906 } 907 908 /** The sleeping period before checking if block move is completed again */ 909 static private long blockMoveWaitTime = 30000L; 910 911 /** 912 * Wait for all block move confirmations. 913 * @return true if there is failed move execution 914 */ 915 public static boolean waitForMoveCompletion( 916 Iterable<? extends StorageGroup> targets) { 917 boolean hasFailure = false; 918 for(;;) { 919 boolean empty = true; 920 for (StorageGroup t : targets) { 921 if (!t.getDDatanode().isPendingQEmpty()) { 922 empty = false; 923 break; 924 } else { 925 hasFailure |= t.getDDatanode().hasFailure; 926 } 927 } 928 if (empty) { 929 return hasFailure; // all pending queues are empty 930 } 931 try { 932 Thread.sleep(blockMoveWaitTime); 933 } catch (InterruptedException ignored) { 934 } 935 } 936 } 937 938 /** 939 * Decide if the block is a good candidate to be moved from source to target. 940 * A block is a good candidate if 941 * 1. the block is not in the process of being moved/has not been moved; 942 * 2. the block does not have a replica on the target; 943 * 3. doing the move does not reduce the number of racks that the block has 944 */ 945 private boolean isGoodBlockCandidate(StorageGroup source, StorageGroup target, 946 StorageType targetStorageType, DBlock block) { 947 if (target.storageType != targetStorageType) { 948 return false; 949 } 950 // check if the block is moved or not 951 if (movedBlocks.contains(block.getBlock())) { 952 return false; 953 } 954 if (block.isLocatedOn(target)) { 955 return false; 956 } 957 if (cluster.isNodeGroupAware() 958 && isOnSameNodeGroupWithReplicas(source, target, block)) { 959 return false; 960 } 961 if (reduceNumOfRacks(source, target, block)) { 962 return false; 963 } 964 return true; 965 } 966 967 /** 968 * Determine whether moving the given block replica from source to target 969 * would reduce the number of racks of the block replicas. 970 */ 971 private boolean reduceNumOfRacks(StorageGroup source, StorageGroup target, 972 DBlock block) { 973 final DatanodeInfo sourceDn = source.getDatanodeInfo(); 974 if (cluster.isOnSameRack(sourceDn, target.getDatanodeInfo())) { 975 // source and target are on the same rack 976 return false; 977 } 978 boolean notOnSameRack = true; 979 synchronized (block) { 980 for (StorageGroup loc : block.getLocations()) { 981 if (cluster.isOnSameRack(loc.getDatanodeInfo(), target.getDatanodeInfo())) { 982 notOnSameRack = false; 983 break; 984 } 985 } 986 } 987 if (notOnSameRack) { 988 // target is not on the same rack as any replica 989 return false; 990 } 991 for (StorageGroup g : block.getLocations()) { 992 if (g != source && cluster.isOnSameRack(g.getDatanodeInfo(), sourceDn)) { 993 // source is on the same rack of another replica 994 return false; 995 } 996 } 997 return true; 998 } 999 1000 /** 1001 * Check if there are any replica (other than source) on the same node group 1002 * with target. If true, then target is not a good candidate for placing 1003 * specific replica as we don't want 2 replicas under the same nodegroup. 1004 * 1005 * @return true if there are any replica (other than source) on the same node 1006 * group with target 1007 */ 1008 private boolean isOnSameNodeGroupWithReplicas(StorageGroup source, 1009 StorageGroup target, DBlock block) { 1010 final DatanodeInfo targetDn = target.getDatanodeInfo(); 1011 for (StorageGroup g : block.getLocations()) { 1012 if (g != source && cluster.isOnSameNodeGroup(g.getDatanodeInfo(), targetDn)) { 1013 return true; 1014 } 1015 } 1016 return false; 1017 } 1018 1019 /** Reset all fields in order to prepare for the next iteration */ 1020 void reset(Configuration conf) { 1021 cluster = NetworkTopology.getInstance(conf); 1022 storageGroupMap.clear(); 1023 sources.clear(); 1024 targets.clear(); 1025 globalBlocks.removeAllButRetain(movedBlocks); 1026 movedBlocks.cleanup(); 1027 } 1028 1029 /** set the sleeping period for block move completion check */ 1030 @VisibleForTesting 1031 public static void setBlockMoveWaitTime(long time) { 1032 blockMoveWaitTime = time; 1033 } 1034 1035 @VisibleForTesting 1036 public static void setDelayAfterErrors(long time) { 1037 delayAfterErrors = time; 1038 } 1039 1040 /** shutdown thread pools */ 1041 public void shutdownNow() { 1042 if (dispatchExecutor != null) { 1043 dispatchExecutor.shutdownNow(); 1044 } 1045 moveExecutor.shutdownNow(); 1046 } 1047 1048 static class Util { 1049 /** @return true if data node is part of the excludedNodes. */ 1050 static boolean isExcluded(Set<String> excludedNodes, DatanodeInfo dn) { 1051 return isIn(excludedNodes, dn); 1052 } 1053 1054 /** 1055 * @return true if includedNodes is empty or data node is part of the 1056 * includedNodes. 1057 */ 1058 static boolean isIncluded(Set<String> includedNodes, DatanodeInfo dn) { 1059 return (includedNodes.isEmpty() || isIn(includedNodes, dn)); 1060 } 1061 1062 /** 1063 * Match is checked using host name , ip address with and without port 1064 * number. 1065 * 1066 * @return true if the datanode's transfer address matches the set of nodes. 1067 */ 1068 private static boolean isIn(Set<String> datanodes, DatanodeInfo dn) { 1069 return isIn(datanodes, dn.getPeerHostName(), dn.getXferPort()) 1070 || isIn(datanodes, dn.getIpAddr(), dn.getXferPort()) 1071 || isIn(datanodes, dn.getHostName(), dn.getXferPort()); 1072 } 1073 1074 /** @return true if nodes contains host or host:port */ 1075 private static boolean isIn(Set<String> nodes, String host, int port) { 1076 if (host == null) { 1077 return false; 1078 } 1079 return (nodes.contains(host) || nodes.contains(host + ":" + port)); 1080 } 1081 1082 /** 1083 * Parse a comma separated string to obtain set of host names 1084 * 1085 * @return set of host names 1086 */ 1087 static Set<String> parseHostList(String string) { 1088 String[] addrs = StringUtils.getTrimmedStrings(string); 1089 return new HashSet<String>(Arrays.asList(addrs)); 1090 } 1091 1092 /** 1093 * Read set of host names from a file 1094 * 1095 * @return set of host names 1096 */ 1097 static Set<String> getHostListFromFile(String fileName, String type) { 1098 Set<String> nodes = new HashSet<String>(); 1099 try { 1100 HostsFileReader.readFileToSet(type, fileName, nodes); 1101 return StringUtils.getTrimmedStrings(nodes); 1102 } catch (IOException e) { 1103 throw new IllegalArgumentException( 1104 "Failed to read host list from file: " + fileName); 1105 } 1106 } 1107 } 1108}