001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.balancer;
019
020import static org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed;
021
022import java.io.BufferedInputStream;
023import java.io.BufferedOutputStream;
024import java.io.DataInputStream;
025import java.io.DataOutputStream;
026import java.io.IOException;
027import java.io.InputStream;
028import java.io.OutputStream;
029import java.net.Socket;
030import java.util.ArrayList;
031import java.util.Arrays;
032import java.util.Collection;
033import java.util.EnumMap;
034import java.util.HashMap;
035import java.util.HashSet;
036import java.util.Iterator;
037import java.util.List;
038import java.util.Map;
039import java.util.Set;
040import java.util.concurrent.ExecutionException;
041import java.util.concurrent.ExecutorService;
042import java.util.concurrent.Executors;
043import java.util.concurrent.Future;
044import java.util.concurrent.ThreadPoolExecutor;
045
046import org.apache.commons.logging.Log;
047import org.apache.commons.logging.LogFactory;
048import org.apache.hadoop.classification.InterfaceAudience;
049import org.apache.hadoop.conf.Configuration;
050import org.apache.hadoop.fs.CommonConfigurationKeys;
051import org.apache.hadoop.fs.StorageType;
052import org.apache.hadoop.hdfs.DFSConfigKeys;
053import org.apache.hadoop.hdfs.DFSUtil;
054import org.apache.hadoop.hdfs.DistributedFileSystem;
055import org.apache.hadoop.hdfs.protocol.Block;
056import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
057import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
058import org.apache.hadoop.hdfs.protocol.HdfsConstants;
059import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtoUtil;
060import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
061import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
062import org.apache.hadoop.hdfs.protocol.datatransfer.TrustedChannelResolver;
063import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil;
064import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient;
065import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
066import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
067import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
068import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode.StorageGroup;
069import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
070import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
071import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;
072import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
073import org.apache.hadoop.io.IOUtils;
074import org.apache.hadoop.net.NetUtils;
075import org.apache.hadoop.net.NetworkTopology;
076import org.apache.hadoop.security.token.Token;
077import org.apache.hadoop.util.HostsFileReader;
078import org.apache.hadoop.util.StringUtils;
079import org.apache.hadoop.util.Time;
080
081import com.google.common.annotations.VisibleForTesting;
082import com.google.common.base.Preconditions;
083
084/** Dispatching block replica moves between datanodes. */
085@InterfaceAudience.Private
086public class Dispatcher {
087  static final Log LOG = LogFactory.getLog(Dispatcher.class);
088
089  /**
090   * the period of time to delay the usage of a DataNode after hitting
091   * errors when using it for migrating data
092   */
093  private static long delayAfterErrors = 10 * 1000;
094
095  private final NameNodeConnector nnc;
096  private final SaslDataTransferClient saslClient;
097
098  /** Set of datanodes to be excluded. */
099  private final Set<String> excludedNodes;
100  /** Restrict to the following nodes. */
101  private final Set<String> includedNodes;
102
103  private final Collection<Source> sources = new HashSet<Source>();
104  private final Collection<StorageGroup> targets = new HashSet<StorageGroup>();
105
106  private final GlobalBlockMap globalBlocks = new GlobalBlockMap();
107  private final MovedBlocks<StorageGroup> movedBlocks;
108
109  /** Map (datanodeUuid,storageType -> StorageGroup) */
110  private final StorageGroupMap<StorageGroup> storageGroupMap
111      = new StorageGroupMap<StorageGroup>();
112
113  private NetworkTopology cluster;
114
115  private final ExecutorService dispatchExecutor;
116
117  private final Allocator moverThreadAllocator;
118
119  /** The maximum number of concurrent blocks moves at a datanode */
120  private final int maxConcurrentMovesPerNode;
121  private final int maxMoverThreads;
122
123  private final long getBlocksSize;
124  private final long getBlocksMinBlockSize;
125  private final long blockMoveTimeout;
126  /**
127   * If no block can be moved out of a {@link Source} after this configured
128   * amount of time, the Source should give up choosing the next possible move.
129   */
130  private final int maxNoMoveInterval;
131
132  static class Allocator {
133    private final int max;
134    private int count = 0;
135    private int lotSize = 1;
136
137    Allocator(int max) {
138      this.max = max;
139    }
140
141    /** Allocate specified number of items */
142    synchronized int allocate(int n) {
143      final int remaining = max - count;
144      if (remaining <= 0) {
145        return 0;
146      } else {
147        final int allocated = remaining < n? remaining: n;
148        count += allocated;
149        return allocated;
150      }
151    }
152
153    /** Aloocate a single lot of items */
154    int allocate() {
155      return allocate(lotSize);
156    }
157
158    synchronized void reset() {
159      count = 0;
160    }
161
162    /** Set the lot size */
163    synchronized void setLotSize(int lotSize) {
164      this.lotSize = lotSize;
165    }
166  }
167
168  private static class GlobalBlockMap {
169    private final Map<Block, DBlock> map = new HashMap<Block, DBlock>();
170
171    /**
172     * Get the block from the map;
173     * if the block is not found, create a new block and put it in the map.
174     */
175    private DBlock get(Block b) {
176      DBlock block = map.get(b);
177      if (block == null) {
178        block = new DBlock(b);
179        map.put(b, block);
180      }
181      return block;
182    }
183    
184    /** Remove all blocks except for the moved blocks. */
185    private void removeAllButRetain(MovedBlocks<StorageGroup> movedBlocks) {
186      for (Iterator<Block> i = map.keySet().iterator(); i.hasNext();) {
187        if (!movedBlocks.contains(i.next())) {
188          i.remove();
189        }
190      }
191    }
192  }
193
194  public static class StorageGroupMap<G extends StorageGroup> {
195    private static String toKey(String datanodeUuid, StorageType storageType) {
196      return datanodeUuid + ":" + storageType;
197    }
198
199    private final Map<String, G> map = new HashMap<String, G>();
200
201    public G get(String datanodeUuid, StorageType storageType) {
202      return map.get(toKey(datanodeUuid, storageType));
203    }
204
205    public void put(G g) {
206      final String key = toKey(g.getDatanodeInfo().getDatanodeUuid(), g.storageType);
207      final StorageGroup existing = map.put(key, g);
208      Preconditions.checkState(existing == null);
209    }
210
211    int size() {
212      return map.size();
213    }
214
215    void clear() {
216      map.clear();
217    }
218
219    public Collection<G> values() {
220      return map.values();
221    }
222  }
223
224  /** This class keeps track of a scheduled block move */
225  public class PendingMove {
226    private DBlock block;
227    private Source source;
228    private DDatanode proxySource;
229    private StorageGroup target;
230
231    private PendingMove(Source source, StorageGroup target) {
232      this.source = source;
233      this.target = target;
234    }
235
236    @Override
237    public String toString() {
238      final Block b = block != null ? block.getBlock() : null;
239      String bStr = b != null ? (b + " with size=" + b.getNumBytes() + " ")
240          : " ";
241      return bStr + "from " + source.getDisplayName() + " to " + target
242          .getDisplayName() + " through " + (proxySource != null ? proxySource
243          .datanode : "");
244    }
245
246    /**
247     * Choose a block & a proxy source for this pendingMove whose source &
248     * target have already been chosen.
249     * 
250     * @return true if a block and its proxy are chosen; false otherwise
251     */
252    private boolean chooseBlockAndProxy() {
253      // source and target must have the same storage type
254      final StorageType t = source.getStorageType();
255      // iterate all source's blocks until find a good one
256      for (Iterator<DBlock> i = source.getBlockIterator(); i.hasNext();) {
257        if (markMovedIfGoodBlock(i.next(), t)) {
258          i.remove();
259          return true;
260        }
261      }
262      return false;
263    }
264
265    /**
266     * @return true if the given block is good for the tentative move.
267     */
268    private boolean markMovedIfGoodBlock(DBlock block, StorageType targetStorageType) {
269      synchronized (block) {
270        synchronized (movedBlocks) {
271          if (isGoodBlockCandidate(source, target, targetStorageType, block)) {
272            this.block = block;
273            if (chooseProxySource()) {
274              movedBlocks.put(block);
275              if (LOG.isDebugEnabled()) {
276                LOG.debug("Decided to move " + this);
277              }
278              return true;
279            }
280          }
281        }
282      }
283      return false;
284    }
285
286    /**
287     * Choose a proxy source.
288     * 
289     * @return true if a proxy is found; otherwise false
290     */
291    private boolean chooseProxySource() {
292      final DatanodeInfo targetDN = target.getDatanodeInfo();
293      // if source and target are same nodes then no need of proxy
294      if (source.getDatanodeInfo().equals(targetDN) && addTo(source)) {
295        return true;
296      }
297      // if node group is supported, first try add nodes in the same node group
298      if (cluster.isNodeGroupAware()) {
299        for (StorageGroup loc : block.getLocations()) {
300          if (cluster.isOnSameNodeGroup(loc.getDatanodeInfo(), targetDN)
301              && addTo(loc)) {
302            return true;
303          }
304        }
305      }
306      // check if there is replica which is on the same rack with the target
307      for (StorageGroup loc : block.getLocations()) {
308        if (cluster.isOnSameRack(loc.getDatanodeInfo(), targetDN) && addTo(loc)) {
309          return true;
310        }
311      }
312      // find out a non-busy replica
313      for (StorageGroup loc : block.getLocations()) {
314        if (addTo(loc)) {
315          return true;
316        }
317      }
318      return false;
319    }
320
321    /** add to a proxy source for specific block movement */
322    private boolean addTo(StorageGroup g) {
323      final DDatanode dn = g.getDDatanode();
324      if (dn.addPendingBlock(this)) {
325        proxySource = dn;
326        return true;
327      }
328      return false;
329    }
330
331    /** Dispatch the move to the proxy source & wait for the response. */
332    private void dispatch() {
333      LOG.info("Start moving " + this);
334
335      Socket sock = new Socket();
336      DataOutputStream out = null;
337      DataInputStream in = null;
338      try {
339        sock.connect(
340            NetUtils.createSocketAddr(target.getDatanodeInfo().getXferAddr()),
341            HdfsServerConstants.READ_TIMEOUT);
342
343        // Set read timeout so that it doesn't hang forever against
344        // unresponsive nodes. Datanode normally sends IN_PROGRESS response
345        // twice within the client read timeout period (every 30 seconds by
346        // default). Here, we make it give up after 5 minutes of no response.
347        sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT * 5);
348        sock.setKeepAlive(true);
349
350        OutputStream unbufOut = sock.getOutputStream();
351        InputStream unbufIn = sock.getInputStream();
352        ExtendedBlock eb = new ExtendedBlock(nnc.getBlockpoolID(),
353            block.getBlock());
354        final KeyManager km = nnc.getKeyManager(); 
355        Token<BlockTokenIdentifier> accessToken = km.getAccessToken(eb);
356        IOStreamPair saslStreams = saslClient.socketSend(sock, unbufOut,
357            unbufIn, km, accessToken, target.getDatanodeInfo());
358        unbufOut = saslStreams.out;
359        unbufIn = saslStreams.in;
360        out = new DataOutputStream(new BufferedOutputStream(unbufOut,
361            HdfsConstants.IO_FILE_BUFFER_SIZE));
362        in = new DataInputStream(new BufferedInputStream(unbufIn,
363            HdfsConstants.IO_FILE_BUFFER_SIZE));
364
365        sendRequest(out, eb, accessToken);
366        receiveResponse(in);
367        nnc.getBytesMoved().addAndGet(block.getNumBytes());
368        LOG.info("Successfully moved " + this);
369      } catch (IOException e) {
370        LOG.warn("Failed to move " + this + ": " + e.getMessage());
371        target.getDDatanode().setHasFailure();
372        // Proxy or target may have some issues, delay before using these nodes
373        // further in order to avoid a potential storm of "threads quota
374        // exceeded" warnings when the dispatcher gets out of sync with work
375        // going on in datanodes.
376        proxySource.activateDelay(delayAfterErrors);
377        target.getDDatanode().activateDelay(delayAfterErrors);
378      } finally {
379        IOUtils.closeStream(out);
380        IOUtils.closeStream(in);
381        IOUtils.closeSocket(sock);
382
383        proxySource.removePendingBlock(this);
384        target.getDDatanode().removePendingBlock(this);
385
386        synchronized (this) {
387          reset();
388        }
389        synchronized (Dispatcher.this) {
390          Dispatcher.this.notifyAll();
391        }
392      }
393    }
394
395    /** Send a block replace request to the output stream */
396    private void sendRequest(DataOutputStream out, ExtendedBlock eb,
397        Token<BlockTokenIdentifier> accessToken) throws IOException {
398      new Sender(out).replaceBlock(eb, target.storageType, accessToken,
399          source.getDatanodeInfo().getDatanodeUuid(), proxySource.datanode);
400    }
401
402    /** Check whether to continue waiting for response */
403    private boolean stopWaitingForResponse(long startTime) {
404      return source.isIterationOver() ||
405          (blockMoveTimeout > 0 &&
406          (Time.monotonicNow() - startTime > blockMoveTimeout));
407    }
408
409    /** Receive a reportedBlock copy response from the input stream */
410    private void receiveResponse(DataInputStream in) throws IOException {
411      long startTime = Time.monotonicNow();
412      BlockOpResponseProto response =
413          BlockOpResponseProto.parseFrom(vintPrefixed(in));
414      while (response.getStatus() == Status.IN_PROGRESS) {
415        // read intermediate responses
416        response = BlockOpResponseProto.parseFrom(vintPrefixed(in));
417        // Stop waiting for slow block moves. Even if it stops waiting,
418        // the actual move may continue.
419        if (stopWaitingForResponse(startTime)) {
420          throw new IOException("Block move timed out");
421        }
422      }
423      String logInfo = "block move is failed";
424      DataTransferProtoUtil.checkBlockOpStatus(response, logInfo);
425    }
426
427    /** reset the object */
428    private void reset() {
429      block = null;
430      source = null;
431      proxySource = null;
432      target = null;
433    }
434  }
435
436  /** A class for keeping track of block locations in the dispatcher. */
437  public static class DBlock extends MovedBlocks.Locations<StorageGroup> {
438    public DBlock(Block block) {
439      super(block);
440    }
441  }
442
443  /** The class represents a desired move. */
444  static class Task {
445    private final StorageGroup target;
446    private long size; // bytes scheduled to move
447
448    Task(StorageGroup target, long size) {
449      this.target = target;
450      this.size = size;
451    }
452
453    long getSize() {
454      return size;
455    }
456  }
457
458  /** A class that keeps track of a datanode. */
459  public static class DDatanode {
460
461    /** A group of storages in a datanode with the same storage type. */
462    public class StorageGroup {
463      final StorageType storageType;
464      final long maxSize2Move;
465      private long scheduledSize = 0L;
466
467      private StorageGroup(StorageType storageType, long maxSize2Move) {
468        this.storageType = storageType;
469        this.maxSize2Move = maxSize2Move;
470      }
471      
472      public StorageType getStorageType() {
473        return storageType;
474      }
475
476      private DDatanode getDDatanode() {
477        return DDatanode.this;
478      }
479
480      public DatanodeInfo getDatanodeInfo() {
481        return DDatanode.this.datanode;
482      }
483
484      /** Decide if still need to move more bytes */
485      boolean hasSpaceForScheduling() {
486        return hasSpaceForScheduling(0L);
487      }
488
489      synchronized boolean hasSpaceForScheduling(long size) {
490        return availableSizeToMove() > size;
491      }
492
493      /** @return the total number of bytes that need to be moved */
494      synchronized long availableSizeToMove() {
495        return maxSize2Move - scheduledSize;
496      }
497
498      /** increment scheduled size */
499      public synchronized void incScheduledSize(long size) {
500        scheduledSize += size;
501      }
502
503      /** @return scheduled size */
504      synchronized long getScheduledSize() {
505        return scheduledSize;
506      }
507
508      /** Reset scheduled size to zero. */
509      synchronized void resetScheduledSize() {
510        scheduledSize = 0L;
511      }
512
513      private PendingMove addPendingMove(DBlock block, final PendingMove pm) {
514        if (getDDatanode().addPendingBlock(pm)) {
515          if (pm.markMovedIfGoodBlock(block, getStorageType())) {
516            incScheduledSize(pm.block.getNumBytes());
517            return pm;
518          } else {
519            getDDatanode().removePendingBlock(pm);
520          }
521        }
522        return null;
523      }
524
525      /** @return the name for display */
526      String getDisplayName() {
527        return datanode + ":" + storageType;
528      }
529
530      @Override
531      public String toString() {
532        return getDisplayName();
533      }
534
535      @Override
536      public int hashCode() {
537        return getStorageType().hashCode() ^ getDatanodeInfo().hashCode();
538      }
539
540      @Override
541      public boolean equals(Object obj) {
542        if (this == obj) {
543          return true;
544        } else if (obj == null || !(obj instanceof StorageGroup)) {
545          return false;
546        } else {
547          final StorageGroup that = (StorageGroup) obj;
548          return this.getStorageType() == that.getStorageType()
549              && this.getDatanodeInfo().equals(that.getDatanodeInfo());
550        }
551      }
552
553    }
554
555    final DatanodeInfo datanode;
556    private final EnumMap<StorageType, Source> sourceMap
557        = new EnumMap<StorageType, Source>(StorageType.class);
558    private final EnumMap<StorageType, StorageGroup> targetMap
559        = new EnumMap<StorageType, StorageGroup>(StorageType.class);
560    protected long delayUntil = 0L;
561    /** blocks being moved but not confirmed yet */
562    private final List<PendingMove> pendings;
563    private volatile boolean hasFailure = false;
564    private ExecutorService moveExecutor;
565
566    @Override
567    public String toString() {
568      return getClass().getSimpleName() + ":" + datanode;
569    }
570
571    private DDatanode(DatanodeInfo datanode, int maxConcurrentMoves) {
572      this.datanode = datanode;
573      this.pendings = new ArrayList<PendingMove>(maxConcurrentMoves);
574    }
575
576    public DatanodeInfo getDatanodeInfo() {
577      return datanode;
578    }
579
580    synchronized ExecutorService initMoveExecutor(int poolSize) {
581      return moveExecutor = Executors.newFixedThreadPool(poolSize);
582    }
583
584    synchronized ExecutorService getMoveExecutor() {
585      return moveExecutor;
586    }
587
588    synchronized void shutdownMoveExecutor() {
589      if (moveExecutor != null) {
590        moveExecutor.shutdown();
591        moveExecutor = null;
592      }
593    }
594
595    private static <G extends StorageGroup> void put(StorageType storageType,
596        G g, EnumMap<StorageType, G> map) {
597      final StorageGroup existing = map.put(storageType, g);
598      Preconditions.checkState(existing == null);
599    }
600
601    public StorageGroup addTarget(StorageType storageType, long maxSize2Move) {
602      final StorageGroup g = new StorageGroup(storageType, maxSize2Move);
603      put(storageType, g, targetMap);
604      return g;
605    }
606
607    public Source addSource(StorageType storageType, long maxSize2Move, Dispatcher d) {
608      final Source s = d.new Source(storageType, maxSize2Move, this);
609      put(storageType, s, sourceMap);
610      return s;
611    }
612
613    synchronized private void activateDelay(long delta) {
614      delayUntil = Time.monotonicNow() + delta;
615      LOG.info(this + " activateDelay " + delta/1000.0 + " seconds");
616    }
617
618    synchronized private boolean isDelayActive() {
619      if (delayUntil == 0 || Time.monotonicNow() > delayUntil) {
620        delayUntil = 0;
621        return false;
622      }
623      return true;
624    }
625
626    /** Check if all the dispatched moves are done */
627    synchronized boolean isPendingQEmpty() {
628      return pendings.isEmpty();
629    }
630
631    /** Add a scheduled block move to the node */
632    synchronized boolean addPendingBlock(PendingMove pendingBlock) {
633      if (!isDelayActive()) {
634        return pendings.add(pendingBlock);
635      }
636      return false;
637    }
638
639    /** Remove a scheduled block move from the node */
640    synchronized boolean removePendingBlock(PendingMove pendingBlock) {
641      return pendings.remove(pendingBlock);
642    }
643
644    void setHasFailure() {
645      this.hasFailure = true;
646    }
647  }
648
649  /** A node that can be the sources of a block move */
650  public class Source extends DDatanode.StorageGroup {
651
652    private final List<Task> tasks = new ArrayList<Task>(2);
653    private long blocksToReceive = 0L;
654    private final long startTime = Time.monotonicNow();
655    /**
656     * Source blocks point to the objects in {@link Dispatcher#globalBlocks}
657     * because we want to keep one copy of a block and be aware that the
658     * locations are changing over time.
659     */
660    private final List<DBlock> srcBlocks = new ArrayList<DBlock>();
661
662    private Source(StorageType storageType, long maxSize2Move, DDatanode dn) {
663      dn.super(storageType, maxSize2Move);
664    }
665
666    /**
667     * Check if the iteration is over
668     */
669    public boolean isIterationOver() {
670      return (Time.monotonicNow()-startTime > MAX_ITERATION_TIME);
671    }
672
673    /** Add a task */
674    void addTask(Task task) {
675      Preconditions.checkState(task.target != this,
676          "Source and target are the same storage group " + getDisplayName());
677      incScheduledSize(task.size);
678      tasks.add(task);
679    }
680
681    /** @return an iterator to this source's blocks */
682    Iterator<DBlock> getBlockIterator() {
683      return srcBlocks.iterator();
684    }
685
686    /**
687     * Fetch new blocks of this source from namenode and update this source's
688     * block list & {@link Dispatcher#globalBlocks}.
689     * 
690     * @return the total size of the received blocks in the number of bytes.
691     */
692    private long getBlockList() throws IOException {
693      final long size = Math.min(getBlocksSize, blocksToReceive);
694      final BlocksWithLocations newBlocks = nnc.getBlocks(getDatanodeInfo(), size);
695
696      if (LOG.isTraceEnabled()) {
697        LOG.trace("getBlocks(" + getDatanodeInfo() + ", "
698            + StringUtils.TraditionalBinaryPrefix.long2String(size, "B", 2)
699            + ") returns " + newBlocks.getBlocks().length + " blocks.");
700      }
701
702      long bytesReceived = 0;
703      for (BlockWithLocations blk : newBlocks.getBlocks()) {
704        // Skip small blocks.
705        if (blk.getBlock().getNumBytes() < getBlocksMinBlockSize) {
706          continue;
707        }
708
709        bytesReceived += blk.getBlock().getNumBytes();
710        synchronized (globalBlocks) {
711          final DBlock block = globalBlocks.get(blk.getBlock());
712          synchronized (block) {
713            block.clearLocations();
714
715            // update locations
716            final String[] datanodeUuids = blk.getDatanodeUuids();
717            final StorageType[] storageTypes = blk.getStorageTypes();
718            for (int i = 0; i < datanodeUuids.length; i++) {
719              final StorageGroup g = storageGroupMap.get(
720                  datanodeUuids[i], storageTypes[i]);
721              if (g != null) { // not unknown
722                block.addLocation(g);
723              }
724            }
725          }
726          if (!srcBlocks.contains(block) && isGoodBlockCandidate(block)) {
727            if (LOG.isTraceEnabled()) {
728              LOG.trace("Add " + block + " to " + this);
729            }
730            srcBlocks.add(block);
731          }
732        }
733      }
734      return bytesReceived;
735    }
736
737    /** Decide if the given block is a good candidate to move or not */
738    private boolean isGoodBlockCandidate(DBlock block) {
739      // source and target must have the same storage type
740      final StorageType sourceStorageType = getStorageType();
741      for (Task t : tasks) {
742        if (Dispatcher.this.isGoodBlockCandidate(this, t.target,
743            sourceStorageType, block)) {
744          return true;
745        }
746      }
747      return false;
748    }
749
750    /**
751     * Choose a move for the source. The block's source, target, and proxy
752     * are determined too. When choosing proxy and target, source &
753     * target throttling has been considered. They are chosen only when they
754     * have the capacity to support this block move. The block should be
755     * dispatched immediately after this method is returned.
756     * 
757     * @return a move that's good for the source to dispatch immediately.
758     */
759    private PendingMove chooseNextMove() {
760      for (Iterator<Task> i = tasks.iterator(); i.hasNext();) {
761        final Task task = i.next();
762        final DDatanode target = task.target.getDDatanode();
763        final PendingMove pendingBlock = new PendingMove(this, task.target);
764        if (target.addPendingBlock(pendingBlock)) {
765          // target is not busy, so do a tentative block allocation
766          if (pendingBlock.chooseBlockAndProxy()) {
767            long blockSize = pendingBlock.block.getNumBytes();
768            incScheduledSize(-blockSize);
769            task.size -= blockSize;
770            if (task.size <= 0) {
771              i.remove();
772            }
773            return pendingBlock;
774          } else {
775            // cancel the tentative move
776            target.removePendingBlock(pendingBlock);
777          }
778        }
779      }
780      return null;
781    }
782    
783    /** Add a pending move */
784    public PendingMove addPendingMove(DBlock block, StorageGroup target) {
785      return target.addPendingMove(block, new PendingMove(this, target));
786    }
787
788    /** Iterate all source's blocks to remove moved ones */
789    private void removeMovedBlocks() {
790      for (Iterator<DBlock> i = getBlockIterator(); i.hasNext();) {
791        if (movedBlocks.contains(i.next().getBlock())) {
792          i.remove();
793        }
794      }
795    }
796
797    /** @return if should fetch more blocks from namenode */
798    private boolean shouldFetchMoreBlocks() {
799      return blocksToReceive > 0;
800    }
801
802    private static final long MAX_ITERATION_TIME = 20 * 60 * 1000L; // 20 mins
803
804    /**
805     * This method iteratively does the following: it first selects a block to
806     * move, then sends a request to the proxy source to start the block move
807     * when the source's block list falls below a threshold, it asks the
808     * namenode for more blocks. It terminates when it has dispatch enough block
809     * move tasks or it has received enough blocks from the namenode, or the
810     * elapsed time of the iteration has exceeded the max time limit.
811     *
812     * @param delay - time to sleep before sending getBlocks. Intended to
813     * disperse Balancer RPCs to NameNode for large clusters. See HDFS-11384.
814     */
815    private void dispatchBlocks(long delay) {
816      this.blocksToReceive = 2 * getScheduledSize();
817      long previousMoveTimestamp = Time.monotonicNow();
818      while (getScheduledSize() > 0 && !isIterationOver()
819          && (!srcBlocks.isEmpty() || blocksToReceive > 0)) {
820        if (LOG.isTraceEnabled()) {
821          LOG.trace(this + " blocksToReceive=" + blocksToReceive
822              + ", scheduledSize=" + getScheduledSize()
823              + ", srcBlocks#=" + srcBlocks.size());
824        }
825        final PendingMove p = chooseNextMove();
826        if (p != null) {
827          // Reset previous move timestamp
828          previousMoveTimestamp = Time.monotonicNow();
829          executePendingMove(p);
830          continue;
831        }
832
833        // Since we cannot schedule any block to move,
834        // remove any moved blocks from the source block list and
835        removeMovedBlocks(); // filter already moved blocks
836        // check if we should fetch more blocks from the namenode
837        if (shouldFetchMoreBlocks()) {
838          // fetch new blocks
839          try {
840            if(delay > 0) {
841              if (LOG.isDebugEnabled()) {
842                LOG.debug("Sleeping " + delay + "  msec.");
843              }
844              Thread.sleep(delay);
845            }
846            blocksToReceive -= getBlockList();
847            continue;
848          } catch (InterruptedException ignored) {
849            // nothing to do
850          } catch (IOException e) {
851            LOG.warn("Exception while getting block list", e);
852            return;
853          } finally {
854            delay = 0L;
855          }
856        } else {
857          // jump out of while-loop after the configured timeout.
858          long noMoveInterval = Time.monotonicNow() - previousMoveTimestamp;
859          if (noMoveInterval > maxNoMoveInterval) {
860            LOG.info("Failed to find a pending move for "  + noMoveInterval
861                + " ms.  Skipping " + this);
862            resetScheduledSize();
863          }
864        }
865
866        // Now we can not schedule any block to move and there are
867        // no new blocks added to the source block list, so we wait.
868        try {
869          synchronized (Dispatcher.this) {
870            Dispatcher.this.wait(1000); // wait for targets/sources to be idle
871          }
872          // Didn't find a possible move in this iteration of the while loop,
873          // adding a small delay before choosing next move again.
874          Thread.sleep(100);
875        } catch (InterruptedException ignored) {
876        }
877      }
878
879      if (isIterationOver()) {
880        LOG.info("The maximum iteration time (" + MAX_ITERATION_TIME/1000
881            + " seconds) has been reached. Stopping " + this);
882      }
883    }
884
885    @Override
886    public int hashCode() {
887      return super.hashCode();
888    }
889
890    @Override
891    public boolean equals(Object obj) {
892      return super.equals(obj);
893    }
894  }
895
896  /** Constructor called by Mover. */
897  public Dispatcher(NameNodeConnector nnc, Set<String> includedNodes,
898      Set<String> excludedNodes, long movedWinWidth, int moverThreads,
899      int dispatcherThreads, int maxConcurrentMovesPerNode,
900      int maxNoMoveInterval, Configuration conf) {
901    this(nnc, includedNodes, excludedNodes, movedWinWidth,
902        moverThreads, dispatcherThreads, maxConcurrentMovesPerNode,
903        0L, 0L, 0, maxNoMoveInterval, conf);
904  }
905
906  Dispatcher(NameNodeConnector nnc, Set<String> includedNodes,
907      Set<String> excludedNodes, long movedWinWidth, int moverThreads,
908      int dispatcherThreads, int maxConcurrentMovesPerNode,
909      long getBlocksSize, long getBlocksMinBlockSize,
910      int blockMoveTimeout, int maxNoMoveInterval, Configuration conf) {
911    this.nnc = nnc;
912    this.excludedNodes = excludedNodes;
913    this.includedNodes = includedNodes;
914    this.movedBlocks = new MovedBlocks<StorageGroup>(movedWinWidth);
915
916    this.cluster = NetworkTopology.getInstance(conf);
917
918    this.dispatchExecutor = dispatcherThreads == 0? null
919        : Executors.newFixedThreadPool(dispatcherThreads);
920    this.moverThreadAllocator = new Allocator(moverThreads);
921    this.maxMoverThreads = moverThreads;
922    this.maxConcurrentMovesPerNode = maxConcurrentMovesPerNode;
923
924    this.getBlocksSize = getBlocksSize;
925    this.getBlocksMinBlockSize = getBlocksMinBlockSize;
926    this.blockMoveTimeout = blockMoveTimeout;
927    this.maxNoMoveInterval = maxNoMoveInterval;
928
929    this.saslClient = new SaslDataTransferClient(conf,
930        DataTransferSaslUtil.getSaslPropertiesResolver(conf),
931        TrustedChannelResolver.getInstance(conf), nnc.fallbackToSimpleAuth);
932  }
933
934  public DistributedFileSystem getDistributedFileSystem() {
935    return nnc.getDistributedFileSystem();
936  }
937
938  public StorageGroupMap<StorageGroup> getStorageGroupMap() {
939    return storageGroupMap;
940  }
941
942  public NetworkTopology getCluster() {
943    return cluster;
944  }
945  
946  long getBytesMoved() {
947    return nnc.getBytesMoved().get();
948  }
949
950  long bytesToMove() {
951    Preconditions.checkState(
952        storageGroupMap.size() >= sources.size() + targets.size(),
953        "Mismatched number of storage groups (" + storageGroupMap.size()
954            + " < " + sources.size() + " sources + " + targets.size()
955            + " targets)");
956
957    long b = 0L;
958    for (Source src : sources) {
959      b += src.getScheduledSize();
960    }
961    return b;
962  }
963
964  void add(Source source, StorageGroup target) {
965    sources.add(source);
966    targets.add(target);
967  }
968
969  private boolean shouldIgnore(DatanodeInfo dn) {
970    // ignore decommissioned nodes
971    final boolean decommissioned = dn.isDecommissioned();
972    // ignore decommissioning nodes
973    final boolean decommissioning = dn.isDecommissionInProgress();
974    // ignore nodes in exclude list
975    final boolean excluded = Util.isExcluded(excludedNodes, dn);
976    // ignore nodes not in the include list (if include list is not empty)
977    final boolean notIncluded = !Util.isIncluded(includedNodes, dn);
978
979    if (decommissioned || decommissioning || excluded || notIncluded) {
980      if (LOG.isTraceEnabled()) {
981        LOG.trace("Excluding datanode " + dn + ": " + decommissioned + ", "
982            + decommissioning + ", " + excluded + ", " + notIncluded);
983      }
984      return true;
985    }
986    return false;
987  }
988
989  /** Get live datanode storage reports and then build the network topology. */
990  public List<DatanodeStorageReport> init() throws IOException {
991    final DatanodeStorageReport[] reports = nnc.getLiveDatanodeStorageReport();
992    final List<DatanodeStorageReport> trimmed = new ArrayList<DatanodeStorageReport>(); 
993    // create network topology and classify utilization collections:
994    // over-utilized, above-average, below-average and under-utilized.
995    for (DatanodeStorageReport r : DFSUtil.shuffle(reports)) {
996      final DatanodeInfo datanode = r.getDatanodeInfo();
997      if (shouldIgnore(datanode)) {
998        continue;
999      }
1000      trimmed.add(r);
1001      cluster.add(datanode);
1002    }
1003    return trimmed;
1004  }
1005
1006  public DDatanode newDatanode(DatanodeInfo datanode) {
1007    return new DDatanode(datanode, maxConcurrentMovesPerNode);
1008  }
1009
1010
1011  public void executePendingMove(final PendingMove p) {
1012    // move the block
1013    final DDatanode targetDn = p.target.getDDatanode();
1014    ExecutorService moveExecutor = targetDn.getMoveExecutor();
1015    if (moveExecutor == null) {
1016      final int nThreads = moverThreadAllocator.allocate();
1017      if (nThreads > 0) {
1018        moveExecutor = targetDn.initMoveExecutor(nThreads);
1019      }
1020    }
1021    if (moveExecutor == null) {
1022      LOG.warn("No mover threads available: skip moving " + p);
1023      targetDn.removePendingBlock(p);
1024      p.proxySource.removePendingBlock(p);
1025      return;
1026    }
1027
1028    moveExecutor.execute(new Runnable() {
1029      @Override
1030      public void run() {
1031        p.dispatch();
1032      }
1033    });
1034  }
1035
1036  public boolean dispatchAndCheckContinue() throws InterruptedException {
1037    return nnc.shouldContinue(dispatchBlockMoves());
1038  }
1039
1040  /**
1041   * The best-effort limit on the number of RPCs per second
1042   * the Balancer will send to the NameNode.
1043   */
1044  final static int BALANCER_NUM_RPC_PER_SEC = 20;
1045
1046  /**
1047   * Dispatch block moves for each source. The thread selects blocks to move &
1048   * sends request to proxy source to initiate block move. The process is flow
1049   * controlled. Block selection is blocked if there are too many un-confirmed
1050   * block moves.
1051   * 
1052   * @return the total number of bytes successfully moved in this iteration.
1053   */
1054  private long dispatchBlockMoves() throws InterruptedException {
1055    final long bytesLastMoved = getBytesMoved();
1056    final Future<?>[] futures = new Future<?>[sources.size()];
1057
1058    int concurrentThreads = Math.min(sources.size(),
1059        ((ThreadPoolExecutor)dispatchExecutor).getCorePoolSize());
1060    assert concurrentThreads > 0 : "Number of concurrent threads is 0.";
1061    if (LOG.isDebugEnabled()) {
1062      LOG.debug("Balancer allowed RPCs per sec = " + BALANCER_NUM_RPC_PER_SEC);
1063      LOG.debug("Balancer concurrent threads = " + concurrentThreads);
1064      LOG.debug("Disperse Interval sec = " +
1065          concurrentThreads / BALANCER_NUM_RPC_PER_SEC);
1066    }
1067
1068    // Determine the size of each mover thread pool per target
1069    int threadsPerTarget = maxMoverThreads/targets.size();
1070    if (threadsPerTarget == 0) {
1071      // Some scheduled moves will get ignored as some targets won't have
1072      // any threads allocated.
1073      moverThreadAllocator.setLotSize(1);
1074      LOG.warn(DFSConfigKeys.DFS_BALANCER_MOVERTHREADS_KEY + "=" +
1075          maxMoverThreads + " is too small for moving blocks to " +
1076          targets.size() + " targets. Balancing may be slower.");
1077    } else {
1078      if  (threadsPerTarget > maxConcurrentMovesPerNode) {
1079        threadsPerTarget = maxConcurrentMovesPerNode;
1080        LOG.info("Limiting threads per target to the specified max.");
1081      }
1082      moverThreadAllocator.setLotSize(threadsPerTarget);
1083      LOG.info("Allocating " + threadsPerTarget + " threads per target.");
1084    }
1085
1086    long dSec = 0;
1087    final Iterator<Source> i = sources.iterator();
1088    for (int j = 0; j < futures.length; j++) {
1089      final Source s = i.next();
1090      final long delay = dSec * 1000;
1091      futures[j] = dispatchExecutor.submit(new Runnable() {
1092        @Override
1093        public void run() {
1094          s.dispatchBlocks(delay);
1095        }
1096      });
1097      // Calculate delay in seconds for the next iteration
1098      if(j >= concurrentThreads) {
1099        dSec = 0;
1100      } else if((j + 1) % BALANCER_NUM_RPC_PER_SEC == 0) {
1101        dSec++;
1102      }
1103    }
1104
1105    // wait for all dispatcher threads to finish
1106    for (Future<?> future : futures) {
1107      try {
1108        future.get();
1109      } catch (ExecutionException e) {
1110        LOG.warn("Dispatcher thread failed", e.getCause());
1111      }
1112    }
1113
1114    // wait for all block moving to be done
1115    waitForMoveCompletion(targets);
1116
1117    return getBytesMoved() - bytesLastMoved;
1118  }
1119
1120  /**
1121   * Wait for all block move confirmations.
1122   * @return true if there is failed move execution
1123   */
1124  public static boolean waitForMoveCompletion(
1125      Iterable<? extends StorageGroup> targets) {
1126    boolean hasFailure = false;
1127    for(;;) {
1128      boolean empty = true;
1129      for (StorageGroup t : targets) {
1130        if (!t.getDDatanode().isPendingQEmpty()) {
1131          empty = false;
1132          break;
1133        } else {
1134          hasFailure |= t.getDDatanode().hasFailure;
1135        }
1136      }
1137      if (empty) {
1138        return hasFailure; // all pending queues are empty
1139      }
1140      try {
1141        Thread.sleep(1000);
1142      } catch (InterruptedException ignored) {
1143      }
1144    }
1145  }
1146
1147  /**
1148   * Decide if the block is a good candidate to be moved from source to target.
1149   * A block is a good candidate if
1150   * 1. the block is not in the process of being moved/has not been moved;
1151   * 2. the block does not have a replica on the target;
1152   * 3. doing the move does not reduce the number of racks that the block has
1153   */
1154  private boolean isGoodBlockCandidate(StorageGroup source, StorageGroup target,
1155      StorageType targetStorageType, DBlock block) {
1156    if (source.equals(target)) {
1157      return false;
1158    }
1159    if (target.storageType != targetStorageType) {
1160      return false;
1161    }
1162    // check if the block is moved or not
1163    if (movedBlocks.contains(block.getBlock())) {
1164      return false;
1165    }
1166    final DatanodeInfo targetDatanode = target.getDatanodeInfo();
1167    if (source.getDatanodeInfo().equals(targetDatanode)) {
1168      // the block is moved inside same DN
1169      return true;
1170    }
1171
1172    // check if block has replica in target node
1173    for (StorageGroup blockLocation : block.getLocations()) {
1174      if (blockLocation.getDatanodeInfo().equals(targetDatanode)) {
1175        return false;
1176      }
1177    }
1178
1179    if (cluster.isNodeGroupAware()
1180        && isOnSameNodeGroupWithReplicas(source, target, block)) {
1181      return false;
1182    }
1183    if (reduceNumOfRacks(source, target, block)) {
1184      return false;
1185    }
1186    return true;
1187  }
1188
1189  /**
1190   * Determine whether moving the given block replica from source to target
1191   * would reduce the number of racks of the block replicas.
1192   */
1193  private boolean reduceNumOfRacks(StorageGroup source, StorageGroup target,
1194      DBlock block) {
1195    final DatanodeInfo sourceDn = source.getDatanodeInfo();
1196    if (cluster.isOnSameRack(sourceDn, target.getDatanodeInfo())) {
1197      // source and target are on the same rack
1198      return false;
1199    }
1200    boolean notOnSameRack = true;
1201    synchronized (block) {
1202      for (StorageGroup loc : block.getLocations()) {
1203        if (cluster.isOnSameRack(loc.getDatanodeInfo(), target.getDatanodeInfo())) {
1204          notOnSameRack = false;
1205          break;
1206        }
1207      }
1208    }
1209    if (notOnSameRack) {
1210      // target is not on the same rack as any replica
1211      return false;
1212    }
1213    for (StorageGroup g : block.getLocations()) {
1214      if (g != source && cluster.isOnSameRack(g.getDatanodeInfo(), sourceDn)) {
1215        // source is on the same rack of another replica
1216        return false;
1217      }
1218    }
1219    return true;
1220  }
1221
1222  /**
1223   * Check if there are any replica (other than source) on the same node group
1224   * with target. If true, then target is not a good candidate for placing
1225   * specific replica as we don't want 2 replicas under the same nodegroup.
1226   *
1227   * @return true if there are any replica (other than source) on the same node
1228   *         group with target
1229   */
1230  private boolean isOnSameNodeGroupWithReplicas(StorageGroup source,
1231      StorageGroup target, DBlock block) {
1232    final DatanodeInfo targetDn = target.getDatanodeInfo();
1233    for (StorageGroup g : block.getLocations()) {
1234      if (g != source && cluster.isOnSameNodeGroup(g.getDatanodeInfo(), targetDn)) {
1235        return true;
1236      }
1237    }
1238    return false;
1239  }
1240
1241  /** Reset all fields in order to prepare for the next iteration */
1242  void reset(Configuration conf) {
1243    cluster = NetworkTopology.getInstance(conf);
1244    storageGroupMap.clear();
1245    sources.clear();
1246
1247    moverThreadAllocator.reset();
1248    for(StorageGroup t : targets) {
1249      t.getDDatanode().shutdownMoveExecutor();
1250    }
1251    targets.clear();
1252    globalBlocks.removeAllButRetain(movedBlocks);
1253    movedBlocks.cleanup();
1254  }
1255
1256  @VisibleForTesting
1257  public static void setDelayAfterErrors(long time) {
1258    delayAfterErrors = time;
1259  }
1260
1261  /** shutdown thread pools */
1262  public void shutdownNow() {
1263    if (dispatchExecutor != null) {
1264      dispatchExecutor.shutdownNow();
1265    }
1266  }
1267
1268  static class Util {
1269    /** @return true if data node is part of the excludedNodes. */
1270    static boolean isExcluded(Set<String> excludedNodes, DatanodeInfo dn) {
1271      return isIn(excludedNodes, dn);
1272    }
1273
1274    /**
1275     * @return true if includedNodes is empty or data node is part of the
1276     *         includedNodes.
1277     */
1278    static boolean isIncluded(Set<String> includedNodes, DatanodeInfo dn) {
1279      return (includedNodes.isEmpty() || isIn(includedNodes, dn));
1280    }
1281
1282    /**
1283     * Match is checked using host name , ip address with and without port
1284     * number.
1285     * 
1286     * @return true if the datanode's transfer address matches the set of nodes.
1287     */
1288    private static boolean isIn(Set<String> datanodes, DatanodeInfo dn) {
1289      return isIn(datanodes, dn.getPeerHostName(), dn.getXferPort())
1290          || isIn(datanodes, dn.getIpAddr(), dn.getXferPort())
1291          || isIn(datanodes, dn.getHostName(), dn.getXferPort());
1292    }
1293
1294    /** @return true if nodes contains host or host:port */
1295    private static boolean isIn(Set<String> nodes, String host, int port) {
1296      if (host == null) {
1297        return false;
1298      }
1299      return (nodes.contains(host) || nodes.contains(host + ":" + port));
1300    }
1301
1302    /**
1303     * Parse a comma separated string to obtain set of host names
1304     * 
1305     * @return set of host names
1306     */
1307    static Set<String> parseHostList(String string) {
1308      String[] addrs = StringUtils.getTrimmedStrings(string);
1309      return new HashSet<String>(Arrays.asList(addrs));
1310    }
1311
1312    /**
1313     * Read set of host names from a file
1314     * 
1315     * @return set of host names
1316     */
1317    static Set<String> getHostListFromFile(String fileName, String type) {
1318      Set<String> nodes = new HashSet<String>();
1319      try {
1320        HostsFileReader.readFileToSet(type, fileName, nodes);
1321        return StringUtils.getTrimmedStrings(nodes);
1322      } catch (IOException e) {
1323        throw new IllegalArgumentException(
1324            "Failed to read host list from file: " + fileName);
1325      }
1326    }
1327  }
1328}