001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.balancer;
019
020import static org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed;
021
022import java.io.BufferedInputStream;
023import java.io.BufferedOutputStream;
024import java.io.DataInputStream;
025import java.io.DataOutputStream;
026import java.io.IOException;
027import java.io.InputStream;
028import java.io.OutputStream;
029import java.net.Socket;
030import java.util.ArrayList;
031import java.util.Arrays;
032import java.util.Collection;
033import java.util.EnumMap;
034import java.util.HashMap;
035import java.util.HashSet;
036import java.util.Iterator;
037import java.util.List;
038import java.util.Map;
039import java.util.Set;
040import java.util.concurrent.ExecutionException;
041import java.util.concurrent.ExecutorService;
042import java.util.concurrent.Executors;
043import java.util.concurrent.Future;
044
045import org.apache.commons.logging.Log;
046import org.apache.commons.logging.LogFactory;
047import org.apache.hadoop.classification.InterfaceAudience;
048import org.apache.hadoop.conf.Configuration;
049import org.apache.hadoop.fs.CommonConfigurationKeys;
050import org.apache.hadoop.hdfs.DFSConfigKeys;
051import org.apache.hadoop.hdfs.DFSUtil;
052import org.apache.hadoop.hdfs.DistributedFileSystem;
053import org.apache.hadoop.hdfs.StorageType;
054import org.apache.hadoop.hdfs.protocol.Block;
055import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
056import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
057import org.apache.hadoop.hdfs.protocol.HdfsConstants;
058import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
059import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
060import org.apache.hadoop.hdfs.protocol.datatransfer.TrustedChannelResolver;
061import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil;
062import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient;
063import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
064import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
065import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
066import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode.StorageGroup;
067import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
068import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
069import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;
070import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
071import org.apache.hadoop.io.IOUtils;
072import org.apache.hadoop.net.NetUtils;
073import org.apache.hadoop.net.NetworkTopology;
074import org.apache.hadoop.security.token.Token;
075import org.apache.hadoop.util.HostsFileReader;
076import org.apache.hadoop.util.StringUtils;
077import org.apache.hadoop.util.Time;
078
079import com.google.common.annotations.VisibleForTesting;
080import com.google.common.base.Preconditions;
081
082/** Dispatching block replica moves between datanodes. */
083@InterfaceAudience.Private
084public class Dispatcher {
085  static final Log LOG = LogFactory.getLog(Dispatcher.class);
086
087  private static final long GB = 1L << 30; // 1GB
088  private static final long MAX_BLOCKS_SIZE_TO_FETCH = 2 * GB;
089
090  private static final int MAX_NO_PENDING_MOVE_ITERATIONS = 5;
091  /**
092   * the period of time to delay the usage of a DataNode after hitting
093   * errors when using it for migrating data
094   */
095  private static long delayAfterErrors = 10 * 1000;
096
097  private final NameNodeConnector nnc;
098  private final SaslDataTransferClient saslClient;
099
100  /** Set of datanodes to be excluded. */
101  private final Set<String> excludedNodes;
102  /** Restrict to the following nodes. */
103  private final Set<String> includedNodes;
104
105  private final Collection<Source> sources = new HashSet<Source>();
106  private final Collection<StorageGroup> targets = new HashSet<StorageGroup>();
107
108  private final GlobalBlockMap globalBlocks = new GlobalBlockMap();
109  private final MovedBlocks<StorageGroup> movedBlocks;
110
111  /** Map (datanodeUuid,storageType -> StorageGroup) */
112  private final StorageGroupMap<StorageGroup> storageGroupMap
113      = new StorageGroupMap<StorageGroup>();
114
115  private NetworkTopology cluster;
116
117  private final ExecutorService moveExecutor;
118  private final ExecutorService dispatchExecutor;
119
120  /** The maximum number of concurrent blocks moves at a datanode */
121  private final int maxConcurrentMovesPerNode;
122
123  private static class GlobalBlockMap {
124    private final Map<Block, DBlock> map = new HashMap<Block, DBlock>();
125
126    /**
127     * Get the block from the map;
128     * if the block is not found, create a new block and put it in the map.
129     */
130    private DBlock get(Block b) {
131      DBlock block = map.get(b);
132      if (block == null) {
133        block = new DBlock(b);
134        map.put(b, block);
135      }
136      return block;
137    }
138    
139    /** Remove all blocks except for the moved blocks. */
140    private void removeAllButRetain(MovedBlocks<StorageGroup> movedBlocks) {
141      for (Iterator<Block> i = map.keySet().iterator(); i.hasNext();) {
142        if (!movedBlocks.contains(i.next())) {
143          i.remove();
144        }
145      }
146    }
147  }
148
149  public static class StorageGroupMap<G extends StorageGroup> {
150    private static String toKey(String datanodeUuid, StorageType storageType) {
151      return datanodeUuid + ":" + storageType;
152    }
153
154    private final Map<String, G> map = new HashMap<String, G>();
155
156    public G get(String datanodeUuid, StorageType storageType) {
157      return map.get(toKey(datanodeUuid, storageType));
158    }
159
160    public void put(G g) {
161      final String key = toKey(g.getDatanodeInfo().getDatanodeUuid(), g.storageType);
162      final StorageGroup existing = map.put(key, g);
163      Preconditions.checkState(existing == null);
164    }
165
166    int size() {
167      return map.size();
168    }
169
170    void clear() {
171      map.clear();
172    }
173
174    public Collection<G> values() {
175      return map.values();
176    }
177  }
178
179  /** This class keeps track of a scheduled block move */
180  public class PendingMove {
181    private DBlock block;
182    private Source source;
183    private DDatanode proxySource;
184    private StorageGroup target;
185
186    private PendingMove(Source source, StorageGroup target) {
187      this.source = source;
188      this.target = target;
189    }
190
191    @Override
192    public String toString() {
193      final Block b = block != null ? block.getBlock() : null;
194      String bStr = b != null ? (b + " with size=" + b.getNumBytes() + " ")
195          : " ";
196      return bStr + "from " + source.getDisplayName() + " to " + target
197          .getDisplayName() + " through " + (proxySource != null ? proxySource
198          .datanode : "");
199    }
200
201    /**
202     * Choose a block & a proxy source for this pendingMove whose source &
203     * target have already been chosen.
204     * 
205     * @return true if a block and its proxy are chosen; false otherwise
206     */
207    private boolean chooseBlockAndProxy() {
208      // source and target must have the same storage type
209      final StorageType t = source.getStorageType();
210      // iterate all source's blocks until find a good one
211      for (Iterator<DBlock> i = source.getBlockIterator(); i.hasNext();) {
212        if (markMovedIfGoodBlock(i.next(), t)) {
213          i.remove();
214          return true;
215        }
216      }
217      return false;
218    }
219
220    /**
221     * @return true if the given block is good for the tentative move.
222     */
223    private boolean markMovedIfGoodBlock(DBlock block, StorageType targetStorageType) {
224      synchronized (block) {
225        synchronized (movedBlocks) {
226          if (isGoodBlockCandidate(source, target, targetStorageType, block)) {
227            this.block = block;
228            if (chooseProxySource()) {
229              movedBlocks.put(block);
230              if (LOG.isDebugEnabled()) {
231                LOG.debug("Decided to move " + this);
232              }
233              return true;
234            }
235          }
236        }
237      }
238      return false;
239    }
240
241    /**
242     * Choose a proxy source.
243     * 
244     * @return true if a proxy is found; otherwise false
245     */
246    private boolean chooseProxySource() {
247      final DatanodeInfo targetDN = target.getDatanodeInfo();
248      // if node group is supported, first try add nodes in the same node group
249      if (cluster.isNodeGroupAware()) {
250        for (StorageGroup loc : block.getLocations()) {
251          if (cluster.isOnSameNodeGroup(loc.getDatanodeInfo(), targetDN)
252              && addTo(loc)) {
253            return true;
254          }
255        }
256      }
257      // check if there is replica which is on the same rack with the target
258      for (StorageGroup loc : block.getLocations()) {
259        if (cluster.isOnSameRack(loc.getDatanodeInfo(), targetDN) && addTo(loc)) {
260          return true;
261        }
262      }
263      // find out a non-busy replica
264      for (StorageGroup loc : block.getLocations()) {
265        if (addTo(loc)) {
266          return true;
267        }
268      }
269      return false;
270    }
271
272    /** add to a proxy source for specific block movement */
273    private boolean addTo(StorageGroup g) {
274      final DDatanode dn = g.getDDatanode();
275      if (dn.addPendingBlock(this)) {
276        proxySource = dn;
277        return true;
278      }
279      return false;
280    }
281
282    /** Dispatch the move to the proxy source & wait for the response. */
283    private void dispatch() {
284      if (LOG.isDebugEnabled()) {
285        LOG.debug("Start moving " + this);
286      }
287
288      Socket sock = new Socket();
289      DataOutputStream out = null;
290      DataInputStream in = null;
291      try {
292        sock.connect(
293            NetUtils.createSocketAddr(target.getDatanodeInfo().getXferAddr()),
294            HdfsServerConstants.READ_TIMEOUT);
295
296        sock.setKeepAlive(true);
297
298        OutputStream unbufOut = sock.getOutputStream();
299        InputStream unbufIn = sock.getInputStream();
300        ExtendedBlock eb = new ExtendedBlock(nnc.getBlockpoolID(),
301            block.getBlock());
302        final KeyManager km = nnc.getKeyManager(); 
303        Token<BlockTokenIdentifier> accessToken = km.getAccessToken(eb);
304        IOStreamPair saslStreams = saslClient.socketSend(sock, unbufOut,
305            unbufIn, km, accessToken, target.getDatanodeInfo());
306        unbufOut = saslStreams.out;
307        unbufIn = saslStreams.in;
308        out = new DataOutputStream(new BufferedOutputStream(unbufOut,
309            HdfsConstants.IO_FILE_BUFFER_SIZE));
310        in = new DataInputStream(new BufferedInputStream(unbufIn,
311            HdfsConstants.IO_FILE_BUFFER_SIZE));
312
313        sendRequest(out, eb, accessToken);
314        receiveResponse(in);
315        nnc.getBytesMoved().addAndGet(block.getNumBytes());
316        LOG.info("Successfully moved " + this);
317      } catch (IOException e) {
318        LOG.warn("Failed to move " + this + ": " + e.getMessage());
319        target.getDDatanode().setHasFailure();
320        // Proxy or target may have some issues, delay before using these nodes
321        // further in order to avoid a potential storm of "threads quota
322        // exceeded" warnings when the dispatcher gets out of sync with work
323        // going on in datanodes.
324        proxySource.activateDelay(delayAfterErrors);
325        target.getDDatanode().activateDelay(delayAfterErrors);
326      } finally {
327        IOUtils.closeStream(out);
328        IOUtils.closeStream(in);
329        IOUtils.closeSocket(sock);
330
331        proxySource.removePendingBlock(this);
332        target.getDDatanode().removePendingBlock(this);
333
334        synchronized (this) {
335          reset();
336        }
337        synchronized (Dispatcher.this) {
338          Dispatcher.this.notifyAll();
339        }
340      }
341    }
342
343    /** Send a block replace request to the output stream */
344    private void sendRequest(DataOutputStream out, ExtendedBlock eb,
345        Token<BlockTokenIdentifier> accessToken) throws IOException {
346      new Sender(out).replaceBlock(eb, target.storageType, accessToken,
347          source.getDatanodeInfo().getDatanodeUuid(), proxySource.datanode);
348    }
349
350    /** Receive a block copy response from the input stream */
351    private void receiveResponse(DataInputStream in) throws IOException {
352      BlockOpResponseProto response =
353          BlockOpResponseProto.parseFrom(vintPrefixed(in));
354      while (response.getStatus() == Status.IN_PROGRESS) {
355        // read intermediate responses
356        response = BlockOpResponseProto.parseFrom(vintPrefixed(in));
357      }
358      if (response.getStatus() != Status.SUCCESS) {
359        if (response.getStatus() == Status.ERROR_ACCESS_TOKEN) {
360          throw new IOException("block move failed due to access token error");
361        }
362        throw new IOException("block move is failed: " + response.getMessage());
363      }
364    }
365
366    /** reset the object */
367    private void reset() {
368      block = null;
369      source = null;
370      proxySource = null;
371      target = null;
372    }
373  }
374
375  /** A class for keeping track of block locations in the dispatcher. */
376  public static class DBlock extends MovedBlocks.Locations<StorageGroup> {
377    public DBlock(Block block) {
378      super(block);
379    }
380
381    @Override
382    public synchronized boolean isLocatedOn(StorageGroup loc) {
383      // currently we only check if replicas are located on the same DataNodes
384      // since we do not have the capability to store two replicas in the same
385      // DataNode even though they are on two different storage types
386      for (StorageGroup existing : locations) {
387        if (existing.getDatanodeInfo().equals(loc.getDatanodeInfo())) {
388          return true;
389        }
390      }
391      return false;
392    }
393  }
394
395  /** The class represents a desired move. */
396  static class Task {
397    private final StorageGroup target;
398    private long size; // bytes scheduled to move
399
400    Task(StorageGroup target, long size) {
401      this.target = target;
402      this.size = size;
403    }
404
405    long getSize() {
406      return size;
407    }
408  }
409
410  /** A class that keeps track of a datanode. */
411  public static class DDatanode {
412
413    /** A group of storages in a datanode with the same storage type. */
414    public class StorageGroup {
415      final StorageType storageType;
416      final long maxSize2Move;
417      private long scheduledSize = 0L;
418
419      private StorageGroup(StorageType storageType, long maxSize2Move) {
420        this.storageType = storageType;
421        this.maxSize2Move = maxSize2Move;
422      }
423      
424      public StorageType getStorageType() {
425        return storageType;
426      }
427
428      private DDatanode getDDatanode() {
429        return DDatanode.this;
430      }
431
432      public DatanodeInfo getDatanodeInfo() {
433        return DDatanode.this.datanode;
434      }
435
436      /** Decide if still need to move more bytes */
437      boolean hasSpaceForScheduling() {
438        return hasSpaceForScheduling(0L);
439      }
440
441      synchronized boolean hasSpaceForScheduling(long size) {
442        return availableSizeToMove() > size;
443      }
444
445      /** @return the total number of bytes that need to be moved */
446      synchronized long availableSizeToMove() {
447        return maxSize2Move - scheduledSize;
448      }
449
450      /** increment scheduled size */
451      public synchronized void incScheduledSize(long size) {
452        scheduledSize += size;
453      }
454
455      /** @return scheduled size */
456      synchronized long getScheduledSize() {
457        return scheduledSize;
458      }
459
460      /** Reset scheduled size to zero. */
461      synchronized void resetScheduledSize() {
462        scheduledSize = 0L;
463      }
464
465      private PendingMove addPendingMove(DBlock block, final PendingMove pm) {
466        if (getDDatanode().addPendingBlock(pm)) {
467          if (pm.markMovedIfGoodBlock(block, getStorageType())) {
468            incScheduledSize(pm.block.getNumBytes());
469            return pm;
470          } else {
471            getDDatanode().removePendingBlock(pm);
472          }
473        }
474        return null;
475      }
476
477      /** @return the name for display */
478      String getDisplayName() {
479        return datanode + ":" + storageType;
480      }
481
482      @Override
483      public String toString() {
484        return getDisplayName();
485      }
486    }
487
488    final DatanodeInfo datanode;
489    private final EnumMap<StorageType, Source> sourceMap
490        = new EnumMap<StorageType, Source>(StorageType.class);
491    private final EnumMap<StorageType, StorageGroup> targetMap
492        = new EnumMap<StorageType, StorageGroup>(StorageType.class);
493    protected long delayUntil = 0L;
494    /** blocks being moved but not confirmed yet */
495    private final List<PendingMove> pendings;
496    private volatile boolean hasFailure = false;
497    private final int maxConcurrentMoves;
498
499    @Override
500    public String toString() {
501      return getClass().getSimpleName() + ":" + datanode;
502    }
503
504    private DDatanode(DatanodeInfo datanode, int maxConcurrentMoves) {
505      this.datanode = datanode;
506      this.maxConcurrentMoves = maxConcurrentMoves;
507      this.pendings = new ArrayList<PendingMove>(maxConcurrentMoves);
508    }
509
510    public DatanodeInfo getDatanodeInfo() {
511      return datanode;
512    }
513
514    private static <G extends StorageGroup> void put(StorageType storageType,
515        G g, EnumMap<StorageType, G> map) {
516      final StorageGroup existing = map.put(storageType, g);
517      Preconditions.checkState(existing == null);
518    }
519
520    public StorageGroup addTarget(StorageType storageType, long maxSize2Move) {
521      final StorageGroup g = new StorageGroup(storageType, maxSize2Move);
522      put(storageType, g, targetMap);
523      return g;
524    }
525
526    public Source addSource(StorageType storageType, long maxSize2Move, Dispatcher d) {
527      final Source s = d.new Source(storageType, maxSize2Move, this);
528      put(storageType, s, sourceMap);
529      return s;
530    }
531
532    synchronized private void activateDelay(long delta) {
533      delayUntil = Time.monotonicNow() + delta;
534    }
535
536    synchronized private boolean isDelayActive() {
537      if (delayUntil == 0 || Time.monotonicNow() > delayUntil) {
538        delayUntil = 0;
539        return false;
540      }
541      return true;
542    }
543
544    /** Check if the node can schedule more blocks to move */
545    synchronized boolean isPendingQNotFull() {
546      return pendings.size() < maxConcurrentMoves;
547    }
548
549    /** Check if all the dispatched moves are done */
550    synchronized boolean isPendingQEmpty() {
551      return pendings.isEmpty();
552    }
553
554    /** Add a scheduled block move to the node */
555    synchronized boolean addPendingBlock(PendingMove pendingBlock) {
556      if (!isDelayActive() && isPendingQNotFull()) {
557        return pendings.add(pendingBlock);
558      }
559      return false;
560    }
561
562    /** Remove a scheduled block move from the node */
563    synchronized boolean removePendingBlock(PendingMove pendingBlock) {
564      return pendings.remove(pendingBlock);
565    }
566
567    void setHasFailure() {
568      this.hasFailure = true;
569    }
570  }
571
572  /** A node that can be the sources of a block move */
573  public class Source extends DDatanode.StorageGroup {
574
575    private final List<Task> tasks = new ArrayList<Task>(2);
576    private long blocksToReceive = 0L;
577    /**
578     * Source blocks point to the objects in {@link Dispatcher#globalBlocks}
579     * because we want to keep one copy of a block and be aware that the
580     * locations are changing over time.
581     */
582    private final List<DBlock> srcBlocks = new ArrayList<DBlock>();
583
584    private Source(StorageType storageType, long maxSize2Move, DDatanode dn) {
585      dn.super(storageType, maxSize2Move);
586    }
587
588    /** Add a task */
589    void addTask(Task task) {
590      Preconditions.checkState(task.target != this,
591          "Source and target are the same storage group " + getDisplayName());
592      incScheduledSize(task.size);
593      tasks.add(task);
594    }
595
596    /** @return an iterator to this source's blocks */
597    Iterator<DBlock> getBlockIterator() {
598      return srcBlocks.iterator();
599    }
600
601    /**
602     * Fetch new blocks of this source from namenode and update this source's
603     * block list & {@link Dispatcher#globalBlocks}.
604     * 
605     * @return the total size of the received blocks in the number of bytes.
606     */
607    private long getBlockList() throws IOException {
608      final long size = Math.min(MAX_BLOCKS_SIZE_TO_FETCH, blocksToReceive);
609      final BlocksWithLocations newBlocks = nnc.getBlocks(getDatanodeInfo(), size);
610
611      long bytesReceived = 0;
612      for (BlockWithLocations blk : newBlocks.getBlocks()) {
613        bytesReceived += blk.getBlock().getNumBytes();
614        synchronized (globalBlocks) {
615          final DBlock block = globalBlocks.get(blk.getBlock());
616          synchronized (block) {
617            block.clearLocations();
618
619            // update locations
620            final String[] datanodeUuids = blk.getDatanodeUuids();
621            final StorageType[] storageTypes = blk.getStorageTypes();
622            for (int i = 0; i < datanodeUuids.length; i++) {
623              final StorageGroup g = storageGroupMap.get(
624                  datanodeUuids[i], storageTypes[i]);
625              if (g != null) { // not unknown
626                block.addLocation(g);
627              }
628            }
629          }
630          if (!srcBlocks.contains(block) && isGoodBlockCandidate(block)) {
631            // filter bad candidates
632            srcBlocks.add(block);
633          }
634        }
635      }
636      return bytesReceived;
637    }
638
639    /** Decide if the given block is a good candidate to move or not */
640    private boolean isGoodBlockCandidate(DBlock block) {
641      // source and target must have the same storage type
642      final StorageType sourceStorageType = getStorageType();
643      for (Task t : tasks) {
644        if (Dispatcher.this.isGoodBlockCandidate(this, t.target,
645            sourceStorageType, block)) {
646          return true;
647        }
648      }
649      return false;
650    }
651
652    /**
653     * Choose a move for the source. The block's source, target, and proxy
654     * are determined too. When choosing proxy and target, source &
655     * target throttling has been considered. They are chosen only when they
656     * have the capacity to support this block move. The block should be
657     * dispatched immediately after this method is returned.
658     * 
659     * @return a move that's good for the source to dispatch immediately.
660     */
661    private PendingMove chooseNextMove() {
662      for (Iterator<Task> i = tasks.iterator(); i.hasNext();) {
663        final Task task = i.next();
664        final DDatanode target = task.target.getDDatanode();
665        final PendingMove pendingBlock = new PendingMove(this, task.target);
666        if (target.addPendingBlock(pendingBlock)) {
667          // target is not busy, so do a tentative block allocation
668          if (pendingBlock.chooseBlockAndProxy()) {
669            long blockSize = pendingBlock.block.getNumBytes();
670            incScheduledSize(-blockSize);
671            task.size -= blockSize;
672            if (task.size == 0) {
673              i.remove();
674            }
675            return pendingBlock;
676          } else {
677            // cancel the tentative move
678            target.removePendingBlock(pendingBlock);
679          }
680        }
681      }
682      return null;
683    }
684    
685    /** Add a pending move */
686    public PendingMove addPendingMove(DBlock block, StorageGroup target) {
687      return target.addPendingMove(block, new PendingMove(this, target));
688    }
689
690    /** Iterate all source's blocks to remove moved ones */
691    private void removeMovedBlocks() {
692      for (Iterator<DBlock> i = getBlockIterator(); i.hasNext();) {
693        if (movedBlocks.contains(i.next().getBlock())) {
694          i.remove();
695        }
696      }
697    }
698
699    private static final int SOURCE_BLOCKS_MIN_SIZE = 5;
700
701    /** @return if should fetch more blocks from namenode */
702    private boolean shouldFetchMoreBlocks() {
703      return srcBlocks.size() < SOURCE_BLOCKS_MIN_SIZE && blocksToReceive > 0;
704    }
705
706    private static final long MAX_ITERATION_TIME = 20 * 60 * 1000L; // 20 mins
707
708    /**
709     * This method iteratively does the following: it first selects a block to
710     * move, then sends a request to the proxy source to start the block move
711     * when the source's block list falls below a threshold, it asks the
712     * namenode for more blocks. It terminates when it has dispatch enough block
713     * move tasks or it has received enough blocks from the namenode, or the
714     * elapsed time of the iteration has exceeded the max time limit.
715     */
716    private void dispatchBlocks() {
717      final long startTime = Time.monotonicNow();
718      this.blocksToReceive = 2 * getScheduledSize();
719      boolean isTimeUp = false;
720      int noPendingMoveIteration = 0;
721      while (!isTimeUp && getScheduledSize() > 0
722          && (!srcBlocks.isEmpty() || blocksToReceive > 0)) {
723        final PendingMove p = chooseNextMove();
724        if (p != null) {
725          // Reset no pending move counter
726          noPendingMoveIteration=0;
727          executePendingMove(p);
728          continue;
729        }
730
731        // Since we cannot schedule any block to move,
732        // remove any moved blocks from the source block list and
733        removeMovedBlocks(); // filter already moved blocks
734        // check if we should fetch more blocks from the namenode
735        if (shouldFetchMoreBlocks()) {
736          // fetch new blocks
737          try {
738            blocksToReceive -= getBlockList();
739            continue;
740          } catch (IOException e) {
741            LOG.warn("Exception while getting block list", e);
742            return;
743          }
744        } else {
745          // source node cannot find a pending block to move, iteration +1
746          noPendingMoveIteration++;
747          // in case no blocks can be moved for source node's task,
748          // jump out of while-loop after 5 iterations.
749          if (noPendingMoveIteration >= MAX_NO_PENDING_MOVE_ITERATIONS) {
750            resetScheduledSize();
751          }
752        }
753
754        // check if time is up or not
755        if (Time.monotonicNow() - startTime > MAX_ITERATION_TIME) {
756          isTimeUp = true;
757          continue;
758        }
759
760        // Now we can not schedule any block to move and there are
761        // no new blocks added to the source block list, so we wait.
762        try {
763          synchronized (Dispatcher.this) {
764            Dispatcher.this.wait(1000); // wait for targets/sources to be idle
765          }
766        } catch (InterruptedException ignored) {
767        }
768      }
769    }
770  }
771
772  public Dispatcher(NameNodeConnector nnc, Set<String> includedNodes,
773      Set<String> excludedNodes, long movedWinWidth, int moverThreads,
774      int dispatcherThreads, int maxConcurrentMovesPerNode, Configuration conf) {
775    this.nnc = nnc;
776    this.excludedNodes = excludedNodes;
777    this.includedNodes = includedNodes;
778    this.movedBlocks = new MovedBlocks<StorageGroup>(movedWinWidth);
779
780    this.cluster = NetworkTopology.getInstance(conf);
781
782    this.moveExecutor = Executors.newFixedThreadPool(moverThreads);
783    this.dispatchExecutor = dispatcherThreads == 0? null
784        : Executors.newFixedThreadPool(dispatcherThreads);
785    this.maxConcurrentMovesPerNode = maxConcurrentMovesPerNode;
786
787    this.saslClient = new SaslDataTransferClient(conf,
788        DataTransferSaslUtil.getSaslPropertiesResolver(conf),
789        TrustedChannelResolver.getInstance(conf), nnc.fallbackToSimpleAuth);
790  }
791
792  public DistributedFileSystem getDistributedFileSystem() {
793    return nnc.getDistributedFileSystem();
794  }
795
796  public StorageGroupMap<StorageGroup> getStorageGroupMap() {
797    return storageGroupMap;
798  }
799
800  public NetworkTopology getCluster() {
801    return cluster;
802  }
803  
804  long getBytesMoved() {
805    return nnc.getBytesMoved().get();
806  }
807
808  long bytesToMove() {
809    Preconditions.checkState(
810        storageGroupMap.size() >= sources.size() + targets.size(),
811        "Mismatched number of storage groups (" + storageGroupMap.size()
812            + " < " + sources.size() + " sources + " + targets.size()
813            + " targets)");
814
815    long b = 0L;
816    for (Source src : sources) {
817      b += src.getScheduledSize();
818    }
819    return b;
820  }
821
822  void add(Source source, StorageGroup target) {
823    sources.add(source);
824    targets.add(target);
825  }
826
827  private boolean shouldIgnore(DatanodeInfo dn) {
828    // ignore decommissioned nodes
829    final boolean decommissioned = dn.isDecommissioned();
830    // ignore decommissioning nodes
831    final boolean decommissioning = dn.isDecommissionInProgress();
832    // ignore nodes in exclude list
833    final boolean excluded = Util.isExcluded(excludedNodes, dn);
834    // ignore nodes not in the include list (if include list is not empty)
835    final boolean notIncluded = !Util.isIncluded(includedNodes, dn);
836
837    if (decommissioned || decommissioning || excluded || notIncluded) {
838      if (LOG.isTraceEnabled()) {
839        LOG.trace("Excluding datanode " + dn + ": " + decommissioned + ", "
840            + decommissioning + ", " + excluded + ", " + notIncluded);
841      }
842      return true;
843    }
844    return false;
845  }
846
847  /** Get live datanode storage reports and then build the network topology. */
848  public List<DatanodeStorageReport> init() throws IOException {
849    final DatanodeStorageReport[] reports = nnc.getLiveDatanodeStorageReport();
850    final List<DatanodeStorageReport> trimmed = new ArrayList<DatanodeStorageReport>(); 
851    // create network topology and classify utilization collections:
852    // over-utilized, above-average, below-average and under-utilized.
853    for (DatanodeStorageReport r : DFSUtil.shuffle(reports)) {
854      final DatanodeInfo datanode = r.getDatanodeInfo();
855      if (shouldIgnore(datanode)) {
856        continue;
857      }
858      trimmed.add(r);
859      cluster.add(datanode);
860    }
861    return trimmed;
862  }
863
864  public DDatanode newDatanode(DatanodeInfo datanode) {
865    return new DDatanode(datanode, maxConcurrentMovesPerNode);
866  }
867
868  public void executePendingMove(final PendingMove p) {
869    // move the block
870    moveExecutor.execute(new Runnable() {
871      @Override
872      public void run() {
873        p.dispatch();
874      }
875    });
876  }
877
878  public boolean dispatchAndCheckContinue() throws InterruptedException {
879    return nnc.shouldContinue(dispatchBlockMoves());
880  }
881
882  /**
883   * Dispatch block moves for each source. The thread selects blocks to move &
884   * sends request to proxy source to initiate block move. The process is flow
885   * controlled. Block selection is blocked if there are too many un-confirmed
886   * block moves.
887   * 
888   * @return the total number of bytes successfully moved in this iteration.
889   */
890  private long dispatchBlockMoves() throws InterruptedException {
891    final long bytesLastMoved = getBytesMoved();
892    final Future<?>[] futures = new Future<?>[sources.size()];
893
894    final Iterator<Source> i = sources.iterator();
895    for (int j = 0; j < futures.length; j++) {
896      final Source s = i.next();
897      futures[j] = dispatchExecutor.submit(new Runnable() {
898        @Override
899        public void run() {
900          s.dispatchBlocks();
901        }
902      });
903    }
904
905    // wait for all dispatcher threads to finish
906    for (Future<?> future : futures) {
907      try {
908        future.get();
909      } catch (ExecutionException e) {
910        LOG.warn("Dispatcher thread failed", e.getCause());
911      }
912    }
913
914    // wait for all block moving to be done
915    waitForMoveCompletion(targets);
916
917    return getBytesMoved() - bytesLastMoved;
918  }
919
920  /** The sleeping period before checking if block move is completed again */
921  static private long blockMoveWaitTime = 30000L;
922
923  /**
924   * Wait for all block move confirmations.
925   * @return true if there is failed move execution
926   */
927  public static boolean waitForMoveCompletion(
928      Iterable<? extends StorageGroup> targets) {
929    boolean hasFailure = false;
930    for(;;) {
931      boolean empty = true;
932      for (StorageGroup t : targets) {
933        if (!t.getDDatanode().isPendingQEmpty()) {
934          empty = false;
935          break;
936        } else {
937          hasFailure |= t.getDDatanode().hasFailure;
938        }
939      }
940      if (empty) {
941        return hasFailure; // all pending queues are empty
942      }
943      try {
944        Thread.sleep(blockMoveWaitTime);
945      } catch (InterruptedException ignored) {
946      }
947    }
948  }
949
950  /**
951   * Decide if the block is a good candidate to be moved from source to target.
952   * A block is a good candidate if
953   * 1. the block is not in the process of being moved/has not been moved;
954   * 2. the block does not have a replica on the target;
955   * 3. doing the move does not reduce the number of racks that the block has
956   */
957  private boolean isGoodBlockCandidate(StorageGroup source, StorageGroup target,
958      StorageType targetStorageType, DBlock block) {
959    if (target.storageType != targetStorageType) {
960      return false;
961    }
962    // check if the block is moved or not
963    if (movedBlocks.contains(block.getBlock())) {
964      return false;
965    }
966    if (block.isLocatedOn(target)) {
967      return false;
968    }
969    if (cluster.isNodeGroupAware()
970        && isOnSameNodeGroupWithReplicas(source, target, block)) {
971      return false;
972    }
973    if (reduceNumOfRacks(source, target, block)) {
974      return false;
975    }
976    return true;
977  }
978
979  /**
980   * Determine whether moving the given block replica from source to target
981   * would reduce the number of racks of the block replicas.
982   */
983  private boolean reduceNumOfRacks(StorageGroup source, StorageGroup target,
984      DBlock block) {
985    final DatanodeInfo sourceDn = source.getDatanodeInfo();
986    if (cluster.isOnSameRack(sourceDn, target.getDatanodeInfo())) {
987      // source and target are on the same rack
988      return false;
989    }
990    boolean notOnSameRack = true;
991    synchronized (block) {
992      for (StorageGroup loc : block.getLocations()) {
993        if (cluster.isOnSameRack(loc.getDatanodeInfo(), target.getDatanodeInfo())) {
994          notOnSameRack = false;
995          break;
996        }
997      }
998    }
999    if (notOnSameRack) {
1000      // target is not on the same rack as any replica
1001      return false;
1002    }
1003    for (StorageGroup g : block.getLocations()) {
1004      if (g != source && cluster.isOnSameRack(g.getDatanodeInfo(), sourceDn)) {
1005        // source is on the same rack of another replica
1006        return false;
1007      }
1008    }
1009    return true;
1010  }
1011
1012  /**
1013   * Check if there are any replica (other than source) on the same node group
1014   * with target. If true, then target is not a good candidate for placing
1015   * specific replica as we don't want 2 replicas under the same nodegroup.
1016   *
1017   * @return true if there are any replica (other than source) on the same node
1018   *         group with target
1019   */
1020  private boolean isOnSameNodeGroupWithReplicas(StorageGroup source,
1021      StorageGroup target, DBlock block) {
1022    final DatanodeInfo targetDn = target.getDatanodeInfo();
1023    for (StorageGroup g : block.getLocations()) {
1024      if (g != source && cluster.isOnSameNodeGroup(g.getDatanodeInfo(), targetDn)) {
1025        return true;
1026      }
1027    }
1028    return false;
1029  }
1030
1031  /** Reset all fields in order to prepare for the next iteration */
1032  void reset(Configuration conf) {
1033    cluster = NetworkTopology.getInstance(conf);
1034    storageGroupMap.clear();
1035    sources.clear();
1036    targets.clear();
1037    globalBlocks.removeAllButRetain(movedBlocks);
1038    movedBlocks.cleanup();
1039  }
1040
1041  /** set the sleeping period for block move completion check */
1042  @VisibleForTesting
1043  public static void setBlockMoveWaitTime(long time) {
1044    blockMoveWaitTime = time;
1045  }
1046
1047  @VisibleForTesting
1048  public static void setDelayAfterErrors(long time) {
1049    delayAfterErrors = time;
1050  }
1051
1052  /** shutdown thread pools */
1053  public void shutdownNow() {
1054    if (dispatchExecutor != null) {
1055      dispatchExecutor.shutdownNow();
1056    }
1057    moveExecutor.shutdownNow();
1058  }
1059
1060  static class Util {
1061    /** @return true if data node is part of the excludedNodes. */
1062    static boolean isExcluded(Set<String> excludedNodes, DatanodeInfo dn) {
1063      return isIn(excludedNodes, dn);
1064    }
1065
1066    /**
1067     * @return true if includedNodes is empty or data node is part of the
1068     *         includedNodes.
1069     */
1070    static boolean isIncluded(Set<String> includedNodes, DatanodeInfo dn) {
1071      return (includedNodes.isEmpty() || isIn(includedNodes, dn));
1072    }
1073
1074    /**
1075     * Match is checked using host name , ip address with and without port
1076     * number.
1077     * 
1078     * @return true if the datanode's transfer address matches the set of nodes.
1079     */
1080    private static boolean isIn(Set<String> datanodes, DatanodeInfo dn) {
1081      return isIn(datanodes, dn.getPeerHostName(), dn.getXferPort())
1082          || isIn(datanodes, dn.getIpAddr(), dn.getXferPort())
1083          || isIn(datanodes, dn.getHostName(), dn.getXferPort());
1084    }
1085
1086    /** @return true if nodes contains host or host:port */
1087    private static boolean isIn(Set<String> nodes, String host, int port) {
1088      if (host == null) {
1089        return false;
1090      }
1091      return (nodes.contains(host) || nodes.contains(host + ":" + port));
1092    }
1093
1094    /**
1095     * Parse a comma separated string to obtain set of host names
1096     * 
1097     * @return set of host names
1098     */
1099    static Set<String> parseHostList(String string) {
1100      String[] addrs = StringUtils.getTrimmedStrings(string);
1101      return new HashSet<String>(Arrays.asList(addrs));
1102    }
1103
1104    /**
1105     * Read set of host names from a file
1106     * 
1107     * @return set of host names
1108     */
1109    static Set<String> getHostListFromFile(String fileName, String type) {
1110      Set<String> nodes = new HashSet<String>();
1111      try {
1112        HostsFileReader.readFileToSet(type, fileName, nodes);
1113        return StringUtils.getTrimmedStrings(nodes);
1114      } catch (IOException e) {
1115        throw new IllegalArgumentException(
1116            "Failed to read host list from file: " + fileName);
1117      }
1118    }
1119  }
1120}