001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.net;
019
020import java.util.ArrayList;
021import java.util.List;
022import java.util.Collection;
023import java.util.Collections;
024import java.util.List;
025import java.util.Random;
026import java.util.TreeMap;
027import java.util.concurrent.locks.ReadWriteLock;
028import java.util.concurrent.locks.ReentrantReadWriteLock;
029
030import com.google.common.annotations.VisibleForTesting;
031import org.apache.commons.logging.Log;
032import org.apache.commons.logging.LogFactory;
033import org.apache.hadoop.classification.InterfaceAudience;
034import org.apache.hadoop.classification.InterfaceStability;
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
037import org.apache.hadoop.util.ReflectionUtils;
038
039import com.google.common.base.Preconditions;
040import com.google.common.collect.Lists;
041
042/** The class represents a cluster of computer with a tree hierarchical
043 * network topology.
044 * For example, a cluster may be consists of many data centers filled 
045 * with racks of computers.
046 * In a network topology, leaves represent data nodes (computers) and inner
047 * nodes represent switches/routers that manage traffic in/out of data centers
048 * or racks.  
049 * 
050 */
051@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
052@InterfaceStability.Unstable
053public class NetworkTopology {
054  public final static String DEFAULT_RACK = "/default-rack";
055  public final static int DEFAULT_HOST_LEVEL = 2;
056  public static final Log LOG =
057    LogFactory.getLog(NetworkTopology.class);
058
059  public static class InvalidTopologyException extends RuntimeException {
060    private static final long serialVersionUID = 1L;
061    public InvalidTopologyException(String msg) {
062      super(msg);
063    }
064  }
065  
066  /**
067   * Get an instance of NetworkTopology based on the value of the configuration
068   * parameter net.topology.impl.
069   * 
070   * @param conf the configuration to be used
071   * @return an instance of NetworkTopology
072   */
073  public static NetworkTopology getInstance(Configuration conf){
074    return ReflectionUtils.newInstance(
075        conf.getClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_IMPL_KEY,
076        NetworkTopology.class, NetworkTopology.class), conf);
077  }
078
079  /** InnerNode represents a switch/router of a data center or rack.
080   * Different from a leaf node, it has non-null children.
081   */
082  static class InnerNode extends NodeBase {
083    protected List<Node> children=new ArrayList<Node>();
084    private int numOfLeaves;
085        
086    /** Construct an InnerNode from a path-like string */
087    InnerNode(String path) {
088      super(path);
089    }
090        
091    /** Construct an InnerNode from its name and its network location */
092    InnerNode(String name, String location) {
093      super(name, location);
094    }
095        
096    /** Construct an InnerNode
097     * from its name, its network location, its parent, and its level */
098    InnerNode(String name, String location, InnerNode parent, int level) {
099      super(name, location, parent, level);
100    }
101        
102    /** @return its children */
103    List<Node> getChildren() {return children;}
104        
105    /** @return the number of children this node has */
106    int getNumOfChildren() {
107      return children.size();
108    }
109        
110    /** Judge if this node represents a rack 
111     * @return true if it has no child or its children are not InnerNodes
112     */ 
113    boolean isRack() {
114      if (children.isEmpty()) {
115        return true;
116      }
117            
118      Node firstChild = children.get(0);
119      if (firstChild instanceof InnerNode) {
120        return false;
121      }
122            
123      return true;
124    }
125        
126    /** Judge if this node is an ancestor of node <i>n</i>
127     * 
128     * @param n a node
129     * @return true if this node is an ancestor of <i>n</i>
130     */
131    boolean isAncestor(Node n) {
132      return getPath(this).equals(NodeBase.PATH_SEPARATOR_STR) ||
133        (n.getNetworkLocation()+NodeBase.PATH_SEPARATOR_STR).
134        startsWith(getPath(this)+NodeBase.PATH_SEPARATOR_STR);
135    }
136        
137    /** Judge if this node is the parent of node <i>n</i>
138     * 
139     * @param n a node
140     * @return true if this node is the parent of <i>n</i>
141     */
142    boolean isParent(Node n) {
143      return n.getNetworkLocation().equals(getPath(this));
144    }
145        
146    /* Return a child name of this node who is an ancestor of node <i>n</i> */
147    private String getNextAncestorName(Node n) {
148      if (!isAncestor(n)) {
149        throw new IllegalArgumentException(
150                                           this + "is not an ancestor of " + n);
151      }
152      String name = n.getNetworkLocation().substring(getPath(this).length());
153      if (name.charAt(0) == PATH_SEPARATOR) {
154        name = name.substring(1);
155      }
156      int index=name.indexOf(PATH_SEPARATOR);
157      if (index !=-1)
158        name = name.substring(0, index);
159      return name;
160    }
161        
162    /** Add node <i>n</i> to the subtree of this node 
163     * @param n node to be added
164     * @return true if the node is added; false otherwise
165     */
166    boolean add(Node n) {
167      if (!isAncestor(n))
168        throw new IllegalArgumentException(n.getName()+", which is located at "
169                +n.getNetworkLocation()+", is not a decendent of "
170                +getPath(this));
171      if (isParent(n)) {
172        // this node is the parent of n; add n directly
173        n.setParent(this);
174        n.setLevel(this.level+1);
175        for(int i=0; i<children.size(); i++) {
176          if (children.get(i).getName().equals(n.getName())) {
177            children.set(i, n);
178            return false;
179          }
180        }
181        children.add(n);
182        numOfLeaves++;
183        return true;
184      } else {
185        // find the next ancestor node
186        String parentName = getNextAncestorName(n);
187        InnerNode parentNode = null;
188        for(int i=0; i<children.size(); i++) {
189          if (children.get(i).getName().equals(parentName)) {
190            parentNode = (InnerNode)children.get(i);
191            break;
192          }
193        }
194        if (parentNode == null) {
195          // create a new InnerNode
196          parentNode = createParentNode(parentName);
197          children.add(parentNode);
198        }
199        // add n to the subtree of the next ancestor node
200        if (parentNode.add(n)) {
201          numOfLeaves++;
202          return true;
203        } else {
204          return false;
205        }
206      }
207    }
208
209    /**
210     * Creates a parent node to be added to the list of children.  
211     * Creates a node using the InnerNode four argument constructor specifying 
212     * the name, location, parent, and level of this node.
213     * 
214     * <p>To be overridden in subclasses for specific InnerNode implementations,
215     * as alternative to overriding the full {@link #add(Node)} method.
216     * 
217     * @param parentName The name of the parent node
218     * @return A new inner node
219     * @see InnerNode#InnerNode(String, String, InnerNode, int)
220     */
221    protected InnerNode createParentNode(String parentName) {
222      return new InnerNode(parentName, getPath(this), this, this.getLevel()+1);
223    }
224
225    /** Remove node <i>n</i> from the subtree of this node
226     * @param n node to be deleted 
227     * @return true if the node is deleted; false otherwise
228     */
229    boolean remove(Node n) {
230      String parent = n.getNetworkLocation();
231      String currentPath = getPath(this);
232      if (!isAncestor(n))
233        throw new IllegalArgumentException(n.getName()
234                                           +", which is located at "
235                                           +parent+", is not a descendent of "+currentPath);
236      if (isParent(n)) {
237        // this node is the parent of n; remove n directly
238        for(int i=0; i<children.size(); i++) {
239          if (children.get(i).getName().equals(n.getName())) {
240            children.remove(i);
241            numOfLeaves--;
242            n.setParent(null);
243            return true;
244          }
245        }
246        return false;
247      } else {
248        // find the next ancestor node: the parent node
249        String parentName = getNextAncestorName(n);
250        InnerNode parentNode = null;
251        int i;
252        for(i=0; i<children.size(); i++) {
253          if (children.get(i).getName().equals(parentName)) {
254            parentNode = (InnerNode)children.get(i);
255            break;
256          }
257        }
258        if (parentNode==null) {
259          return false;
260        }
261        // remove n from the parent node
262        boolean isRemoved = parentNode.remove(n);
263        // if the parent node has no children, remove the parent node too
264        if (isRemoved) {
265          if (parentNode.getNumOfChildren() == 0) {
266            children.remove(i);
267          }
268          numOfLeaves--;
269        }
270        return isRemoved;
271      }
272    } // end of remove
273        
274    /** Given a node's string representation, return a reference to the node
275     * @param loc string location of the form /rack/node
276     * @return null if the node is not found or the childnode is there but
277     * not an instance of {@link InnerNode}
278     */
279    private Node getLoc(String loc) {
280      if (loc == null || loc.length() == 0) return this;
281            
282      String[] path = loc.split(PATH_SEPARATOR_STR, 2);
283      Node childnode = null;
284      for(int i=0; i<children.size(); i++) {
285        if (children.get(i).getName().equals(path[0])) {
286          childnode = children.get(i);
287        }
288      }
289      if (childnode == null) return null; // non-existing node
290      if (path.length == 1) return childnode;
291      if (childnode instanceof InnerNode) {
292        return ((InnerNode)childnode).getLoc(path[1]);
293      } else {
294        return null;
295      }
296    }
297        
298    /** get <i>leafIndex</i> leaf of this subtree 
299     * if it is not in the <i>excludedNode</i>
300     *
301     * @param leafIndex an indexed leaf of the node
302     * @param excludedNode an excluded node (can be null)
303     * @return
304     */
305    Node getLeaf(int leafIndex, Node excludedNode) {
306      int count=0;
307      // check if the excluded node a leaf
308      boolean isLeaf =
309        excludedNode == null || !(excludedNode instanceof InnerNode);
310      // calculate the total number of excluded leaf nodes
311      int numOfExcludedLeaves =
312        isLeaf ? 1 : ((InnerNode)excludedNode).getNumOfLeaves();
313      if (isLeafParent()) { // children are leaves
314        if (isLeaf) { // excluded node is a leaf node
315          int excludedIndex = children.indexOf(excludedNode);
316          if (excludedIndex != -1 && leafIndex >= 0) {
317            // excluded node is one of the children so adjust the leaf index
318            leafIndex = leafIndex>=excludedIndex ? leafIndex+1 : leafIndex;
319          }
320        }
321        // range check
322        if (leafIndex<0 || leafIndex>=this.getNumOfChildren()) {
323          return null;
324        }
325        return children.get(leafIndex);
326      } else {
327        for(int i=0; i<children.size(); i++) {
328          InnerNode child = (InnerNode)children.get(i);
329          if (excludedNode == null || excludedNode != child) {
330            // not the excludedNode
331            int numOfLeaves = child.getNumOfLeaves();
332            if (excludedNode != null && child.isAncestor(excludedNode)) {
333              numOfLeaves -= numOfExcludedLeaves;
334            }
335            if (count+numOfLeaves > leafIndex) {
336              // the leaf is in the child subtree
337              return child.getLeaf(leafIndex-count, excludedNode);
338            } else {
339              // go to the next child
340              count = count+numOfLeaves;
341            }
342          } else { // it is the excluededNode
343            // skip it and set the excludedNode to be null
344            excludedNode = null;
345          }
346        }
347        return null;
348      }
349    }
350    
351    protected boolean isLeafParent() {
352      return isRack();
353    }
354
355    /**
356      * Determine if children a leaves, default implementation calls {@link #isRack()}
357      * <p>To be overridden in subclasses for specific InnerNode implementations,
358      * as alternative to overriding the full {@link #getLeaf(int, Node)} method.
359      * 
360      * @return true if children are leaves, false otherwise
361      */
362    protected boolean areChildrenLeaves() {
363      return isRack();
364    }
365
366    /**
367     * Get number of leaves.
368     */
369    int getNumOfLeaves() {
370      return numOfLeaves;
371    }
372  } // end of InnerNode
373
374  /**
375   * the root cluster map
376   */
377  InnerNode clusterMap;
378  /** Depth of all leaf nodes */
379  private int depthOfAllLeaves = -1;
380  /** rack counter */
381  protected int numOfRacks = 0;
382
383  /**
384   * Whether or not this cluster has ever consisted of more than 1 rack,
385   * according to the NetworkTopology.
386   */
387  private boolean clusterEverBeenMultiRack = false;
388
389  /** the lock used to manage access */
390  protected ReadWriteLock netlock = new ReentrantReadWriteLock();
391
392  public NetworkTopology() {
393    clusterMap = new InnerNode(InnerNode.ROOT);
394  }
395
396  /** Add a leaf node
397   * Update node counter & rack counter if necessary
398   * @param node node to be added; can be null
399   * @exception IllegalArgumentException if add a node to a leave 
400                                         or node to be added is not a leaf
401   */
402  public void add(Node node) {
403    if (node==null) return;
404    int newDepth = NodeBase.locationToDepth(node.getNetworkLocation()) + 1;
405    netlock.writeLock().lock();
406    try {
407      if( node instanceof InnerNode ) {
408        throw new IllegalArgumentException(
409          "Not allow to add an inner node: "+NodeBase.getPath(node));
410      }
411      if ((depthOfAllLeaves != -1) && (depthOfAllLeaves != newDepth)) {
412        LOG.error("Error: can't add leaf node " + NodeBase.getPath(node) +
413            " at depth " + newDepth + " to topology:\n" + this.toString());
414        throw new InvalidTopologyException("Failed to add " + NodeBase.getPath(node) +
415            ": You cannot have a rack and a non-rack node at the same " +
416            "level of the network topology.");
417      }
418      Node rack = getNodeForNetworkLocation(node);
419      if (rack != null && !(rack instanceof InnerNode)) {
420        throw new IllegalArgumentException("Unexpected data node " 
421                                           + node.toString() 
422                                           + " at an illegal network location");
423      }
424      if (clusterMap.add(node)) {
425        LOG.info("Adding a new node: "+NodeBase.getPath(node));
426        if (rack == null) {
427          incrementRacks();
428        }
429        if (!(node instanceof InnerNode)) {
430          if (depthOfAllLeaves == -1) {
431            depthOfAllLeaves = node.getLevel();
432          }
433        }
434      }
435      if(LOG.isDebugEnabled()) {
436        LOG.debug("NetworkTopology became:\n" + this.toString());
437      }
438    } finally {
439      netlock.writeLock().unlock();
440    }
441  }
442
443  protected void incrementRacks() {
444    numOfRacks++;
445    if (!clusterEverBeenMultiRack && numOfRacks > 1) {
446      clusterEverBeenMultiRack = true;
447    }
448  }
449
450  /**
451   * Return a reference to the node given its string representation.
452   * Default implementation delegates to {@link #getNode(String)}.
453   * 
454   * <p>To be overridden in subclasses for specific NetworkTopology 
455   * implementations, as alternative to overriding the full {@link #add(Node)}
456   *  method.
457   * 
458   * @param node The string representation of this node's network location is
459   * used to retrieve a Node object. 
460   * @return a reference to the node; null if the node is not in the tree
461   * 
462   * @see #add(Node)
463   * @see #getNode(String)
464   */
465  protected Node getNodeForNetworkLocation(Node node) {
466    return getNode(node.getNetworkLocation());
467  }
468  
469  /**
470   * Given a string representation of a rack, return its children
471   * @param loc a path-like string representation of a rack
472   * @return a newly allocated list with all the node's children
473   */
474  public List<Node> getDatanodesInRack(String loc) {
475    netlock.readLock().lock();
476    try {
477      loc = NodeBase.normalize(loc);
478      if (!NodeBase.ROOT.equals(loc)) {
479        loc = loc.substring(1);
480      }
481      InnerNode rack = (InnerNode) clusterMap.getLoc(loc);
482      if (rack == null) {
483        return null;
484      }
485      return new ArrayList<Node>(rack.getChildren());
486    } finally {
487      netlock.readLock().unlock();
488    }
489  }
490
491  /** Remove a node
492   * Update node counter and rack counter if necessary
493   * @param node node to be removed; can be null
494   */ 
495  public void remove(Node node) {
496    if (node==null) return;
497    if( node instanceof InnerNode ) {
498      throw new IllegalArgumentException(
499        "Not allow to remove an inner node: "+NodeBase.getPath(node));
500    }
501    LOG.info("Removing a node: "+NodeBase.getPath(node));
502    netlock.writeLock().lock();
503    try {
504      if (clusterMap.remove(node)) {
505        InnerNode rack = (InnerNode)getNode(node.getNetworkLocation());
506        if (rack == null) {
507          numOfRacks--;
508        }
509      }
510      if(LOG.isDebugEnabled()) {
511        LOG.debug("NetworkTopology became:\n" + this.toString());
512      }
513    } finally {
514      netlock.writeLock().unlock();
515    }
516  }
517
518  /** Check if the tree contains node <i>node</i>
519   * 
520   * @param node a node
521   * @return true if <i>node</i> is already in the tree; false otherwise
522   */
523  public boolean contains(Node node) {
524    if (node == null) return false;
525    netlock.readLock().lock();
526    try {
527      Node parent = node.getParent();
528      for (int level = node.getLevel(); parent != null && level > 0;
529           parent = parent.getParent(), level--) {
530        if (parent == clusterMap) {
531          return true;
532        }
533      }
534    } finally {
535      netlock.readLock().unlock();
536    }
537    return false; 
538  }
539    
540  /** Given a string representation of a node, return its reference
541   * 
542   * @param loc
543   *          a path-like string representation of a node
544   * @return a reference to the node; null if the node is not in the tree
545   */
546  public Node getNode(String loc) {
547    netlock.readLock().lock();
548    try {
549      loc = NodeBase.normalize(loc);
550      if (!NodeBase.ROOT.equals(loc))
551        loc = loc.substring(1);
552      return clusterMap.getLoc(loc);
553    } finally {
554      netlock.readLock().unlock();
555    }
556  }
557
558  /**
559   * @return true if this cluster has ever consisted of multiple racks, even if
560   *         it is not now a multi-rack cluster.
561   */
562  public boolean hasClusterEverBeenMultiRack() {
563    return clusterEverBeenMultiRack;
564  }
565
566  /** Given a string representation of a rack for a specific network
567   *  location
568   *
569   * To be overridden in subclasses for specific NetworkTopology 
570   * implementations, as alternative to overriding the full 
571   * {@link #getRack(String)} method.
572   * @param loc
573   *          a path-like string representation of a network location
574   * @return a rack string
575   */
576  public String getRack(String loc) {
577    return loc;
578  }
579  
580  /** @return the total number of racks */
581  public int getNumOfRacks() {
582    netlock.readLock().lock();
583    try {
584      return numOfRacks;
585    } finally {
586      netlock.readLock().unlock();
587    }
588  }
589
590  /** @return the total number of leaf nodes */
591  public int getNumOfLeaves() {
592    netlock.readLock().lock();
593    try {
594      return clusterMap.getNumOfLeaves();
595    } finally {
596      netlock.readLock().unlock();
597    }
598  }
599
600  /** Return the distance between two nodes
601   * It is assumed that the distance from one node to its parent is 1
602   * The distance between two nodes is calculated by summing up their distances
603   * to their closest common ancestor.
604   * @param node1 one node
605   * @param node2 another node
606   * @return the distance between node1 and node2 which is zero if they are the same
607   *  or {@link Integer#MAX_VALUE} if node1 or node2 do not belong to the cluster
608   */
609  public int getDistance(Node node1, Node node2) {
610    if (node1 == node2) {
611      return 0;
612    }
613    Node n1=node1, n2=node2;
614    int dis = 0;
615    netlock.readLock().lock();
616    try {
617      int level1=node1.getLevel(), level2=node2.getLevel();
618      while(n1!=null && level1>level2) {
619        n1 = n1.getParent();
620        level1--;
621        dis++;
622      }
623      while(n2!=null && level2>level1) {
624        n2 = n2.getParent();
625        level2--;
626        dis++;
627      }
628      while(n1!=null && n2!=null && n1.getParent()!=n2.getParent()) {
629        n1=n1.getParent();
630        n2=n2.getParent();
631        dis+=2;
632      }
633    } finally {
634      netlock.readLock().unlock();
635    }
636    if (n1==null) {
637      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node1));
638      return Integer.MAX_VALUE;
639    }
640    if (n2==null) {
641      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node2));
642      return Integer.MAX_VALUE;
643    }
644    return dis+2;
645  }
646
647  /** Check if two nodes are on the same rack
648   * @param node1 one node (can be null)
649   * @param node2 another node (can be null)
650   * @return true if node1 and node2 are on the same rack; false otherwise
651   * @exception IllegalArgumentException when either node1 or node2 is null, or
652   * node1 or node2 do not belong to the cluster
653   */
654  public boolean isOnSameRack( Node node1,  Node node2) {
655    if (node1 == null || node2 == null) {
656      return false;
657    }
658      
659    netlock.readLock().lock();
660    try {
661      return isSameParents(node1, node2);
662    } finally {
663      netlock.readLock().unlock();
664    }
665  }
666  
667  /**
668   * Check if network topology is aware of NodeGroup
669   */
670  public boolean isNodeGroupAware() {
671    return false;
672  }
673  
674  /** 
675   * Return false directly as not aware of NodeGroup, to be override in sub-class
676   */
677  public boolean isOnSameNodeGroup(Node node1, Node node2) {
678    return false;
679  }
680
681  /**
682   * Compare the parents of each node for equality
683   * 
684   * <p>To be overridden in subclasses for specific NetworkTopology 
685   * implementations, as alternative to overriding the full 
686   * {@link #isOnSameRack(Node, Node)} method.
687   * 
688   * @param node1 the first node to compare
689   * @param node2 the second node to compare
690   * @return true if their parents are equal, false otherwise
691   * 
692   * @see #isOnSameRack(Node, Node)
693   */
694  protected boolean isSameParents(Node node1, Node node2) {
695    return node1.getParent()==node2.getParent();
696  }
697
698  private static final Random r = new Random();
699
700  @VisibleForTesting
701  void setRandomSeed(long seed) {
702    r.setSeed(seed);
703  }
704
705  /** randomly choose one node from <i>scope</i>
706   * if scope starts with ~, choose one from the all nodes except for the
707   * ones in <i>scope</i>; otherwise, choose one from <i>scope</i>
708   * @param scope range of nodes from which a node will be chosen
709   * @return the chosen node
710   */
711  public Node chooseRandom(String scope) {
712    netlock.readLock().lock();
713    try {
714      if (scope.startsWith("~")) {
715        return chooseRandom(NodeBase.ROOT, scope.substring(1));
716      } else {
717        return chooseRandom(scope, null);
718      }
719    } finally {
720      netlock.readLock().unlock();
721    }
722  }
723
724  private Node chooseRandom(String scope, String excludedScope){
725    if (excludedScope != null) {
726      if (scope.startsWith(excludedScope)) {
727        return null;
728      }
729      if (!excludedScope.startsWith(scope)) {
730        excludedScope = null;
731      }
732    }
733    Node node = getNode(scope);
734    if (!(node instanceof InnerNode)) {
735      return node;
736    }
737    InnerNode innerNode = (InnerNode)node;
738    int numOfDatanodes = innerNode.getNumOfLeaves();
739    if (excludedScope == null) {
740      node = null;
741    } else {
742      node = getNode(excludedScope);
743      if (!(node instanceof InnerNode)) {
744        numOfDatanodes -= 1;
745      } else {
746        numOfDatanodes -= ((InnerNode)node).getNumOfLeaves();
747      }
748    }
749    if (numOfDatanodes == 0) {
750      throw new InvalidTopologyException(
751          "Failed to find datanode (scope=\"" + String.valueOf(scope) +
752          "\" excludedScope=\"" + String.valueOf(excludedScope) + "\").");
753    }
754    int leaveIndex = r.nextInt(numOfDatanodes);
755    return innerNode.getLeaf(leaveIndex, node);
756  }
757
758  /** return leaves in <i>scope</i>
759   * @param scope a path string
760   * @return leaves nodes under specific scope
761   */
762  public List<Node> getLeaves(String scope) {
763    Node node = getNode(scope);
764    List<Node> leafNodes = new ArrayList<Node>();
765    if (!(node instanceof InnerNode)) {
766      leafNodes.add(node);
767    } else {
768      InnerNode innerNode = (InnerNode) node;
769      for (int i=0;i<innerNode.getNumOfLeaves();i++) {
770        leafNodes.add(innerNode.getLeaf(i, null));
771      }
772    }
773    return leafNodes;
774  }
775
776  /** return the number of leaves in <i>scope</i> but not in <i>excludedNodes</i>
777   * if scope starts with ~, return the number of nodes that are not
778   * in <i>scope</i> and <i>excludedNodes</i>; 
779   * @param scope a path string that may start with ~
780   * @param excludedNodes a list of nodes
781   * @return number of available nodes
782   */
783  public int countNumOfAvailableNodes(String scope,
784                                      Collection<Node> excludedNodes) {
785    boolean isExcluded=false;
786    if (scope.startsWith("~")) {
787      isExcluded=true;
788      scope=scope.substring(1);
789    }
790    scope = NodeBase.normalize(scope);
791    int excludedCountInScope = 0; // the number of nodes in both scope & excludedNodes
792    int excludedCountOffScope = 0; // the number of nodes outside scope & excludedNodes
793    netlock.readLock().lock();
794    try {
795      for (Node node : excludedNodes) {
796        node = getNode(NodeBase.getPath(node));
797        if (node == null) {
798          continue;
799        }
800        if ((NodeBase.getPath(node) + NodeBase.PATH_SEPARATOR_STR)
801            .startsWith(scope + NodeBase.PATH_SEPARATOR_STR)) {
802          excludedCountInScope++;
803        } else {
804          excludedCountOffScope++;
805        }
806      }
807      Node n = getNode(scope);
808      int scopeNodeCount = 0;
809      if (n != null) {
810        scopeNodeCount++;
811      }
812      if (n instanceof InnerNode) {
813        scopeNodeCount=((InnerNode)n).getNumOfLeaves();
814      }
815      if (isExcluded) {
816        return clusterMap.getNumOfLeaves() - scopeNodeCount
817            - excludedCountOffScope;
818      } else {
819        return scopeNodeCount - excludedCountInScope;
820      }
821    } finally {
822      netlock.readLock().unlock();
823    }
824  }
825
826  /** convert a network tree to a string */
827  @Override
828  public String toString() {
829    // print the number of racks
830    StringBuilder tree = new StringBuilder();
831    tree.append("Number of racks: ");
832    tree.append(numOfRacks);
833    tree.append("\n");
834    // print the number of leaves
835    int numOfLeaves = getNumOfLeaves();
836    tree.append("Expected number of leaves:");
837    tree.append(numOfLeaves);
838    tree.append("\n");
839    // print nodes
840    for(int i=0; i<numOfLeaves; i++) {
841      tree.append(NodeBase.getPath(clusterMap.getLeaf(i, null)));
842      tree.append("\n");
843    }
844    return tree.toString();
845  }
846  
847  /**
848   * Divide networklocation string into two parts by last separator, and get 
849   * the first part here.
850   * 
851   * @param networkLocation
852   * @return
853   */
854  public static String getFirstHalf(String networkLocation) {
855    int index = networkLocation.lastIndexOf(NodeBase.PATH_SEPARATOR_STR);
856    return networkLocation.substring(0, index);
857  }
858
859  /**
860   * Divide networklocation string into two parts by last separator, and get 
861   * the second part here.
862   * 
863   * @param networkLocation
864   * @return
865   */
866  public static String getLastHalf(String networkLocation) {
867    int index = networkLocation.lastIndexOf(NodeBase.PATH_SEPARATOR_STR);
868    return networkLocation.substring(index);
869  }
870
871  /**
872   * Returns an integer weight which specifies how far away {node} is away from
873   * {reader}. A lower value signifies that a node is closer.
874   * 
875   * @param reader Node where data will be read
876   * @param node Replica of data
877   * @return weight
878   */
879  protected int getWeight(Node reader, Node node) {
880    // 0 is local, 1 is same rack, 2 is off rack
881    // Start off by initializing to off rack
882    int weight = 2;
883    if (reader != null) {
884      if (reader.equals(node)) {
885        weight = 0;
886      } else if (isOnSameRack(reader, node)) {
887        weight = 1;
888      }
889    }
890    return weight;
891  }
892
893  /**
894   * Sort nodes array by network distance to <i>reader</i>.
895   * <p/>
896   * In a three-level topology, a node can be either local, on the same rack,
897   * or on a different rack from the reader. Sorting the nodes based on network
898   * distance from the reader reduces network traffic and improves
899   * performance.
900   * <p/>
901   * As an additional twist, we also randomize the nodes at each network
902   * distance. This helps with load balancing when there is data skew.
903   *
904   * @param reader    Node where data will be read
905   * @param nodes     Available replicas with the requested data
906   * @param activeLen Number of active nodes at the front of the array
907   */
908  public void sortByDistance(Node reader, Node[] nodes, int activeLen) {
909    /** Sort weights for the nodes array */
910    int[] weights = new int[activeLen];
911    for (int i=0; i<activeLen; i++) {
912      weights[i] = getWeight(reader, nodes[i]);
913    }
914    // Add weight/node pairs to a TreeMap to sort
915    TreeMap<Integer, List<Node>> tree = new TreeMap<Integer, List<Node>>();
916    for (int i=0; i<activeLen; i++) {
917      int weight = weights[i];
918      Node node = nodes[i];
919      List<Node> list = tree.get(weight);
920      if (list == null) {
921        list = Lists.newArrayListWithExpectedSize(1);
922        tree.put(weight, list);
923      }
924      list.add(node);
925    }
926
927    int idx = 0;
928    for (List<Node> list: tree.values()) {
929      if (list != null) {
930        Collections.shuffle(list, r);
931        for (Node n: list) {
932          nodes[idx] = n;
933          idx++;
934        }
935      }
936    }
937    Preconditions.checkState(idx == activeLen,
938        "Sorted the wrong number of nodes!");
939  }
940}