001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.net;
019
020import java.util.ArrayList;
021import java.util.List;
022import java.util.Collection;
023import java.util.Collections;
024import java.util.List;
025import java.util.Random;
026import java.util.TreeMap;
027import java.util.concurrent.locks.ReadWriteLock;
028import java.util.concurrent.locks.ReentrantReadWriteLock;
029
030import com.google.common.annotations.VisibleForTesting;
031import org.apache.commons.logging.Log;
032import org.apache.commons.logging.LogFactory;
033import org.apache.hadoop.classification.InterfaceAudience;
034import org.apache.hadoop.classification.InterfaceStability;
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
037import org.apache.hadoop.util.ReflectionUtils;
038
039import com.google.common.base.Preconditions;
040import com.google.common.collect.Lists;
041
042/** The class represents a cluster of computer with a tree hierarchical
043 * network topology.
044 * For example, a cluster may be consists of many data centers filled 
045 * with racks of computers.
046 * In a network topology, leaves represent data nodes (computers) and inner
047 * nodes represent switches/routers that manage traffic in/out of data centers
048 * or racks.  
049 * 
050 */
051@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
052@InterfaceStability.Unstable
053public class NetworkTopology {
054  public final static String DEFAULT_RACK = "/default-rack";
055  public final static int DEFAULT_HOST_LEVEL = 2;
056  public static final Log LOG = 
057    LogFactory.getLog(NetworkTopology.class);
058    
059  public static class InvalidTopologyException extends RuntimeException {
060    private static final long serialVersionUID = 1L;
061    public InvalidTopologyException(String msg) {
062      super(msg);
063    }
064  }
065  
066  /**
067   * Get an instance of NetworkTopology based on the value of the configuration
068   * parameter net.topology.impl.
069   * 
070   * @param conf the configuration to be used
071   * @return an instance of NetworkTopology
072   */
073  public static NetworkTopology getInstance(Configuration conf){
074    return ReflectionUtils.newInstance(
075        conf.getClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_IMPL_KEY,
076        NetworkTopology.class, NetworkTopology.class), conf);
077  }
078
079  /** InnerNode represents a switch/router of a data center or rack.
080   * Different from a leaf node, it has non-null children.
081   */
082  static class InnerNode extends NodeBase {
083    protected List<Node> children=new ArrayList<Node>();
084    private int numOfLeaves;
085        
086    /** Construct an InnerNode from a path-like string */
087    InnerNode(String path) {
088      super(path);
089    }
090        
091    /** Construct an InnerNode from its name and its network location */
092    InnerNode(String name, String location) {
093      super(name, location);
094    }
095        
096    /** Construct an InnerNode
097     * from its name, its network location, its parent, and its level */
098    InnerNode(String name, String location, InnerNode parent, int level) {
099      super(name, location, parent, level);
100    }
101        
102    /** @return its children */
103    List<Node> getChildren() {return children;}
104        
105    /** @return the number of children this node has */
106    int getNumOfChildren() {
107      return children.size();
108    }
109        
110    /** Judge if this node represents a rack 
111     * @return true if it has no child or its children are not InnerNodes
112     */ 
113    boolean isRack() {
114      if (children.isEmpty()) {
115        return true;
116      }
117            
118      Node firstChild = children.get(0);
119      if (firstChild instanceof InnerNode) {
120        return false;
121      }
122            
123      return true;
124    }
125        
126    /** Judge if this node is an ancestor of node <i>n</i>
127     * 
128     * @param n a node
129     * @return true if this node is an ancestor of <i>n</i>
130     */
131    boolean isAncestor(Node n) {
132      return getPath(this).equals(NodeBase.PATH_SEPARATOR_STR) ||
133        (n.getNetworkLocation()+NodeBase.PATH_SEPARATOR_STR).
134        startsWith(getPath(this)+NodeBase.PATH_SEPARATOR_STR);
135    }
136        
137    /** Judge if this node is the parent of node <i>n</i>
138     * 
139     * @param n a node
140     * @return true if this node is the parent of <i>n</i>
141     */
142    boolean isParent(Node n) {
143      return n.getNetworkLocation().equals(getPath(this));
144    }
145        
146    /* Return a child name of this node who is an ancestor of node <i>n</i> */
147    private String getNextAncestorName(Node n) {
148      if (!isAncestor(n)) {
149        throw new IllegalArgumentException(
150                                           this + "is not an ancestor of " + n);
151      }
152      String name = n.getNetworkLocation().substring(getPath(this).length());
153      if (name.charAt(0) == PATH_SEPARATOR) {
154        name = name.substring(1);
155      }
156      int index=name.indexOf(PATH_SEPARATOR);
157      if (index !=-1)
158        name = name.substring(0, index);
159      return name;
160    }
161        
162    /** Add node <i>n</i> to the subtree of this node 
163     * @param n node to be added
164     * @return true if the node is added; false otherwise
165     */
166    boolean add(Node n) {
167      if (!isAncestor(n))
168        throw new IllegalArgumentException(n.getName()+", which is located at "
169                +n.getNetworkLocation()+", is not a decendent of "
170                +getPath(this));
171      if (isParent(n)) {
172        // this node is the parent of n; add n directly
173        n.setParent(this);
174        n.setLevel(this.level+1);
175        for(int i=0; i<children.size(); i++) {
176          if (children.get(i).getName().equals(n.getName())) {
177            children.set(i, n);
178            return false;
179          }
180        }
181        children.add(n);
182        numOfLeaves++;
183        return true;
184      } else {
185        // find the next ancestor node
186        String parentName = getNextAncestorName(n);
187        InnerNode parentNode = null;
188        for(int i=0; i<children.size(); i++) {
189          if (children.get(i).getName().equals(parentName)) {
190            parentNode = (InnerNode)children.get(i);
191            break;
192          }
193        }
194        if (parentNode == null) {
195          // create a new InnerNode
196          parentNode = createParentNode(parentName);
197          children.add(parentNode);
198        }
199        // add n to the subtree of the next ancestor node
200        if (parentNode.add(n)) {
201          numOfLeaves++;
202          return true;
203        } else {
204          return false;
205        }
206      }
207    }
208
209    /**
210     * Creates a parent node to be added to the list of children.  
211     * Creates a node using the InnerNode four argument constructor specifying 
212     * the name, location, parent, and level of this node.
213     * 
214     * <p>To be overridden in subclasses for specific InnerNode implementations,
215     * as alternative to overriding the full {@link #add(Node)} method.
216     * 
217     * @param parentName The name of the parent node
218     * @return A new inner node
219     * @see InnerNode#InnerNode(String, String, InnerNode, int)
220     */
221    protected InnerNode createParentNode(String parentName) {
222      return new InnerNode(parentName, getPath(this), this, this.getLevel()+1);
223    }
224
225    /** Remove node <i>n</i> from the subtree of this node
226     * @param n node to be deleted 
227     * @return true if the node is deleted; false otherwise
228     */
229    boolean remove(Node n) {
230      String parent = n.getNetworkLocation();
231      String currentPath = getPath(this);
232      if (!isAncestor(n))
233        throw new IllegalArgumentException(n.getName()
234                                           +", which is located at "
235                                           +parent+", is not a descendent of "+currentPath);
236      if (isParent(n)) {
237        // this node is the parent of n; remove n directly
238        for(int i=0; i<children.size(); i++) {
239          if (children.get(i).getName().equals(n.getName())) {
240            children.remove(i);
241            numOfLeaves--;
242            n.setParent(null);
243            return true;
244          }
245        }
246        return false;
247      } else {
248        // find the next ancestor node: the parent node
249        String parentName = getNextAncestorName(n);
250        InnerNode parentNode = null;
251        int i;
252        for(i=0; i<children.size(); i++) {
253          if (children.get(i).getName().equals(parentName)) {
254            parentNode = (InnerNode)children.get(i);
255            break;
256          }
257        }
258        if (parentNode==null) {
259          return false;
260        }
261        // remove n from the parent node
262        boolean isRemoved = parentNode.remove(n);
263        // if the parent node has no children, remove the parent node too
264        if (isRemoved) {
265          if (parentNode.getNumOfChildren() == 0) {
266            children.remove(i);
267          }
268          numOfLeaves--;
269        }
270        return isRemoved;
271      }
272    } // end of remove
273        
274    /** Given a node's string representation, return a reference to the node
275     * @param loc string location of the form /rack/node
276     * @return null if the node is not found or the childnode is there but
277     * not an instance of {@link InnerNode}
278     */
279    private Node getLoc(String loc) {
280      if (loc == null || loc.length() == 0) return this;
281            
282      String[] path = loc.split(PATH_SEPARATOR_STR, 2);
283      Node childnode = null;
284      for(int i=0; i<children.size(); i++) {
285        if (children.get(i).getName().equals(path[0])) {
286          childnode = children.get(i);
287        }
288      }
289      if (childnode == null) return null; // non-existing node
290      if (path.length == 1) return childnode;
291      if (childnode instanceof InnerNode) {
292        return ((InnerNode)childnode).getLoc(path[1]);
293      } else {
294        return null;
295      }
296    }
297        
298    /** get <i>leafIndex</i> leaf of this subtree 
299     * if it is not in the <i>excludedNode</i>
300     *
301     * @param leafIndex an indexed leaf of the node
302     * @param excludedNode an excluded node (can be null)
303     * @return
304     */
305    Node getLeaf(int leafIndex, Node excludedNode) {
306      int count=0;
307      // check if the excluded node a leaf
308      boolean isLeaf =
309        excludedNode == null || !(excludedNode instanceof InnerNode);
310      // calculate the total number of excluded leaf nodes
311      int numOfExcludedLeaves =
312        isLeaf ? 1 : ((InnerNode)excludedNode).getNumOfLeaves();
313      if (isLeafParent()) { // children are leaves
314        if (isLeaf) { // excluded node is a leaf node
315          int excludedIndex = children.indexOf(excludedNode);
316          if (excludedIndex != -1 && leafIndex >= 0) {
317            // excluded node is one of the children so adjust the leaf index
318            leafIndex = leafIndex>=excludedIndex ? leafIndex+1 : leafIndex;
319          }
320        }
321        // range check
322        if (leafIndex<0 || leafIndex>=this.getNumOfChildren()) {
323          return null;
324        }
325        return children.get(leafIndex);
326      } else {
327        for(int i=0; i<children.size(); i++) {
328          InnerNode child = (InnerNode)children.get(i);
329          if (excludedNode == null || excludedNode != child) {
330            // not the excludedNode
331            int numOfLeaves = child.getNumOfLeaves();
332            if (excludedNode != null && child.isAncestor(excludedNode)) {
333              numOfLeaves -= numOfExcludedLeaves;
334            }
335            if (count+numOfLeaves > leafIndex) {
336              // the leaf is in the child subtree
337              return child.getLeaf(leafIndex-count, excludedNode);
338            } else {
339              // go to the next child
340              count = count+numOfLeaves;
341            }
342          } else { // it is the excluededNode
343            // skip it and set the excludedNode to be null
344            excludedNode = null;
345          }
346        }
347        return null;
348      }
349    }
350    
351    protected boolean isLeafParent() {
352      return isRack();
353    }
354
355    /**
356      * Determine if children a leaves, default implementation calls {@link #isRack()}
357      * <p>To be overridden in subclasses for specific InnerNode implementations,
358      * as alternative to overriding the full {@link #getLeaf(int, Node)} method.
359      * 
360      * @return true if children are leaves, false otherwise
361      */
362    protected boolean areChildrenLeaves() {
363      return isRack();
364    }
365
366    /**
367     * Get number of leaves.
368     */
369    int getNumOfLeaves() {
370      return numOfLeaves;
371    }
372  } // end of InnerNode
373
374  /**
375   * the root cluster map
376   */
377  InnerNode clusterMap;
378  /** Depth of all leaf nodes */
379  private int depthOfAllLeaves = -1;
380  /** rack counter */
381  protected int numOfRacks = 0;
382  /** the lock used to manage access */
383  protected ReadWriteLock netlock = new ReentrantReadWriteLock();
384
385  public NetworkTopology() {
386    clusterMap = new InnerNode(InnerNode.ROOT);
387  }
388
389  /** Add a leaf node
390   * Update node counter & rack counter if necessary
391   * @param node node to be added; can be null
392   * @exception IllegalArgumentException if add a node to a leave 
393                                         or node to be added is not a leaf
394   */
395  public void add(Node node) {
396    if (node==null) return;
397    String oldTopoStr = this.toString();
398    if( node instanceof InnerNode ) {
399      throw new IllegalArgumentException(
400        "Not allow to add an inner node: "+NodeBase.getPath(node));
401    }
402    int newDepth = NodeBase.locationToDepth(node.getNetworkLocation()) + 1;
403    netlock.writeLock().lock();
404    try {
405      if ((depthOfAllLeaves != -1) && (depthOfAllLeaves != newDepth)) {
406        LOG.error("Error: can't add leaf node " + NodeBase.getPath(node) +
407            " at depth " + newDepth + " to topology:\n" + oldTopoStr);
408        throw new InvalidTopologyException("Failed to add " + NodeBase.getPath(node) +
409            ": You cannot have a rack and a non-rack node at the same " +
410            "level of the network topology.");
411      }
412      Node rack = getNodeForNetworkLocation(node);
413      if (rack != null && !(rack instanceof InnerNode)) {
414        throw new IllegalArgumentException("Unexpected data node " 
415                                           + node.toString() 
416                                           + " at an illegal network location");
417      }
418      if (clusterMap.add(node)) {
419        LOG.info("Adding a new node: "+NodeBase.getPath(node));
420        if (rack == null) {
421          numOfRacks++;
422        }
423        if (!(node instanceof InnerNode)) {
424          if (depthOfAllLeaves == -1) {
425            depthOfAllLeaves = node.getLevel();
426          }
427        }
428      }
429      if(LOG.isDebugEnabled()) {
430        LOG.debug("NetworkTopology became:\n" + this.toString());
431      }
432    } finally {
433      netlock.writeLock().unlock();
434    }
435  }
436  
437  /**
438   * Return a reference to the node given its string representation.
439   * Default implementation delegates to {@link #getNode(String)}.
440   * 
441   * <p>To be overridden in subclasses for specific NetworkTopology 
442   * implementations, as alternative to overriding the full {@link #add(Node)}
443   *  method.
444   * 
445   * @param node The string representation of this node's network location is
446   * used to retrieve a Node object. 
447   * @return a reference to the node; null if the node is not in the tree
448   * 
449   * @see #add(Node)
450   * @see #getNode(String)
451   */
452  protected Node getNodeForNetworkLocation(Node node) {
453    return getNode(node.getNetworkLocation());
454  }
455  
456  /**
457   * Given a string representation of a rack, return its children
458   * @param loc a path-like string representation of a rack
459   * @return a newly allocated list with all the node's children
460   */
461  public List<Node> getDatanodesInRack(String loc) {
462    netlock.readLock().lock();
463    try {
464      loc = NodeBase.normalize(loc);
465      if (!NodeBase.ROOT.equals(loc)) {
466        loc = loc.substring(1);
467      }
468      InnerNode rack = (InnerNode) clusterMap.getLoc(loc);
469      if (rack == null) {
470        return null;
471      }
472      return new ArrayList<Node>(rack.getChildren());
473    } finally {
474      netlock.readLock().unlock();
475    }
476  }
477
478  /** Remove a node
479   * Update node counter and rack counter if necessary
480   * @param node node to be removed; can be null
481   */ 
482  public void remove(Node node) {
483    if (node==null) return;
484    if( node instanceof InnerNode ) {
485      throw new IllegalArgumentException(
486        "Not allow to remove an inner node: "+NodeBase.getPath(node));
487    }
488    LOG.info("Removing a node: "+NodeBase.getPath(node));
489    netlock.writeLock().lock();
490    try {
491      if (clusterMap.remove(node)) {
492        InnerNode rack = (InnerNode)getNode(node.getNetworkLocation());
493        if (rack == null) {
494          numOfRacks--;
495        }
496      }
497      if(LOG.isDebugEnabled()) {
498        LOG.debug("NetworkTopology became:\n" + this.toString());
499      }
500    } finally {
501      netlock.writeLock().unlock();
502    }
503  }
504
505  /** Check if the tree contains node <i>node</i>
506   * 
507   * @param node a node
508   * @return true if <i>node</i> is already in the tree; false otherwise
509   */
510  public boolean contains(Node node) {
511    if (node == null) return false;
512    netlock.readLock().lock();
513    try {
514      Node parent = node.getParent();
515      for (int level = node.getLevel(); parent != null && level > 0;
516           parent = parent.getParent(), level--) {
517        if (parent == clusterMap) {
518          return true;
519        }
520      }
521    } finally {
522      netlock.readLock().unlock();
523    }
524    return false; 
525  }
526    
527  /** Given a string representation of a node, return its reference
528   * 
529   * @param loc
530   *          a path-like string representation of a node
531   * @return a reference to the node; null if the node is not in the tree
532   */
533  public Node getNode(String loc) {
534    netlock.readLock().lock();
535    try {
536      loc = NodeBase.normalize(loc);
537      if (!NodeBase.ROOT.equals(loc))
538        loc = loc.substring(1);
539      return clusterMap.getLoc(loc);
540    } finally {
541      netlock.readLock().unlock();
542    }
543  }
544  
545  /** Given a string representation of a rack for a specific network
546   *  location
547   * 
548   * To be overridden in subclasses for specific NetworkTopology 
549   * implementations, as alternative to overriding the full 
550   * {@link #getRack(String)} method.
551   * @param loc
552   *          a path-like string representation of a network location
553   * @return a rack string
554   */
555  public String getRack(String loc) {
556    return loc;
557  }
558  
559  /** @return the total number of racks */
560  public int getNumOfRacks() {
561    netlock.readLock().lock();
562    try {
563      return numOfRacks;
564    } finally {
565      netlock.readLock().unlock();
566    }
567  }
568
569  /** @return the total number of leaf nodes */
570  public int getNumOfLeaves() {
571    netlock.readLock().lock();
572    try {
573      return clusterMap.getNumOfLeaves();
574    } finally {
575      netlock.readLock().unlock();
576    }
577  }
578
579  /** Return the distance between two nodes
580   * It is assumed that the distance from one node to its parent is 1
581   * The distance between two nodes is calculated by summing up their distances
582   * to their closest common ancestor.
583   * @param node1 one node
584   * @param node2 another node
585   * @return the distance between node1 and node2 which is zero if they are the same
586   *  or {@link Integer#MAX_VALUE} if node1 or node2 do not belong to the cluster
587   */
588  public int getDistance(Node node1, Node node2) {
589    if (node1 == node2) {
590      return 0;
591    }
592    Node n1=node1, n2=node2;
593    int dis = 0;
594    netlock.readLock().lock();
595    try {
596      int level1=node1.getLevel(), level2=node2.getLevel();
597      while(n1!=null && level1>level2) {
598        n1 = n1.getParent();
599        level1--;
600        dis++;
601      }
602      while(n2!=null && level2>level1) {
603        n2 = n2.getParent();
604        level2--;
605        dis++;
606      }
607      while(n1!=null && n2!=null && n1.getParent()!=n2.getParent()) {
608        n1=n1.getParent();
609        n2=n2.getParent();
610        dis+=2;
611      }
612    } finally {
613      netlock.readLock().unlock();
614    }
615    if (n1==null) {
616      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node1));
617      return Integer.MAX_VALUE;
618    }
619    if (n2==null) {
620      LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node2));
621      return Integer.MAX_VALUE;
622    }
623    return dis+2;
624  }
625
626  /** Check if two nodes are on the same rack
627   * @param node1 one node (can be null)
628   * @param node2 another node (can be null)
629   * @return true if node1 and node2 are on the same rack; false otherwise
630   * @exception IllegalArgumentException when either node1 or node2 is null, or
631   * node1 or node2 do not belong to the cluster
632   */
633  public boolean isOnSameRack( Node node1,  Node node2) {
634    if (node1 == null || node2 == null) {
635      return false;
636    }
637      
638    netlock.readLock().lock();
639    try {
640      return isSameParents(node1, node2);
641    } finally {
642      netlock.readLock().unlock();
643    }
644  }
645  
646  /**
647   * Check if network topology is aware of NodeGroup
648   */
649  public boolean isNodeGroupAware() {
650    return false;
651  }
652  
653  /** 
654   * Return false directly as not aware of NodeGroup, to be override in sub-class
655   */
656  public boolean isOnSameNodeGroup(Node node1, Node node2) {
657    return false;
658  }
659
660  /**
661   * Compare the parents of each node for equality
662   * 
663   * <p>To be overridden in subclasses for specific NetworkTopology 
664   * implementations, as alternative to overriding the full 
665   * {@link #isOnSameRack(Node, Node)} method.
666   * 
667   * @param node1 the first node to compare
668   * @param node2 the second node to compare
669   * @return true if their parents are equal, false otherwise
670   * 
671   * @see #isOnSameRack(Node, Node)
672   */
673  protected boolean isSameParents(Node node1, Node node2) {
674    return node1.getParent()==node2.getParent();
675  }
676
677  private static final Random r = new Random();
678
679  @VisibleForTesting
680  void setRandomSeed(long seed) {
681    r.setSeed(seed);
682  }
683
684  /** randomly choose one node from <i>scope</i>
685   * if scope starts with ~, choose one from the all nodes except for the
686   * ones in <i>scope</i>; otherwise, choose one from <i>scope</i>
687   * @param scope range of nodes from which a node will be chosen
688   * @return the chosen node
689   */
690  public Node chooseRandom(String scope) {
691    netlock.readLock().lock();
692    try {
693      if (scope.startsWith("~")) {
694        return chooseRandom(NodeBase.ROOT, scope.substring(1));
695      } else {
696        return chooseRandom(scope, null);
697      }
698    } finally {
699      netlock.readLock().unlock();
700    }
701  }
702
703  private Node chooseRandom(String scope, String excludedScope){
704    if (excludedScope != null) {
705      if (scope.startsWith(excludedScope)) {
706        return null;
707      }
708      if (!excludedScope.startsWith(scope)) {
709        excludedScope = null;
710      }
711    }
712    Node node = getNode(scope);
713    if (!(node instanceof InnerNode)) {
714      return node;
715    }
716    InnerNode innerNode = (InnerNode)node;
717    int numOfDatanodes = innerNode.getNumOfLeaves();
718    if (excludedScope == null) {
719      node = null;
720    } else {
721      node = getNode(excludedScope);
722      if (!(node instanceof InnerNode)) {
723        numOfDatanodes -= 1;
724      } else {
725        numOfDatanodes -= ((InnerNode)node).getNumOfLeaves();
726      }
727    }
728    if (numOfDatanodes == 0) {
729      throw new InvalidTopologyException(
730          "Failed to find datanode (scope=\"" + String.valueOf(scope) +
731          "\" excludedScope=\"" + String.valueOf(excludedScope) + "\").");
732    }
733    int leaveIndex = r.nextInt(numOfDatanodes);
734    return innerNode.getLeaf(leaveIndex, node);
735  }
736
737  /** return leaves in <i>scope</i>
738   * @param scope a path string
739   * @return leaves nodes under specific scope
740   */
741  public List<Node> getLeaves(String scope) {
742    Node node = getNode(scope);
743    List<Node> leafNodes = new ArrayList<Node>();
744    if (!(node instanceof InnerNode)) {
745      leafNodes.add(node);
746    } else {
747      InnerNode innerNode = (InnerNode) node;
748      for (int i=0;i<innerNode.getNumOfLeaves();i++) {
749        leafNodes.add(innerNode.getLeaf(i, null));
750      }
751    }
752    return leafNodes;
753  }
754
755  /** return the number of leaves in <i>scope</i> but not in <i>excludedNodes</i>
756   * if scope starts with ~, return the number of nodes that are not
757   * in <i>scope</i> and <i>excludedNodes</i>; 
758   * @param scope a path string that may start with ~
759   * @param excludedNodes a list of nodes
760   * @return number of available nodes
761   */
762  public int countNumOfAvailableNodes(String scope,
763                                      Collection<Node> excludedNodes) {
764    boolean isExcluded=false;
765    if (scope.startsWith("~")) {
766      isExcluded=true;
767      scope=scope.substring(1);
768    }
769    scope = NodeBase.normalize(scope);
770    int excludedCountInScope = 0; // the number of nodes in both scope & excludedNodes
771    int excludedCountOffScope = 0; // the number of nodes outside scope & excludedNodes
772    netlock.readLock().lock();
773    try {
774      for (Node node : excludedNodes) {
775        node = getNode(NodeBase.getPath(node));
776        if (node == null) {
777          continue;
778        }
779        if ((NodeBase.getPath(node) + NodeBase.PATH_SEPARATOR_STR)
780            .startsWith(scope + NodeBase.PATH_SEPARATOR_STR)) {
781          excludedCountInScope++;
782        } else {
783          excludedCountOffScope++;
784        }
785      }
786      Node n = getNode(scope);
787      int scopeNodeCount = 0;
788      if (n != null) {
789        scopeNodeCount++;
790      }
791      if (n instanceof InnerNode) {
792        scopeNodeCount=((InnerNode)n).getNumOfLeaves();
793      }
794      if (isExcluded) {
795        return clusterMap.getNumOfLeaves() - scopeNodeCount
796            - excludedCountOffScope;
797      } else {
798        return scopeNodeCount - excludedCountInScope;
799      }
800    } finally {
801      netlock.readLock().unlock();
802    }
803  }
804
805  /** convert a network tree to a string */
806  @Override
807  public String toString() {
808    // print the number of racks
809    StringBuilder tree = new StringBuilder();
810    tree.append("Number of racks: ");
811    tree.append(numOfRacks);
812    tree.append("\n");
813    // print the number of leaves
814    int numOfLeaves = getNumOfLeaves();
815    tree.append("Expected number of leaves:");
816    tree.append(numOfLeaves);
817    tree.append("\n");
818    // print nodes
819    for(int i=0; i<numOfLeaves; i++) {
820      tree.append(NodeBase.getPath(clusterMap.getLeaf(i, null)));
821      tree.append("\n");
822    }
823    return tree.toString();
824  }
825  
826  /**
827   * Divide networklocation string into two parts by last separator, and get 
828   * the first part here.
829   * 
830   * @param networkLocation
831   * @return
832   */
833  public static String getFirstHalf(String networkLocation) {
834    int index = networkLocation.lastIndexOf(NodeBase.PATH_SEPARATOR_STR);
835    return networkLocation.substring(0, index);
836  }
837
838  /**
839   * Divide networklocation string into two parts by last separator, and get 
840   * the second part here.
841   * 
842   * @param networkLocation
843   * @return
844   */
845  public static String getLastHalf(String networkLocation) {
846    int index = networkLocation.lastIndexOf(NodeBase.PATH_SEPARATOR_STR);
847    return networkLocation.substring(index);
848  }
849
850  /**
851   * Returns an integer weight which specifies how far away {node} is away from
852   * {reader}. A lower value signifies that a node is closer.
853   * 
854   * @param reader Node where data will be read
855   * @param node Replica of data
856   * @return weight
857   */
858  protected int getWeight(Node reader, Node node) {
859    // 0 is local, 1 is same rack, 2 is off rack
860    // Start off by initializing to off rack
861    int weight = 2;
862    if (reader != null) {
863      if (reader == node) {
864        weight = 0;
865      } else if (isOnSameRack(reader, node)) {
866        weight = 1;
867      }
868    }
869    return weight;
870  }
871
872  /**
873   * Sort nodes array by network distance to <i>reader</i>.
874   * <p/>
875   * In a three-level topology, a node can be either local, on the same rack,
876   * or on a different rack from the reader. Sorting the nodes based on network
877   * distance from the reader reduces network traffic and improves
878   * performance.
879   * <p/>
880   * As an additional twist, we also randomize the nodes at each network
881   * distance. This helps with load balancing when there is data skew.
882   *
883   * @param reader    Node where data will be read
884   * @param nodes     Available replicas with the requested data
885   * @param activeLen Number of active nodes at the front of the array
886   */
887  public void sortByDistance(Node reader, Node[] nodes, int activeLen) {
888    /** Sort weights for the nodes array */
889    int[] weights = new int[activeLen];
890    for (int i=0; i<activeLen; i++) {
891      weights[i] = getWeight(reader, nodes[i]);
892    }
893    // Add weight/node pairs to a TreeMap to sort
894    TreeMap<Integer, List<Node>> tree = new TreeMap<Integer, List<Node>>();
895    for (int i=0; i<activeLen; i++) {
896      int weight = weights[i];
897      Node node = nodes[i];
898      List<Node> list = tree.get(weight);
899      if (list == null) {
900        list = Lists.newArrayListWithExpectedSize(1);
901        tree.put(weight, list);
902      }
903      list.add(node);
904    }
905
906    int idx = 0;
907    for (List<Node> list: tree.values()) {
908      if (list != null) {
909        Collections.shuffle(list, r);
910        for (Node n: list) {
911          nodes[idx] = n;
912          idx++;
913        }
914      }
915    }
916    Preconditions.checkState(idx == activeLen,
917        "Sorted the wrong number of nodes!");
918  }
919}