001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hdfs.util;
020
021import org.apache.hadoop.classification.InterfaceAudience;
022import org.apache.hadoop.classification.InterfaceStability;
023import org.xml.sax.ContentHandler;
024import org.xml.sax.SAXException;
025import org.xml.sax.helpers.AttributesImpl;
026
027import java.util.LinkedList;
028import java.util.List;
029import java.util.Map;
030import java.util.TreeMap;
031
032/**
033 * General xml utilities.
034 *   
035 */
036@InterfaceAudience.Private
037@InterfaceStability.Unstable
038public class XMLUtils {
039  /**
040   * Exception that reflects an invalid XML document.
041   */
042  static public class InvalidXmlException extends RuntimeException {
043    private static final long serialVersionUID = 1L;
044    public InvalidXmlException(String s) {
045      super(s);
046    }
047  }
048  
049  /**
050   * Exception that reflects a string that cannot be unmangled.
051   */
052  public static class UnmanglingError extends RuntimeException {
053    private static final long serialVersionUID = 1L;
054    
055    public UnmanglingError(String str, Exception e) {
056      super(str, e);
057    }
058    
059    public UnmanglingError(String str) {
060      super(str);
061    }
062  }
063  
064
065  /**
066   * Given a code point, determine if it should be mangled before being
067   * represented in an XML document.
068   * 
069   * Any code point that isn't valid in XML must be mangled.
070   * See http://en.wikipedia.org/wiki/Valid_characters_in_XML for a
071   * quick reference, or the w3 standard for the authoritative reference.
072   * 
073   * @param cp      The code point
074   * @return        True if the code point should be mangled
075   */
076  private static boolean codePointMustBeMangled(int cp) {
077    if (cp < 0x20) {
078      return ((cp != 0x9) && (cp != 0xa) && (cp != 0xd));
079    } else if ((0xd7ff < cp) && (cp < 0xe000)) {
080      return true;
081    } else if ((cp == 0xfffe) || (cp == 0xffff)) {
082      return true;
083    } else if (cp == 0x5c) {
084      // we mangle backslash to simplify decoding... it's
085      // easier if backslashes always begin mangled sequences. 
086      return true;
087    }
088    return false;
089  }
090
091  private static final int NUM_SLASH_POSITIONS = 4;
092
093  private static String mangleCodePoint(int cp) {
094    return String.format("\\%0" + NUM_SLASH_POSITIONS + "x;", cp);
095  }
096
097  /**
098   * Mangle a string so that it can be represented in an XML document.
099   * 
100   * There are three kinds of code points in XML:
101   * - Those that can be represented normally,
102   * - Those that have to be escaped (for example, & must be represented 
103   *     as &amp;)
104   * - Those that cannot be represented at all in XML.
105   *
106   * The built-in SAX functions will handle the first two types for us just
107   * fine.  However, sometimes we come across a code point of the third type.
108   * In this case, we have to mangle the string in order to represent it at
109   * all.  We also mangle backslash to avoid confusing a backslash in the
110   * string with part our escape sequence.
111   * 
112   * The encoding used here is as follows: an illegal code point is
113   * represented as '\ABCD;', where ABCD is the hexadecimal value of 
114   * the code point.
115   *
116   * @param str     The input string.
117   *
118   * @return        The mangled string.
119   */
120  public static String mangleXmlString(String str) {
121    final StringBuilder bld = new StringBuilder();
122    final int length = str.length();
123    for (int offset = 0; offset < length; ) {
124       final int cp = str.codePointAt(offset);
125       final int len = Character.charCount(cp);
126       if (codePointMustBeMangled(cp)) {
127         bld.append(mangleCodePoint(cp));
128       } else {
129         for (int i = 0; i < len; i++) {
130           bld.append(str.charAt(offset + i));
131         }
132       }
133       offset += len;
134    }
135    return bld.toString();
136  }
137
138  /**
139   * Demangle a string from an XML document.
140   * See {@link #mangleXmlString(String)} for a description of the mangling
141   * format.
142   *
143   * @param str    The string to be demangled.
144   * 
145   * @return       The unmangled string
146   * @throws       UnmanglingError if the input is malformed.
147   */
148  public static String unmangleXmlString(String str)
149        throws UnmanglingError {
150    int slashPosition = -1;
151    String escapedCp = "";
152    StringBuilder bld = new StringBuilder();
153    for (int i = 0; i < str.length(); i++) {
154      char ch = str.charAt(i);
155      if ((slashPosition >= 0) && (slashPosition < NUM_SLASH_POSITIONS)) {
156        escapedCp += ch;
157        ++slashPosition;
158      } else if (slashPosition == NUM_SLASH_POSITIONS) {
159        if (ch != ';') {
160          throw new UnmanglingError("unterminated code point escape: " +
161              "expected semicolon at end.");
162        }
163        try {
164          bld.appendCodePoint(Integer.parseInt(escapedCp, 16));
165        } catch (NumberFormatException e) {
166          throw new UnmanglingError("error parsing unmangling escape code", e);
167        }
168        escapedCp = "";
169        slashPosition = -1;
170      } else if (ch == '\\') {
171        slashPosition = 0;
172      } else {
173        bld.append(ch);
174      }
175    }
176    if (slashPosition != -1) {
177      throw new UnmanglingError("unterminated code point escape: string " +
178          "broke off in the middle");
179    }
180    return bld.toString();
181  }
182  
183  /**
184   * Add a SAX tag with a string inside.
185   *
186   * @param contentHandler     the SAX content handler
187   * @param tag                the element tag to use  
188   * @param value              the string to put inside the tag
189   */
190  public static void addSaxString(ContentHandler contentHandler,
191      String tag, String val) throws SAXException {
192    contentHandler.startElement("", "", tag, new AttributesImpl());
193    char c[] = mangleXmlString(val).toCharArray();
194    contentHandler.characters(c, 0, c.length);
195    contentHandler.endElement("", "", tag);
196  }
197
198  /**
199   * Represents a bag of key-value pairs encountered during parsing an XML
200   * file.
201   */
202  static public class Stanza {
203    private final TreeMap<String, LinkedList <Stanza > > subtrees;
204
205    /** The unmangled value of this stanza. */
206    private String value;
207    
208    public Stanza() {
209      subtrees = new TreeMap<String, LinkedList <Stanza > >();
210      value = "";
211    }
212    
213    public void setValue(String value) {
214      this.value = value;
215    }
216    
217    public String getValue() {
218      return this.value;
219    }
220    
221    /** 
222     * Discover if a stanza has a given entry.
223     *
224     * @param name        entry to look for
225     * 
226     * @return            true if the entry was found
227     */
228    public boolean hasChildren(String name) {
229      return subtrees.containsKey(name);
230    }
231    
232    /** 
233     * Pull an entry from a stanza.
234     *
235     * @param name        entry to look for
236     * 
237     * @return            the entry
238     */
239    public List<Stanza> getChildren(String name) throws InvalidXmlException {
240      LinkedList <Stanza> children = subtrees.get(name);
241      if (children == null) {
242        throw new InvalidXmlException("no entry found for " + name);
243      }
244      return children;
245    }
246    
247    /** 
248     * Pull a string entry from a stanza.
249     *
250     * @param name        entry to look for
251     * 
252     * @return            the entry
253     */
254    public String getValue(String name) throws InvalidXmlException {
255      String ret = getValueOrNull(name);
256      if (ret == null) {
257        throw new InvalidXmlException("no entry found for " + name);
258      }
259      return ret;
260    }
261
262    /** 
263     * Pull a string entry from a stanza, or null.
264     *
265     * @param name        entry to look for
266     * 
267     * @return            the entry, or null if it was not found.
268     */
269    public String getValueOrNull(String name) throws InvalidXmlException {
270      if (!subtrees.containsKey(name)) {
271        return null;
272      }
273      LinkedList <Stanza> l = subtrees.get(name);
274      if (l.size() != 1) {
275        throw new InvalidXmlException("More than one value found for " + name);
276      }
277      return l.get(0).getValue();
278    }
279    
280    /** 
281     * Add an entry to a stanza.
282     *
283     * @param name        name of the entry to add
284     * @param child       the entry to add
285     */
286    public void addChild(String name, Stanza child) {
287      LinkedList<Stanza> l;
288      if (subtrees.containsKey(name)) {
289        l = subtrees.get(name);
290      } else {
291        l = new LinkedList<Stanza>();
292        subtrees.put(name, l);
293      }
294      l.add(child);
295    }
296    
297    /** 
298     * Convert a stanza to a human-readable string.
299     */
300    @Override
301    public String toString() {
302      StringBuilder bld = new StringBuilder();
303      bld.append("{");
304      if (!value.equals("")) {
305        bld.append("\"").append(value).append("\"");
306      }
307      String prefix = "";
308      for (Map.Entry<String, LinkedList <Stanza > > entry :
309          subtrees.entrySet()) {
310        String key = entry.getKey();
311        LinkedList <Stanza > ll = entry.getValue();
312        for (Stanza child : ll) {
313          bld.append(prefix);
314          bld.append("<").append(key).append(">");
315          bld.append(child.toString());
316          prefix = ", ";
317        }
318      }
319      bld.append("}");
320      return bld.toString();
321    }
322  }
323}