001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.hdfs.util;
020    
021    import org.apache.hadoop.classification.InterfaceAudience;
022    import org.apache.hadoop.classification.InterfaceStability;
023    import org.xml.sax.ContentHandler;
024    import org.xml.sax.SAXException;
025    import org.xml.sax.helpers.AttributesImpl;
026    
027    import java.util.LinkedList;
028    import java.util.List;
029    import java.util.Map;
030    import java.util.TreeMap;
031    
032    /**
033     * General xml utilities.
034     *   
035     */
036    @InterfaceAudience.Private
037    @InterfaceStability.Unstable
038    public class XMLUtils {
039      /**
040       * Exception that reflects an invalid XML document.
041       */
042      static public class InvalidXmlException extends RuntimeException {
043        private static final long serialVersionUID = 1L;
044        public InvalidXmlException(String s) {
045          super(s);
046        }
047      }
048      
049      /**
050       * Exception that reflects a string that cannot be unmangled.
051       */
052      public static class UnmanglingError extends RuntimeException {
053        private static final long serialVersionUID = 1L;
054        
055        public UnmanglingError(String str, Exception e) {
056          super(str, e);
057        }
058        
059        public UnmanglingError(String str) {
060          super(str);
061        }
062      }
063      
064    
065      /**
066       * Given a code point, determine if it should be mangled before being
067       * represented in an XML document.
068       * 
069       * Any code point that isn't valid in XML must be mangled.
070       * See http://en.wikipedia.org/wiki/Valid_characters_in_XML for a
071       * quick reference, or the w3 standard for the authoritative reference.
072       * 
073       * @param cp      The code point
074       * @return        True if the code point should be mangled
075       */
076      private static boolean codePointMustBeMangled(int cp) {
077        if (cp < 0x20) {
078          return ((cp != 0x9) && (cp != 0xa) && (cp != 0xd));
079        } else if ((0xd7ff < cp) && (cp < 0xe000)) {
080          return true;
081        } else if ((cp == 0xfffe) || (cp == 0xffff)) {
082          return true;
083        } else if (cp == 0x5c) {
084          // we mangle backslash to simplify decoding... it's
085          // easier if backslashes always begin mangled sequences. 
086          return true;
087        }
088        return false;
089      }
090    
091      private static int NUM_SLASH_POSITIONS = 4;
092    
093      private static String mangleCodePoint(int cp) {
094        return String.format("\\%0" + NUM_SLASH_POSITIONS + "x;", cp);
095      }
096    
097      /**
098       * Mangle a string so that it can be represented in an XML document.
099       * 
100       * There are three kinds of code points in XML:
101       * - Those that can be represented normally,
102       * - Those that have to be escaped (for example, & must be represented 
103       *     as &amp;)
104       * - Those that cannot be represented at all in XML.
105       *
106       * The built-in SAX functions will handle the first two types for us just
107       * fine.  However, sometimes we come across a code point of the third type.
108       * In this case, we have to mangle the string in order to represent it at
109       * all.  We also mangle backslash to avoid confusing a backslash in the
110       * string with part our escape sequence.
111       * 
112       * The encoding used here is as follows: an illegal code point is
113       * represented as '\ABCD;', where ABCD is the hexadecimal value of 
114       * the code point.
115       *
116       * @param str     The input string.
117       *
118       * @return        The mangled string.
119       */
120      public static String mangleXmlString(String str) {
121        final StringBuilder bld = new StringBuilder();
122        final int length = str.length();
123        for (int offset = 0; offset < length; ) {
124           final int cp = str.codePointAt(offset);
125           final int len = Character.charCount(cp);
126           if (codePointMustBeMangled(cp)) {
127             bld.append(mangleCodePoint(cp));
128           } else {
129             for (int i = 0; i < len; i++) {
130               bld.append(str.charAt(offset + i));
131             }
132           }
133           offset += len;
134        }
135        return bld.toString();
136      }
137    
138      /**
139       * Demangle a string from an XML document.
140       * See {@link #mangleXmlString(String)} for a description of the mangling
141       * format.
142       *
143       * @param str    The string to be demangled.
144       * 
145       * @return       The unmangled string
146       * @throws       UnmanglingError if the input is malformed.
147       */
148      public static String unmangleXmlString(String str)
149            throws UnmanglingError {
150        int slashPosition = -1;
151        String escapedCp = "";
152        StringBuilder bld = new StringBuilder();
153        for (int i = 0; i < str.length(); i++) {
154          char ch = str.charAt(i);
155          if ((slashPosition >= 0) && (slashPosition < NUM_SLASH_POSITIONS)) {
156            escapedCp += ch;
157            ++slashPosition;
158          } else if (slashPosition == NUM_SLASH_POSITIONS) {
159            if (ch != ';') {
160              throw new UnmanglingError("unterminated code point escape: " +
161                  "expected semicolon at end.");
162            }
163            try {
164              bld.appendCodePoint(Integer.parseInt(escapedCp, 16));
165            } catch (NumberFormatException e) {
166              throw new UnmanglingError("error parsing unmangling escape code", e);
167            }
168            escapedCp = "";
169            slashPosition = -1;
170          } else if (ch == '\\') {
171            slashPosition = 0;
172          } else {
173            bld.append(ch);
174          }
175        }
176        if (slashPosition != -1) {
177          throw new UnmanglingError("unterminated code point escape: string " +
178              "broke off in the middle");
179        }
180        return bld.toString();
181      }
182      
183      /**
184       * Add a SAX tag with a string inside.
185       *
186       * @param contentHandler     the SAX content handler
187       * @param tag                the element tag to use  
188       * @param value              the string to put inside the tag
189       */
190      public static void addSaxString(ContentHandler contentHandler,
191          String tag, String val) throws SAXException {
192        contentHandler.startElement("", "", tag, new AttributesImpl());
193        char c[] = mangleXmlString(val).toCharArray();
194        contentHandler.characters(c, 0, c.length);
195        contentHandler.endElement("", "", tag);
196      }
197    
198      /**
199       * Represents a bag of key-value pairs encountered during parsing an XML
200       * file.
201       */
202      static public class Stanza {
203        private TreeMap<String, LinkedList <Stanza > > subtrees;
204    
205        /** The unmangled value of this stanza. */
206        private String value;
207        
208        public Stanza() {
209          subtrees = new TreeMap<String, LinkedList <Stanza > >();
210          value = "";
211        }
212        
213        public void setValue(String value) {
214          this.value = value;
215        }
216        
217        public String getValue() {
218          return this.value;
219        }
220        
221        /** 
222         * Discover if a stanza has a given entry.
223         *
224         * @param name        entry to look for
225         * 
226         * @return            true if the entry was found
227         */
228        public boolean hasChildren(String name) {
229          return subtrees.containsKey(name);
230        }
231        
232        /** 
233         * Pull an entry from a stanza.
234         *
235         * @param name        entry to look for
236         * 
237         * @return            the entry
238         */
239        public List<Stanza> getChildren(String name) throws InvalidXmlException {
240          LinkedList <Stanza> children = subtrees.get(name);
241          if (children == null) {
242            throw new InvalidXmlException("no entry found for " + name);
243          }
244          return children;
245        }
246        
247        /** 
248         * Pull a string entry from a stanza.
249         *
250         * @param name        entry to look for
251         * 
252         * @return            the entry
253         */
254        public String getValue(String name) throws InvalidXmlException {
255          String ret = getValueOrNull(name);
256          if (ret == null) {
257            throw new InvalidXmlException("no entry found for " + name);
258          }
259          return ret;
260        }
261    
262        /** 
263         * Pull a string entry from a stanza, or null.
264         *
265         * @param name        entry to look for
266         * 
267         * @return            the entry, or null if it was not found.
268         */
269        public String getValueOrNull(String name) throws InvalidXmlException {
270          if (!subtrees.containsKey(name)) {
271            return null;
272          }
273          LinkedList <Stanza> l = subtrees.get(name);
274          if (l.size() != 1) {
275            throw new InvalidXmlException("More than one value found for " + name);
276          }
277          return l.get(0).getValue();
278        }
279        
280        /** 
281         * Add an entry to a stanza.
282         *
283         * @param name        name of the entry to add
284         * @param child       the entry to add
285         */
286        public void addChild(String name, Stanza child) {
287          LinkedList<Stanza> l;
288          if (subtrees.containsKey(name)) {
289            l = subtrees.get(name);
290          } else {
291            l = new LinkedList<Stanza>();
292            subtrees.put(name, l);
293          }
294          l.add(child);
295        }
296        
297        /** 
298         * Convert a stanza to a human-readable string.
299         */
300        @Override
301        public String toString() {
302          StringBuilder bld = new StringBuilder();
303          bld.append("{");
304          if (!value.equals("")) {
305            bld.append("\"").append(value).append("\"");
306          }
307          String prefix = "";
308          for (Map.Entry<String, LinkedList <Stanza > > entry :
309              subtrees.entrySet()) {
310            String key = entry.getKey();
311            LinkedList <Stanza > ll = entry.getValue();
312            for (Stanza child : ll) {
313              bld.append(prefix);
314              bld.append("<").append(key).append(">");
315              bld.append(child.toString());
316              prefix = ", ";
317            }
318          }
319          bld.append("}");
320          return bld.toString();
321        }
322      }
323    }