001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.hdfs.util;
020    
021    import org.apache.hadoop.classification.InterfaceAudience;
022    import org.apache.hadoop.classification.InterfaceStability;
023    import org.xml.sax.ContentHandler;
024    import org.xml.sax.SAXException;
025    import org.xml.sax.helpers.AttributesImpl;
026    
027    import java.util.LinkedList;
028    import java.util.List;
029    import java.util.Map;
030    import java.util.TreeMap;
031    
032    /**
033     * General xml utilities.
034     *   
035     */
036    @InterfaceAudience.Private
037    @InterfaceStability.Unstable
038    public class XMLUtils {
039      /**
040       * Exception that reflects an invalid XML document.
041       */
042      static public class InvalidXmlException extends RuntimeException {
043        private static final long serialVersionUID = 1L;
044        public InvalidXmlException(String s) {
045          super(s);
046        }
047      }
048      
049      /**
050       * Exception that reflects a string that cannot be unmangled.
051       */
052      public static class UnmanglingError extends RuntimeException {
053        private static final long serialVersionUID = 1L;
054        
055        public UnmanglingError(String str, Exception e) {
056          super(str, e);
057        }
058        
059        public UnmanglingError(String str) {
060          super(str);
061        }
062      }
063      
064    
065      /**
066       * Given a code point, determine if it should be mangled before being
067       * represented in an XML document.
068       * 
069       * Any code point that isn't valid in XML must be mangled.
070       * See http://en.wikipedia.org/wiki/Valid_characters_in_XML for a
071       * quick reference, or the w3 standard for the authoritative reference.
072       * 
073       * @param cp      The code point
074       * @return        True if the code point should be mangled
075       */
076      private static boolean codePointMustBeMangled(int cp) {
077        if (cp < 0x20) {
078          return ((cp != 0x9) && (cp != 0xa) && (cp != 0xd));
079        } else if ((0xd7ff < cp) && (cp < 0xe000)) {
080          return true;
081        } else if ((cp == 0xfffe) || (cp == 0xffff)) {
082          return true;
083        } else if (cp == 0x5c) {
084          // we mangle backslash to simplify decoding... it's
085          // easier if backslashes always begin mangled sequences. 
086          return true;
087        }
088        return false;
089      }
090    
091      private static final int NUM_SLASH_POSITIONS = 4;
092    
093      private static String mangleCodePoint(int cp) {
094        return String.format("\\%0" + NUM_SLASH_POSITIONS + "x;", cp);
095      }
096    
097      private static String codePointToEntityRef(int cp) {
098        switch (cp) {
099          case '&':
100            return "&amp;";
101          case '\"':
102            return "&quot;";
103          case '\'':
104            return "&apos;";
105          case '<':
106            return "&lt;";
107          case '>':
108            return "&gt;";
109          default:
110            return null;
111        }
112      }
113    
114      /**
115       * Mangle a string so that it can be represented in an XML document.
116       * 
117       * There are three kinds of code points in XML:
118       * - Those that can be represented normally,
119       * - Those that have to be escaped (for example, & must be represented 
120       *     as &amp;)
121       * - Those that cannot be represented at all in XML.
122       *
123       * The built-in SAX functions will handle the first two types for us just
124       * fine.  However, sometimes we come across a code point of the third type.
125       * In this case, we have to mangle the string in order to represent it at
126       * all.  We also mangle backslash to avoid confusing a backslash in the
127       * string with part our escape sequence.
128       * 
129       * The encoding used here is as follows: an illegal code point is
130       * represented as '\ABCD;', where ABCD is the hexadecimal value of 
131       * the code point.
132       *
133       * @param str     The input string.
134       *
135       * @return        The mangled string.
136       */
137      public static String mangleXmlString(String str, boolean createEntityRefs) {
138        final StringBuilder bld = new StringBuilder();
139        final int length = str.length();
140        for (int offset = 0; offset < length; ) {
141           final int cp = str.codePointAt(offset);
142           final int len = Character.charCount(cp);
143           if (codePointMustBeMangled(cp)) {
144             bld.append(mangleCodePoint(cp));
145           } else {
146             String entityRef = null;
147             if (createEntityRefs) {
148               entityRef = codePointToEntityRef(cp);
149             }
150             if (entityRef != null) {
151               bld.append(entityRef);
152             } else {
153               for (int i = 0; i < len; i++) {
154                 bld.append(str.charAt(offset + i));
155               }
156             }
157           }
158           offset += len;
159        }
160        return bld.toString();
161      }
162    
163      /**
164       * Demangle a string from an XML document.
165       * See {@link #mangleXmlString(String, boolean)} for a description of the
166       * mangling format.
167       *
168       * @param str    The string to be demangled.
169       * 
170       * @return       The unmangled string
171       * @throws       UnmanglingError if the input is malformed.
172       */
173      public static String unmangleXmlString(String str, boolean decodeEntityRefs)
174            throws UnmanglingError {
175        int slashPosition = -1;
176        String escapedCp = "";
177        StringBuilder bld = new StringBuilder();
178        StringBuilder entityRef = null;
179        for (int i = 0; i < str.length(); i++) {
180          char ch = str.charAt(i);
181          if (entityRef != null) {
182            entityRef.append(ch);
183            if (ch == ';') {
184              String e = entityRef.toString();
185              if (e.equals("&quot;")) {
186                bld.append("\"");
187              } else if (e.equals("&apos;")) {
188                bld.append("\'");
189              } else if (e.equals("&amp;")) {
190                bld.append("&");
191              } else if (e.equals("&lt;")) {
192                bld.append("<");
193              } else if (e.equals("&gt;")) {
194                bld.append(">");
195              } else {
196                throw new UnmanglingError("Unknown entity ref " + e);
197              }
198              entityRef = null;
199            }
200          } else  if ((slashPosition >= 0) && (slashPosition < NUM_SLASH_POSITIONS)) {
201            escapedCp += ch;
202            ++slashPosition;
203          } else if (slashPosition == NUM_SLASH_POSITIONS) {
204            if (ch != ';') {
205              throw new UnmanglingError("unterminated code point escape: " +
206                  "expected semicolon at end.");
207            }
208            try {
209              bld.appendCodePoint(Integer.parseInt(escapedCp, 16));
210            } catch (NumberFormatException e) {
211              throw new UnmanglingError("error parsing unmangling escape code", e);
212            }
213            escapedCp = "";
214            slashPosition = -1;
215          } else if (ch == '\\') {
216            slashPosition = 0;
217          } else {
218            boolean startingEntityRef = false;
219            if (decodeEntityRefs) {
220              startingEntityRef = (ch == '&');
221            }
222            if (startingEntityRef) {
223              entityRef = new StringBuilder();
224              entityRef.append("&");
225            } else {
226              bld.append(ch);
227            }
228          }
229        }
230        if (entityRef != null) {
231          throw new UnmanglingError("unterminated entity ref starting with " +
232              entityRef.toString());
233        } else if (slashPosition != -1) {
234          throw new UnmanglingError("unterminated code point escape: string " +
235              "broke off in the middle");
236        }
237        return bld.toString();
238      }
239      
240      /**
241       * Add a SAX tag with a string inside.
242       *
243       * @param contentHandler     the SAX content handler
244       * @param tag                the element tag to use  
245       * @param val                the string to put inside the tag
246       */
247      public static void addSaxString(ContentHandler contentHandler,
248          String tag, String val) throws SAXException {
249        contentHandler.startElement("", "", tag, new AttributesImpl());
250        char c[] = mangleXmlString(val, false).toCharArray();
251        contentHandler.characters(c, 0, c.length);
252        contentHandler.endElement("", "", tag);
253      }
254    
255      /**
256       * Represents a bag of key-value pairs encountered during parsing an XML
257       * file.
258       */
259      static public class Stanza {
260        private final TreeMap<String, LinkedList <Stanza > > subtrees;
261    
262        /** The unmangled value of this stanza. */
263        private String value;
264        
265        public Stanza() {
266          subtrees = new TreeMap<String, LinkedList <Stanza > >();
267          value = "";
268        }
269        
270        public void setValue(String value) {
271          this.value = value;
272        }
273        
274        public String getValue() {
275          return this.value;
276        }
277        
278        /** 
279         * Discover if a stanza has a given entry.
280         *
281         * @param name        entry to look for
282         * 
283         * @return            true if the entry was found
284         */
285        public boolean hasChildren(String name) {
286          return subtrees.containsKey(name);
287        }
288        
289        /** 
290         * Pull an entry from a stanza.
291         *
292         * @param name        entry to look for
293         * 
294         * @return            the entry
295         */
296        public List<Stanza> getChildren(String name) throws InvalidXmlException {
297          LinkedList <Stanza> children = subtrees.get(name);
298          if (children == null) {
299            throw new InvalidXmlException("no entry found for " + name);
300          }
301          return children;
302        }
303        
304        /** 
305         * Pull a string entry from a stanza.
306         *
307         * @param name        entry to look for
308         * 
309         * @return            the entry
310         */
311        public String getValue(String name) throws InvalidXmlException {
312          String ret = getValueOrNull(name);
313          if (ret == null) {
314            throw new InvalidXmlException("no entry found for " + name);
315          }
316          return ret;
317        }
318    
319        /** 
320         * Pull a string entry from a stanza, or null.
321         *
322         * @param name        entry to look for
323         * 
324         * @return            the entry, or null if it was not found.
325         */
326        public String getValueOrNull(String name) throws InvalidXmlException {
327          if (!subtrees.containsKey(name)) {
328            return null;
329          }
330          LinkedList <Stanza> l = subtrees.get(name);
331          if (l.size() != 1) {
332            throw new InvalidXmlException("More than one value found for " + name);
333          }
334          return l.get(0).getValue();
335        }
336        
337        /** 
338         * Add an entry to a stanza.
339         *
340         * @param name        name of the entry to add
341         * @param child       the entry to add
342         */
343        public void addChild(String name, Stanza child) {
344          LinkedList<Stanza> l;
345          if (subtrees.containsKey(name)) {
346            l = subtrees.get(name);
347          } else {
348            l = new LinkedList<Stanza>();
349            subtrees.put(name, l);
350          }
351          l.add(child);
352        }
353        
354        /** 
355         * Convert a stanza to a human-readable string.
356         */
357        @Override
358        public String toString() {
359          StringBuilder bld = new StringBuilder();
360          bld.append("{");
361          if (!value.equals("")) {
362            bld.append("\"").append(value).append("\"");
363          }
364          String prefix = "";
365          for (Map.Entry<String, LinkedList <Stanza > > entry :
366              subtrees.entrySet()) {
367            String key = entry.getKey();
368            LinkedList <Stanza > ll = entry.getValue();
369            for (Stanza child : ll) {
370              bld.append(prefix);
371              bld.append("<").append(key).append(">");
372              bld.append(child.toString());
373              prefix = ", ";
374            }
375          }
376          bld.append("}");
377          return bld.toString();
378        }
379      }
380    }