001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.wicket.util.io; 018 019import java.io.BufferedInputStream; 020import java.io.IOException; 021import java.io.InputStream; 022import java.io.InputStreamReader; 023import java.io.Reader; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027import org.apache.wicket.util.lang.Args; 028import org.apache.wicket.util.string.Strings; 029 030 031/** 032 * This is a simple XmlReader. Its only purpose is to read the xml decl string from the input and 033 * apply proper character encoding to all subsequent characters. The xml decl string itself is 034 * removed from the output. 035 * 036 * @author Juergen Donnerstag 037 */ 038public final class XmlReader extends Reader 039{ 040 /** Regex to find <?xml encoding ... ?> */ 041 private static final Pattern xmlDecl = Pattern.compile("[\\s\\n\\r]*<\\?xml(\\s+.*)?\\?>"); 042 043 /** Regex to find <?xml encoding ... ?> */ 044 private static final Pattern encodingPattern = Pattern.compile("\\s+encoding\\s*=\\s*([\"\'](.*?)[\"\']|(\\S*)).*\\?>"); 045 046 /** Null, if JVM default. Else from <?xml encoding=""> */ 047 private String encoding; 048 049 /** The input stream to read the data from */ 050 private final InputStream inputStream; 051 052 /** The reader which does the character encoding */ 053 private Reader reader; 054 055 /** 056 * Construct. 057 * 058 * @param inputStream 059 * The InputStream to read the xml data from 060 * @param defaultEncoding 061 * Default character encoding to use when not specified in XML declaration, specify 062 * null to use JVM default 063 * @throws IOException 064 * In case something went wrong while reading the data 065 */ 066 public XmlReader(final InputStream inputStream, final String defaultEncoding) 067 throws IOException 068 { 069 Args.notNull(inputStream, "inputStream"); 070 071 if (!inputStream.markSupported()) 072 { 073 this.inputStream = new BufferedInputStream(new BOMInputStream(inputStream)); 074 } 075 else 076 { 077 this.inputStream = new BOMInputStream(inputStream); 078 } 079 encoding = defaultEncoding; 080 081 init(); 082 } 083 084 /** 085 * Return the encoding used while reading the markup file. 086 * 087 * @return if null, then JVM default 088 */ 089 public final String getEncoding() 090 { 091 return encoding; 092 } 093 094 /** 095 * Reads and parses markup from a resource such as file. 096 * 097 * @throws IOException 098 */ 099 public void init() throws IOException 100 { 101 // read ahead buffer required for the first line of the markup (encoding) 102 final int readAheadSize = 80; 103 inputStream.mark(readAheadSize); 104 105 // read-ahead the input stream and check if it starts with <?xml..?>. 106 String xmlDeclaration = getXmlDeclaration(inputStream, readAheadSize); 107 if (!Strings.isEmpty(xmlDeclaration)) 108 { 109 // If yes than determine the encoding from the xml decl 110 encoding = determineEncoding(xmlDeclaration); 111 } 112 else 113 { 114 // If not, reset the input stream to the beginning of the file 115 inputStream.reset(); 116 } 117 118 if (encoding == null) 119 { 120 // Use JVM default 121 reader = new InputStreamReader(inputStream); 122 } 123 else 124 { 125 // Use the encoding provided 126 reader = new InputStreamReader(inputStream, encoding); 127 } 128 } 129 130 /** 131 * Determine the encoding from the xml decl. 132 * 133 * @param string 134 * The xmlDecl string 135 * @return The encoding. Null, if not found 136 */ 137 private String determineEncoding(final CharSequence string) 138 { 139 // Does the string match the <?xml .. ?> pattern 140 final Matcher matcher = encodingPattern.matcher(string); 141 if (!matcher.find()) 142 { 143 // No 144 return null; 145 } 146 147 // Extract the encoding 148 String encoding = matcher.group(2); 149 if ((encoding == null) || (encoding.length() == 0)) 150 { 151 encoding = matcher.group(3); 152 } 153 154 if (encoding != null) 155 { 156 encoding = encoding.trim(); 157 } 158 159 return encoding; 160 } 161 162 /** 163 * Read-ahead the input stream (markup file). If the first line contains <?xml...?>, than 164 * remember the xml decl for later to determine the encoding. 165 * <p> 166 * The xml decl will not be forwarded to the user. 167 * 168 * @param in 169 * The markup file 170 * @param readAheadSize 171 * The read ahead buffer available to read the xml encoding information 172 * @return true, if <?xml ..?> has been found 173 * @throws IOException 174 */ 175 private String getXmlDeclaration(final InputStream in, final int readAheadSize) 176 throws IOException 177 { 178 // Max one line 179 final StringBuilder pushBack = new StringBuilder(readAheadSize); 180 181 // The current char from the markup file 182 int value; 183 while ((value = in.read()) != -1) 184 { 185 pushBack.append((char)value); 186 187 // Stop at the end of the first tag or end of line. If it is HTML 188 // without newlines, stop after X bytes (= characters) 189 if ((value == '>') || (value == '\n') || (value == '\r') || 190 (pushBack.length() >= (readAheadSize - 1))) 191 { 192 break; 193 } 194 } 195 196 // Does the string match the <?xml .. ?> pattern 197 final Matcher matcher = xmlDecl.matcher(pushBack); 198 if (!matcher.matches()) 199 { 200 // No 201 return null; 202 } 203 204 // Save the whole <?xml ..> string for later 205 return pushBack.toString().trim(); 206 } 207 208 /** 209 * @see java.io.Reader#close() 210 */ 211 @Override 212 public void close() throws IOException 213 { 214 try 215 { 216 reader.close(); 217 } 218 finally 219 { 220 inputStream.close(); 221 } 222 } 223 224 /** 225 * @see java.io.Reader#read(char[], int, int) 226 */ 227 @Override 228 public int read(final char[] buf, final int from, final int to) throws IOException 229 { 230 return reader.read(buf, from, to); 231 } 232 233 /** 234 * @return The markup to be parsed 235 */ 236 @Override 237 public String toString() 238 { 239 return inputStream.toString() + " (" + encoding + ")"; 240 } 241}