001package org.jsoup; 002 003import org.jsoup.helper.DataUtil; 004import org.jsoup.helper.HttpConnection; 005import org.jsoup.nodes.Document; 006import org.jsoup.nodes.Element; 007import org.jsoup.parser.Parser; 008import org.jsoup.safety.Cleaner; 009import org.jsoup.safety.Safelist; 010import org.jspecify.annotations.Nullable; 011 012import java.io.File; 013import java.io.IOException; 014import java.io.InputStream; 015import java.net.URL; 016 017/** 018 The core public access point to the jsoup functionality. 019 020 @author Jonathan Hedley */ 021 022public class Jsoup { 023 private Jsoup() {} 024 025 /** 026 Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. 027 028 @param html HTML to parse 029 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur 030 before the HTML declares a {@code <base href>} tag. 031 @return sane HTML 032 */ 033 public static Document parse(String html, String baseUri) { 034 return Parser.parse(html, baseUri); 035 } 036 037 /** 038 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML 039 (non-HTML) parser. 040 041 @param html HTML to parse 042 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur 043 before the HTML declares a {@code <base href>} tag. 044 @param parser alternate {@link Parser#xmlParser() parser} to use. 045 @return sane HTML 046 */ 047 public static Document parse(String html, String baseUri, Parser parser) { 048 return parser.parseInput(html, baseUri); 049 } 050 051 /** 052 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML 053 (non-HTML) parser. As no base URI is specified, absolute URL resolution, if required, relies on the HTML including 054 a {@code <base href>} tag. 055 056 @param html HTML to parse 057 before the HTML declares a {@code <base href>} tag. 058 @param parser alternate {@link Parser#xmlParser() parser} to use. 059 @return sane HTML 060 */ 061 public static Document parse(String html, Parser parser) { 062 return parser.parseInput(html, ""); 063 } 064 065 /** 066 Parse HTML into a Document. As no base URI is specified, absolute URL resolution, if required, relies on the HTML 067 including a {@code <base href>} tag. 068 069 @param html HTML to parse 070 @return sane HTML 071 072 @see #parse(String, String) 073 */ 074 public static Document parse(String html) { 075 return Parser.parse(html, ""); 076 } 077 078 /** 079 * Creates a new {@link Connection} (session), with the defined request URL. Use to fetch and parse a HTML page. 080 * <p> 081 * Use examples: 082 * <ul> 083 * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li> 084 * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();</code></li> 085 * </ul> 086 * @param url URL to connect to. The protocol must be {@code http} or {@code https}. 087 * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. 088 * @see #newSession() 089 * @see Connection#newRequest() 090 */ 091 public static Connection connect(String url) { 092 return HttpConnection.connect(url); 093 } 094 095 /** 096 Creates a new {@link Connection} to use as a session. Connection settings (user-agent, timeouts, URL, etc), and 097 cookies will be maintained for the session. Use examples: 098<pre><code> 099Connection session = Jsoup.newSession() 100 .timeout(20 * 1000) 101 .userAgent("FooBar 2000"); 102 103Document doc1 = session.newRequest() 104 .url("https://jsoup.org/").data("ref", "example") 105 .get(); 106Document doc2 = session.newRequest() 107 .url("https://en.wikipedia.org/wiki/Main_Page") 108 .get(); 109Connection con3 = session.newRequest(); 110</code></pre> 111 112 <p>For multi-threaded requests, it is safe to use this session between threads, but take care to call {@link 113 Connection#newRequest()} per request and not share that instance between threads when executing or parsing.</p> 114 115 @return a connection 116 @since 1.14.1 117 */ 118 public static Connection newSession() { 119 return new HttpConnection(); 120 } 121 122 /** 123 Parse the contents of a file as HTML. 124 125 @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). 126 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 127 present, or fall back to {@code UTF-8} (which is often safe to do). 128 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 129 @return sane HTML 130 131 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 132 */ 133 public static Document parse(File file, @Nullable String charsetName, String baseUri) throws IOException { 134 return DataUtil.load(file, charsetName, baseUri); 135 } 136 137 /** 138 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 139 140 @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). 141 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 142 present, or fall back to {@code UTF-8} (which is often safe to do). 143 @return sane HTML 144 145 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 146 @see #parse(File, String, String) parse(file, charset, baseUri) 147 */ 148 public static Document parse(File file, @Nullable String charsetName) throws IOException { 149 return DataUtil.load(file, charsetName, file.getAbsolutePath()); 150 } 151 152 /** 153 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 154 The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag, 155 or if neither is present, will be {@code UTF-8}. 156 157 <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p> 158 159 @param file the file to load HTML from. Supports gzipped files (ending in .z or .gz). 160 @return sane HTML 161 @throws IOException if the file could not be found or read. 162 @see #parse(File, String, String) parse(file, charset, baseUri) 163 @since 1.15.1 164 */ 165 public static Document parse(File file) throws IOException { 166 return DataUtil.load(file, null, file.getAbsolutePath()); 167 } 168 169 /** 170 Parse the contents of a file as HTML. 171 172 @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). 173 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 174 present, or fall back to {@code UTF-8} (which is often safe to do). 175 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 176 @param parser alternate {@link Parser#xmlParser() parser} to use. 177 @return sane HTML 178 179 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 180 @since 1.14.2 181 */ 182 public static Document parse(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 183 return DataUtil.load(file, charsetName, baseUri, parser); 184 } 185 186 /** 187 Read an input stream, and parse it to a Document. 188 189 @param in input stream to read. The stream will be closed after reading. 190 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 191 present, or fall back to {@code UTF-8} (which is often safe to do). 192 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 193 @return sane HTML 194 195 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 196 */ 197 public static Document parse(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { 198 return DataUtil.load(in, charsetName, baseUri); 199 } 200 201 /** 202 Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML 203 (non-HTML) parser. 204 205 @param in input stream to read. Make sure to close it after parsing. 206 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 207 present, or fall back to {@code UTF-8} (which is often safe to do). 208 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 209 @param parser alternate {@link Parser#xmlParser() parser} to use. 210 @return sane HTML 211 212 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 213 */ 214 public static Document parse(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 215 return DataUtil.load(in, charsetName, baseUri, parser); 216 } 217 218 /** 219 Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. 220 221 @param bodyHtml body HTML fragment 222 @param baseUri URL to resolve relative URLs against. 223 @return sane HTML document 224 225 @see Document#body() 226 */ 227 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 228 return Parser.parseBodyFragment(bodyHtml, baseUri); 229 } 230 231 /** 232 Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. 233 234 @param bodyHtml body HTML fragment 235 @return sane HTML document 236 237 @see Document#body() 238 */ 239 public static Document parseBodyFragment(String bodyHtml) { 240 return Parser.parseBodyFragment(bodyHtml, ""); 241 } 242 243 /** 244 Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead. 245 <p> 246 The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}. 247 248 @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}. 249 @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. 250 @return The parsed HTML. 251 252 @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed 253 @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored 254 @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored 255 @throws java.net.SocketTimeoutException if the connection times out 256 @throws IOException if a connection or read error occurs 257 258 @see #connect(String) 259 */ 260 public static Document parse(URL url, int timeoutMillis) throws IOException { 261 Connection con = HttpConnection.connect(url); 262 con.timeout(timeoutMillis); 263 return con.get(); 264 } 265 266 /** 267 Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through an allow-list of safe 268 tags and attributes. 269 270 @param bodyHtml input untrusted HTML (body fragment) 271 @param baseUri URL to resolve relative URLs against 272 @param safelist list of permitted HTML elements 273 @return safe HTML (body fragment) 274 275 @see Cleaner#clean(Document) 276 */ 277 public static String clean(String bodyHtml, String baseUri, Safelist safelist) { 278 Document dirty = parseBodyFragment(bodyHtml, baseUri); 279 Cleaner cleaner = new Cleaner(safelist); 280 Document clean = cleaner.clean(dirty); 281 return clean.body().html(); 282 } 283 284 /** 285 Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of permitted 286 tags and attributes. 287 288 <p>Note that as this method does not take a base href URL to resolve attributes with relative URLs against, those 289 URLs will be removed, unless the input HTML contains a {@code <base href> tag}. If you wish to preserve those, use 290 the {@link Jsoup#clean(String html, String baseHref, Safelist)} method instead, and enable 291 {@link Safelist#preserveRelativeLinks(boolean)}.</p> 292 293 <p>Note that the output of this method is still <b>HTML</b> even when using the TextNode only 294 {@link Safelist#none()}, and so any HTML entities in the output will be appropriately escaped. 295 If you want plain text, not HTML, you should use a text method such as {@link Element#text()} instead, after 296 cleaning the document.</p> 297 <p>Example:</p> 298 <pre>{@code 299 String sourceBodyHtml = "<p>5 is < 6.</p>"; 300 String html = Jsoup.clean(sourceBodyHtml, Safelist.none()); 301 302 Cleaner cleaner = new Cleaner(Safelist.none()); 303 String text = cleaner.clean(Jsoup.parse(sourceBodyHtml)).text(); 304 305 // html is: 5 is < 6. 306 // text is: 5 is < 6. 307 }</pre> 308 309 @param bodyHtml input untrusted HTML (body fragment) 310 @param safelist list of permitted HTML elements 311 @return safe HTML (body fragment) 312 @see Cleaner#clean(Document) 313 */ 314 public static String clean(String bodyHtml, Safelist safelist) { 315 return clean(bodyHtml, "", safelist); 316 } 317 318 /** 319 * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of 320 * permitted tags and attributes. 321 * <p>The HTML is treated as a body fragment; it's expected the cleaned HTML will be used within the body of an 322 * existing document. If you want to clean full documents, use {@link Cleaner#clean(Document)} instead, and add 323 * structural tags (<code>html, head, body</code> etc) to the safelist. 324 * 325 * @param bodyHtml input untrusted HTML (body fragment) 326 * @param baseUri URL to resolve relative URLs against 327 * @param safelist list of permitted HTML elements 328 * @param outputSettings document output settings; use to control pretty-printing and entity escape modes 329 * @return safe HTML (body fragment) 330 * @see Cleaner#clean(Document) 331 */ 332 public static String clean(String bodyHtml, String baseUri, Safelist safelist, Document.OutputSettings outputSettings) { 333 Document dirty = parseBodyFragment(bodyHtml, baseUri); 334 Cleaner cleaner = new Cleaner(safelist); 335 Document clean = cleaner.clean(dirty); 336 clean.outputSettings(outputSettings); 337 return clean.body().html(); 338 } 339 340 /** 341 Test if the input body HTML has only tags and attributes allowed by the Safelist. Useful for form validation. 342 <p> 343 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 344 output of this method, the input document <b>must always</b> be normalized using a method such as 345 {@link #clean(String, String, Safelist)}, and the result of that method used to store or serialize the document 346 before later reuse such as presentation to end users. This ensures that enforced attributes are set correctly, and 347 that any differences between how a given browser and how jsoup parses the input HTML are normalized. 348 </p> 349 <p>Example:</p> 350 <pre>{@code 351 Safelist safelist = Safelist.relaxed(); 352 boolean isValid = Jsoup.isValid(sourceBodyHtml, safelist); 353 String normalizedHtml = Jsoup.clean(sourceBodyHtml, "https://example.com/", safelist); 354 }</pre> 355 <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.) 356 @param bodyHtml HTML to test 357 @param safelist safelist to test against 358 @return true if no tags or attributes were removed; false otherwise 359 @see #clean(String, Safelist) 360 */ 361 public static boolean isValid(String bodyHtml, Safelist safelist) { 362 return new Cleaner(safelist).isValidBodyHtml(bodyHtml); 363 } 364}