001package org.jsoup; 002 003import org.jsoup.helper.DataUtil; 004import org.jsoup.helper.HttpConnection; 005import org.jsoup.nodes.Document; 006import org.jsoup.nodes.Element; 007import org.jsoup.parser.Parser; 008import org.jsoup.safety.Cleaner; 009import org.jsoup.safety.Safelist; 010import org.jspecify.annotations.Nullable; 011 012import java.io.File; 013import java.io.IOException; 014import java.io.InputStream; 015import java.net.URL; 016import java.nio.file.Path; 017 018/** 019 The core public access point to the jsoup functionality. 020 021 @author Jonathan Hedley */ 022 023public class Jsoup { 024 private Jsoup() {} 025 026 /** 027 Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. 028 029 @param html HTML to parse 030 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur 031 before the HTML declares a {@code <base href>} tag. 032 @return sane HTML 033 */ 034 public static Document parse(String html, String baseUri) { 035 return Parser.parse(html, baseUri); 036 } 037 038 /** 039 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML 040 (non-HTML) parser. 041 042 @param html HTML to parse 043 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur 044 before the HTML declares a {@code <base href>} tag. 045 @param parser alternate {@link Parser#xmlParser() parser} to use. 046 @return sane HTML 047 */ 048 public static Document parse(String html, String baseUri, Parser parser) { 049 return parser.parseInput(html, baseUri); 050 } 051 052 /** 053 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML 054 (non-HTML) parser. As no base URI is specified, absolute URL resolution, if required, relies on the HTML including 055 a {@code <base href>} tag. 056 057 @param html HTML to parse 058 before the HTML declares a {@code <base href>} tag. 059 @param parser alternate {@link Parser#xmlParser() parser} to use. 060 @return sane HTML 061 */ 062 public static Document parse(String html, Parser parser) { 063 return parser.parseInput(html, ""); 064 } 065 066 /** 067 Parse HTML into a Document. As no base URI is specified, absolute URL resolution, if required, relies on the HTML 068 including a {@code <base href>} tag. 069 070 @param html HTML to parse 071 @return sane HTML 072 073 @see #parse(String, String) 074 */ 075 public static Document parse(String html) { 076 return Parser.parse(html, ""); 077 } 078 079 /** 080 * Creates a new {@link Connection} (session), with the defined request URL. Use to fetch and parse a HTML page. 081 * <p> 082 * Use examples: 083 * <ul> 084 * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li> 085 * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();</code></li> 086 * </ul> 087 * @param url URL to connect to. The protocol must be {@code http} or {@code https}. 088 * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. 089 * @see #newSession() 090 * @see Connection#newRequest() 091 */ 092 public static Connection connect(String url) { 093 return HttpConnection.connect(url); 094 } 095 096 /** 097 Creates a new {@link Connection} to use as a session. Connection settings (user-agent, timeouts, URL, etc), and 098 cookies will be maintained for the session. Use examples: 099<pre><code> 100Connection session = Jsoup.newSession() 101 .timeout(20 * 1000) 102 .userAgent("FooBar 2000"); 103 104Document doc1 = session.newRequest() 105 .url("https://jsoup.org/").data("ref", "example") 106 .get(); 107Document doc2 = session.newRequest() 108 .url("https://en.wikipedia.org/wiki/Main_Page") 109 .get(); 110Connection con3 = session.newRequest(); 111</code></pre> 112 113 <p>For multi-threaded requests, it is safe to use this session between threads, but take care to call {@link 114 Connection#newRequest()} per request and not share that instance between threads when executing or parsing.</p> 115 116 @return a connection 117 @since 1.14.1 118 */ 119 public static Connection newSession() { 120 return new HttpConnection(); 121 } 122 123 /** 124 Parse the contents of a file as HTML. 125 126 @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). 127 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 128 present, or fall back to {@code UTF-8} (which is often safe to do). 129 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 130 @return sane HTML 131 132 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 133 */ 134 public static Document parse(File file, @Nullable String charsetName, String baseUri) throws IOException { 135 return DataUtil.load(file, charsetName, baseUri); 136 } 137 138 /** 139 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 140 141 @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). 142 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 143 present, or fall back to {@code UTF-8} (which is often safe to do). 144 @return sane HTML 145 146 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 147 @see #parse(File, String, String) parse(file, charset, baseUri) 148 */ 149 public static Document parse(File file, @Nullable String charsetName) throws IOException { 150 return DataUtil.load(file, charsetName, file.getAbsolutePath()); 151 } 152 153 /** 154 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 155 The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag, 156 or if neither is present, will be {@code UTF-8}. 157 158 <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p> 159 160 @param file the file to load HTML from. Supports gzipped files (ending in .z or .gz). 161 @return sane HTML 162 @throws IOException if the file could not be found or read. 163 @see #parse(File, String, String) parse(file, charset, baseUri) 164 @since 1.15.1 165 */ 166 public static Document parse(File file) throws IOException { 167 return DataUtil.load(file, null, file.getAbsolutePath()); 168 } 169 170 /** 171 Parse the contents of a file as HTML. 172 173 @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). 174 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 175 present, or fall back to {@code UTF-8} (which is often safe to do). 176 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 177 @param parser alternate {@link Parser#xmlParser() parser} to use. 178 @return sane HTML 179 180 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 181 @since 1.14.2 182 */ 183 public static Document parse(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 184 return DataUtil.load(file, charsetName, baseUri, parser); 185 } 186 187 /** 188 Parse the contents of a file as HTML. 189 190 @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). 191 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 192 present, or fall back to {@code UTF-8} (which is often safe to do). 193 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 194 @return sane HTML 195 196 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 197 @since 1.18.1 198 */ 199 public static Document parse(Path path, @Nullable String charsetName, String baseUri) throws IOException { 200 return DataUtil.load(path, charsetName, baseUri); 201 } 202 203 /** 204 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 205 206 @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). 207 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 208 present, or fall back to {@code UTF-8} (which is often safe to do). 209 @return sane HTML 210 211 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 212 @see #parse(File, String, String) parse(file, charset, baseUri) 213 @since 1.18.1 214 */ 215 public static Document parse(Path path, @Nullable String charsetName) throws IOException { 216 return DataUtil.load(path, charsetName, path.toAbsolutePath().toString()); 217 } 218 219 /** 220 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 221 The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag, 222 or if neither is present, will be {@code UTF-8}. 223 224 <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p> 225 226 @param path the file to load HTML from. Supports gzipped files (ending in .z or .gz). 227 @return sane HTML 228 @throws IOException if the file could not be found or read. 229 @see #parse(Path, String, String) parse(file, charset, baseUri) 230 @since 1.18.1 231 */ 232 public static Document parse(Path path) throws IOException { 233 return DataUtil.load(path, null, path.toAbsolutePath().toString()); 234 } 235 236 /** 237 Parse the contents of a file as HTML. 238 239 @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). 240 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 241 present, or fall back to {@code UTF-8} (which is often safe to do). 242 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 243 @param parser alternate {@link Parser#xmlParser() parser} to use. 244 @return sane HTML 245 246 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 247 @since 1.18.1 248 */ 249 public static Document parse(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 250 return DataUtil.load(path, charsetName, baseUri, parser); 251 } 252 253 /** 254 Read an input stream, and parse it to a Document. 255 256 @param in input stream to read. The stream will be closed after reading. 257 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 258 present, or fall back to {@code UTF-8} (which is often safe to do). 259 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 260 @return sane HTML 261 262 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 263 */ 264 public static Document parse(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { 265 return DataUtil.load(in, charsetName, baseUri); 266 } 267 268 /** 269 Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML 270 (non-HTML) parser. 271 272 @param in input stream to read. Make sure to close it after parsing. 273 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 274 present, or fall back to {@code UTF-8} (which is often safe to do). 275 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 276 @param parser alternate {@link Parser#xmlParser() parser} to use. 277 @return sane HTML 278 279 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 280 */ 281 public static Document parse(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 282 return DataUtil.load(in, charsetName, baseUri, parser); 283 } 284 285 /** 286 Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. 287 288 @param bodyHtml body HTML fragment 289 @param baseUri URL to resolve relative URLs against. 290 @return sane HTML document 291 292 @see Document#body() 293 */ 294 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 295 return Parser.parseBodyFragment(bodyHtml, baseUri); 296 } 297 298 /** 299 Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. 300 301 @param bodyHtml body HTML fragment 302 @return sane HTML document 303 304 @see Document#body() 305 */ 306 public static Document parseBodyFragment(String bodyHtml) { 307 return Parser.parseBodyFragment(bodyHtml, ""); 308 } 309 310 /** 311 Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead. 312 <p> 313 The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}. 314 315 @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}. 316 @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. 317 @return The parsed HTML. 318 319 @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed 320 @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored 321 @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored 322 @throws java.net.SocketTimeoutException if the connection times out 323 @throws IOException if a connection or read error occurs 324 325 @see #connect(String) 326 */ 327 public static Document parse(URL url, int timeoutMillis) throws IOException { 328 Connection con = HttpConnection.connect(url); 329 con.timeout(timeoutMillis); 330 return con.get(); 331 } 332 333 /** 334 Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through an allow-list of safe 335 tags and attributes. 336 337 @param bodyHtml input untrusted HTML (body fragment) 338 @param baseUri URL to resolve relative URLs against 339 @param safelist list of permitted HTML elements 340 @return safe HTML (body fragment) 341 342 @see Cleaner#clean(Document) 343 */ 344 public static String clean(String bodyHtml, String baseUri, Safelist safelist) { 345 Document dirty = parseBodyFragment(bodyHtml, baseUri); 346 Cleaner cleaner = new Cleaner(safelist); 347 Document clean = cleaner.clean(dirty); 348 return clean.body().html(); 349 } 350 351 /** 352 Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of permitted 353 tags and attributes. 354 355 <p>Note that as this method does not take a base href URL to resolve attributes with relative URLs against, those 356 URLs will be removed, unless the input HTML contains a {@code <base href> tag}. If you wish to preserve those, use 357 the {@link Jsoup#clean(String html, String baseHref, Safelist)} method instead, and enable 358 {@link Safelist#preserveRelativeLinks(boolean)}.</p> 359 360 <p>Note that the output of this method is still <b>HTML</b> even when using the TextNode only 361 {@link Safelist#none()}, and so any HTML entities in the output will be appropriately escaped. 362 If you want plain text, not HTML, you should use a text method such as {@link Element#text()} instead, after 363 cleaning the document.</p> 364 <p>Example:</p> 365 <pre>{@code 366 String sourceBodyHtml = "<p>5 is < 6.</p>"; 367 String html = Jsoup.clean(sourceBodyHtml, Safelist.none()); 368 369 Cleaner cleaner = new Cleaner(Safelist.none()); 370 String text = cleaner.clean(Jsoup.parse(sourceBodyHtml)).text(); 371 372 // html is: 5 is < 6. 373 // text is: 5 is < 6. 374 }</pre> 375 376 @param bodyHtml input untrusted HTML (body fragment) 377 @param safelist list of permitted HTML elements 378 @return safe HTML (body fragment) 379 @see Cleaner#clean(Document) 380 */ 381 public static String clean(String bodyHtml, Safelist safelist) { 382 return clean(bodyHtml, "", safelist); 383 } 384 385 /** 386 * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of 387 * permitted tags and attributes. 388 * <p>The HTML is treated as a body fragment; it's expected the cleaned HTML will be used within the body of an 389 * existing document. If you want to clean full documents, use {@link Cleaner#clean(Document)} instead, and add 390 * structural tags (<code>html, head, body</code> etc) to the safelist. 391 * 392 * @param bodyHtml input untrusted HTML (body fragment) 393 * @param baseUri URL to resolve relative URLs against 394 * @param safelist list of permitted HTML elements 395 * @param outputSettings document output settings; use to control pretty-printing and entity escape modes 396 * @return safe HTML (body fragment) 397 * @see Cleaner#clean(Document) 398 */ 399 public static String clean(String bodyHtml, String baseUri, Safelist safelist, Document.OutputSettings outputSettings) { 400 Document dirty = parseBodyFragment(bodyHtml, baseUri); 401 Cleaner cleaner = new Cleaner(safelist); 402 Document clean = cleaner.clean(dirty); 403 clean.outputSettings(outputSettings); 404 return clean.body().html(); 405 } 406 407 /** 408 Test if the input body HTML has only tags and attributes allowed by the Safelist. Useful for form validation. 409 <p> 410 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 411 output of this method, the input document <b>must always</b> be normalized using a method such as 412 {@link #clean(String, String, Safelist)}, and the result of that method used to store or serialize the document 413 before later reuse such as presentation to end users. This ensures that enforced attributes are set correctly, and 414 that any differences between how a given browser and how jsoup parses the input HTML are normalized. 415 </p> 416 <p>Example:</p> 417 <pre>{@code 418 Safelist safelist = Safelist.relaxed(); 419 boolean isValid = Jsoup.isValid(sourceBodyHtml, safelist); 420 String normalizedHtml = Jsoup.clean(sourceBodyHtml, "https://example.com/", safelist); 421 }</pre> 422 <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.) 423 @param bodyHtml HTML to test 424 @param safelist safelist to test against 425 @return true if no tags or attributes were removed; false otherwise 426 @see #clean(String, Safelist) 427 */ 428 public static boolean isValid(String bodyHtml, Safelist safelist) { 429 return new Cleaner(safelist).isValidBodyHtml(bodyHtml); 430 } 431}