Source code

001package org.jsoup;
002
003import org.jsoup.helper.DataUtil;
004import org.jsoup.helper.HttpConnection;
005import org.jsoup.nodes.Document;
006import org.jsoup.nodes.Element;
007import org.jsoup.parser.Parser;
008import org.jsoup.safety.Cleaner;
009import org.jsoup.safety.Safelist;
010import org.jspecify.annotations.Nullable;
011
012import java.io.File;
013import java.io.IOException;
014import java.io.InputStream;
015import java.net.URL;
016import java.nio.file.Path;
017
018/**
019 The core public access point to the jsoup functionality.
020
021 @author Jonathan Hedley */
022
023public class Jsoup {
024    private Jsoup() {}
025
026    /**
027     Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.
028
029     @param html    HTML to parse
030     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
031     before the HTML declares a {@code <base href>} tag.
032     @return sane HTML
033     */
034    public static Document parse(String html, String baseUri) {
035        return Parser.parse(html, baseUri);
036    }
037
038    /**
039     Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
040     (non-HTML) parser.
041
042     @param html    HTML to parse
043     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
044     before the HTML declares a {@code <base href>} tag.
045     @param parser alternate {@link Parser#xmlParser() parser} to use.
046     @return sane HTML
047     */
048    public static Document parse(String html, String baseUri, Parser parser) {
049        return parser.parseInput(html, baseUri);
050    }
051
052    /**
053     Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
054     (non-HTML) parser.  As no base URI is specified, absolute URL resolution, if required, relies on the HTML including
055     a {@code <base href>} tag.
056
057     @param html    HTML to parse
058     before the HTML declares a {@code <base href>} tag.
059     @param parser alternate {@link Parser#xmlParser() parser} to use.
060     @return sane HTML
061     */
062    public static Document parse(String html, Parser parser) {
063        return parser.parseInput(html, "");
064    }
065
066    /**
067     Parse HTML into a Document. As no base URI is specified, absolute URL resolution, if required, relies on the HTML
068     including a {@code <base href>} tag.
069
070     @param html HTML to parse
071     @return sane HTML
072
073     @see #parse(String, String)
074     */
075    public static Document parse(String html) {
076        return Parser.parse(html, "");
077    }
078
079    /**
080     * Creates a new {@link Connection} (session), with the defined request URL. Use to fetch and parse a HTML page.
081     * <p>
082     * Use examples:
083     * <ul>
084     *  <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li>
085     *  <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();</code></li>
086     * </ul>
087     * @param url URL to connect to. The protocol must be {@code http} or {@code https}.
088     * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute.
089     * @see #newSession()
090     * @see Connection#newRequest()
091     */
092    public static Connection connect(String url) {
093        return HttpConnection.connect(url);
094    }
095
096    /**
097     Creates a new {@link Connection} to use as a session. Connection settings (user-agent, timeouts, URL, etc), and
098     cookies will be maintained for the session. Use examples:
099<pre><code>
100Connection session = Jsoup.newSession()
101     .timeout(20 * 1000)
102     .userAgent("FooBar 2000");
103
104Document doc1 = session.newRequest()
105     .url("https://jsoup.org/").data("ref", "example")
106     .get();
107Document doc2 = session.newRequest()
108     .url("https://en.wikipedia.org/wiki/Main_Page")
109     .get();
110Connection con3 = session.newRequest();
111</code></pre>
112
113     <p>For multi-threaded requests, it is safe to use this session between threads, but take care to call {@link
114    Connection#newRequest()} per request and not share that instance between threads when executing or parsing.</p>
115
116     @return a connection
117     @since 1.14.1
118     */
119    public static Connection newSession() {
120        return new HttpConnection();
121    }
122
123    /**
124     Parse the contents of a file as HTML.
125
126     @param file          file to load HTML from. Supports gzipped files (ending in .z or .gz).
127     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
128     present, or fall back to {@code UTF-8} (which is often safe to do).
129     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
130     @return sane HTML
131
132     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
133     */
134    public static Document parse(File file, @Nullable String charsetName, String baseUri) throws IOException {
135        return DataUtil.load(file, charsetName, baseUri);
136    }
137
138    /**
139     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
140
141     @param file        file to load HTML from. Supports gzipped files (ending in .z or .gz).
142     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
143     present, or fall back to {@code UTF-8} (which is often safe to do).
144     @return sane HTML
145
146     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
147     @see #parse(File, String, String) parse(file, charset, baseUri)
148     */
149    public static Document parse(File file, @Nullable String charsetName) throws IOException {
150        return DataUtil.load(file, charsetName, file.getAbsolutePath());
151    }
152
153    /**
154     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
155     The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag,
156     or if neither is present, will be {@code UTF-8}.
157
158     <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p>
159
160     @param file the file to load HTML from. Supports gzipped files (ending in .z or .gz).
161     @return sane HTML
162     @throws IOException if the file could not be found or read.
163     @see #parse(File, String, String) parse(file, charset, baseUri)
164     @since 1.15.1
165     */
166    public static Document parse(File file) throws IOException {
167        return DataUtil.load(file, null, file.getAbsolutePath());
168    }
169
170    /**
171     Parse the contents of a file as HTML.
172
173     @param file          file to load HTML from. Supports gzipped files (ending in .z or .gz).
174     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
175     present, or fall back to {@code UTF-8} (which is often safe to do).
176     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
177     @param parser alternate {@link Parser#xmlParser() parser} to use.
178     @return sane HTML
179
180     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
181     @since 1.14.2
182     */
183    public static Document parse(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
184        return DataUtil.load(file, charsetName, baseUri, parser);
185    }
186
187    /**
188     Parse the contents of a file as HTML.
189
190     @param path          file to load HTML from. Supports gzipped files (ending in .z or .gz).
191     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
192     present, or fall back to {@code UTF-8} (which is often safe to do).
193     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
194     @return sane HTML
195
196     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
197     @since 1.18.1
198     */
199    public static Document parse(Path path, @Nullable String charsetName, String baseUri) throws IOException {
200        return DataUtil.load(path, charsetName, baseUri);
201    }
202
203    /**
204     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
205
206     @param path        file to load HTML from. Supports gzipped files (ending in .z or .gz).
207     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
208     present, or fall back to {@code UTF-8} (which is often safe to do).
209     @return sane HTML
210
211     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
212     @see #parse(File, String, String) parse(file, charset, baseUri)
213     @since 1.18.1
214     */
215    public static Document parse(Path path, @Nullable String charsetName) throws IOException {
216        return DataUtil.load(path, charsetName, path.toAbsolutePath().toString());
217    }
218
219    /**
220     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
221     The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag,
222     or if neither is present, will be {@code UTF-8}.
223
224     <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p>
225
226     @param path the file to load HTML from. Supports gzipped files (ending in .z or .gz).
227     @return sane HTML
228     @throws IOException if the file could not be found or read.
229     @see #parse(Path, String, String) parse(file, charset, baseUri)
230     @since 1.18.1
231     */
232    public static Document parse(Path path) throws IOException {
233        return DataUtil.load(path, null, path.toAbsolutePath().toString());
234    }
235
236    /**
237     Parse the contents of a file as HTML.
238
239     @param path          file to load HTML from. Supports gzipped files (ending in .z or .gz).
240     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
241     present, or fall back to {@code UTF-8} (which is often safe to do).
242     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
243     @param parser alternate {@link Parser#xmlParser() parser} to use.
244     @return sane HTML
245
246     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
247     @since 1.18.1
248     */
249    public static Document parse(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
250        return DataUtil.load(path, charsetName, baseUri, parser);
251    }
252
253     /**
254     Read an input stream, and parse it to a Document.
255
256     @param in          input stream to read. The stream will be closed after reading.
257     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
258     present, or fall back to {@code UTF-8} (which is often safe to do).
259     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
260     @return sane HTML
261
262     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
263     */
264    public static Document parse(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
265        return DataUtil.load(in, charsetName, baseUri);
266    }
267
268    /**
269     Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
270     (non-HTML) parser.
271
272     @param in          input stream to read. Make sure to close it after parsing.
273     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
274     present, or fall back to {@code UTF-8} (which is often safe to do).
275     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
276     @param parser alternate {@link Parser#xmlParser() parser} to use.
277     @return sane HTML
278
279     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
280     */
281    public static Document parse(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
282        return DataUtil.load(in, charsetName, baseUri, parser);
283    }
284
285    /**
286     Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
287
288     @param bodyHtml body HTML fragment
289     @param baseUri  URL to resolve relative URLs against.
290     @return sane HTML document
291
292     @see Document#body()
293     */
294    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
295        return Parser.parseBodyFragment(bodyHtml, baseUri);
296    }
297
298    /**
299     Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
300
301     @param bodyHtml body HTML fragment
302     @return sane HTML document
303
304     @see Document#body()
305     */
306    public static Document parseBodyFragment(String bodyHtml) {
307        return Parser.parseBodyFragment(bodyHtml, "");
308    }
309
310    /**
311     Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead.
312     <p>
313     The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}.
314
315     @param url           URL to fetch (with a GET). The protocol must be {@code http} or {@code https}.
316     @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown.
317     @return The parsed HTML.
318
319     @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed
320     @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored
321     @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored
322     @throws java.net.SocketTimeoutException if the connection times out
323     @throws IOException if a connection or read error occurs
324
325     @see #connect(String)
326     */
327    public static Document parse(URL url, int timeoutMillis) throws IOException {
328        Connection con = HttpConnection.connect(url);
329        con.timeout(timeoutMillis);
330        return con.get();
331    }
332
333    /**
334     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through an allow-list of safe
335     tags and attributes.
336
337     @param bodyHtml  input untrusted HTML (body fragment)
338     @param baseUri   URL to resolve relative URLs against
339     @param safelist  list of permitted HTML elements
340     @return safe HTML (body fragment)
341
342     @see Cleaner#clean(Document)
343     */
344    public static String clean(String bodyHtml, String baseUri, Safelist safelist) {
345        Document dirty = parseBodyFragment(bodyHtml, baseUri);
346        Cleaner cleaner = new Cleaner(safelist);
347        Document clean = cleaner.clean(dirty);
348        return clean.body().html();
349    }
350
351    /**
352     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of permitted
353     tags and attributes.
354
355     <p>Note that as this method does not take a base href URL to resolve attributes with relative URLs against, those
356     URLs will be removed, unless the input HTML contains a {@code <base href> tag}. If you wish to preserve those, use
357     the {@link Jsoup#clean(String html, String baseHref, Safelist)} method instead, and enable
358     {@link Safelist#preserveRelativeLinks(boolean)}.</p>
359
360     <p>Note that the output of this method is still <b>HTML</b> even when using the TextNode only
361     {@link Safelist#none()}, and so any HTML entities in the output will be appropriately escaped.
362     If you want plain text, not HTML, you should use a text method such as {@link Element#text()} instead, after
363     cleaning the document.</p>
364     <p>Example:</p>
365     <pre>{@code
366     String sourceBodyHtml = "<p>5 is &lt; 6.</p>";
367     String html = Jsoup.clean(sourceBodyHtml, Safelist.none());
368
369     Cleaner cleaner = new Cleaner(Safelist.none());
370     String text = cleaner.clean(Jsoup.parse(sourceBodyHtml)).text();
371
372     // html is: 5 is &lt; 6.
373     // text is: 5 is < 6.
374     }</pre>
375
376     @param bodyHtml input untrusted HTML (body fragment)
377     @param safelist list of permitted HTML elements
378     @return safe HTML (body fragment)
379     @see Cleaner#clean(Document)
380     */
381    public static String clean(String bodyHtml, Safelist safelist) {
382        return clean(bodyHtml, "", safelist);
383    }
384
385    /**
386     * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of
387     * permitted tags and attributes.
388     * <p>The HTML is treated as a body fragment; it's expected the cleaned HTML will be used within the body of an
389     * existing document. If you want to clean full documents, use {@link Cleaner#clean(Document)} instead, and add
390     * structural tags (<code>html, head, body</code> etc) to the safelist.
391     *
392     * @param bodyHtml input untrusted HTML (body fragment)
393     * @param baseUri URL to resolve relative URLs against
394     * @param safelist list of permitted HTML elements
395     * @param outputSettings document output settings; use to control pretty-printing and entity escape modes
396     * @return safe HTML (body fragment)
397     * @see Cleaner#clean(Document)
398     */
399    public static String clean(String bodyHtml, String baseUri, Safelist safelist, Document.OutputSettings outputSettings) {
400        Document dirty = parseBodyFragment(bodyHtml, baseUri);
401        Cleaner cleaner = new Cleaner(safelist);
402        Document clean = cleaner.clean(dirty);
403        clean.outputSettings(outputSettings);
404        return clean.body().html();
405    }
406
407    /**
408     Test if the input body HTML has only tags and attributes allowed by the Safelist. Useful for form validation.
409     <p>
410     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
411     output of this method, the input document <b>must always</b> be normalized using a method such as
412     {@link #clean(String, String, Safelist)}, and the result of that method used to store or serialize the document
413     before later reuse such as presentation to end users. This ensures that enforced attributes are set correctly, and
414     that any differences between how a given browser and how jsoup parses the input HTML are normalized.
415     </p>
416     <p>Example:</p>
417     <pre>{@code
418     Safelist safelist = Safelist.relaxed();
419     boolean isValid = Jsoup.isValid(sourceBodyHtml, safelist);
420     String normalizedHtml = Jsoup.clean(sourceBodyHtml, "https://example.com/", safelist);
421     }</pre>
422     <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.)
423     @param bodyHtml HTML to test
424     @param safelist safelist to test against
425     @return true if no tags or attributes were removed; false otherwise
426     @see #clean(String, Safelist)
427     */
428    public static boolean isValid(String bodyHtml, Safelist safelist) {
429        return new Cleaner(safelist).isValidBodyHtml(bodyHtml);
430    }
431}