Source code

001package org.jsoup;
002
003import org.jsoup.helper.DataUtil;
004import org.jsoup.helper.HttpConnection;
005import org.jsoup.nodes.Document;
006import org.jsoup.nodes.Element;
007import org.jsoup.parser.Parser;
008import org.jsoup.safety.Cleaner;
009import org.jsoup.safety.Safelist;
010import org.jspecify.annotations.Nullable;
011
012import java.io.File;
013import java.io.IOException;
014import java.io.InputStream;
015import java.net.URL;
016
017/**
018 The core public access point to the jsoup functionality.
019
020 @author Jonathan Hedley */
021
022public class Jsoup {
023    private Jsoup() {}
024
025    /**
026     Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.
027
028     @param html    HTML to parse
029     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
030     before the HTML declares a {@code <base href>} tag.
031     @return sane HTML
032     */
033    public static Document parse(String html, String baseUri) {
034        return Parser.parse(html, baseUri);
035    }
036
037    /**
038     Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
039     (non-HTML) parser.
040
041     @param html    HTML to parse
042     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
043     before the HTML declares a {@code <base href>} tag.
044     @param parser alternate {@link Parser#xmlParser() parser} to use.
045     @return sane HTML
046     */
047    public static Document parse(String html, String baseUri, Parser parser) {
048        return parser.parseInput(html, baseUri);
049    }
050
051    /**
052     Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
053     (non-HTML) parser.  As no base URI is specified, absolute URL resolution, if required, relies on the HTML including
054     a {@code <base href>} tag.
055
056     @param html    HTML to parse
057     before the HTML declares a {@code <base href>} tag.
058     @param parser alternate {@link Parser#xmlParser() parser} to use.
059     @return sane HTML
060     */
061    public static Document parse(String html, Parser parser) {
062        return parser.parseInput(html, "");
063    }
064
065    /**
066     Parse HTML into a Document. As no base URI is specified, absolute URL resolution, if required, relies on the HTML
067     including a {@code <base href>} tag.
068
069     @param html HTML to parse
070     @return sane HTML
071
072     @see #parse(String, String)
073     */
074    public static Document parse(String html) {
075        return Parser.parse(html, "");
076    }
077
078    /**
079     * Creates a new {@link Connection} (session), with the defined request URL. Use to fetch and parse a HTML page.
080     * <p>
081     * Use examples:
082     * <ul>
083     *  <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li>
084     *  <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();</code></li>
085     * </ul>
086     * @param url URL to connect to. The protocol must be {@code http} or {@code https}.
087     * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute.
088     * @see #newSession()
089     * @see Connection#newRequest()
090     */
091    public static Connection connect(String url) {
092        return HttpConnection.connect(url);
093    }
094
095    /**
096     Creates a new {@link Connection} to use as a session. Connection settings (user-agent, timeouts, URL, etc), and
097     cookies will be maintained for the session. Use examples:
098<pre><code>
099Connection session = Jsoup.newSession()
100     .timeout(20 * 1000)
101     .userAgent("FooBar 2000");
102
103Document doc1 = session.newRequest()
104     .url("https://jsoup.org/").data("ref", "example")
105     .get();
106Document doc2 = session.newRequest()
107     .url("https://en.wikipedia.org/wiki/Main_Page")
108     .get();
109Connection con3 = session.newRequest();
110</code></pre>
111
112     <p>For multi-threaded requests, it is safe to use this session between threads, but take care to call {@link
113    Connection#newRequest()} per request and not share that instance between threads when executing or parsing.</p>
114
115     @return a connection
116     @since 1.14.1
117     */
118    public static Connection newSession() {
119        return new HttpConnection();
120    }
121
122    /**
123     Parse the contents of a file as HTML.
124
125     @param file          file to load HTML from. Supports gzipped files (ending in .z or .gz).
126     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
127     present, or fall back to {@code UTF-8} (which is often safe to do).
128     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
129     @return sane HTML
130
131     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
132     */
133    public static Document parse(File file, @Nullable String charsetName, String baseUri) throws IOException {
134        return DataUtil.load(file, charsetName, baseUri);
135    }
136
137    /**
138     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
139
140     @param file        file to load HTML from. Supports gzipped files (ending in .z or .gz).
141     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
142     present, or fall back to {@code UTF-8} (which is often safe to do).
143     @return sane HTML
144
145     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
146     @see #parse(File, String, String) parse(file, charset, baseUri)
147     */
148    public static Document parse(File file, @Nullable String charsetName) throws IOException {
149        return DataUtil.load(file, charsetName, file.getAbsolutePath());
150    }
151
152    /**
153     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
154     The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag,
155     or if neither is present, will be {@code UTF-8}.
156
157     <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p>
158
159     @param file the file to load HTML from. Supports gzipped files (ending in .z or .gz).
160     @return sane HTML
161     @throws IOException if the file could not be found or read.
162     @see #parse(File, String, String) parse(file, charset, baseUri)
163     @since 1.15.1
164     */
165    public static Document parse(File file) throws IOException {
166        return DataUtil.load(file, null, file.getAbsolutePath());
167    }
168
169    /**
170     Parse the contents of a file as HTML.
171
172     @param file          file to load HTML from. Supports gzipped files (ending in .z or .gz).
173     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
174     present, or fall back to {@code UTF-8} (which is often safe to do).
175     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
176     @param parser alternate {@link Parser#xmlParser() parser} to use.
177     @return sane HTML
178
179     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
180     @since 1.14.2
181     */
182    public static Document parse(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
183        return DataUtil.load(file, charsetName, baseUri, parser);
184    }
185
186     /**
187     Read an input stream, and parse it to a Document.
188
189     @param in          input stream to read. The stream will be closed after reading.
190     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
191     present, or fall back to {@code UTF-8} (which is often safe to do).
192     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
193     @return sane HTML
194
195     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
196     */
197    public static Document parse(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
198        return DataUtil.load(in, charsetName, baseUri);
199    }
200
201    /**
202     Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
203     (non-HTML) parser.
204
205     @param in          input stream to read. Make sure to close it after parsing.
206     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
207     present, or fall back to {@code UTF-8} (which is often safe to do).
208     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
209     @param parser alternate {@link Parser#xmlParser() parser} to use.
210     @return sane HTML
211
212     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
213     */
214    public static Document parse(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
215        return DataUtil.load(in, charsetName, baseUri, parser);
216    }
217
218    /**
219     Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
220
221     @param bodyHtml body HTML fragment
222     @param baseUri  URL to resolve relative URLs against.
223     @return sane HTML document
224
225     @see Document#body()
226     */
227    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
228        return Parser.parseBodyFragment(bodyHtml, baseUri);
229    }
230
231    /**
232     Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
233
234     @param bodyHtml body HTML fragment
235     @return sane HTML document
236
237     @see Document#body()
238     */
239    public static Document parseBodyFragment(String bodyHtml) {
240        return Parser.parseBodyFragment(bodyHtml, "");
241    }
242
243    /**
244     Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead.
245     <p>
246     The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}.
247
248     @param url           URL to fetch (with a GET). The protocol must be {@code http} or {@code https}.
249     @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown.
250     @return The parsed HTML.
251
252     @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed
253     @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored
254     @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored
255     @throws java.net.SocketTimeoutException if the connection times out
256     @throws IOException if a connection or read error occurs
257
258     @see #connect(String)
259     */
260    public static Document parse(URL url, int timeoutMillis) throws IOException {
261        Connection con = HttpConnection.connect(url);
262        con.timeout(timeoutMillis);
263        return con.get();
264    }
265
266    /**
267     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through an allow-list of safe
268     tags and attributes.
269
270     @param bodyHtml  input untrusted HTML (body fragment)
271     @param baseUri   URL to resolve relative URLs against
272     @param safelist  list of permitted HTML elements
273     @return safe HTML (body fragment)
274
275     @see Cleaner#clean(Document)
276     */
277    public static String clean(String bodyHtml, String baseUri, Safelist safelist) {
278        Document dirty = parseBodyFragment(bodyHtml, baseUri);
279        Cleaner cleaner = new Cleaner(safelist);
280        Document clean = cleaner.clean(dirty);
281        return clean.body().html();
282    }
283
284    /**
285     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of permitted
286     tags and attributes.
287
288     <p>Note that as this method does not take a base href URL to resolve attributes with relative URLs against, those
289     URLs will be removed, unless the input HTML contains a {@code <base href> tag}. If you wish to preserve those, use
290     the {@link Jsoup#clean(String html, String baseHref, Safelist)} method instead, and enable
291     {@link Safelist#preserveRelativeLinks(boolean)}.</p>
292
293     <p>Note that the output of this method is still <b>HTML</b> even when using the TextNode only
294     {@link Safelist#none()}, and so any HTML entities in the output will be appropriately escaped.
295     If you want plain text, not HTML, you should use a text method such as {@link Element#text()} instead, after
296     cleaning the document.</p>
297     <p>Example:</p>
298     <pre>{@code
299     String sourceBodyHtml = "<p>5 is &lt; 6.</p>";
300     String html = Jsoup.clean(sourceBodyHtml, Safelist.none());
301
302     Cleaner cleaner = new Cleaner(Safelist.none());
303     String text = cleaner.clean(Jsoup.parse(sourceBodyHtml)).text();
304
305     // html is: 5 is &lt; 6.
306     // text is: 5 is < 6.
307     }</pre>
308
309     @param bodyHtml input untrusted HTML (body fragment)
310     @param safelist list of permitted HTML elements
311     @return safe HTML (body fragment)
312     @see Cleaner#clean(Document)
313     */
314    public static String clean(String bodyHtml, Safelist safelist) {
315        return clean(bodyHtml, "", safelist);
316    }
317
318    /**
319     * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of
320     * permitted tags and attributes.
321     * <p>The HTML is treated as a body fragment; it's expected the cleaned HTML will be used within the body of an
322     * existing document. If you want to clean full documents, use {@link Cleaner#clean(Document)} instead, and add
323     * structural tags (<code>html, head, body</code> etc) to the safelist.
324     *
325     * @param bodyHtml input untrusted HTML (body fragment)
326     * @param baseUri URL to resolve relative URLs against
327     * @param safelist list of permitted HTML elements
328     * @param outputSettings document output settings; use to control pretty-printing and entity escape modes
329     * @return safe HTML (body fragment)
330     * @see Cleaner#clean(Document)
331     */
332    public static String clean(String bodyHtml, String baseUri, Safelist safelist, Document.OutputSettings outputSettings) {
333        Document dirty = parseBodyFragment(bodyHtml, baseUri);
334        Cleaner cleaner = new Cleaner(safelist);
335        Document clean = cleaner.clean(dirty);
336        clean.outputSettings(outputSettings);
337        return clean.body().html();
338    }
339
340    /**
341     Test if the input body HTML has only tags and attributes allowed by the Safelist. Useful for form validation.
342     <p>
343     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
344     output of this method, the input document <b>must always</b> be normalized using a method such as
345     {@link #clean(String, String, Safelist)}, and the result of that method used to store or serialize the document
346     before later reuse such as presentation to end users. This ensures that enforced attributes are set correctly, and
347     that any differences between how a given browser and how jsoup parses the input HTML are normalized.
348     </p>
349     <p>Example:</p>
350     <pre>{@code
351     Safelist safelist = Safelist.relaxed();
352     boolean isValid = Jsoup.isValid(sourceBodyHtml, safelist);
353     String normalizedHtml = Jsoup.clean(sourceBodyHtml, "https://example.com/", safelist);
354     }</pre>
355     <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.)
356     @param bodyHtml HTML to test
357     @param safelist safelist to test against
358     @return true if no tags or attributes were removed; false otherwise
359     @see #clean(String, Safelist)
360     */
361    public static boolean isValid(String bodyHtml, Safelist safelist) {
362        return new Cleaner(safelist).isValidBodyHtml(bodyHtml);
363    }
364}