Extract links from an HTML page

1. Using javax.swing.text.html.HTMLEditorKit

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import java.io.IOException;
import java.io.FileReader;
import java.io.Reader;
import java.util.List;
import java.util.ArrayList;
 
import javax.swing.text.html.parser.ParserDelegator;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTML.Attribute;
import javax.swing.text.MutableAttributeSet;
 
public class HTMLUtils {
  private HTMLUtils() {}
 
  public static List<String> extractLinks(Reader reader) throws IOException {
    final ArrayList<String> list = new ArrayList<String>();
 
    ParserDelegator parserDelegator = new ParserDelegator();
    ParserCallback parserCallback = new ParserCallback() {
      public void handleText(final char[] data, final int pos) { }
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
        if (tag == Tag.A) {
          String address = (String) attribute.getAttribute(Attribute.HREF);
          list.add(address);
        }
      }
      public void handleEndTag(Tag t, final int pos) {  }
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
      public void handleComment(final char[] data, final int pos) { }
      public void handleError(final java.lang.String errMsg, final int pos) { }
    };
    parserDelegator.parse(reader, parserCallback, false);
    return list;
  }
 
  public final static void main(String[] args) throws Exception{
    FileReader reader = new FileReader("java-new.html");
    List<String> links = HTMLUtils.extractLinks(reader);
    for (String link : links) {
      System.out.println(link);
    }
  }
}

2. Using an HTML parser

In this HowTo, I will use the OpenSource package Jsoup.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
public class HTMLUtils {
  private HTMLUtils() {}
 
  public static List<String>extractLinks(String url) throws IOException {
    final ArrayList<String> result = new ArrayList<String>();
 
    Document doc = Jsoup.connect(url).get();
 
    Elements links = doc.select("a[href]");
    Elements media = doc.select("[src]");
    Elements imports = doc.select("link[href]");
 
    // href ...
    for (Element link : links) {
      result.add(link.attr("abs:href"));
    }
 
    // img ...
    for (Element src : media) {
      result.add(src.attr("abs:src"));
    }
 
    // js, css, ...
    for (Element link : imports) {
      result.add(link.attr("abs:href"));
    }
    return result;
  }
 
 
  public final static void main(String[] args) throws Exception{
    String site = "http://www.rgagnon.com/topics/java-language.html";
    List<String> links = HTMLUtils.extractLinks(site);
    for (String link : links) {
      System.out.println(link);
    }
  }
}

Done! Happy Coding!

Related posts: