获取HTML文件中的文字 : HTML解析器 « 网络协议


  



import java.io.InputStreamReader;

import java.io.Reader;

import java.net.URI;

import java.net.URL;

import java.net.URLConnection;



import javax.swing.text.EditorKit;

import javax.swing.text.html.HTMLDocument;

import javax.swing.text.html.HTMLEditorKit;



public class Main {

  public static void main(String[] argv) throws Exception {

    HTMLDocument doc = new HTMLDocument() {

      public HTMLEditorKit.ParserCallback getReader(int pos) {

        return new HTMLEditorKit.ParserCallback() {

          public void handleText(char[] data, int pos) {

            System.out.println(data);

          }

        };

      }

    };



    URL url = new URI("http://www.google.com").toURL();

    URLConnection conn = url.openConnection();

    Reader rd = new InputStreamReader(conn.getInputStream());



    EditorKit kit = new HTMLEditorKit();

    kit.read(rd, doc, 0);

  }

}

获取HTML文件中的文字 : HTML解析器 « 网络协议 « Java