import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
/**
* 標題:利用htmlparser提取網頁純文本的例子
*/
@SuppressWarnings("serial")
public class TestHTMLParser {
public static void testHtml() {
try {
String sCurrentLine;
String sTotalString;
sCurrentLine = "";
sTotalString = "";
java.io.InputStream l_urlStream;
java.net.URL l_url = new java.net.URL(
"http://www.alexgaoyh.com/html/c7471ca9d5.html");
java.net.HttpURLConnection l_connection = (java.net.HttpURLConnection) l_url
.openConnection();
l_connection.connect();
l_urlStream = l_connection.getInputStream();
java.io.BufferedReader l_reader = new java.io.BufferedReader(
new java.io.InputStreamReader(l_urlStream));
while ((sCurrentLine = l_reader.readLine()) != null) {
sTotalString += sCurrentLine;
}
String testText = extractText(sTotalString);
System.out.println(testText);
} catch (Exception e) {
e.printStackTrace();
}
}
public static String extractText(String inputHtml) throws Exception {
StringBuffer text = new StringBuffer();
Parser parser = Parser.createParser(new String(inputHtml.getBytes(),
"UTF-8"), "UTF-8");
// 遍歷所有的節點
NodeList nodes = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
return true;
}
});
// System.out.println(nodes.size()); //打印節點的數量
for (int i = 0; i < nodes.size(); i++) {
Node nodet = nodes.elementAt(i);
// System.out.println(nodet.getText());
text.append(new String(nodet.toPlainTextString().getBytes("GBK")));
}
return text.toString();
}
public static void main(String[] args) throws Exception {
testHtml();
}
}
利用htmlparser提取網頁純文本的例子
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.