1.Html基礎
1.html文檔結構
<html> <head><title>計算機學院</title></head> <body> <div id="Notice"> <a href="Article.aspx?t=5&id=9297" > <span >關於公佈2013-2014學年第2學期轉專業學生名單及做好相關工作的通 </span></a> <a href="Article.aspx?t=5&id=9296" > <span >教材科關於2013-2014學年第2學期領取教材有關事宜的通知 </span></a> </div> </body> </html> |
注: 1.Element: <head>.<body>.<div>標籤 2.Node:<a>.<span>標籤 3.SiblingElements:<head>和<body>標籤互爲兄弟Element 4.SiblingNode:<div>標籤內的第一個<a>和第二個<a>互爲兄弟Node |
該html的Tree如下
|
2.加載html文檔
1.從字符串中提取html文檔 String html = “<html><head><title>你好</title></head><body><p>我是誰</p></body></html> Document doc = Jsoup.parse(html); |
2.從URL直接加載html文檔 try { Document doc = (Document) Jsoup.connect(BASIC_URL).get(); } catch (IOException e) { // TODO Auto-generated catch block Log.e(tag, e.toString()); } try { Document doc = (Document) Jsoup.connect(BASIC_URL) .data("query","Java")//請求參數 .userAgent("Mozilla")//設置USER-AGENT .cookie("auth", "token")//設置cookie .timeout(60*1000)//超時 .post(); } catch (IOException e) { // TODO Auto-generated catch block Log.e(tag, e.toString()); }//post方式請求 |
3.文件加載 File file = new File(filePath); Document doc = (Document) Jsoup.parse(file, "UTF-8", "www.suse.edu.cn"); |
2.1 設置Http頭信息
con1.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); con1.header("Accept-Encoding","gzip,deflate,sdch"); con1.header("Referer","http://www.suse.edu.cn/"); con1.header("Accept-Language","zh-CN,zh;q=0.8,en;q=0.6"); con1.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36"); con1.header("(Request-Line)", "POST /cgi-bin/login?lang=zh_CN HTTP/1.1"); con1.header("Cache-Control", "no-cache"); con1.header("Connection", "Keep-Alive"); con1.header("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); con1.header("Host", "http://www.suse.edu.cn/"); Response re = con1.ignoreContentType(true).method(Method.GET).execute(); |
3.修改數據
Doc.select(“div.comments a”).attr(“rel”,”nofollow”);//爲所有連接增加rel = nofollow屬性 Doc.select(“div.comments a”).addClass(“mylinkclass”);//爲所有連接增加class = mylinkclass屬性 Doc.select(“img”).removeAttr(“onclick”);//刪除所有圖片的onClick屬性 Doc.select(“input=[type=text]”).val(“”);//清空文本輸入框中的文本
注:修改完後直接調用Elements的html()方法就可以獲取修改完的html文檔 |
4.文檔清理--清除文檔內容
String safe = Jsoup.clean(unfafe,Whitelist.basic()); |
||||||||||
使用的是Whitelist類來對html文檔進行過濾,使用幾個常用方法
|
5.提取html內容
5.1Dom解析
5.1.1常用方法
body() |
|
createElement(String tagName) |
|
head() |
|
nodeName() |
|
normalise() |
|
text(String text) |
|
title() |
|
void |
child(int index) |
|
children() |
|
data() |
|
dataNodes() |
|
dataset() |
|
elementSiblingIndex() |
|
empty() |
|
firstElementSibling() |
獲取Elements
getAllElements() |
|
getElementById(String id) |
|
getElementsByAttribute(String key) |
|
getElementsByAttributeStarting(String keyPrefix) |
|
getElementsByAttributeValue(String key, String value) |
|
getElementsByAttributeValueContaining(String key, String match) |
|
getElementsByAttributeValueEnding(String key, String valueSuffix) |
|
getElementsByAttributeValueMatching(String key, Pattern pattern) |
|
getElementsByAttributeValueMatching(String key, String regex) |
|
getElementsByAttributeValueNot(String key, String value) |
|
getElementsByAttributeValueStarting(String key, String valuePrefix) |
|
getElementsByClass(String className) |
|
getElementsByIndexEquals(int index) |
|
getElementsByIndexGreaterThan(int index) |
|
getElementsByIndexLessThan(int index) |
|
getElementsByTag(String tagName) |
|
getElementsContainingOwnText(String searchText) |
|
getElementsContainingText(String searchText) |
|
getElementsMatchingOwnText(Pattern pattern) |
|
getElementsMatchingOwnText(String regex) |
|
getElementsMatchingText(Pattern pattern) |
|
getElementsMatchingText(String regex) |
|
boolean |
hasClass(String className) |
int |
hashCode() |
boolean |
hasText() |
html() |
|
id() |
|
insertChildren(int index, Collection<? extends Node> children) |
|
boolean |
isBlock() |
lastElementSibling() |
|
nextElementSibling() |
|
nodeName() |
|
parent() |
|
parents() |
|
prependChild(Node child) |
|
prependElement(String tagName) |
|
prependText(String text) |
|
previousElementSibling() |
|
removeClass(String className) |
|
select(String cssQuery) |
|
siblingElements() |
|
tag() |
|
tagName() |
|
text() |
|
textNodes() |
|
val() |
|
val(String value) |
|
wrap(String html) |
選擇器
Jsoup連接網絡,錯誤處理
1.超時
修改url
如:訪問的url:http://rwxy.suse.edu.cn/images/Maincn.asp/
修改爲:http://rwxy.suse.edu.cn/images/Maincn.asp
注:去掉了末尾的“/”斜槓符號