有關爬蟲,自己半年前寫了一個,寫過就忘記了,今天纔看到
1 import java.io.IOException; 2 import java.net.URISyntaxException; 3 import java.nio.charset.StandardCharsets; 4 import java.util.ArrayList; 5 import java.util.Date; 6 import java.util.HashMap; 7 import java.util.List; 8 import java.util.Map; 9 10 import org.apache.commons.lang.time.DateFormatUtils; 11 import org.apache.http.HttpEntity; 12 import org.apache.http.ParseException; 13 import org.apache.http.client.ClientProtocolException; 14 import org.apache.http.client.methods.CloseableHttpResponse; 15 import org.apache.http.client.methods.HttpPost; 16 import org.apache.http.client.utils.URIBuilder; 17 import org.apache.http.entity.StringEntity; 18 import org.apache.http.impl.client.CloseableHttpClient; 19 import org.apache.http.impl.client.HttpClientBuilder; 20 import org.apache.http.util.EntityUtils; 21 import org.apache.xerces.util.URI; 22 import org.jsoup.Jsoup; 23 import org.jsoup.nodes.Document; 24 import org.jsoup.nodes.Element; 25 import org.jsoup.select.Elements; 26 import org.quartz.Job; 27 import org.quartz.JobExecutionContext; 28 import org.quartz.JobExecutionException; 29 30 /* 31 * @說明 32 * 由於針對網頁取數據,1頁有100條數據,二期數據一直是變化的; 33 * 目前設計是每5分鐘抓取一次,所以抓一次,存一次,之前的數據仍舊保留,但是隻抓第一頁數據。 34 * 其他排名靠後的數據就不抓了 35 * 默認排序爲 totalvolpct 總成交佔比 36 * 37 * 38 * <th>名稱</th> 39 <th><a href="#" onclick="return window['sortTable']('symbol', ' ');">代碼</a> </th> 40 <th><a href="#" onclick="return window['sortTable']('totalvol', ' ');">總成交量(萬股)</a> </th> 41 <td><a href="#" onclick="return window['sortTable']('totalvolpct', ' ');">總成交量佔比</a> </td> ----百分比,入庫時是去掉百分號入庫的 42 <td><a href="#" onclick="return window['sortTable']('totalamt', ' ');">總成交額(萬元)</a> </td> 43 <td><a href="#" onclick="return window['sortTable']('totalamtpct', ' ');">總成交額佔比</a> </td>----百分比,入庫時是去掉百分號入庫的 44 <td><a href="#" onclick="return window['sortTable']('avgprice', ' ');">平均成交價(元)</a> </td> 45 <td><a href="#" onclick="return window['sortTable']('kuvolume', '↓');">主買量(萬股)</a>↓</td> 46 <td><a href="#" onclick="return window['sortTable']('kevolume', ' ');">中性量(萬股)</a> </td> 47 <td><a href="#" onclick="return window['sortTable']('kdvolume', ' ');">主賣量(萬股)</a> </td> 48 <th>詳情 </th> 49 50 51 建數據庫表時,字段順序一定要按照上面的順序來建,否則會有問題 52 53 * 54 * 55 */ 56 57 58 public class Crawler implements Job{ 59 60 61 private String url="http://vip.stock.finance.sina.com.cn/quotes_service/view/cn_bill_sum.php?num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=1"; 62 private String encode="UTF-8"; 63 64 public String getUrlData() { 65 66 String out=new String(); 67 68 //---大單分析--- 69 //---http://vip.stock.finance.sina.com.cn/quotes_service/view/cn_bill_sum.php?num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=2 70 71 // 獲得Http客戶端(可以理解爲:你得先有一個瀏覽器;注意:實際上HttpClient與瀏覽器是不一樣的) 72 CloseableHttpClient httpClient = HttpClientBuilder.create().build(); 73 74 // // 創建Post請求 75 76 HttpPost httpPost = new HttpPost(url); 77 78 //---下面這句話暫時沒有起作用,不知道原因;其實參數是可以不用放在上面的httpPost對象中的 79 StringEntity entity = new StringEntity("num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=1", encode); 80 81 // post請求是將參數放在請求體裏面傳過去的;這裏將entity放入post請求體中 82 httpPost.setEntity(entity); 83 84 httpPost.setHeader("Content-Type", "text/html;charset=utf8"); 85 86 // 響應模型 87 CloseableHttpResponse response = null; 88 try { 89 // 由客戶端執行(發送)Post請求 90 response = httpClient.execute(httpPost); 91 // 從響應模型中獲取響應實體 92 HttpEntity responseEntity = response.getEntity(); 93 94 System.out.println("響應狀態爲:" + response.getStatusLine()); 95 if (responseEntity != null) { 96 System.out.println("響應內容長度爲:" + responseEntity.getContentLength()); 97 //System.out.println("響應內容爲:" + EntityUtils.toString(responseEntity,"GBK")); 98 out=EntityUtils.toString(responseEntity,"GBK"); 99 } 100 } catch (ClientProtocolException e) { 101 e.printStackTrace(); 102 } catch (ParseException e) { 103 e.printStackTrace(); 104 } catch (IOException e) { 105 e.printStackTrace(); 106 } finally { 107 try { 108 // 釋放資源 109 if (httpClient != null) { 110 httpClient.close(); 111 } 112 if (response != null) { 113 response.close(); 114 } 115 } catch (IOException e) { 116 e.printStackTrace(); 117 } 118 } 119 120 121 122 return out; 123 124 } 125 126 127 //---想直接跳過字符串處理,暫時沒成功---// 128 public void DealUrlString(String inStr,org.springframework.jdbc.core.JdbcTemplate db) { 129 130 String out=new String(); 131 132 Document doc = Jsoup.parseBodyFragment(inStr); 133 134 Element et = doc.getElementById("divListTemplate"); 135 Elements et_tab = et.getElementsByTag("table"); 136 137 Elements trs = et_tab.first().getElementsByTag("tr"); 138 139 140 System.out.println("====size===="+trs.size()); 141 142 143 int n=0; 144 145 146 147 for (Element element : trs) { 148 if(n==0) { 149 n++; 150 } 151 else { 152 StringBuffer insert_sql=new StringBuffer(); 153 154 insert_sql.append("insert into stock_bigdeal_analyse (cn_name,symbol,totalvol,totalvolpct,totalamt,totalamtpct,avgprice,kuvolume,kevolume,kdvolume,input_time) values ( "); 155 156 Elements ele_ths= element.getElementsByTag("th"); 157 insert_sql.append( "'" + ele_ths.get(0).text().trim()+"', "); 158 insert_sql.append( "'" + ele_ths.get(1).text().replaceAll(" ", "")+"', ");//--看不見的特殊符號, 159 160 161 Elements ele_tds= element.getElementsByTag("td"); 162 163 insert_sql.append( "'" + ele_tds.get(0).text().trim().replaceAll(" ", "").replaceAll(",", "")+"', "); 164 insert_sql.append( "'" + ele_tds.get(1).text().trim().replaceAll(" ", "").replaceAll("%", "")+"',"); 165 insert_sql.append( "'" + ele_tds.get(2).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',"); 166 insert_sql.append( "'" + ele_tds.get(3).text().trim().replaceAll(" ", "").replaceAll("%", "")+"',"); 167 insert_sql.append( "'" + ele_tds.get(4).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',"); 168 insert_sql.append( "'" + ele_tds.get(5).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',"); 169 insert_sql.append( "'" + ele_tds.get(6).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',"); 170 insert_sql.append( "'" + ele_tds.get(7).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',"); 171 172 insert_sql.append( "'" +DateFormatUtils.format(new Date(),"yyyyMMddHHmmssSSS")+"') "); 173 174 // System.out.println(insert_sql); 175 176 db.execute(insert_sql.toString()); 177 178 } 179 } 180 181 182 183 } 184 185 186 187 public static void main(String args[]) { 188 189 Crawler cr= new Crawler(); 190 191 SpringDb sd = new SpringDb(); 192 org.springframework.jdbc.core.JdbcTemplate db= sd.getJdbc(); 193 194 String data_in = cr.getUrlData(); 195 cr.DealUrlString(data_in,db); 196 197 198 } 199 200 201 @Override 202 public void execute(JobExecutionContext arg0) throws JobExecutionException { 203 204 Crawler cr= new Crawler(); 205 206 SpringDb sd = new SpringDb(); 207 org.springframework.jdbc.core.JdbcTemplate db= sd.getJdbc(); 208 209 String data_in = cr.getUrlData(); 210 cr.DealUrlString(data_in,db); 211 212 } 213 214 215 216 217 218 }