/// <summary>
/// C#爬蟲獲取網頁中表格的數據
/// </summary>
public void GetDataFromNet() {
//爬取的網頁地址
string url="http://www.sse.net.cn/index/singleIndex?indexType=cbcfi";
//若是POST請求,下面一行代碼解註釋
//request.Method = "POST";
WebRequest request = WebRequest.Create(url);
WebResponse response = (WebResponse)request.GetResponse();
Stream dataStream = response.GetResponseStream();
StreamReader reader = new StreamReader(dataStream, Encoding.UTF8);
//此處將爬取到的內容轉換爲HTML
string strHTML = reader.ReadToEnd();
//也可以將HTML直接轉換爲dynamic對象,按需選擇
//dynamic Datas = JsonConvert.DeserializeObject(strHTML);
var list = new List<string>();
//正則表達式獲取table所有行==>mc
Regex reg = new Regex("(?is)(?<=<table[^>]*?[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>");
MatchCollection mc = reg.Matches(strHTML);
foreach (Match mat in mc)
{
//正則表達式獲取每行所有的td
Regex reg1 = new Regex(@"<td.*?>(?<value>.*?)</td>");
//mat.Value是表格中每一行的HTML字符串
bool abc = false;
foreach (Match m in reg1.Matches(mat.Value))//類似循環一行的每個td
{
string val = m.Groups["value"].Value;
if (val.IndexOf("本期") != -1)
{
list.Add(val.Substring(6));
}
if (val == "秦皇島-廣州(6-7萬DWT)")
abc = true;
if (abc)
list.Add(m.Groups["value"].Value);
//後續處理,保存到數據庫
}
}
reader.Close();
dataStream.Close();
response.Close();
}