爬一個網頁,輸出全部(C#Console)
using System;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace LoginkCreditCenter.WebSpiderConsole
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("\r\n-------Please Enter URl--------\r\n");
string url = Console.ReadLine();
string strResponse = GetPageData(url, "");
#region
//StringBuilder strWebData = new StringBuilder(strResponse);
//Console.WriteLine(strWebData);//寫StringBuilder對象
//Console.WriteLine(strResponse.IndexOf("<table"));
//Console.WriteLine(strResponse.IndexOf("</table>"));
//Console.WriteLine("\r\n-------The Table IS--------\r\n");
//string strSub = strResponse.Substring(strResponse.IndexOf("<table"), (strResponse.IndexOf("</table>") - strResponse.IndexOf("<table")+8));
//Console.WriteLine(strSub);
//Console.WriteLine("\r\n-------The Last Table IS--------\r\n");
//string strSubLast = strResponse.Substring(strResponse.LastIndexOf("<table"), (strResponse.LastIndexOf("</table>") - strResponse.LastIndexOf("<table") + 8));
//Console.WriteLine(strSubLast);
#endregion
string strRemain = strResponse;
string nextStrRemain = strResponse;
do
{
strRemain = nextStrRemain;
string strSubStr = strRemain.Substring(strRemain.IndexOf("<table"), (strRemain.IndexOf("</table>") - strRemain.IndexOf("<table") + 8));
Console.WriteLine("\r\n-------The IS A Table--------\r\n");
Console.WriteLine(strSubStr);
nextStrRemain = strRemain.Substring((strRemain.IndexOf("</table>") + 8));
} while (nextStrRemain.Contains("</table>")) ;
Console.ReadLine();
}
private static string GetPageData(string url, string charSet)
{
try
{
//StringBuilder strWebData = new StringBuilder();
string strWebData = string.Empty;
if (url != null || url.Trim() != "")
{
//創建WebClient實例wc
WebClient wc = new WebClient();
//下載網頁要解決編碼問題或者Cookie
//在頭部加入Cookie
//需要一些重載方法
//string cookie="";
//wc.Headers.Add("Cookie", cookie);
//獲取或設置用於對向Internet資源請求進行身份驗證的網絡憑據
wc.Credentials = CredentialCache.DefaultCredentials;
//如果服務器要驗證用戶名密碼
//string username="";
//string password="";
//NetworkCredential credential = new NetworkCredential(username, password);
//wc.Credentials = credential;
//從資源下載並返回字節數組
byte[] dataBuffer = wc.DownloadData(url);
strWebData = Encoding.Default.GetString(dataBuffer);
//獲取網頁字符編碼描述信息
Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" >
//Match charSetMatchs=Regex.Matches(,,);所有匹配項
string webCharSet = charSetMatch.Groups[2].Value;//"<meta([^<]*)charset=([^<]*)\"有兩個()獲取兩個所以Group[2].value
//string webCharSet = "";
if (charSet == null || charSet == "")
{
//如果未獲取到編碼,則設置默認編碼
if (webCharSet == null || webCharSet == "")
{
charSet = "utf-8";
}
else
{
charSet = webCharSet;
}
}
if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
{
strWebData = Encoding.GetEncoding(charSet).GetString(dataBuffer);
//strWebData = Encoding.Default.GetString(dataBuffer);
}
}
return strWebData;
}
catch (Exception ex)
{
return ex.Message;
}
}
}
}
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace LoginkCreditCenter.WebSpiderConsole
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("\r\n-------Please Enter URl--------\r\n");
string url = Console.ReadLine();
string strResponse = GetPageData(url, "");
#region
//StringBuilder strWebData = new StringBuilder(strResponse);
//Console.WriteLine(strWebData);//寫StringBuilder對象
//Console.WriteLine(strResponse.IndexOf("<table"));
//Console.WriteLine(strResponse.IndexOf("</table>"));
//Console.WriteLine("\r\n-------The Table IS--------\r\n");
//string strSub = strResponse.Substring(strResponse.IndexOf("<table"), (strResponse.IndexOf("</table>") - strResponse.IndexOf("<table")+8));
//Console.WriteLine(strSub);
//Console.WriteLine("\r\n-------The Last Table IS--------\r\n");
//string strSubLast = strResponse.Substring(strResponse.LastIndexOf("<table"), (strResponse.LastIndexOf("</table>") - strResponse.LastIndexOf("<table") + 8));
//Console.WriteLine(strSubLast);
#endregion
string strRemain = strResponse;
string nextStrRemain = strResponse;
do
{
strRemain = nextStrRemain;
string strSubStr = strRemain.Substring(strRemain.IndexOf("<table"), (strRemain.IndexOf("</table>") - strRemain.IndexOf("<table") + 8));
Console.WriteLine("\r\n-------The IS A Table--------\r\n");
Console.WriteLine(strSubStr);
nextStrRemain = strRemain.Substring((strRemain.IndexOf("</table>") + 8));
} while (nextStrRemain.Contains("</table>")) ;
Console.ReadLine();
}
private static string GetPageData(string url, string charSet)
{
try
{
//StringBuilder strWebData = new StringBuilder();
string strWebData = string.Empty;
if (url != null || url.Trim() != "")
{
//創建WebClient實例wc
WebClient wc = new WebClient();
//下載網頁要解決編碼問題或者Cookie
//在頭部加入Cookie
//需要一些重載方法
//string cookie="";
//wc.Headers.Add("Cookie", cookie);
//獲取或設置用於對向Internet資源請求進行身份驗證的網絡憑據
wc.Credentials = CredentialCache.DefaultCredentials;
//如果服務器要驗證用戶名密碼
//string username="";
//string password="";
//NetworkCredential credential = new NetworkCredential(username, password);
//wc.Credentials = credential;
//從資源下載並返回字節數組
byte[] dataBuffer = wc.DownloadData(url);
strWebData = Encoding.Default.GetString(dataBuffer);
//獲取網頁字符編碼描述信息
Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" >
//Match charSetMatchs=Regex.Matches(,,);所有匹配項
string webCharSet = charSetMatch.Groups[2].Value;//"<meta([^<]*)charset=([^<]*)\"有兩個()獲取兩個所以Group[2].value
//string webCharSet = "";
if (charSet == null || charSet == "")
{
//如果未獲取到編碼,則設置默認編碼
if (webCharSet == null || webCharSet == "")
{
charSet = "utf-8";
}
else
{
charSet = webCharSet;
}
}
if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
{
strWebData = Encoding.GetEncoding(charSet).GetString(dataBuffer);
//strWebData = Encoding.Default.GetString(dataBuffer);
}
}
return strWebData;
}
catch (Exception ex)
{
return ex.Message;
}
}
}
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.