思路,根據請求返回的響應頭的Content-Type類型中的charset編碼類型去編碼抓取的內容,達到解決亂碼的目的
public static string GetHtml(string url)
{
string htmlCode;
HttpWebRequest webRequest = (System.NET.HttpWebRequest)System.net.WebRequest.Create(url);
webRequest.Timeout = 30000;
webRequest.Method = "GET";
webRequest.UserAgent = "Mozilla/4.0";
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
HttpWebResponse webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
//獲取目標網站的編碼格式
string contentype = webResponse.Headers["Content-Type"];
Regex regex = new Regex("charset\\s*=\\s*[\\W]?\\s*([\\w-]+)", RegexOptions.IgnoreCase);
if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip則先解壓
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (var zipStream = new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
{
//匹配編碼格式
if (regex.IsMatch(contentype))
{
Encoding ending = Encoding.GetEncoding(regex.Match(contentype).Groups[1].Value.Trim());
using (StreamReader sr = new System.IO.StreamReader(zipStream, ending))
{
htmlCode = sr.ReadToEnd();
}
}
else
{
using (StreamReader sr = new System.IO.StreamReader(zipStream, Encoding.UTF8))
{
htmlCode = sr.ReadToEnd();
}
}
}
}
}
else
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, Encoding.Default))
{
htmlCode = sr.ReadToEnd();
}
}
}
return htmlCode;
}
原帖地址:http://blog.csdn.net/zhuyu19911016520/article/details/46647001C#獲取網頁內容,解決大部分亂碼問題
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.