在用net core寫爬蟲的時候,發現默認不再支持gb2312編碼了:
解決方案如下:
1,引入System.Text.Encoding.CodePages:
2,在需要的地方註冊EncodingProvider的方法;
3,調用 Encoding.GetEncoding("GB2312").GetString(pageSource);
public void CityCrawler(string allCityUrl)
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
var cityList = new List<string>();
var pageSource = new HttpClient(new HttpClientHandler { AutomaticDecompression = DecompressionMethods.GZip })
.GetByteArrayAsync(allCityUrl).Result;
var result = Encoding.GetEncoding("GB2312").GetString(pageSource);
var cities = Regex.Matches(result,
"([\u4e00-\u9fa5]{2,5})\", \"spell\": \"[A-Za-z]+\", \"url\": \"//([A-Za-z]{2,}.esf.fang.com)");
for (int index = 0; index < cities.Count; index++)
{
var city = cities[index].Groups[1].Value + ":" + "https://" + cities[index].Groups[2].Value;
cityList.Add(city);
}
File.WriteAllLines("房天下城市列表.txt", cityList);
}