C#簡單爬蟲實現

一、環境

.net core 6.0

vs2022 控制檯應用程序

Nuget引入:

AngleSharp 1.1.0 用於HTML解析

Downloader 3.0.6 用於下載文件

 ShellProgressBar 5.2.0 用於進度條顯示

二、效果

 

 

三、相關代碼

1.Program.cs

using ShellProgressBar;
using Spider;
using System.Collections;

var url = "https://blog.csdn.net/u011127019/article/details/124248757";
var data = await HttpHelper.GetHtmlDocument(url);
DownloadHandler downloadHandler = new DownloadHandler();
List<ImageList> imageList = new List<ImageList>();
ImageList imageList1 = new ImageList
{
    Name = "圖片目錄",
    Images = new List<string>()
};
foreach (var item in data.QuerySelectorAll("#article_content img"))
{

    var link = item.QuerySelector("img");
    var href = item?.GetAttribute("src");
    if (href != null)
    {
        imageList1.ImageCount++;
        imageList1.Images.Add(href);
    }
}
imageList.Add(imageList1);
var list = imageList;// 加載圖集列表
ProgressBarOptions BarOptions = new()
{
    ProgressCharacter = '─',
    ProgressBarOnBottom = true,
    ForegroundColor = ConsoleColor.Yellow,
    ForegroundColorDone = ConsoleColor.DarkGreen,
    BackgroundColor = ConsoleColor.DarkGray,
    BackgroundCharacter = '\u2593'
};

ProgressBarOptions ChildBarOptions = new()
{
    ForegroundColor = ConsoleColor.Green,
    BackgroundColor = ConsoleColor.DarkGreen,
    ProgressCharacter = '─'
};
using var bar = new ProgressBar(list.Count, "正在下載所有圖片", BarOptions);

foreach (var item in list)
{
    bar.Message = $"圖集:{item.Name}";
    bar.Tick();
    int i = 1;
    foreach (var imgUrl in item.Images)
    {
        using (var childBar = bar.Spawn(item.ImageCount, $"圖片:{imgUrl}", ChildBarOptions))
        {
            childBar.Tick();
            string fileName = string.Empty;
            // 具體的下載代碼

            if (imgUrl.Contains(".png"))
            {
                fileName = ".png";
            }
            if (imgUrl.Contains(".jpg"))
            {
                fileName = ".jpg";
            }

            await downloadHandler.Download(childBar, imgUrl, AppDomain.CurrentDomain.BaseDirectory + "\\Images\\" + i + fileName);
            i++;
        }
    }
}

  2.HttpHelper.cs

using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
using Downloader;
using System.Net;
using System.Text;

namespace Spider
{

    public static class HttpHelper
    {
        public const string UserAgent =
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36";
        public static IDownloadService Downloader { get; }

        public static DownloadConfiguration DownloadConf => new()
        {
            BufferBlockSize = 10240, // 通常,主機最大支持8000字節,默認值爲8000。
            ChunkCount = 8, // 要下載的文件分片數量,默認值爲1
                            // MaximumBytesPerSecond = 1024 * 50, // 下載速度限制,默認值爲零或無限制
            MaxTryAgainOnFailover = 5, // 失敗的最大次數
            ParallelDownload = true, // 下載文件是否爲並行的。默認值爲false
            Timeout = 1000, // 每個 stream reader  的超時(毫秒),默認值是1000
            RequestConfiguration = {
                Accept = "*/*",
                AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate,
                CookieContainer = new CookieContainer(), // Add your cookies
                Headers = new WebHeaderCollection(), // Add your custom headers
                KeepAlive = true,
                ProtocolVersion = HttpVersion.Version11, // Default value is HTTP 1.1
                UseDefaultCredentials = false,
                UserAgent = UserAgent
            }
        };

        public static HttpClientHandler Handler { get; }

        public static HttpClient Client { get; }

        static HttpHelper()
        {
            Handler = new HttpClientHandler();
            Client = new HttpClient(Handler);
            Client.DefaultRequestHeaders.Add("User-Agent", UserAgent);
            Downloader = new DownloadService(DownloadConf);
        }

        public static async Task<IHtmlDocument> GetHtmlDocument(string url)
        {
            var html = await Client.GetStringAsync(url);
            return new HtmlParser().ParseDocument(html);
        }

        public static async Task<IHtmlDocument> GetHtmlDocument(string url, string charset)
        {
            var res = await Client.GetAsync(url);
            var resBytes = await res.Content.ReadAsByteArrayAsync();
            var resStr = Encoding.GetEncoding(charset).GetString(resBytes);
            return new HtmlParser().ParseDocument(resStr);
        }

    }
}

  3.DownloadHandler.cs

using Downloader;
using ShellProgressBar;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Diagnostics;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading.Tasks;

namespace Spider
{
    public class DownloadHandler
    {
       
        public async Task Download(IProgressBar bar, string url, string filepath)
        {
            var barOptions = new ProgressBarOptions
            {
                ForegroundColor = ConsoleColor.Yellow,
                BackgroundColor = ConsoleColor.DarkYellow,
                ForegroundColorError = ConsoleColor.Red,
                ForegroundColorDone = ConsoleColor.Green,
                BackgroundCharacter = '\u2593',
                ProgressBarOnBottom = true,
                EnableTaskBarProgress = RuntimeInformation.IsOSPlatform(OSPlatform.Windows),
                DisplayTimeInRealTime = false,
                ShowEstimatedDuration = false
            };
            var percentageBar = bar.Spawn(100, $"正在下載:{Path.GetFileName(url)}", barOptions);

            HttpHelper.Downloader.DownloadStarted += DownloadStarted;
            HttpHelper.Downloader.DownloadFileCompleted += DownloadFileCompleted;
            HttpHelper.Downloader.DownloadProgressChanged += DownloadProgressChanged;

            await HttpHelper.Downloader.DownloadFileTaskAsync(url, filepath);

            void DownloadStarted(object? sender, DownloadStartedEventArgs e)
            {
                Trace.WriteLine(
                    $"圖片, FileName:{Path.GetFileName(e.FileName)}, TotalBytesToReceive:{e.TotalBytesToReceive}");
            }

            void DownloadFileCompleted(object? sender, AsyncCompletedEventArgs e)
            {
                Trace.WriteLine($"下載完成, filepath:{filepath}");
                percentageBar.Dispose();
            }

            void DownloadProgressChanged(object? sender, DownloadProgressChangedEventArgs e)
            {
                percentageBar.AsProgress<double>().Report(e.ProgressPercentage);
            }
        }
    }
}

  4.Images.cs

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Spider
{
    public class ImageList
    {
        public string Name { get; set; } = string.Empty;
        public int ImageCount { get; set; }
        public List<string>? Images { get; set; }
    }
}

  四、源碼下載

鏈接:https://pan.baidu.com/s/1VnnH05Har9hUhxAsIfKSMw?pwd=paws
提取碼:paws

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章