Skip to content

C# 多线程 06-使用并发集合 04-使用 ConcurrentBag 创建一个可扩展的爬虫

🏷️ 《C# 多线程》

使用 ConcurrentBag 创建一个可扩展的爬虫

示例代码

csharp
static Dictionary<string, string[]> _contentEmulation = new Dictionary<string, string[]>();

/// <summary>
/// 使用 ConcurrentBag 创建一个可扩展的爬虫
/// </summary>
/// <param name="args"></param>
static void Main(string[] args)
{
    CreateLinks();
    Task t = RunProgram();
    t.Wait();

    Console.ReadLine();
}

/// <summary>
/// 创建模拟用的链接数据
/// </summary>
private static void CreateLinks()
{
    _contentEmulation["http://liujiajia.me"] = new[] { "http://liujiajia.me/#/blog/it", "http://liujiajia.me/#/blog/game" };

    _contentEmulation["http://liujiajia.me/#/blog/it"] = new[] {
        "http://liujiajia.me/#/blog/details/csharp-multi-threading-06-concurrent-00-summary",
        "http://liujiajia.me/#/blog/details/cookie-http-only" };

    _contentEmulation["http://liujiajia.me/#/blog/game"] = new[] {
        "http://liujiajia.me/#/blog/details/wow-7-3-ptr",
        "http://liujiajia.me/#/blog/details/63b737b6-7663-43f6-acd2-dc6e020c14ba" };
}

static async Task RunProgram()
{
    var bag = new ConcurrentBag<CrawlingTask>();
    // 定义 4 个网站根 url 地址,并创建 4 个对应的 Task
    string[] urls =
    {
        "http://liujiajia.me",
        "http://weibo.com",
        "http://sf.gg",
        "http://ngacn.cc"
    };

    var crawlers = new Task[4];
    for (int i = 1; i <= 4; i++)
    {
        string crawlerName = $"Crawler {i}";
        bag.Add(new CrawlingTask { UrlToCrawl = urls[i-1], ProducerName = "root" });
        crawlers[i - 1] = Task.Run(() => Crawl(bag, crawlerName));
    }

    await Task.WhenAll(crawlers);
}

/// <summary>
/// 模拟爬虫程序
/// </summary>
/// <param name="bag"></param>
/// <param name="crawlerName"></param>
/// <returns></returns>
static async Task Crawl(ConcurrentBag<CrawlingTask> bag, string crawlerName)
{
    CrawlingTask task;
    while (bag.TryTake(out task))
    {
        // 如果页面中存在 URL 地址,则将这些地址放入待爬取的任务集合
        IEnumerable<string> urls = await GetLinksFromContent(task);
        if (urls != null)
        {
            foreach (var url in urls)
            {
                var t = new CrawlingTask
                {
                    UrlToCrawl = url,
                    ProducerName = crawlerName
                };

                bag.Add(t);
            }

            Console.WriteLine($"Indexing url {task.UrlToCrawl} posted by {task.ProducerName} is completed by {crawlerName}");
        }
    }
}

/// <summary>
/// 获取页面上的 URL 地址
/// </summary>
/// <param name="task"></param>
/// <returns></returns>
static async Task<IEnumerable<string>> GetLinksFromContent(CrawlingTask task)
{
    await GetRandomDelay();

    if (_contentEmulation.ContainsKey(task.UrlToCrawl))
    {
        return _contentEmulation[task.UrlToCrawl];
    }

    return null;
}

static Task GetRandomDelay()
{
    int delay = new Random(DateTime.Now.Millisecond).Next(150, 200);
    return Task.Delay(delay);
}

private class CrawlingTask
{
    public string UrlToCrawl { get; set; }
    public string ProducerName { get; set; }
}

运行结果

txt
Indexing url http://liujiajia.me posted by root is completed by Crawler 2
Indexing url http://liujiajia.me/#/blog/game posted by Crawler 2 is completed by
 Crawler 1
Indexing url http://liujiajia.me/#/blog/it posted by Crawler 2 is completed by C
rawler 3