yhw-miracle

详解基础爬虫架构

痛点就是起点 writed in

本文为痛点就是起点原创文章,可以随意转载,但需注明出处。

基础爬虫框架主要包括五大模块,分别为URL管理器、HTML下载器、HTML解析器、数据存储器和爬虫调度器。它们之间关系如下图所示。

URL管理器负责管理URL链接,维护已爬取的URL集合和为未爬取的URL集合,并提供外部访问接口。

HTML下载器负责从URL管理器中获取未爬取的URL链接,并下载相应的HTML网页。

HTML解析器负责解析HTML下载器下载的网页信息,解析出的信息交给数据存储器,解析出的新的URL链接交给URL管理器。

数据存储器负责将HTML解析器解析出来的数据通过文件或数据库的形式存储起来。

爬虫调度器负责统筹以上四个模块之间协调工作。

以爬取百度百科100条词条的词条标题、摘要和链接为例。

URL 管理器

URL管理器维护了两个变量,已爬取URL集合和未爬取URL集合;对外提供了四类访问这两个变量的方法,包括是否有待爬取的URL、获取未爬取的URL、添加新的URL到未爬取集合中、已爬取URL集合和未爬取URL集合的大小。

URL管理器需要对爬取的URL进行去重处理,常见的去重方案有三种,分别是内存去重、关系数据库去重和缓存数据库去重。

两个变量

self.new_urls = set()
self.old_urls = set()

六个方法

have_new_url(self)
get_new_url(self)
add_new_url(self, url)
add_new_urls(self, urls)
new_url_size(self)
old_url_size(self)
# coding: utf-8


class URLManager(object):
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()

    def have_new_url(self):
        """
        判断是否有待爬取的 url
        :return: 待爬取的 url 集合的大小
        """
        return self.new_url_size() != 0

    def get_new_url(self):
        """
        获取待爬取的 url
        :return: 一个待爬取的 url
        """
        new_url = self.new_urls.pop()
        if new_url is not None:
            self.old_urls.add(new_url)
            return new_url

    def add_new_url(self, url):
        """
        添加一个待爬取的 url
        :param url:
        :return:
        """
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)

    def add_new_urls(self, urls):
        """
        添加待爬取的 url 集合
        :param urls:
        :return:
        """
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_new_url(url)

    def new_url_size(self):
        """
        待爬取的 url 集合的大小
        :return:
        """
        return len(self.new_urls)

    def old_url_size(self):
        """
        已爬取的 url 集合的大小
        :return:
        """
        return len(self.old_urls)

HTML 下载器

# coding: utf-8

import requests


class HtmlDownloader(object):
    def download(self, url):
        if url is None:
            return
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " \
                     "Chrome/71.0.3578.98 Safari/537.36"
        headers = {"User-Agent": user_agent}
        req = requests.get(url, headers=headers)
        if req.status_code == 200:
            req.encoding = "utf-8"
            return req.text
        return None


HTML 解析器

# coding: utf-8

from bs4 import BeautifulSoup
import re, urlparse


class HtmlParser(object):
    def parser(self, page_url, html_content):
        """

        :param page_url:
        :param html_content:
        :return:
        """
        if page_url is None or html_content is None:
            return
        soup = BeautifulSoup(html_content, "lxml")
        new_urls = self.get_new_urls(page_url, soup)
        new_data = self.get_new_data(page_url, soup)
        return new_urls, new_data

    def get_new_urls(self, page_url, soup):
        """

        :param page_url:
        :param soup:
        :return:
        """
        new_urls = set()
        links = soup.find_all("a", href=re.compile(r'/item/(%\w+)+/\d+'))
        for link in links:
            new_url = link["href"]
            new_full_url = urlparse.urljoin(page_url, new_url)
            new_urls.add(new_full_url)
        return new_urls

    def get_new_data(self, page_url, soup):
        """

        :param page_url:
        :param soup:
        :return:
        """
        data = {}
        data["url"] = page_url.encode("utf-8")

        title = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1")
        data["title"] = title.string.encode("utf-8")

        summary = soup.find("div", class_="lemma-summary")
        data["summary"] = summary.get_text().encode("utf-8")

        return data

数据存储器

# coding: utf-8

import csv


class DataOutput(object):
    def __init__(self):
        self.datas = []

    def store_data(self, data):
        if data is None:
            return
        self.datas.append(data)

    def output_html(self):
        headers = ["url", "title", "summary"]
        with open("baike.csv", "w") as fp:
            fp_csv = csv.DictWriter(fp, headers)
            fp_csv.writeheader()
            fp_csv.writerows(self.datas)

爬虫调度器

# coding: utf-8

from URLManager import URLManager
from HtmlDownloader import HtmlDownloader
from HtmlParser import HtmlParser
from DataOutput import DataOutput


class Spider(object):
    def __init__(self):
        self.manager = URLManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while(self.manager.have_new_url() and self.manager.old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print "已经爬取 %s 个链接" % self.manager.old_url_size()
            except Exception, e:
                print "crawl failed."
                print e
        self.output.output_html()


if __name__ == "__main__":
    spider = Spider()
    spider.crawl("https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB")

python 爬虫框架
知识总结

欢迎关注,我们一起进行认知迭代!


痛点就是起点

© 2016 - 2020 基于 jekyll | Github Pags | iconfont By yhw-miracle