欢迎光临!
若无相欠,怎会相见

爬虫学习之爬取伯乐在线文章的元素

介绍

学习Python爬虫学习了一段时间了,也知道一点原理了。下面想把自己学习的成果记录下来。虽然只是初级教程,希望对初学者有帮助。

我自己使用的元素定位方法是CSS选择器法,因为之前学习PHP改写主题的时候,了解一些前端知识,用CSS更方便。

成果

本成果只是爬取伯乐在线文章的标题,创建时间,赞,收藏,评论,文章目录及标签,目前就写到这里。代码如下:

# -*- coding: utf-8 -*-
import scrapy
import re

from scrapy.http import Request
# python 3.x
from urllib import parse

# import urlparse        python 2.x

class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/112109/']
    # start_urls = ['http://blog.jobbole.com/all-posts/']
    """
    def parse(self, response):
        
        1. 获取文章列表页中的文章URL并交给解析函数进行具体字段的解析  
        2. 获取下一页的URL并交给scrapy进行下载
        

        # 解析列表页中的所有文章URL并交给scrapy下载后并进行解析
        post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
        for post_url in post_urls :
            yield Request( url = parse.urljoin(response.url,post_url), callback=self.parse_detail)

        # 提取下一页并交给scrapy进行下载
        next_url = response.css(".navigation .next.page-numbers::attr(href)").extract_first()
        if next_url:
            yield  Request(url=parse.urljoin(response.url,post_url),callback=self.parse)
"""
    def parse(self, response):
        # def parse_detail(self,response):
        #post_urls = response.css(".floated-thumb .post-thumb a::attr(href)")
        #post_string = response.xpath('//*[@id="breadcrumb"]/div/a[1]/text()').extract()[0]
        #post_url = response.xpath('//*[@id="breadcrumb"]/div/a[1]/text()').extract_first("none")
        #select=response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1")

        # raise_num = response.xpath("//*[@id='post-79326']/div[3]/div[5]/span/h10/text()").extract()[0]
        # print(raise_num, "赞")
        #
        # fav_num = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*",fav_num)
        # if match_re:
        #     fav_num = int(match_re.group(1))
        # else:
        #     fav_num = 0
        # print(fav_num,"收藏")
        #
        # comment_num = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        #
        # # comment_num = response.xpath("//span[contains(@class, 'btn-bluet-bigger')]/text()").extract_first()
        # match_comment = re.match(".*?(\d+).*",comment_num)
        # if match_comment:
        #     comment_num = int(match_comment.group(1))
        # else:
        #     comment_num = 0
        # print(comment_num,"评论")

        # CSS选择器

        article_title =  response.css(".entry-header h1::text").extract_first()

        # arricle_url = response.css()

        article_time = response.css(".entry-meta-hide-on-mobile::text").extract_first().strip()

        # category = response.css(".entry-meta-hide-on-mobile a::text").extract_first().strip()

        tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        article_tags = ",".join(tag_list)

        raise_num = int(response.css(".post-adds h10::text").extract()[0])

        fav_num = response.css(".bookmark-btn ::text").extract()[0]
        match_re = re.match(".*?(\d+).*",fav_num)
        if match_re:
            fav_num = int(match_re.group(1))
        else:
            fav_num = 0

        comment_num = response.css("a[href='#article-comment'] span::text").extract()[0]
        match_comment = re.match(".*?(\d+).*",comment_num)
        if match_comment:
            comment_num = int(match_comment.group(1))
        else:
            comment_num = 0

        print("文章标题:",article_title)

        print("文章日期:",article_time)

        print("赞:", raise_num)

        print("收藏:", fav_num)

        print("评论:",comment_num)

        print("分类及标签:",article_tags)

        pass

后面会继续更新的。

jobboleArticleSpider V0.1.py

赞(0) 打赏
转载请注明:飘零博客 » 爬虫学习之爬取伯乐在线文章的元素
分享到: 更多 (0)

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

欢迎光临