其他
Scrapy翻页爬取糗事百科所有段子后,我总结出的...
https://www.qiushibaike.com/text/
scrapy startproject qiushibaike
scrapy genspider spider_bk www.qiushibaike.com/text/
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class QiushibaikeItem(scrapy.Item):
# define the fields for your item here like:
# 作者
author = scrapy.Field()
# 作者等级
level = scrapy.Field()
# 内容
context = scrapy.Field()
# 赞同人数
star = scrapy.Field()
# 评论人数
comment = scrapy.Field()
import scrapy
from qiushibaike.items import QiushibaikeItem
class SpiderBkSpider(scrapy.Spider):
name = 'spider_bk'
allowed_domains = ['www.qiushibaike.com/text/page/1/']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response):
# 实例化方法
item = QiushibaikeItem()
# 获取当前页面的所有的div
divs = response.xpath("//div[@class='col1 old-style-col1']/div")
for div in divs:
item['author'] = div.xpath('./div[@class="author clearfix"]/a/h2/text()').get().strip() # 作者
item['level'] = div.xpath('./div[@class="author clearfix"]/div/text()').get() # 作者等级
content = div.xpath(".//div[@class='content']//text()").getall()
content = " ".join(content).strip() # 内容
item['context'] = content
item['star'] = div.xpath('./div/span/i/text()').get() # 赞同人数
item['comment'] = div.xpath('./div/span/a/i/text()').get() # 评论人数
yield item
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
class QiushibaikePipeline:
def process_item(self, item, spider):
print(item['author'])
print(item['level'])
print(item['context'])
print(item['star'])
print(item['comment'])
# 保存文件到本地
with open('./糗事百科.json', 'a+', encoding='utf-8') as f:
lines = json.dumps(dict(item), ensure_ascii=False) + '\n'
f.write(lines)
return item
from scrapy import cmdline
cmdline.execute('scrapy crawl spider_bk -s LOG_FILE=all.log'.split())
from fake_useragent import UserAgent
BOT_NAME = 'qiushibaike'
SPIDER_MODULES = ['qiushibaike.spiders']
NEWSPIDER_MODULE = 'qiushibaike.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': str(UserAgent().random),
}
ITEM_PIPELINES = {
'qiushibaike.pipelines.QiushibaikePipeline': 300,
}
def start_requests(self):
# 获取翻页URL
for page in range(1, 13 + 1):
url = 'https://www.qiushibaike.com/text/page/{}/'.format(str(page)) # 提取翻页链接
yield scrapy.Request(url, callback=self.parse)
回复关键词 「linux」,即可获取 185 页 Linux 工具快速教程手册和154页的Linux笔记。
回复关键词 「Python进阶」,即可获取 106 页 Python 进阶文档 PDF
回复关键词 「Python面试题」,即可获取最新 100道 面试题 PDF
回复关键词 「python数据分析」,即可获取47页python数据分析与自然语言处理的 PDF
回复关键词 「python爬虫」,满满五份PPT爬虫教程和70多个案例
回复关键词 「Python最强基础学习文档」,即可获取 168 页 Python 最强基础学习文档 PDF,让你快速入门Python
回复关键词 「linux」,即可获取 185 页 Linux 工具快速教程手册和154页的Linux笔记。
回复关键词 「Python进阶」,即可获取 106 页 Python 进阶文档 PDF
回复关键词 「Python面试题」,即可获取最新 100道 面试题 PDF
回复关键词 「python数据分析」,即可获取47页python数据分析与自然语言处理的 PDF
回复关键词 「python爬虫」,满满五份PPT爬虫教程和70多个案例
推荐我的微信号
来围观我的朋友圈,我的经验分享,技术更新,不定期送书,坑位有限,速速扫码添加!
备注:开发方向_昵称_城市,另送你10本Python电子