其他
多种爬虫方式对比
Python爬虫的方式有多种,从爬虫框架到解析提取,再到数据存储,各阶段都有不同的手段和类库支持。虽然不能一概而论哪种方式一定更好,毕竟不同案例需求和不同应用场景会综合决定采取哪种方式,但对比之下还是会有很大差距。
01 3种爬虫框架
1. 常规爬虫
实现3个函数,分别用于解析网页、存储信息,以及二者的联合调用。在主程序中,用一个常规的循环语句逐页解析。
import requests
from lxml import etree
import pymysql
import time
def get_info(url):
pass
return infos
def save_info(infos):
pass
def getANDsave(url):
pass
if __name__ == '__main__':
urls = [f'https://hangzhou.anjuke.com/sale/p{page}/' for page in range(1,51)]
start = time.time()
#常规单线程爬取
for url in urls:
getANDsave(url)
tt = time.time()-start
print("共用时:",tt, "秒。")
耗时64.9秒。
2. Scrapy框架
Scrapy框架是一个常用的爬虫框架,非常好用,只需要简单实现核心抓取和存储功能即可,而无需关注内部信息流转,而且框架自带多线程和异常处理能力。
class anjukeSpider(scrapy.Spider):
name = 'anjuke'
allowed_domains = ['anjuke.com']
start_urls = [f'https://hangzhou.anjuke.com/sale/p{page}/' for page in range(1, 51)]
def parse(self, response):
pass
yield item
耗时14.4秒。
3. 多线程爬虫
对于爬虫这种IO密集型任务来说,多线程可明显提升效率。实现多线程python的方式有多种,这里我们应用concurrent的futures模块,并设置最大线程数为8。
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
def get_info(url):
pass
return infos
def save_info(infos):
pass
def getANDsave(url):
pass
if __name__ == '__main__':
urls = [f'https://hangzhou.anjuke.com/sale/p{page}/' for page in range(1,51)]
start = time.time()
executor = ThreadPoolExecutor(max_workers=8)
future_tasks = [executor.submit(getANDsave, url) for url in urls]
wait(future_tasks, return_when = ALL_COMPLETED)
tt = time.time()-start
print("共用时:",tt, "秒。")
耗时8.1秒。
对比来看,多线程爬虫方案耗时最短,相比常规爬虫而言能带来数倍的效率提升,Scrapy爬虫也取得了不俗的表现。需要指出的是,这里3种框架都采用了Xpath解析和MySQL存储。
from lxml import etree
def get_info(url):
response = requests.get(url, headers = headers)
html = response.text
html = etree.HTML(html)
items = html.xpath("//li[@class = 'list-item']")
infos = []
for item in items:
try:
title = item.xpath(".//div[@class='house-title']/a/text()")[0].strip()
houseType = item.xpath(".//div[@class='house-details']/div[2]/span[1]/text()")[0]
area = item.xpath(".//div[@class='house-details']/div[2]/span[2]/text()")[0]
floor = item.xpath(".//div[@class='house-details']/div[2]/span[3]/text()")[0]
buildYear = item.xpath(".//div[@class='house-details']/div[2]/span[4]/text()")[0]
adrres = item.xpath(".//div[@class='house-details']/div[3]/span[1]/text()")[0]
adrres = "|".join(adrres.split())
tags = item.xpath(".//div[@class='tags-bottom']//text()")
tags = '|'.join(tags).strip()
broker = item.xpath(".//div[@class='broker-item']/span[2]/text()")[0]
totalPrice = item.xpath(".//div[@class='pro-price']/span[1]//text()")
totalPrice = "".join(totalPrice).strip()
price = item.xpath(".//div[@class='pro-price']/span[2]/text()")[0]
values = (title, houseType, area, floor, buildYear, adrres, tags, broker, totalPrice, price)
infos.append(values)
except:
print('1条信息解析失败')
return infos
import re
def get_info(url):
response = requests.get(url, headers = headers)
html = response.text
html = html.replace('\n','')
pattern = r'<li class="list-item" data-from="">.*?</li>'
results = re.compile(pattern).findall(html)##先编译,再正则匹配
infos = []
for result in results:
values = ['']*10
titles = re.compile('title="(.*?)"').findall(result)
values[0] = titles[0]
values[5] = titles[1].replace(' ','')
spans = re.compile('<span>(.*?)</span><em class="spe-lines">|</em><span>(.*?)</span><em class="spe-lines">|</em><span>(.*?)</span><em class="spe-lines">|</em><span>(.*?)</span>').findall(result)
values[1] =''.join(spans[0])
values[2] = ''.join(spans[1])
values[3] = ''.join(spans[2])
values[4] = ''.join(spans[3])
values[7] = re.compile('<span class="broker-name broker-text">(.*?)</span>').findall(result)[0]
tagRE = re.compile('<span class="item-tags tag-others">(.*?)</span>').findall(result)
if tagRE:
values[6] = '|'.join(tagRE)
values[8] = re.compile('<span class="price-det"><strong>(.*?)</strong>万</span>').findall(result)[0]+'万'
values[9] = re.compile('<span class="unit-price">(.*?)</span>').findall(result)[0]
infos.append(tuple(values))
return infos
from bs4 import BeautifulSoup
def get_info(url):
response = requests.get(url, headers = headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all('li', attrs={'class': "list-item"})
infos = []
for item in items:
try:
title = item.find('a', attrs={'class': "houseListTitle"}).get_text().strip()
details = item.find_all('div',attrs={'class': "details-item"})[0]
houseType = details.find_all('span')[0].get_text().strip()
area = details.find_all('span')[1].get_text().strip()
floor = details.find_all('span')[2].get_text().strip()
buildYear = details.find_all('span')[3].get_text().strip()
addres = item.find_all('div',attrs={'class': "details-item"})[1].get_text().replace(' ','').replace('\n','')
tag_spans = item.find('div', attrs={'class':'tags-bottom'}).find_all('span')
tags = [span.get_text() for span in tag_spans]
tags = '|'.join(tags)
broker = item.find('span',attrs={'class':'broker-name broker-text'}).get_text().strip()
totalPrice = item.find('span',attrs={'class':'price-det'}).get_text()
price = item.find('span',attrs={'class':'unit-price'}).get_text()
values = (title, houseType, area, floor, buildYear, addres, tags, broker, totalPrice, price)
infos.append(values)
except:
print('1条信息解析失败')
return infos
import pymysql
def save_info(infos):
#####infos为列表形式,其中列表中每个元素为一个元组,包含10个字段
db= pymysql.connect(host="localhost",user="root",password="123456",db="ajkhzesf")
sql_insert = 'insert into hzesfmulti8(title, houseType, area, floor, buildYear, adrres, tags, broker, totalPrice, price) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
cursor = db.cursor()
cursor.executemany(sql_insert, infos)
db.commit()
import pymongo
def save_info(infos):
# infos为列表形式,其中列表中的每个元素为一个字典,包括10个字段
client = pymongo.MongoClient()
collection = client.anjuke.hzesfmulti
collection.insert_many(infos)
client.close()
import csv
def save_info(infos):
# infos为列表形式,其中列表中的每个元素为一个列表,包括10个字段
with open(r"D:\PyFile\HZhouse\anjuke.csv", 'a', encoding='gb18030', newline="") as f:
writer = csv.writer(f)
writer.writerows(infos)
不同爬虫执行效率对比
网页源码爬取,
目标信息解析,
数据本地存储。
此为上篇。
下篇,我们将利用Pandas对爬取的房源信息进行数据分析和可视化。