其他
今日头条街拍爬取练习
(点击上方Python爱好者社区,可快速关注)
蒋蜀黍,Python爱好者社区专栏作者
博客地址:https://ask.hellobi.com/blog/JiangYiXin
本文参考文末崔老师课程创作而来
本次练习使用到的知识点有
* Requests 库的使用
* BeautifulShop 库的使用
* 正则表达式的使用
* pymongo 库的使用
1、项目流程分析
2、中心调度
# 中心调度
defmain(offset):
# 获取列表页
index_data = get_page_index(offset,KEYWORDS)
ifindex_data is None:
print("offset:"+offset+"异常")
return
# 解析列表页获取所有详情页的url
forurl inparse_page_index(index_data):
# 获取详情页
detail_data = get_page_detail(url)
ifdetail_data is None:
print('url:%s异常'.format(url))
pass
# 解析详情页
data = parse_page_detail(detail_data, url)
ifdata is None:
continue
save_to_mongo(data)
3、请求和解析列表页
# 请求获取列表页的响应数据
defget_page_index(offset,keywords):
params = {
'offset':offset,
'format':'json',
'keyword':KEYWORDS,
'cur_tab':3,
'autoload':'true',
'count':20
}
try:
response = requests.get('http://www.toutiao.com/search_content/',params=params)
ifresponse.status_code==200:
returnresponse.text
return None
exceptRequestException ase:
return None
# 解析列表页
defparse_page_index(text):
try:
data = json.loads(text)
ifdata and'data'indata.keys():
foritem indata.get('data'):
yielditem.get('article_url')
exceptJSONDecodeError ase:
print('解析异常')
return[]
4、请求和解析详情页
# 解析详情页面
defparse_page_detail(html, url):
soup = BeautifulSoup(html,'lxml')
# 获取页面的标题
title = soup.title.string
image_pattern = re.compile('var gallery = (.*?);',re.S)
result = image_pattern.search(html)
ifresult:
try:
data = json.loads(result.group(1))
ifdata and'sub_images'indata.keys():
# 获取所有的image的url
images = [item.get('url') foritem indata.get('sub_images')]
forimage inimages:
# 下载图片
download_image(image)
return{'title':title, 'url':url, 'images':images}
exceptJSONDecodeError ase:
return None
return None
5、下载图片和保存至Mongodb
# 获取图片的二进制流
defdownload_image(url):
try:
print('图片'+url+'正在下载')
response = requests.get(url)
ifresponse.status_code == 200:
# 保存图片
save_image(response.content)
exceptRequestException ase:
print('异常image:'+url)
pass
# 保存二进制流至文件
defsave_image(content):
file_path = '{0}/images/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
withopen(file_path,'wb+') asfile:
file.write(content)
file.close()
defsave_to_mongo(data):
ifdb[MONGO_TABLE].insert(data):
print('成功保存'+data['title'])
return True
return False
6、完整代码
#!/usr/bin/python
# -*- coding: utf-8 -*-
importos
importre
importrequests
importpymongo
importjson
fromhashlib importmd5
frombs4 importBeautifulSoup
fromsetting import*
fromrequests.exceptions importRequestException
fromjson.decoder importJSONDecodeError
frommultiprocessing importPool
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
# 请求获取列表页的响应数据
defget_page_index(offset,keywords):
params = {
'offset':offset,
'format':'json',
'keyword':KEYWORDS,
'cur_tab':3,
'autoload':'true',
'count':20
}
try:
response = requests.get('http://www.toutiao.com/search_content/',params=params)
ifresponse.status_code==200:
returnresponse.text
return None
exceptRequestException ase:
return None
# 解析列表页
defparse_page_index(text):
try:
data = json.loads(text)
ifdata and'data'indata.keys():
foritem indata.get('data'):
yielditem.get('article_url')
exceptJSONDecodeError ase:
print('解析异常')
return[]
# 请求获取详情页面的响应数据
defget_page_detail(url):
response = requests.get(url)
try:
ifresponse.status_code==200:
returnresponse.text
return None
exceptRequestException ase:
return None
# 解析详情页面
defparse_page_detail(html, url):
soup = BeautifulSoup(html,'lxml')
# 获取页面的标题
title = soup.title.string
image_pattern = re.compile('var gallery = (.*?);',re.S)
result = image_pattern.search(html)
ifresult:
try:
data = json.loads(result.group(1))
ifdata and'sub_images'indata.keys():
# 获取所有的image的url
images = [item.get('url') foritem indata.get('sub_images')]
forimage inimages:
# 下载图片
download_image(image)
return{'title':title, 'url':url, 'images':images}
exceptJSONDecodeError ase:
return None
return None
# 获取图片的二进制流
defdownload_image(url):
try:
print('图片'+url+'正在下载')
response = requests.get(url)
ifresponse.status_code == 200:
# 保存图片
save_image(response.content)
exceptRequestException ase:
print('异常image:'+url)
pass
# 保存二进制流至文件
defsave_image(content):
file_path = '{0}/images/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
withopen(file_path,'wb+') asfile:
file.write(content)
file.close()
defsave_to_mongo(data):
ifdb[MONGO_TABLE].insert(data):
print('成功保存'+data['title'])
return True
return False
# 中心调度
defmain(offset):
# 获取列表页
index_data = get_page_index(offset,KEYWORDS)
ifindex_data is None:
print("offset:"+offset+"异常")
return
# 解析列表页获取所有详情页的url
forurl inparse_page_index(index_data):
# 获取详情页
detail_data = get_page_detail(url)
ifdetail_data is None:
print('url:%s异常'.format(url))
pass
# 解析详情页
data = parse_page_detail(detail_data, url)
ifdata is None:
continue
save_to_mongo(data)
if__name__=='__main__':
groups = [x*20forx inrange(GROUP_START,GROUP_END+1)]
pool = Pool()
pool.map(main, groups)
7、运行结果
点击文章末尾阅读原文即可购买
点击上图可查看详细课程说明信息
关注公众号,“Python爱好者社区”,回复“爬虫”即可获取崔老师爬虫免费学习视频。
为大家提供与Python相关的最新技术和资讯。
长按指纹 > 识别图中二维码 > 添加关注