Selenium 爬取淘宝实战练习

查看原文

其他

Selenium 爬取淘宝实战练习

2017-05-12 蒋蜀黍 Python爱好者社区

(点击上方Python爱好者社区，可快速关注)

蒋蜀黍，Python爱好者社区专栏作者

博客地址：https://ask.hellobi.com/blog/JiangYiXin

提示：本文参考文末崔老师课程创作而来

1、项目流程

2、中心调度

# 中心调度
defmain():
    try:
        total = search()
        total = int(re.compile('.*?(\d+).*?').search(total).group(1))
        fori inrange(2,total+1):
            next_page(i)
    exceptException ase:
        print('异常')
    finally:
        browser.close()

3、模拟查询

# 根据关键字查询
def search():
    try:
        browser.get('https://www.taobao.com/')
        # 直到搜索框加载出
        input_search = wait.until(EC.presence_of_element_located((By.ID,'q')))
        # 直到搜索按钮可以点击
        submit_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'btn-search')))
        input_search.send_keys(KEYWORDS)
        submit_button.click()
        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.wraper  div.total')))
        get_products()
        returntotal.text
    exceptTimeoutException ase:
        print('响应超时')

4、下一页的操作

# 下一页爬取
def next_page(index):
    try:
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager div.form > input')))
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager div.form > span.btn.J_Submit')))
        input.clear()
        input.send_keys(index)
        submit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(index)))
        get_products()
    exceptNoSuchElementException ase:
        print('元素未加载')
        returnnext_page(index)

5、商品信息的解析

# 获取一页上所有的商品
def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
    html = browser.page_source
    doc = PyQuery(html)
    items = doc('.m-itemlist .items  .item').items()
    foritem initems:
        product = {
            'image':item.find('.pic-link .img').attr('data-src'),
            'price':float(item.find('.price').text()[2:]),
            'deal':item.find('.deal-cnt').text()[:-3],
            'title':item.find('.title').text(),
            'shop':item.find('.shop').text(),
            'location':item.find('.location').text(),
            'keywords':KEYWORDS
        }
        save_to_mongo(product=product)

6、完整代码

#!/usr/bin/python
# -*- coding: utf-8 -*-

importpymongo
importre
frompyquery importPyQuery
fromselenium importwebdriver
fromselenium.webdriver.common.by importBy
fromselenium.webdriver.support.ui importWebDriverWait
fromselenium.webdriver.support importexpected_conditions asEC
fromselenium.common.exceptions importTimeoutException,NoSuchElementException
fromsetting import*

client = pymongo.MongoClient(MONGO_HOST)
db = client[MONGO_DB]
browser = webdriver.Chrome()
wait = WebDriverWait(browser,10)


# 根据关键字查询
def search():
    try:
        browser.get('https://www.taobao.com/')
        # 直到搜索框加载出
        input_search = wait.until(EC.presence_of_element_located((By.ID,'q')))
        # 直到搜索按钮可以点击
        submit_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'btn-search')))
        input_search.send_keys(KEYWORDS)
        submit_button.click()
        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.wraper  div.total')))
        get_products()
        returntotal.text
    exceptTimeoutException ase:
        print('响应超时')


# 下一页爬取
def next_page(index):
    try:
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager div.form > input')))
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager div.form > span.btn.J_Submit')))
        input.clear()
        input.send_keys(index)
        submit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(index)))
        get_products()
    exceptNoSuchElementException ase:
        print('元素未加载')
        returnnext_page(index)


# 获取一页上所有的商品
def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
    html = browser.page_source
    doc = PyQuery(html)
    items = doc('.m-itemlist .items  .item').items()
    foritem initems:
        product = {
            'image':item.find('.pic-link .img').attr('data-src'),
            'price':float(item.find('.price').text()[2:]),
            'deal':item.find('.deal-cnt').text()[:-3],
            'title':item.find('.title').text(),
            'shop':item.find('.shop').text(),
            'location':item.find('.location').text(),
            'keywords':KEYWORDS
        }
        save_to_mongo(product=product)


# 保存至mongoDB
def save_to_mongo(product):
    try:
        ifdb[MONGO_TABLE].insert(product):
            print('保存成功', product)
    exceptException:
        print('保存失败')


# 中心调度
def main():
    try:
        total = search()
        total = int(re.compile('.*?(\d+).*?').search(total).group(1))
        fori inrange(2,total+1):
            next_page(i)
    exceptException ase:
        print('异常')
    finally:
        browser.close()


if__name__=='__main__':
    main()

7、运行结果

点击文章末尾阅读原文即可购买

点击上图可查看详细课程说明信息

关注公众号，“Python爱好者社区”，回复“爬虫”即可获取崔老师爬虫免费学习视频。

Python爱好者社区

为大家提供与Python相关的最新技术和资讯

长按指纹 > 识别图中二维码 > 添加关注

李光耀：过早翘起尾巴与美国对抗是中国厄运的开始！

谁会想到，裁员会裁到总编辑头上

“环评”提质增效助力高质量发展？

13岁男孩杀害8岁女童案今日开庭，女童父亲：侮辱遗体、没有悔罪

法官累积受贿929万：介绍案源、解冻账户、提取管理人报酬

Selenium 爬取淘宝实战练习

提示：本文参考文末崔老师课程创作而来

1、项目流程

2、中心调度

3、模拟查询

4、下一页的操作

5、商品信息的解析

6、完整代码

7、运行结果

您可能也对以下帖子感兴趣

李光耀：过早翘起尾巴与美国对抗 是中国厄运的开始！

谁会想到，裁员会裁到总编辑头上

“环评”提质增效 助力高质量发展？

13岁男孩杀害8岁女童案今日开庭，女童父亲：侮辱遗体、没有悔罪

法官累积受贿929万：介绍案源、解冻账户、提取管理人报酬

生成图片，分享到微信朋友圈

Selenium 爬取淘宝实战练习

提示：本文参考文末崔老师课程创作而来

1、项目流程

2、中心调度

3、模拟查询

4、下一页的操作

5、商品信息的解析

6、完整代码

7、运行结果

您可能也对以下帖子感兴趣

李光耀：过早翘起尾巴与美国对抗是中国厄运的开始！

“环评”提质增效助力高质量发展？