其他
学经济学也要敲代码——经生小白自力更生收集数据
Editor's Note
本文作者是17级经济学专业的本科生,而此文的主题是网络爬虫。实证经济学未来将越来越依赖于大数据资源和相关分析技术。作为经济学专业的学生,不仅要了解经济学理论和相关计量技术,在数据方面还要有“自己动手,丰衣足食”的决心。
The following article is from 数据Seminar Author Wallace
数据概览与总体思路
#调用所需要的库
import pandas as pd
import requests
import os
html=requests.get('http://www.customs.gov.cn/customs/302249/302274/302278/302280/2546702/index.html') #向目标网站发起请求
df = pd.read_html(html.text,skiprows=3)[-1] # 取返回列表中的最后一个表格
df.columns = ['Year&Month', 'Total', ' Export', ' Import', 'Balance',
'1to12 total','1to12 export','1to12 import','1to12 balance']
print(df)
df.to_csv(r'E:\月度数据.csv',encoding='utf_8_sig', mode='a', sep=",", header=not os.path.exists(r'E:\月度数据.csv'),index=False)
utf-8以字节为编码单元,它的字节顺序在所有系统中都是一样的,没有字节序的问题,也因此它实际上并不需要BOM(“ByteOrder Mark”), 但是utf-8 with BOM即utf_8_sig需要提供BOM("ByteOrder Mark")。
pip install selenium
engine_path = r'D:/Users/hong/Desktop/chromedriver_win32.exe'
import selenium.webdriver as webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import traceback
import codecs
import re
import pandas as pd
import random
import os
import requests
pip install + 库
的名称page_xpath = '//*[@id="708ff4f3b78e4b29a3b73c4fe3b64f3a"]'
row_xpath = '//*[@id="708ff4f3b78e4b29a3b73c4fe3b64f3a"]/div/ul/li[1]/a'
row_xpath = '//*[@id="708ff4f3b78e4b29a3b73c4fe3b64f3a"]/div/ul/li[2]/a'
row_xpath = '//*[@id="708ff4f3b78e4b29a3b73c4fe3b64f3a"]/div/ul/li[{}]/a'.format(row_num)
element_list = driver.find_elements_by_xpath(row_xpath)
driver.find_element_by_xpath(row_xpath).click() #点击进入详情页
driver.switch_to.window(driver.window_handles[-1]) #点击每一行的详情页后都会弹出新的一页,所以需要进行窗口切换到弹出页
driver.find_element_by_xpath(page_xpath).click() #点击下一页翻页
bool_func = lambda x: driver.find_elements_by_xpath(page_xpath)
webdriver.support.wait.WebDriverWait(driver, 20).until(bool_func)
#确定需要的页面打开完毕,规定时间为20秒
title = driver.title #获取网页标题
driver.get(root_url)#请求页面
print(driver.current_url) #输出当前的
urldriver.close()#关闭页面
driver.quit()#关闭浏览器
import selenium.webdriver as webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import traceback
import codecs
import re
import pandas as pd
import random
import os
import requests
# 浏览器内核路径
engine_path = r'D:/Users/hong/Desktop/chromedriver_win32.exe'
# 初始url
root_url = 'http://www.customs.gov.cn/eportal/ui?pageId=302280¤tPage=5&moduleId=708ff4f3b78e4b29a3b73c4fe3b64f3a&staticRequest=yes'
#配置虚拟浏览器
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path=engine_path, options=options)
driver.set_window_size(1920, 1080) # 主要为了方便做屏幕截图
driver.get(root_url)
page_xpath = '//*[@id="708ff4f3b78e4b29a3b73c4fe3b64f3a"]' #从网页中获取页面的xpath
bool_func = lambda x: driver.find_elements_by_xpath(page_xpath)
webdriver.support.wait.WebDriverWait(driver, 20).until(bool_func)
row_num = 1
page_num = 1 #从第一页第一行开始
num = 1
while True:
page_xpath = '//*[@id="708ff4f3b78e4b29a3b73c4fe3b64f3a"]'
bool_func = lambda x: driver.find_elements_by_xpath(page_xpath)
webdriver.support.wait.WebDriverWait(driver, 20).until(bool_func)
row_xpath = '//*[@id="708ff4f3b78e4b29a3b73c4fe3b64f3a"]/div/ul/li[{}]/a'.format(row_num)#每一行的xpath都带有规律
element_list = driver.find_elements_by_xpath(row_xpath)
if element_list:
element_list[0].click() #点击进入详情页
driver.switch_to.window(driver.window_handles[-1]) #浏览器中,进入详情页要翻页,所以要更换浏览器标签页。
# 确定我们需要的页面打开完毕
bool_func = lambda x: driver.find_elements_by_id('easysiteText')
webdriver.support.wait.WebDriverWait(driver, 20).until(bool_func)
title = driver.title
html = driver.page_source
df = pd.read_html(html.text,skiprows=3)[-1] # 取返回列表中的最后一个表格
df.columns = ['Year&Month', 'Total', ' Export', ' Import', 'Balance',
'1to12 total', '1to12 export', '1to12 import', '1to12 balance']
df.to_csv(r'E:/{}{}.csv'.fromat(title,random.randint(1,10)),encoding='utf_8_sig', mode='a', sep=",",index=False)
time.sleep(1)
print(num)
print(driver.current_url)
driver.close()
driver.switch_to.window(driver.window_handles[-1])
num = num + 1
row_num = row_num + 1
else:
page_xpath = '//*[@id="708ff4f3b78e4b29a3b73c4fe3b64f3a"]/div/div/div[1]/a[3]' #下一页的xpath
driver.find_element_by_xpath(page_xpath).click() #点击下一页翻页
row_num = 1
page_num = page_num + 1
End
美滋滋教室
分享-学习-提高
👏欢迎 长按二维码添加关注👏