其他
爬虫实战程序的函数封装
本文作者:陈志玲
文字编辑:余术玲
技术总编:张 邯
def<函数名>(参数列表):
<函数体>
return<返回值列表>
import requests
import time
import json
#首先定义一个函数名 GetPageProjectInfo(只使用字母和下划线),并传入参数(该函数为完成任务所需要的信息),此处即为页数page
def GetPageProjectInfo(page):
#此处headers在完整程序中查看
timestamp= int(round(time.time()*1000))
url=f"https://www.dreammove.cn/list/get_list.html?type=8&industry=0&city=0&offset={page}&keyword=&_={timestamp}"
raw_html= requests.get(url,headers= headers)#requests.get(url, params=None, **Kwargs)
html_text= raw_html.text
return json.loads(html_text)["data"]["list"] #return语句返回一个我们所需要的值
def GetAllProjectInfo(): #该函数无需向其传入参数,即可调用
ProjectInfo = []
for i in range(1,23):
#将GetPageProjectInfo(i)返回的值拼接到空列表ProjectInfo中
ProjectInfo.extend(GetPageProjectInfo(i))
return ProjectInfo
def Json2csv_Project(Info,VarName,FileName):
with open(FileName,"w",encoding ="gb18030") as f:
f.write("\t".join(VarName)+"\n")
for EachInfo in Info:
tempInfo = []
for key in VarName:
if key in EachInfo:
tempInfo.append(str(EachInfo[key]).replace("\n","").replace("\t","").replace("\r",""))
else:
tempInfo.append("")
f.write("\t".join(tempInfo)+"\n")
def GetId(FileName): #将csv文件作为参数传入
with open(FileName,"r",encoding ="gb18030") as f:
final_Info = f.readlines()
ProjectId = []
for i in range(1,len(final_Info)):
ProjectId.append(final_Info[i].split("\t")[0])
return ProjectId #返回所有项目的id
def GetTeamInfo(ProjectId):
timestamp = int(round(time.time()*1000))
url =f"https://www.dreammove.cn/project/project_team/id/{ProjectId}?_={timestamp}"
#此处headers在完整程序中查看
raw_html = requests.get(url,headers =headers)
html_text = raw_html.text
return json.loads(html_text)["data"]["team_list"]
#Team_Id即所有id,VarNamem即变量名,最终的表头,FileName即最终的csv文件
def Json2csv_Team(Team_Id,VarNamem,FileName):
with open(FileName,"w",encoding = "gb18030") as f:#按gb18030编码往FileName中写入内容
f.write("id\t"+"\t".join(VarName)+"\n")
for Eachid in Team_Id :
TeamInfo = GetTeamInfo(Eachid)#利用函数GetTeamInfo(id)获得团队信息
if TeamInfo.__class__ == list: #判断上一步获得的TeamInfo是否是列表
for Eachperson in TeamInfo :#如果是列表,再对列表TeamInfo进行遍历,输出每一个元素
print(Eachperson)
tempInfo = [Eachid]#放置第一层遍历的id
for key in VarName: #对变量名进行遍历
if key in Eachperson:#如果是 键列表TeamInfo中的元素
tempInfo.append(str(Eachperson[key]).replace("\n","").replace("\t","").replace("\r",""))#列表TeamInfo中的元素——字典的键转化成字符串 去掉换行符,制表符,回车符
else:
tempInfo.append("")
f.write("\t".join(tempInfo)+"\n")
#用来让脚本判断自己是被当做模块调用还是被直接运行,当被import作为模块调用时if以下的代码就不会被执行
if __name__ == "__main__":
#调用GetAllProjectInfo()函数获得项目id,在调用Json2csv_Project函数获得csv文件Project_FileName
VarName =['id','update_time','province_name','subsite_id','is_open','industry','type','open_flag','project_name','step','seo_string','abstract','cover','project_phase','member_count','province','city','address','company_name','project_url','uid','over_time','vote_leader_step','stage','is_agree','is_del','agreement_id','barcode','sort','display_subsite_id','need_fund','real_fund','project_valuation','final_valuation','min_lead_fund','min_follow_fund','total_fund','agree_total_fund','leader_flag','leader_id','read_cnt','follow_cnt','inverstor_cnt','comment_cnt','nickname','short_name','site_url','site_logo','storelevel','industry_name']
Project_FileName ="C:\\CrowdFunding\\dreammove\\ProjectInfo.csv" Json2csv_Project(GetAllProjectInfo(),VarName,Project_FileName) #调用GetAllProjectInfo()函数获得项目id,在调用Json2csv_Project函数获得csv文件Project_FileName
#获取二级团队信息,存入TeamInfo.csv文件中
VarName =['name','duty','src','intro','is_fulltime','relationship','short_intro','shared_rate','amount','member_type']
Team_FileName ="C:\\CrowdFunding\\dreammove\\TeamInfo.csv"
#GetId函数获取Project_FileName中的项目id 再用Json2csv_Team函数获得团队信息
Json2csv_Team(GetId(Project_FileName),VarName,Team_FileName)
import requests
import time
import json
def GetPageProjectInfo(page):
headers = {"Accept":"application/json, text/javascript, */*; q=0.01",
"Referer":"https://www.dreammove.cn/list/index.html?industry=0&type=8&city=0",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100Mobile Safari/537.36",
"X-Requested-With": "XMLHttpRequest"}
timestamp =int(round(time.time()*1000))
url =f"https://www.dreammove.cn/list/get_list.html?type=8&industry=0&city=0&offset={page}&keyword=&_={timestamp}"
raw_html =requests.get(url,headers= headers)
html_text =raw_html.text
return json.loads(html_text)["data"]["list"]
#print(GetPageProjectInfo(1))
def GetAllProjectInfo():
ProjectInfo =[]
for i inrange(1,23):
ProjectInfo.extend(GetPageProjectInfo(i))
return ProjectInfo
#print(GetAllProjectInfo())
def Json2csv_Project(Info,VarName,FileName):
with open(FileName,"w",encoding = "gb18030") as f:
f.write("\t".join(VarName)+"\n")
for EachInfo in Info:
tempInfo = []
for key in VarName:
if key in EachInfo:
tempInfo.append(str(EachInfo[key]).replace("\n","").replace("\t","").replace("\r",""))
else:
tempInfo.append("")
f.write("\t".join(tempInfo)+"\n")
def GetId(FileName):
with open(FileName,"r",encoding = "gb18030") as f:
final_Info = f.readlines()
ProjectId= []
for i in range(1,len(final_Info)):
ProjectId.append(final_Info[i].split("\t")[0])
return ProjectId
def GetTeamInfo(ProjectId):
timestamp =int(round(time.time()*1000))
url =f"https://www.dreammove.cn/project/project_team/id/{ProjectId}?_={timestamp}"
headers ={"Accept": "application/json, text/javascript, */*;q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"Cookie": "PHPSESSID=bk25qacnlg8g68d205h4pqeq56;Hm_lvt_c18b08cac9b94bf4628c0277d3a4d7de=1562549437;,jumu_web_idu=MDAwMDAwMDAwMLGGhpiGr36zsa96r7WEvXE;jumu_web_idp=MDAwMDAwMDAwMMafpd-afJ2NtZ9-r7OXoXE;Hm_lpvt_c18b08cac9b94bf4628c0277d3a4d7de=1562561558",
"Host": "www.dreammove.cn",
"Referer":"https://www.dreammove.cn/project/detail/id/97",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100Mobile Safari/537.36",
"X-Requested-With": "XMLHttpRequest"}
raw_html =requests.get(url,headers = headers)
html_text =raw_html.text
return json.loads(html_text)["data"]["team_list"]
def Json2csv_Team(Team_Id,VarNamem,FileName):
with open(FileName,"w",encoding = "gb18030") as f:
f.write("id\t"+"\t".join(VarName)+"\n")
for Eachid in Team_Id :
#print(eachid)
TeamInfo = GetTeamInfo(Eachid)
if TeamInfo.__class__ == list:
for Eachperson in TeamInfo :
print(Eachperson)
tempInfo = [Eachid]
for key in VarName:
if key in Eachperson:
tempInfo.append(str(Eachperson[key]).replace("\n","").replace("\t","").replace("\r",""))
else:
tempInfo.append("")
f.write("\t".join(tempInfo)+"\n")
if __name__ == "__main__":
VarName =['id','update_time','province_name','subsite_id','is_open','industry','type','open_flag','project_name','step','seo_string','abstract','cover','project_phase','member_count','province','city','address','company_name','project_url','uid','over_time','vote_leader_step','stage','is_agree','is_del','agreement_id','barcode','sort','display_subsite_id','need_fund','real_fund','project_valuation','final_valuation','min_lead_fund','min_follow_fund','total_fund','agree_total_fund','leader_flag','leader_id','read_cnt','follow_cnt','inverstor_cnt','comment_cnt','nickname','short_name','site_url','site_logo','storelevel','industry_name']
Project_FileName = "C:\\CrowdFunding\\dreammove\\ProjectInfo.csv"
Json2csv_Project(GetAllProjectInfo(),VarName,Project_FileName)
VarName =['name','duty','src','intro','is_fulltime','relationship','short_intro','shared_rate','amount','member_type']
Team_FileName= "C:\\CrowdFunding\\dreammove\\TeamInfo.csv"
Json2csv_Team(GetId(Project_FileName),VarName,Team_FileName)
关于我们
微信公众号“Stata and Python数据分析”分享实用的stata、python等软件的数据处理知识,欢迎转载、打赏。我们是由李春涛教授领导下的研究生及本科生组成的大数据处理和分析团队。
1)必须原创,禁止抄袭;
2)必须准确,详细,有例子,有截图;
注意事项:
1)所有投稿都会经过本公众号运营团队成员的审核,审核通过才可录用,一经录用,会在该推文里为作者署名,并有赏金分成。
2)邮件请注明投稿,邮件名称为“投稿+推文名称”。
3)应广大读者要求,现开通有偿问答服务,如果大家遇到有关数据处理、分析等问题,可以在公众号中提出,只需支付少量赏金,我们会在后期的推文里给予解答。