其他
玩转Python之“手把手”教你爬数据(二)
本文作者:钱梦璇
文字编辑:余术玲
技术总编:张 邯
此时点击Response旁的Headers,找到其请求头(Request headers)和Payload报文,如下图所示:
url="https://pcapi.duocaitou.com/pc/project/getProjectSummary"
headers={"Accept": "application/json,text/plain, */*",
"Content-Type": "application/json;charset=UTF-8",
"Referer":"https://www.duocaitou.com/project/detail/NHdWaS81RHRueGs9",
"Sec-Fetch-Mode": "cors",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}
data = {"id":"NHdWaS81RHRueGs9"} #此项目对应的id
req=json.loads(requests.post(url,headers=headers,data=json.dumps(data)).text)["data"]["projectTeamList"]
print(req)
if __name__ =="__main__": #主程序
Varname=["imageS","endDate","initiator","imageM","initiatorLogo","title","financingRate","preheatdateEnd","investUpMoney","remainDays","payType","projectProgress","investFinancingRate","pType","raise","statusName","id","address","launch","upMoney","financingType","financingMoney","pjtBreif","finishDate","investmentType","status","brief","picUrl","name","logo","Person_id","sort","content"]
with open("E:\\多彩投.csv","w",encoding="gb18030")as f:
f.write("\t".join(Varname)+"\n")
for i in range(1,98):
ProjectInfo=(GetSinglePage(i))
for EachProject in ProjectInfo:
PersonInfo=(GetSingleProject(EachProject["id"]))
for EachPerson in PersonInfo:
Json2csv("E:\\多彩投.csv",EachProject,EachPerson)
三.信息的获取与导入
with open("E://多彩投.csv","a",encoding= "gb18030") as f:
tempInfo= []
for EachVar in Project_VarName: # Project_VarName表示项目信息对应的变量名列表
if EachVar in ProjectInfo: #根据if条件语句进行判断
tempInfo.append(str(ProjectInfo[EachVar]).replace("\n","").replace("\t","").replace("\r",""))
else:
tempInfo.append("")
tempInfo ="\t".join(tempInfo)+"\n"
print(tempInfo)
f.write(tempInfo)
import requests
import json
def GetSinglePage(i): #用函数将一级爬虫封装起来
url="https://pcapi.duocaitou.com/pc/project/goProject"
headers={"Accept":"application/json, text/plain, */*",
"Content-Type": "application/json;charset=UTF-8",
"Referer":"https://www.duocaitou.com/project?pageNum=1",
"Sec-Fetch-Mode":"cors",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/76.0.3809.100 Safari/537.36"
}
data={"pageNum":str(i), "pageSize": "9"} #不同的i对应不同的页数
return json.loads(requests.post(url,headers=headers,data=json.dumps(data)).text)["data"]["list"]
def GetSingleProject(id): #用函数将二级爬虫封装起来
url="https://pcapi.duocaitou.com/pc/project/getProjectSummary"
headers={"Accept":"application/json, text/plain, */*",
"Content-Type":"application/json;charset=UTF-8",
"Referer":"https://www.duocaitou.com/project/detail/NHdWaS81RHRueGs9",
"Sec-Fetch-Mode": "cors",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}
data= {"id":id} #不同的项目对应不同的id
return json.loads(requests.post(url,headers=headers,data=json.dumps(data)).text)["data"]["projectTeamList"]
def Json2csv(FileName,ProjectInfo,PersonInfo): #将输入信息的程序用函数封装起来
Project_VarName=["imageS","endDate","initiator","imageM","initiatorLogo","title","financingRate","preheatdateEnd","investUpMoney","remainDays","payType","projectProgress","investFinancingRate","pType","raise","statusName","id","address","launch","upMoney","financingType","financingMoney","pjtBreif","finishDate","investmentType","status"]
Person_VarName=["brief","picUrl","name","logo","id","sort","content"]
with open(FileName,"a",encoding = "gb18030") as f:
tempInfo= []
for EachVar in Project_VarName: #输入项目信息
if EachVar in ProjectInfo:
tempInfo.append(str(ProjectInfo[EachVar]).replace("\n","").replace("\t","").replace("\r",""))
else:
tempInfo.append("")
for EachVar in Person_VarName: #输入个人信息
if EachVar in PersonInfo:
tempInfo.append(str(PersonInfo[EachVar]).replace("\n","").replace("\t","").replace("\r",""))
else:
tempInfo.append("")
tempInfo= "\t".join(tempInfo)+"\n"
print(tempInfo)
f.write(tempInfo)
if __name__ == "__main__": #主程序
Varname=["imageS","endDate","initiator","imageM","initiatorLogo","title","financingRate","preheatdateEnd","investUpMoney","remainDays","payType","projectProgress","investFinancingRate","pType","raise","statusName","id","address","launch","upMoney","financingType","financingMoney","pjtBreif","finishDate","investmentType","status","brief","picUrl","name","logo","Person_id","sort","content"]
with open("E:\\多彩投.csv","w",encoding="gb18030") as f:
f.write("\t".join(Varname)+"\n")
for i in range(1,98):
ProjectInfo=(GetSinglePage(i))
for EachProject in ProjectInfo:
PersonInfo=(GetSingleProject(EachProject["id"]))
for EachPerson in PersonInfo:
Json2csv("E:\\多彩投.csv",EachProject,EachPerson)
关于我们
微信公众号“Stata and Python数据分析”分享实用的stata、python等软件的数据处理知识,欢迎转载、打赏。我们是由李春涛教授领导下的研究生及本科生组成的大数据处理和分析团队。
1)必须原创,禁止抄袭;
2)必须准确,详细,有例子,有截图;
注意事项:
1)所有投稿都会经过本公众号运营团队成员的审核,审核通过才可录用,一经录用,会在该推文里为作者署名,并有赏金分成。
2)邮件请注明投稿,邮件名称为“投稿+推文名称”。
3)应广大读者要求,现开通有偿问答服务,如果大家遇到有关数据处理、分析等问题,可以在公众号中提出,只需支付少量赏金,我们会在后期的推文里给予解答。