其他
只会JavaScript,也可以在浏览器里写爬虫
今天在udemy里看到一个PostgreSQL的教程,Learn SQL Using PostgreSQL: From Zero to Hero(通过PostgreSQL学SQL,从入门到精通),参与的学生还不少。
因为对他的课程章节安排比较感兴趣,就想着copy下来研究,又懒得一条条拷贝、黏贴,因为是一次性的工作,加之又有简单的脚步交互,就懒得在Python里折腾了,于是有了下面这段脚本。
第一步,在浏览器里打开网页
本次的目标网址,https://www.udemy.com/course/postgresql-from-zero-to-hero/
第二步,鼠标右键,点Inspect
第三步,切换到Console Tab
第四步,写JavaScript,实现你的功能,
//判断是否所有课程内容都已经展开,如果没有点击展开
a=this.document.getElementsByClassName("sections-toggle")[0]
if(a.innerText=='展开')
a.click()
//获取所有课程的节点
lectures=this.document.getElementsByClassName("lectures-container collapse in")
//定义一个数组,用于记录所有的课程信息
dict=[]
//读取每个章节的明细
for(j=0;j<lectures.length;j++){
nodes=lectures[j].childNodes
section=lectures[j].previousSibling.getElementsByClassName("section-title-text")[0].textContent
//console.log(section)
for(i=0;i<nodes.length;i++){
//这里根据需要,选择数组还是json
x=JSON.constructor()
node=nodes[i]
title=node.getElementsByClassName("title")[0].textContent
desc=""
if(node.getElementsByClassName("description collapse").length>0)
desc=node.getElementsByClassName("description collapse")[0].textContent
timing=node.getElementsByClassName("content-summary")[0].textContent
x['section']=section
x['title']=title
x['desc']=desc
x['timing']=timing
dict.push(x)
}
}
//结果copy到剪贴板
copy(dict)
结果这样(显示部分内容),可以黏贴到记事本,并保存为json,然后用用Excel读取,当然也可以生成简单到csv格式,这个看个人需要。
[
{
"section": "Introduction",
"title": "Introduction ",
"desc": "Why learn SQL and what are the major databases. Explains what tables, fields and rows are in the context of databases.",
"timing": "05:04"
},
{
"section": "Installing PostgreSQL and pgAdmin",
"title": "Installing PostgreSQL on Mac and Windows ",
"desc": "How to install PostgreSQL using EnterpriseDB installer",
"timing": "02:47"
},
{
"section": "Installing PostgreSQL and pgAdmin",
"title": "Installing PostgreSQL on Ubuntu ",
"desc": "Get up and running with PostgreSQL 11 and pgAdmin 4 on Ubuntu.",
"timing": "05:59"
},
{
"section": "Installing PostgreSQL and pgAdmin",
"title": "Install Northwind Database ",
"desc": "How to install Northwind database using pgAdmin using the restore feature and northwind.tar file.",
"timing": "03:16"
},
{
"section": "Installing PostgreSQL and pgAdmin",
"title": "Install Some Additional Databases. ",
"desc": "Add 3 more databases to learn from.",
"timing": "00:19"
},
]
这个代码,经过完善,可以配合PhantomJS实现自动化
(function(console){
console.save = function(data, filename){
if(!data) {
console.error('Console.save: No data')
return;
}
if(!filename) filename = 'console.json'
if(typeof data === "object"){
data = JSON.stringify(data, undefined, 4)
}
var blob = new Blob([data], {type: 'text/json'}),
e = document.createEvent('MouseEvents'),
a = document.createElement('a')
a.download = filename
a.href = window.URL.createObjectURL(blob)
a.dataset.downloadurl = ['text/json', a.download, a.href].join(':')
e.initMouseEvent('click', true, false, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null)
a.dispatchEvent(e)
}
})(console)
利用上面这个函数,还可以实现自动保存为json文件到本地,在浏览器里实现自动化。
console.save(dict, "udemy-postgresql-from-zero-to-hero.json")