其他
使用 R 语言爬取宗祠网
内容比较简单,感兴趣的小伙伴可以对照讲义材料自学一下~有问题可以及时跟李老师说。
最近有小伙伴想爬取宗祠网上的宗祠列表数据。
该数据爬取比较简单,使用 rvest 包即可完成:
library(tidyverse)
library(rvest)
read_html("http://www.100citang.cn/zongciweb/citang") -> html
tibble(
html %>%
html_elements("h5.name") %>%
html_text(),
html %>%
html_elements("p.address") %>%
html_text()
) %>%
set_names("name", "address") -> df
df
然后我们可以使用地理编码获取每个祠堂的经纬度:
这部分内容可以学习 R 语言地理编码课程掌握:
使用 R 语言进行地理编码:地址解析经纬度、坐标转换 & 根据经纬度判断所处的省市区县:https://rstata.duanshu.com/#/brief/course/6f8633b486ed49e398c486d0d9f0f59a
# 解析经纬度
df %>%
mutate(id = row_number()) -> df
library(parallel)
# 改成多线程
makeCluster(5) -> cl
dir.create("rds")
clusterExport(cl, "%>%")
clusterExport(cl, "df")
parLapply(cl, 1:nrow(df), function(x){
try({
if(!file.exists(paste0("rds/", df$id[x], ".json"))) {
download.file(paste0("https://api.map.baidu.com/geocoding/v3/?address=", df$name[x],
"&output=json&ak=百度地理编码密钥&ret_coordtype=gcj02ll"),
paste0("rds/", df$id[x], ".json"))
}
})
}) -> res
fs::dir_ls("rds") -> ls
length(ls)
parLapply(cl, ls, function(x){
jsonlite::fromJSON(x) -> lst
lst$result$location %>%
paste0(collapse = ",") -> loc
tibble::tibble(id = x, location = loc)
}) %>%
bind_rows() -> df2
df2 %>%
mutate(addid = basename(id),
addid = str_remove_all(addid, "\\.json")) %>%
filter(str_detect(location, ",") & !str_detect(location, "rror")) -> df2
df2 %>%
select(id = addid, location) %>%
type_convert() %>%
arrange(id) -> df2
df %>%
anti_join(df2) -> df
source("坐标转换.R")
df2 %>%
separate(location, into = c("lng", "lat"), sep = ",", remove = F) %>%
mutate(经度 = as.numeric(lng),
纬度 = as.numeric(lat),
value2 = map2_chr(经度, 纬度, GCJ02_WGS84)) %>%
select(-contains("度")) %>%
separate(value2, into = c("经度", "纬度"), sep = ",") %>%
type_convert() %>%
select(-lat, -lng) %>%
select(id, lat = 纬度, lng = 经度, everything()) %>%
select(-location) -> dfall2
dfall2
然后还可以根据经纬度判断所处的省市区县:
library(sf)
mycrs <- "+proj=aea +lat_0=0 +lon_0=105 +lat_1=25 +lat_2=47 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"
read_sf("2021行政区划/县.shp") %>%
st_transform(mycrs) -> county
dfall2 %>%
st_as_sf(coords = c("lng", "lat"), crs = 4326, remove = F) %>%
st_transform(mycrs) %>%
st_intersection(county) %>%
st_drop_geometry() -> dfall3
dfall3
dfall3 %>%
select(-contains("类型")) %>%
left_join(dfall2) %>%
haven::write_dta("经纬度解析结果.dta")
haven::read_dta("含经度纬度结果.dta") %>%
st_as_sf(coords = c("经度", "纬度"), crs = 4326, remove = F) %>%
st_transform(3055) %>%
st_intersection(county) %>%
st_drop_geometry() -> dfall3
dfall3 %>%
select(-contains("类型")) %>%
left_join(df) %>%
select(ID = id, 宗祠名称 = name, 地址 = address, 经度 = lng, 纬度 = lat, everything()) %>%
haven::write_dta("宗祠网宗祠列表、经纬度及其所处的省市区县.dta", label = "数据处理:微信公众号 RStata")
haven::read_dta("宗祠网宗祠列表、经纬度及其所处的省市区县.dta")
另外就是该网站也有直接爬取经纬度的地方:http://www.100citang.cn/zongciweb/ditu
library(tidyverse)
library(httr)
cookies = c(
`JSESSIONID` = "18B2BC3B8D1344DDA40E77CB96BA81CB"
)
headers = c(
`Accept` = "*/*",
`Accept-Language` = "zh-CN,zh;q=0.9,en;q=0.8",
`Connection` = "keep-alive",
`Content-Type` = "application/json;charset=UTF-8",
`Origin` = "http://www.100citang.cn",
`Referer` = "http://www.100citang.cn/zongciweb/ditu",
`User-Agent` = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
`X-Requested-With` = "XMLHttpRequest"
)
data = '{"address":"","name":"长乐安氏宗祠"}'
res <- httr::POST(url = "http://www.100citang.cn/zongciweb/findClanHallByAddress", httr::add_headers(.headers=headers), httr::set_cookies(.cookies = cookies), body = data, config = httr::config(ssl_verifypeer = FALSE))
content(res) %>%
listviewer::jsonedit()
然后就可以循环爬取所有的了:
library(tidyverse)
library(rvest)
read_html("http://www.100citang.cn/zongciweb/citang") -> html
tibble(
html %>%
html_elements("h5.name") %>%
html_text(),
html %>%
html_elements("p.address") %>%
html_text()
) %>%
set_names("name", "address") -> df
clusterExport(cl, "headers")
clusterExport(cl, "cookies")
parLapply(cl, df$name, function(x){
data = paste0('{"address":"","name":"', x, '"}')
res <- httr::POST(url = "http://www.100citang.cn/zongciweb/findClanHallByAddress", httr::add_headers(.headers=headers), httr::set_cookies(.cookies = cookies), body = data, config = httr::config(ssl_verifypeer = FALSE))
httr::content(res) %>%
purrr::transpose() %>%
tibble::as_tibble()
}) %>%
bind_rows() -> df2
df2 %>%
select(id, name, fulladdress, coordinate) %>%
unnest() %>%
distinct(id, .keep_all = T) %>%
separate(coordinate, into = c("lng", "lat"), sep = ",") %>%
type_convert() -> df2a
df2a %>%
filter(!is.na(lng)) %>%
st_as_sf(coords = c("lng", "lat"), crs = 4326, remove = F) %>%
st_transform(mycrs) %>%
st_intersection(county) %>%
st_drop_geometry() %>%
select(-contains("类型")) -> df3a
df3a
df2a %>%
left_join(df3a) %>%
haven::write_dta("宗祠网直接爬取到的经纬度数据.dta",
label = "数据爬取:微信公众号 RStata")
这个数据可以结合之前这个课程里面的结果使用:
使用 Stata 爬取全国宗庙祠堂数据并绘制地图展示:https://rstata.duanshu.com/#/brief/course/1a86473ee3c14a1c8f4a1ac55d35d05d
课程信息
内容比较简单,感兴趣的小伙伴可以对照讲义材料自学一下~有问题可以及时跟李老师说。就不再视频讲解了~
直播地址:腾讯会议(需要报名 RStata 培训班参加) 讲义材料:需要购买 RStata 名师讲堂会员,详情可阅读:一起来学习 R 语言和 Stata 啦!学习过程中遇到的问题也可以随时提问!
更多关于 RStata 会员的更多信息可添加微信号 r_stata 咨询:
附件下载(点击文末的阅读原文即可跳转):
https://rstata.duanshu.com/#/brief/course/2c499d2843b34885a8b034d280182044