其他
R可视化04|ggplot2图层-统计变换图层(stat layer)
"pythonic生物人"的第102篇分享
本文介绍ggplot2的一些统计变换方式,续前几篇。
目录
1、误差线和误差范围(Revealing uncertainty)
2、加权数据(Weighted Data)
3、展示数据分布(Displaying Distributions)
histogram及density展示一维连续数据分布
boxplot和violin展示多个连续&&离散变量的分布
三维数据分布
4、解决散点图遮盖问题(overplotting)
统计变换(stat):以特定的方式对数据进行汇总,然后再绘图。
1、误差线和误差范围(Revealing uncertainty)
ggplot2依据变量是否是连续(continuous)还是离散(discrete)、是否想展示区间中的中心值和仅仅展示区间,有四类geoms可解决Revealing uncertainty问题。
离散型变量&&区间: geom_errorbar()
,geom_linerange()
离散型变量&&区间&&中间值: geom_crossbar()
,geom_pointrange()
连续型变量&&区间: geom_ribbon()
连续型变量&&区间&&中间值: geom_smooth(stat = "identity")
library('gridExtra')
y <- c(18, 11, 16)
df <- data.frame(x = 1:3, y = y, se = c(1.2, 0.5, 1.0))
base <- ggplot(df, aes(x, y, ymin = y - se, ymax = y + se))#以下geom使用默认需要设置y值域
p2 <- base + geom_crossbar()
p3 <- base + geom_pointrange()
p4 <- base + geom_smooth(stat = "identity")
p5 <- base + geom_errorbar()
p6 <- base + geom_linerange()
p7 <- base + geom_ribbon()
grid.arrange(p2, p3, p4,p5,p6,p7,nrow = 3)
2、加权数据(Weighted Data)
处理整合数据(aggregated data)的时候,数据的每一行代表了多种特征【一行为多列,每列为一个特征】,这时我们需要通过某种方式把某个特征的影响考虑在内,举几个栗子可能会清晰点什么叫做加加权。这里使用数据midwest,为一个437x28的数据集,包含28个特征。
> head(midwest)
# A tibble: 6 x 28
PID county state area poptotal popdensity popwhite popblack
<int> <chr> <chr> <dbl> <int> <dbl> <int> <int>
1 561 ADAMS IL 0.052 66090 1271. 63917 1702
2 562 ALEXA~ IL 0.014 10626 759 7054 3496
3 563 BOND IL 0.022 14991 681. 14477 429
4 564 BOONE IL 0.017 30806 1812. 29344 127
5 565 BROWN IL 0.018 5836 324. 5264 547
6 566 BUREAU IL 0.05 35688 714. 35157 50
# ... with 20 more variables: popamerindian <int>, popasian <int>,
# popother <int>, percwhite <dbl>, percblack <dbl>, percamerindan <dbl>,
# percasian <dbl>, percother <dbl>, popadults <int>, perchsd <dbl>,
# percollege <dbl>, percprof <dbl>, poppovertyknown <int>,
# percpovertyknown <dbl>, percbelowpoverty <dbl>,
# percchildbelowpovert <dbl>, percadultpoverty <dbl>,
# percelderlypoverty <dbl>, inmetro <int>, category <chr>
> dim(midwest)
[1] 437 28
For simple geoms like lines and points, use the size aesthetic.
#for simple geoms like lines and points, use the size aesthetic
# 不加权
p1 <- ggplot(midwest, aes(percwhite, percbelowpoverty)) + geom_point()
# 通过population加权
p2 <- ggplot(midwest, aes(percwhite, percbelowpoverty)) +
geom_point(aes(size = poptotal/1e+06)) + #use the size aesthetic
scale_size_area("Population\n(millions)", breaks = c(0.5, 1, 2, 4))
grid.arrange(p1,p2,nrow = 1)
For more complicated grobs which involve some statistical transformation,we specify weights with the weight aesthetic.
#For more complicated grobs which involve some statistical transformation,
#we specify weights with the weight aesthetic
# Unweighted
p1 <- ggplot(midwest, aes(percwhite, percbelowpoverty)) +
geom_point() +
geom_smooth(method = lm, size = 1)
#> `geom_smooth()` using formula 'y ~ x'
# Weighted by population
p2 <- ggplot(midwest, aes(percwhite, percbelowpoverty)) +
geom_point(aes(size = poptotal / 1e6)) +
geom_smooth(aes(weight = poptotal), method = lm, size = 1) +
scale_size_area(guide = "none")
#> `geom_smooth()` using formula 'y ~ x'
p3 <- ggplot(midwest, aes(percbelowpoverty)) +
geom_histogram(binwidth = 1) +
ylab("Counties")
p4 <- ggplot(midwest, aes(percbelowpoverty)) +
geom_histogram(aes(weight = poptotal), binwidth = 1) +#weight aesthetic
ylab("Population (1000s)")
grid.arrange(p1,p2,p3,p4,nrow = 2)
3、展示数据分布(Displaying Distributions)
这里使用diamonds
数据集,为一个53940x10的数据集,每一行为一种钻石,记录钻石属性数据,如4个C(克拉、切割、颜色、纯度,为钻石最重要质量保证)及其它6个属性。
> head(diamonds)
# A tibble: 6 x 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
> dim(diamonds)
[1] 53940 10
histogram及density展示一维连续数据分布
#直方图
p1 <- ggplot(diamonds, aes(depth)) +
geom_histogram()
p2 <- ggplot(diamonds, aes(depth)) +
geom_histogram(binwidth = 0.1) +
xlim(55, 70)#设置x轴显示范围,集聚显示某部分数据
#频率多边形图
p3 <- ggplot(diamonds, aes(depth)) +
geom_freqpoly(aes(colour = cut), binwidth = 0.1, na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none") # 隐藏图例
#条件密度图
p4 <- ggplot(diamonds, aes(depth)) +
geom_histogram(aes(fill = cut), binwidth = 0.1, position = "fill", na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none")
#核密度图
p5 <- ggplot(diamonds, aes(depth)) +
geom_density(na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none")
#分组和密度图
p6 <- ggplot(diamonds, aes(depth, fill = cut, colour = cut)) +
geom_density(alpha = 0.2, na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none")
grid.arrange(p1,p2,p3,p4,p5,p6,nrow = 3)
boxplot和violin展示多个连续&&离散变量的分布
p1 <- ggplot(diamonds, aes(clarity, depth)) +
geom_boxplot()
p2 <- ggplot(diamonds, aes(carat, depth)) +
geom_boxplot(aes(group = cut_width(carat, 0.1))) + #cut_width设置离散区间
xlim(NA, 2.05)
p3 <- ggplot(diamonds, aes(clarity, depth)) +
geom_violin()
p4 <- ggplot(diamonds, aes(carat, depth)) +
geom_violin(aes(group = cut_width(carat, 0.1))) +
xlim(NA, 2.05)
#> Warning: Removed 997 rows containing non-finite values (stat_ydensity).
grid.arrange(p1,p2,p3,p4,nrow = 2)
三维数据分布
options(repr.plot.width = 8, repr.plot.height = 3, repr.plot.res = 250)
# Bubble plots work better with fewer observations
small <- faithfuld[seq(1, nrow(faithfuld), by = 10), ]
p1 <- ggplot(small, aes(eruptions, waiting)) +
geom_point(aes(size = density), alpha = 1/3) +
scale_size_area()
p2 <- ggplot(faithfuld, aes(eruptions, waiting)) +
geom_contour(aes(z = density, colour = ..level..))#.. notation refers to a variable computed internally
p3 <- ggplot(faithfuld, aes(eruptions, waiting)) +
geom_raster(aes(fill = density))
grid.arrange(p1,p2,p3,nrow = 1)
4、解决散点图遮挡问题(overplotting)
options(repr.plot.width = 7, repr.plot.height = 8, repr.plot.res = 250)
df <- data.frame(x = rnorm(2000), y = rnorm(2000))
norm <- ggplot(df, aes(x, y)) + xlab(NULL) + ylab(NULL)
#使用小点和空点
p1 <- norm + geom_point()
p2 <- norm + geom_point(shape = 1) # Hollow circles
p3 <- norm + geom_point(shape = ".") # Pixel sized
#使用透明度
p4 <- norm + geom_point(alpha = 1 / 3)
p5 <- norm + geom_point(alpha = 1 / 5)
p6 <- norm + geom_point(alpha = 1 / 10)
#添加随机扰动值(离散变量)
p7 <- norm + geom_bin2d()
p8 <- norm + geom_bin2d(bins = 10)
p9 <- norm + geom_hex()
p10 <- norm + geom_hex(bins = 10)
grid.arrange(p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,nrow = 4)
参考资料:https://ggplot2-book.org/statistical-summaries.html
本文结束,往期文章
有任何意见请移步到QQ群629562529反馈,一起进步哈!