友链
导航
These are the good times in your life,
so put on a smile and it'll be alright
友链
导航
# 原始数据 > states <- state.x77[,1:6] > head(states) Population Income Illiteracy Life Exp Murder HS Grad Alabama 3615 3624 2.1 69.05 15.1 41.3 Alaska 365 6315 1.5 69.31 11.3 66.7 Arizona 2212 4530 1.8 70.55 7.8 58.1 Arkansas 2110 3378 1.9 70.66 10.1 39.9 California 21198 5114 1.1 71.71 10.3 62.6 Colorado 2541 4884 0.7 72.06 6.8 63.9 # 计算协方差 > cov(states) Population Income Illiteracy Life Exp Murder Population 19931683.7588 571229.7796 292.8679592 -407.8424612 5663.523714 Income 571229.7796 377573.3061 -163.7020408 280.6631837 -521.894286 Illiteracy 292.8680 -163.7020 0.3715306 -0.4815122 1.581776 Life Exp -407.8425 280.6632 -0.4815122 1.8020204 -3.869480 Murder 5663.5237 -521.8943 1.5817755 -3.8694804 13.627465 HS Grad -3551.5096 3076.7690 -3.2354694 6.3126849 -14.549616 HS Grad Population -3551.509551 Income 3076.768980 Illiteracy -3.235469 Life Exp 6.312685 Murder -14.549616 HS Grad 65.237894 # 计算相关度 Correlation # cor(x, y = NULL, use = "everything", # method = c("pearson", "kendall", "spearman")) # 默认 method = "pearson" # "pearson" 积差相关系数 衡量两个 定量变量间 的线性相关程度 > cor(states) Population Income Illiteracy Life Exp Murder HS Grad Population 1.00000000 0.2082276 0.1076224 -0.06805195 0.3436428 -0.09848975 Income 0.20822756 1.0000000 -0.4370752 0.34025534 -0.2300776 0.61993232 Illiteracy 0.10762237 -0.4370752 1.0000000 -0.58847793 0.7029752 -0.65718861 Life Exp -0.06805195 0.3402553 -0.5884779 1.00000000 -0.7808458 0.58221620 Murder 0.34364275 -0.2300776 0.7029752 -0.78084575 1.0000000 -0.48797102 HS Grad -0.09848975 0.6199323 -0.6571886 0.58221620 -0.4879710 1.00000000 # "spearman" 等级相关系数 衡量分级 定序变量间 的相关程度 > cor(states, method="spearman") Population Income Illiteracy Life Exp Murder HS Grad Population 1.0000000 0.1246098 0.3130496 -0.1040171 0.3457401 -0.3833649 Income 0.1246098 1.0000000 -0.3145948 0.3241050 -0.2174623 0.5104809 Illiteracy 0.3130496 -0.3145948 1.0000000 -0.5553735 0.6723592 -0.6545396 Life Exp -0.1040171 0.3241050 -0.5553735 1.0000000 -0.7802406 0.5239410 Murder 0.3457401 -0.2174623 0.6723592 -0.7802406 1.0000000 -0.4367330 HS Grad -0.3833649 0.5104809 -0.6545396 0.5239410 -0.4367330 1.0000000 # 显著性检验(文盲率与谋杀率) > cor.test(states[,3], states[,5]) Pearson's product-moment correlation data: states[, 3] and states[, 5] t = 6.8479, df = 48, p-value = 1.258e-08 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.5279280 0.8207295 sample estimates: cor 0.7029752
R 中提供了一些测试数据, 一些文档中会使用, 所以在文档中看到 context 中没定义的变量, 可以在 R 里 help() 下, 看看是不是测试数据. 比如:
# 用 help 方法 > help(axis)
# 对于无 th 的文件 > mydataframe <- read.table('Desktop/r1.csv', header=FALSE, sep=",") > mydataframe V1 V2 1 Mar 16 2014 09:50:46 0.7 2 Mar 16 2014 09:50:56 0.7 3 Mar 16 2014 09:51:06 0.7 4 Mar 16 2014 09:51:16 0.7 5 Mar 16 2014 09:51:26 0.7 # 对于有 th 的文件 # 导入表格一般就用这种方法, 对之后的操作更方便 > mydataframe <- read.table('Desktop/r1.csv', header=TRUE, sep=",") > mydataframe datetime write 1 Mar 16 2014 09:50:46 0.7 2 Mar 16 2014 09:50:56 0.7 3 Mar 16 2014 09:51:06 0.7 4 Mar 16 2014 09:51:16 0.7 5 Mar 16 2014 09:51:26 0.7 # 以某行为 key > mydataframe <- read.table('Desktop/r1.csv', header=TRUE, sep=",", row.names="datetime") > mydataframe write Mar 16 2014 09:50:46 0.7 Mar 16 2014 09:50:56 0.7 Mar 16 2014 09:51:06 0.7 Mar 16 2014 09:51:16 0.7 Mar 16 2014 09:51:26 0.7
# 读入数据 > d <- read.table('Desktop/r2.csv', header=TRUE, sep=",") > d datetime kb_wps 1 Mar 16 2014 09:50:46 0.7 2 Mar 16 2014 09:50:56 0.7 3 Mar 16 2014 09:51:06 0.7 4 Mar 16 2014 09:51:16 0.7 5 Mar 16 2014 09:51:26 0.7 # 查看第一列 > d[1] datetime 1 Mar 16 2014 09:50:46 2 Mar 16 2014 09:50:56 3 Mar 16 2014 09:51:06 4 Mar 16 2014 09:51:16 5 Mar 16 2014 09:51:26 # 按 th 读取列 > d$kb_wps [1] 0.7 0.7 0.7 0.7 0.7 # 统计列 > summary(d$kb_wps) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.7 0.7 0.7 0.7 0.7 0.7 # 变量重编码 # 日期时间说明详见: http://www.stat.berkeley.edu/classes/s133/dates.html > d2 <- within(d, ( + dt <- strptime(datetime, format='%b %d %Y %H:%M:%S') + )) > d2 datetime kb_wps dt 1 Mar 16 2014 09:50:46 0.7 2014-03-16 09:50:46 2 Mar 16 2014 09:50:56 0.7 2014-03-16 09:50:56 3 Mar 16 2014 09:51:06 0.7 2014-03-16 09:51:06 4 Mar 16 2014 09:51:16 0.7 2014-03-16 09:51:16 5 Mar 16 2014 09:51:26 0.7 2014-03-16 09:51:26 # 画图 > plot(d$datetime, d$kb_wps) # 默认 plot 在 x axis 不会正常显示时间 # 如果要再 x axis 显示时间 # 画图时, 先不画 x axis > plot(d2$dt, d2$kb_wps, xaxt="n") # 画完再画 x axis > axis.POSIXct(1, d2$dt, format="%m/%d %H:%M") # 当数据太多时, 可以用 head preview > head(d) datetime X197_kb_wps X198_kb_wps X205_kb_wps X206_kb_wps 1 Mar 16 2014 09:50:46 0.7 0.7 0.73 0.7 2 Mar 16 2014 09:50:56 0.7 0.7 0.73 0.7 3 Mar 16 2014 09:51:06 0.7 0.7 0.73 0.7 4 Mar 16 2014 09:51:16 0.7 0.7 0.73 0.7 5 Mar 16 2014 09:51:26 0.7 0.7 0.73 0.7 6 Mar 16 2014 09:51:36 0.7 0.7 0.73 0.7 X207_kb_wps X208_kb_wps 1 0.73 0.71 2 0.73 0.71 3 0.73 0.71 4 0.73 0.71 5 0.73 0.71 6 0.73 0.71 # 直接让 axis.POSIXct 以 data.frame 中的数据画轴, 可能只有两个 label, label 不够细. # 可以自己按间隔要求制作 seq, 再用 seq 做 x 轴 > tickpos <- seq(d2$dt[1], d2$dt[length(d2$dt)], by="4 hours") > axis.POSIXct(side=1, at=tickpos, format="%H:%M", las=2) # las=2 设置 label 垂直与 axis
opar <- par(no.readonly=TRUE) par(mfrow=c(2,3)) plot(d2$dt, d2$X197_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="197") plot(d2$dt, d2$X198_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="198") plot(d2$dt, d2$X205_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="205") plot(d2$dt, d2$X206_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="206") plot(d2$dt, d2$X207_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="207") plot(d2$dt, d2$X208_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="208") par(opar)
d2 <- within(d, (dt <- as.POSIXct(timestamp, origin="1970-01-01"))) # 过滤太旧的数据 d2 <- d2[which(d2$dt > as.POSIXct("2014-01-01")),]
# 只提供行下标, 将列下标留空 (故选取所有列) send_data <- d[which(d$direction == 'SEND'), ]
R: Add Straight Lines to a Plot
> plot(d$dt, d$mem_used, xlab="", ylab="", las=2, main="mem used") > abline(h = 300000, col = "red")
trendline - How do I add different trend lines in R? - Stack Overflow
glm 需要用 1:n 做 x 轴, 不可用时间
# set the margins tmpmar <- par("mar") tmpmar[3] <- 0.5 par(mar=tmpmar) # get underlying plot x <- 1:10 y <- jitter(x^2) plot(x, y, pch=20) # basic straight line of fit fit <- glm(y~x) co <- coef(fit) abline(fit, col="blue", lwd=2) # exponential f <- function(x,a,b) {a * exp(b * x)} fit <- nls(y ~ f(x,a,b), start = c(a=1, b=1)) co <- coef(fit) curve(f(x, a=co[1], b=co[2]), add = TRUE, col="green", lwd=2) # logarithmic f <- function(x,a,b) {a * log(x) + b} fit <- nls(y ~ f(x,a,b), start = c(a=1, b=1)) co <- coef(fit) curve(f(x, a=co[1], b=co[2]), add = TRUE, col="orange", lwd=2) # polynomial f <- function(x,a,b,d) {(a*x^2) + (b*x) + d} fit <- nls(y ~ f(x,a,b,d), start = c(a=1, b=1, d=1)) co <- coef(fit) curve(f(x, a=co[1], b=co[2], d=co[3]), add = TRUE, col="pink", lwd=2) Add a descriptive legend: # legend legend("topleft", legend=c("linear","exponential","logarithmic","polynomial"), col=c("blue","green","orange","pink"), lwd=2, )
基础用法: R CMD BATCH -option foo.r
虽然也能用 -option
接受参数, 但参数处理较麻烦, 如果需要处理参数, 更恰当是用 Rscript 命令
## myScript.R args <- commandArgs(trailingOnly = TRUE) rnorm(n=as.numeric(args[1]), mean=as.numeric(args[2])) ## bash $ Rscript myScript.R 5 100 [1] 98.46435 100.04626 99.44937 98.52910 100.78853
### Project Euler 1 ### 找到1000以下,所有能被3或5整除的数,将它们相加 # 赋值 x <- 1:999 sum(x[x %% 3 == 0 | x %% 5 == 0 ]) ### Project Euler 2 ### 找到4000000以下的斐波纳契数列 ### 将其中的偶数进行求和 i <- 2 x <- 1:2 while (x[i] < 4e6) { x[i+1] <- x[i-1] + x[i] i <- i + 1 } # 删除第 i 个元素 x <- x[-i] sum(x[x %% 2 == 0])