Xiaopei's DokuWiki

These are the good times in your life,
so put on a smile and it'll be alright

User Tools

Site Tools


it:r

R

data structures

统计

相关

# 原始数据
> states <- state.x77[,1:6]
> head(states)
           Population Income Illiteracy Life Exp Murder HS Grad
Alabama          3615   3624        2.1    69.05   15.1    41.3
Alaska            365   6315        1.5    69.31   11.3    66.7
Arizona          2212   4530        1.8    70.55    7.8    58.1
Arkansas         2110   3378        1.9    70.66   10.1    39.9
California      21198   5114        1.1    71.71   10.3    62.6
Colorado         2541   4884        0.7    72.06    6.8    63.9
 
# 计算协方差
> cov(states)
              Population      Income   Illiteracy     Life Exp      Murder
Population 19931683.7588 571229.7796  292.8679592 -407.8424612 5663.523714
Income       571229.7796 377573.3061 -163.7020408  280.6631837 -521.894286
Illiteracy      292.8680   -163.7020    0.3715306   -0.4815122    1.581776
Life Exp       -407.8425    280.6632   -0.4815122    1.8020204   -3.869480
Murder         5663.5237   -521.8943    1.5817755   -3.8694804   13.627465
HS Grad       -3551.5096   3076.7690   -3.2354694    6.3126849  -14.549616
                HS Grad
Population -3551.509551
Income      3076.768980
Illiteracy    -3.235469
Life Exp       6.312685
Murder       -14.549616
HS Grad       65.237894
 
# 计算相关度 Correlation 
# cor(x, y = NULL, use = "everything",
#   method = c("pearson", "kendall", "spearman"))
# 默认 method = "pearson"
# "pearson" 积差相关系数 衡量两个 定量变量间 的线性相关程度
> cor(states)
            Population     Income Illiteracy    Life Exp     Murder     HS Grad
Population  1.00000000  0.2082276  0.1076224 -0.06805195  0.3436428 -0.09848975
Income      0.20822756  1.0000000 -0.4370752  0.34025534 -0.2300776  0.61993232
Illiteracy  0.10762237 -0.4370752  1.0000000 -0.58847793  0.7029752 -0.65718861
Life Exp   -0.06805195  0.3402553 -0.5884779  1.00000000 -0.7808458  0.58221620
Murder      0.34364275 -0.2300776  0.7029752 -0.78084575  1.0000000 -0.48797102
HS Grad    -0.09848975  0.6199323 -0.6571886  0.58221620 -0.4879710  1.00000000
# "spearman" 等级相关系数 衡量分级 定序变量间 的相关程度
> cor(states, method="spearman")
           Population     Income Illiteracy   Life Exp     Murder    HS Grad
Population  1.0000000  0.1246098  0.3130496 -0.1040171  0.3457401 -0.3833649
Income      0.1246098  1.0000000 -0.3145948  0.3241050 -0.2174623  0.5104809
Illiteracy  0.3130496 -0.3145948  1.0000000 -0.5553735  0.6723592 -0.6545396
Life Exp   -0.1040171  0.3241050 -0.5553735  1.0000000 -0.7802406  0.5239410
Murder      0.3457401 -0.2174623  0.6723592 -0.7802406  1.0000000 -0.4367330
HS Grad    -0.3833649  0.5104809 -0.6545396  0.5239410 -0.4367330  1.0000000
 
# 显著性检验(文盲率与谋杀率)
> cor.test(states[,3], states[,5])
 
	Pearson's product-moment correlation
 
data:  states[, 3] and states[, 5] 
t = 6.8479, df = 48, p-value = 1.258e-08
alternative hypothesis: true correlation is not equal to 0 
95 percent confidence interval:
 0.5279280 0.8207295 
sample estimates:
      cor 
0.7029752 

tips

示例数据

R 中提供了一些测试数据, 一些文档中会使用, 所以在文档中看到 context 中没定义的变量, 可以在 R 里 help() 下, 看看是不是测试数据. 比如:

  • mtcars: Motor Trend Car Road Tests

使用帮助

# 用 help 方法
> help(axis)

读 csv

# 对于无 th 的文件
> mydataframe <- read.table('Desktop/r1.csv', header=FALSE, sep=",")
> mydataframe
                     V1  V2
1  Mar 16 2014 09:50:46 0.7
2  Mar 16 2014 09:50:56 0.7
3  Mar 16 2014 09:51:06 0.7
4  Mar 16 2014 09:51:16 0.7
5  Mar 16 2014 09:51:26 0.7
 
# 对于有 th 的文件
# 导入表格一般就用这种方法, 对之后的操作更方便
> mydataframe <- read.table('Desktop/r1.csv', header=TRUE, sep=",")
> mydataframe
               datetime write
1  Mar 16 2014 09:50:46   0.7
2  Mar 16 2014 09:50:56   0.7
3  Mar 16 2014 09:51:06   0.7
4  Mar 16 2014 09:51:16   0.7
5  Mar 16 2014 09:51:26   0.7
 
# 以某行为 key
> mydataframe <- read.table('Desktop/r1.csv', header=TRUE, sep=",", row.names="datetime")
> mydataframe
                     write
Mar 16 2014 09:50:46   0.7
Mar 16 2014 09:50:56   0.7
Mar 16 2014 09:51:06   0.7
Mar 16 2014 09:51:16   0.7
Mar 16 2014 09:51:26   0.7

data frame 的基本操作

# 读入数据
> d <- read.table('Desktop/r2.csv', header=TRUE, sep=",")
> d
              datetime kb_wps
1 Mar 16 2014 09:50:46    0.7
2 Mar 16 2014 09:50:56    0.7
3 Mar 16 2014 09:51:06    0.7
4 Mar 16 2014 09:51:16    0.7
5 Mar 16 2014 09:51:26    0.7
 
# 查看第一列
> d[1]
              datetime
1 Mar 16 2014 09:50:46
2 Mar 16 2014 09:50:56
3 Mar 16 2014 09:51:06
4 Mar 16 2014 09:51:16
5 Mar 16 2014 09:51:26
 
# 按 th 读取列
> d$kb_wps
[1] 0.7 0.7 0.7 0.7 0.7
 
# 统计列
> summary(d$kb_wps)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
    0.7     0.7     0.7     0.7     0.7     0.7
 
# 变量重编码
# 日期时间说明详见: http://www.stat.berkeley.edu/classes/s133/dates.html
> d2 <- within(d, (
+   dt <- strptime(datetime, format='%b %d %Y %H:%M:%S')
+ ))
> d2
              datetime kb_wps                  dt
1 Mar 16 2014 09:50:46    0.7 2014-03-16 09:50:46
2 Mar 16 2014 09:50:56    0.7 2014-03-16 09:50:56
3 Mar 16 2014 09:51:06    0.7 2014-03-16 09:51:06
4 Mar 16 2014 09:51:16    0.7 2014-03-16 09:51:16
5 Mar 16 2014 09:51:26    0.7 2014-03-16 09:51:26
 
# 画图    
> plot(d$datetime, d$kb_wps)
# 默认 plot 在 x axis 不会正常显示时间
 
# 如果要再 x axis 显示时间
# 画图时, 先不画 x axis
> plot(d2$dt, d2$kb_wps, xaxt="n")
# 画完再画 x axis
> axis.POSIXct(1, d2$dt, format="%m/%d %H:%M")
 
# 当数据太多时, 可以用 head preview
> head(d)
              datetime X197_kb_wps X198_kb_wps X205_kb_wps X206_kb_wps
1 Mar 16 2014 09:50:46         0.7         0.7        0.73         0.7
2 Mar 16 2014 09:50:56         0.7         0.7        0.73         0.7
3 Mar 16 2014 09:51:06         0.7         0.7        0.73         0.7
4 Mar 16 2014 09:51:16         0.7         0.7        0.73         0.7
5 Mar 16 2014 09:51:26         0.7         0.7        0.73         0.7
6 Mar 16 2014 09:51:36         0.7         0.7        0.73         0.7
  X207_kb_wps X208_kb_wps
1        0.73        0.71
2        0.73        0.71
3        0.73        0.71
4        0.73        0.71
5        0.73        0.71
6        0.73        0.71
 
# 直接让 axis.POSIXct 以 data.frame 中的数据画轴, 可能只有两个 label, label 不够细.
# 可以自己按间隔要求制作 seq, 再用 seq 做 x 轴
> tickpos <- seq(d2$dt[1], d2$dt[length(d2$dt)], by="4 hours")
> axis.POSIXct(side=1, at=tickpos, format="%H:%M", las=2)
# las=2 设置 label 垂直与 axis

组合多个图片

opar <- par(no.readonly=TRUE)
par(mfrow=c(2,3))
 
plot(d2$dt, d2$X197_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="197")
plot(d2$dt, d2$X198_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="198")
plot(d2$dt, d2$X205_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="205")
plot(d2$dt, d2$X206_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="206")
plot(d2$dt, d2$X207_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="207")
plot(d2$dt, d2$X208_kb_wps, xaxt="n", ylab="kb/s", xlab="", main="208")
 
par(opar)

处理 unix timestamp

d2 <- within(d, (dt <- as.POSIXct(timestamp, origin="1970-01-01")))
 
# 过滤太旧的数据
d2 <- d2[which(d2$dt > as.POSIXct("2014-01-01")),]

筛选行

# 只提供行下标, 将列下标留空 (故选取所有列)
send_data <- d[which(d$direction == 'SEND'), ]

在图表中增加直线 abline

R: Add Straight Lines to a Plot

> plot(d$dt, d$mem_used,  xlab="", ylab="", las=2, main="mem used")
> abline(h = 300000, col = "red")

trendline

trendline - How do I add different trend lines in R? - Stack Overflow

glm 需要用 1:n 做 x 轴, 不可用时间

# set the margins
tmpmar <- par("mar")
tmpmar[3] <- 0.5
par(mar=tmpmar)

# get underlying plot
x <- 1:10
y <- jitter(x^2)
plot(x, y, pch=20)

# basic straight line of fit
fit <- glm(y~x)
co <- coef(fit)
abline(fit, col="blue", lwd=2)

# exponential
f <- function(x,a,b) {a * exp(b * x)}
fit <- nls(y ~ f(x,a,b), start = c(a=1, b=1)) 
co <- coef(fit)
curve(f(x, a=co[1], b=co[2]), add = TRUE, col="green", lwd=2) 

# logarithmic
f <- function(x,a,b) {a * log(x) + b}
fit <- nls(y ~ f(x,a,b), start = c(a=1, b=1)) 
co <- coef(fit)
curve(f(x, a=co[1], b=co[2]), add = TRUE, col="orange", lwd=2) 

# polynomial
f <- function(x,a,b,d) {(a*x^2) + (b*x) + d}
fit <- nls(y ~ f(x,a,b,d), start = c(a=1, b=1, d=1)) 
co <- coef(fit)
curve(f(x, a=co[1], b=co[2], d=co[3]), add = TRUE, col="pink", lwd=2) 

Add a descriptive legend:

# legend
legend("topleft",
    legend=c("linear","exponential","logarithmic","polynomial"),
    col=c("blue","green","orange","pink"),
    lwd=2,
    )

作为 TCP 服务

使用脚本 (而非交互命令行) 运行 aka batch script

R CMD BATCH

基础用法: R CMD BATCH -option foo.r

虽然也能用 -option 接受参数, 但参数处理较麻烦, 如果需要处理参数, 更恰当是用 Rscript 命令

Rscript

## myScript.R
args <- commandArgs(trailingOnly = TRUE)
rnorm(n=as.numeric(args[1]), mean=as.numeric(args[2]))

## bash
$ Rscript myScript.R 5 100
[1]  98.46435 100.04626  99.44937  98.52910 100.78853

TODO

笨办法学R编程

### Project Euler 1
### 找到1000以下,所有能被3或5整除的数,将它们相加
# 赋值
x <- 1:999
sum(x[x %% 3 == 0 | x %% 5 == 0 ])
 
 
### Project Euler 2
### 找到4000000以下的斐波纳契数列
### 将其中的偶数进行求和
i <- 2
x <- 1:2
while (x[i] < 4e6) {
  x[i+1] <- x[i-1] + x[i]
  i <- i + 1
}
# 删除第 i 个元素
x <- x[-i]
sum(x[x %% 2 == 0])
it/r.txt · Last modified: 2014/09/12 16:32 by admin