Introduction and basics
Install R and R studio
Please make sure that you have R version 3.2.1 or later, а R studio Version 1.0.136 or later.
R is installed from here: https://cran.r-project.org/ R studio is installed from here: https://www.rstudio.com/products/rstudio/download/
The simplest data type: vector
Let's create some vectors:
x <- 1:5 x x*2 x>4 x==7 x*x
Other options:
c(1, 2, 3) seq(from=1, to=8, by=2) rep(0.5, 6)
Vector can be also generated from distribution:
set.seed(100) rnorm(5) rbinom(10, 100, 0.5) rpois(10, 4)
Subsets:
x<-c(1, 5, 7, 9, 15, 3)
According to conditions:
x[1] x[2:4] x[c(2, 5)] x[-1] x[-(1:3)] x[x>5] x[x>5 & x<10]
The simplest statistical analysis:
x <- rnorm(100) mean(x) sd(x) min(x) max(x) quantile(x)
Simple plots
The most simple plot:
x_data <- c(0:10) y_data <- x_data +5 plot(x_data, y_data, main = "My Chart Title", xlab ="X", ylab = "Y", pch=16, col = "red")
Let's add axis restrictions:
plot(x_data, y_data, main = "My Chart Title", xlab ="X", ylab = "Y", pch=16, col = "red", xlim=c(1,7), ylim=c(0, 20)
Or more lines:
x <- 1:10 y <- x*x z <- x-5 plot(y ~ x, type='l', col = 'red') lines(y ~ z, col = 'blue')
Let's make scatterplot and add lines:
x <- rnorm(1000) y <- x*x + rnorm(1000, sd=2) plot(x, y, pch=19, cex=0.3) abline(lm(y~x), col="red") lines(lowess(y~x), col="blue")
Or histogram:
x <- rnorm(1000) hist(x, col='blue')
And a boxplot:
boxplot(mpg~cyl,data=mtcars, main="Car Milage Data", xlab="Number of Cylinders", ylab="Miles Per Gallon")
Save plot to file:
png(file="Pictures/boxplot.png", width=400, height=350, res=72) boxplot(x,y) dev.off()
Data frames
Let's create out first data frame:
n <- c(2, 3, 5) s <- c("aa", "bb", "cc") b <- c(TRUE, FALSE, TRUE) df <- data.frame(n, s, b)
Refer column by name:
df$n
Data frame dimension:
dim(df)
Let's use preloaded data sets:
data() ?mtcars
Get rows, columns, cells:
mtcars[12,2] mtcars[8,] mtcars[1:3,] mtcars[c(1,13),] mtcars[c(1,3,7,13),1]
Add a column:
dim(mtnew) num<-1:33 mtnew<-cbind(mtnew, num) mtnew[30:33,]
Logical conditions and order:
mtcars1 <- mtcars[mtcars$cyl>4 & mtcars$cyl<8,] mtcars1 mtcars1[order(mtcars1$drat),]
Factors:
f <- factor(c("yes", "yes", "no", "yes", "no")) levels(f) levels(f) <- c(levels(f), "maybe") table(f) f <- factor(c("yes", "yes", "no", "yes", "no"), levels = c("yes", "no"))
Split by factor:
boxplot(mtcars$mpg ~ mtcars$cyl)
Files:
http://makarich.fbb.msu.ru/artemov/R/FBBRStudents.txt
Working with directory:
getwd() setwd("Day1")
Read file:
students <- read.table("FBBRStudents.tab",sep="\t", header=T) students[101:102,] students <- read.table("FBBRStudents.tab",sep="\t",header=T, + colClasses = c("character","factor","factor","integer")) str(students)
Save R workspace:
save(students, lines, file="Students.RData") rm(list=ls()) ls() load("Students.RData") ls()
Better plots
Install packages:
install.packages("reshape2") library(“reshape2”) names(airquality) <- tolower(names(airquality)) head(airquality) aql <- melt(airquality) head(aql)
Control id variables:
aql <- melt(airquality, id.vars = c("Month", "Day"), variable.name = "climate_variable", value.name = "climate_value") head(aql)
And back:
aql <- melt(airquality, id.vars = c("month", "day")) aqw <- dcast(aql, month + day ~ variable) head(aqw)
Install ggplot
install.packages("ggplot2") library("ggplot2")
We will need data from gapminder
install.packages("gapminder") library("gapminder")
Make scatterplot
p <- ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) p + geom_point()
Add axes:
p + geom_point() + scale_x_log10()
Color the plot:
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp, color = continent)) + geom_point() + scale_x_log10()
Additional lines:
p + geom_point() + geom_smooth() p + geom_point() + geom_smooth(lwd = 3, se = FALSE, method = "lm")
Plot for one country only:
ggplot(subset(gapminder, country == "Zimbabwe"), aes(x = year, y = lifeExp)) + geom_line() + geom_point()
Or for several countries:
jCountries <- c("Canada", "Rwanda", "Cambodia", "Mexico") ggplot(subset(gapminder, country %in% jCountries), aes(x = year, y = lifeExp, color = country)) + geom_line() + geom_point()
Life expectancy per continent?
ggplot(gapminder, aes(x = continent, y = lifeExp)) + geom_point() ggplot(gapminder, aes(x = continent, y = lifeExp)) + geom_jitter() ggplot(gapminder, aes(x = continent, y = lifeExp)) + geom_boxplot() ggplot(gapminder, aes(x = continent, y = lifeExp)) + geom_boxplot(outlier.colour = "hotpink") + geom_jitter(position = position_jitter(width = 0.1, height = 0), alpha = 1/4)
We can only produce density plots and histograms:
ggplot(gapminder, aes(x = lifeExp)) + geom_histogram() ggplot(gapminder, aes(x = lifeExp)) + geom_density() ggplot(gapminder, aes(x = lifeExp, color = continent)) + geom_density() ggplot(gapminder, aes(x = lifeExp, fill = continent)) + geom_density(alpha = 0.2) ggplot(subset(gapminder, continent != "Oceania"), aes(x = lifeExp, fill = continent)) + geom_histogram() + facet_grid(continent ~ .) p + ggtitle("Life expectancy over time by continent") p + theme_bw()