Kodomo

Пользователь

Introduction and basics

Install R and R studio

Please make sure that you have R version 3.2.1 or later, а R studio Version 1.0.136 or later.

R is installed from here: https://cran.r-project.org/ R studio is installed from here: https://www.rstudio.com/products/rstudio/download/

The simplest data type: vector

Let's create some vectors:

x <- 1:5
x
x*2
x>4
x==7
x*x

Other options:

c(1, 2, 3)
seq(from=1, to=8, by=2)
rep(0.5, 6)

Vector can be also generated from distribution:

set.seed(100)
rnorm(5)
rbinom(10, 100, 0.5)
rpois(10, 4)

Subsets:

x<-c(1, 5, 7, 9, 15, 3)

According to conditions:

x[1]
x[2:4]
x[c(2, 5)]
x[-1]
x[-(1:3)]
x[x>5]
x[x>5 & x<10]

The simplest statistical analysis:

x <- rnorm(100)
mean(x)
sd(x) 
min(x)
max(x)
quantile(x)

Simple plots

The most simple plot:

x_data <- c(0:10)
y_data <- x_data +5
plot(x_data, y_data,  main = "My Chart Title", xlab ="X", ylab = "Y", pch=16, col = "red")

Let's add axis restrictions:

plot(x_data, y_data,  main = "My Chart Title", xlab ="X", ylab = "Y", pch=16, col = "red", xlim=c(1,7), ylim=c(0, 20)

Or more lines:

x <- 1:10
y <- x*x
z <- x-5
plot(y ~ x, type='l', col = 'red')
lines(y ~ z, col = 'blue')

Let's make scatterplot and add lines:

x <- rnorm(1000)
y <- x*x + rnorm(1000, sd=2)
plot(x, y, pch=19, cex=0.3)
abline(lm(y~x), col="red")
lines(lowess(y~x), col="blue")

Or histogram:

x <- rnorm(1000)
hist(x, col='blue')

And a boxplot:

boxplot(mpg~cyl,data=mtcars, main="Car Milage Data",    xlab="Number of Cylinders", ylab="Miles Per Gallon")

Save plot to file:

png(file="Pictures/boxplot.png", width=400, height=350, res=72)
boxplot(x,y)
dev.off()

Data frames

Let's create out first data frame:

n <- c(2, 3, 5) 
s <- c("aa", "bb", "cc")  
b <- c(TRUE, FALSE, TRUE) 
df <- data.frame(n, s, b) 

Refer column by name:

df$n

Data frame dimension:

dim(df)

Let's use preloaded data sets:

data()
?mtcars

Get rows, columns, cells:

mtcars[12,2]
mtcars[8,]
mtcars[1:3,]
mtcars[c(1,13),]  
mtcars[c(1,3,7,13),1] 

Add a column:

dim(mtnew)
num<-1:33
mtnew<-cbind(mtnew, num)
mtnew[30:33,]

Logical conditions and order:

mtcars1 <- mtcars[mtcars$cyl>4 & mtcars$cyl<8,]
mtcars1
mtcars1[order(mtcars1$drat),]

Factors:

f <- factor(c("yes", "yes", "no", "yes", "no"))
levels(f)
levels(f) <- c(levels(f), "maybe") 
table(f) 
f <- factor(c("yes", "yes", "no", "yes", "no"), levels = c("yes", "no")) 

Split by factor:

boxplot(mtcars$mpg ~ mtcars$cyl)

Files:

http://makarich.fbb.msu.ru/artemov/R/FBBRStudents.txt

Working with directory:

getwd()
setwd("Day1")  

Read file:

students <- read.table("FBBRStudents.tab",sep="\t", header=T) 
students[101:102,] 
students <- read.table("FBBRStudents.tab",sep="\t",header=T, 
+ colClasses = c("character","factor","factor","integer"))
str(students) 

Save R workspace:

save(students, lines, file="Students.RData")
rm(list=ls()) 
ls()
load("Students.RData") 
ls() 

Better plots

Install packages:

install.packages("reshape2")
library(“reshape2”)
names(airquality) <- tolower(names(airquality))
head(airquality)
aql <- melt(airquality)
head(aql)

Control id variables:

aql <- melt(airquality, id.vars = c("Month", "Day"), variable.name = "climate_variable", value.name = "climate_value")
head(aql)

And back:

aql <- melt(airquality, id.vars = c("month", "day"))
aqw <- dcast(aql, month + day ~ variable)
head(aqw)

Install ggplot

install.packages("ggplot2")
library("ggplot2")

We will need data from gapminder

install.packages("gapminder")
library("gapminder")

Make scatterplot

p <- ggplot(gapminder, aes(x = gdpPercap, y = lifeExp))
p + geom_point()

Add axes:

p + geom_point() + scale_x_log10()

Color the plot:

ggplot(gapminder, aes(x = gdpPercap, y = lifeExp, color = continent)) + geom_point() + scale_x_log10()

Additional lines:

p + geom_point() + geom_smooth()
p + geom_point() + geom_smooth(lwd = 3, se = FALSE, method = "lm")

Plot for one country only:

ggplot(subset(gapminder, country == "Zimbabwe"),
       aes(x = year, y = lifeExp)) + geom_line() + geom_point()

Or for several countries:

jCountries <- c("Canada", "Rwanda", "Cambodia", "Mexico")
ggplot(subset(gapminder, country %in% jCountries),
       aes(x = year, y = lifeExp, color = country)) + geom_line() + geom_point()

Life expectancy per continent?

ggplot(gapminder, aes(x = continent, y = lifeExp)) + geom_point()
ggplot(gapminder, aes(x = continent, y = lifeExp)) + geom_jitter()
ggplot(gapminder, aes(x = continent, y = lifeExp)) + geom_boxplot()
ggplot(gapminder, aes(x = continent, y = lifeExp)) +
  geom_boxplot(outlier.colour = "hotpink") +
  geom_jitter(position = position_jitter(width = 0.1, height = 0), alpha = 1/4)

We can only produce density plots and histograms:

ggplot(gapminder, aes(x = lifeExp)) + geom_histogram()
ggplot(gapminder, aes(x = lifeExp)) + geom_density()
ggplot(gapminder, aes(x = lifeExp, color = continent)) + geom_density()
ggplot(gapminder, aes(x = lifeExp, fill = continent)) +
  geom_density(alpha = 0.2)
ggplot(subset(gapminder, continent != "Oceania"),
       aes(x = lifeExp, fill = continent)) + geom_histogram() +
  facet_grid(continent ~ .)
p + ggtitle("Life expectancy over time by continent")
p + theme_bw()