# Section 1.3 -------------------------------------------------
2+2

x <- 2+2
y = 2+2

ls()

x

x <- 2+3

help(mean)
?mean

# Section 1.4 -------------------------------------------------
# In university computer create a math1024 folder and then a 
# data sub-folder where you keep all the files 
# Do not use spaces in sub-folder names

setwd("H:/math1024/data/") 
# change this to the file location of your data files (not the zip folder)
# Or in Rstudio please go through the top menu button Session and  Set Working directory

# In your own personal computer you may need something like: 
# setwd("C:/math1024/data/") 

## Note that these are forward slashes(/). 
## Not backward slashes in Windows! 

# Section 1.5 -------------------------------------------------
## Read the computer failure data set
cfail <- scan("compfail.txt") 
## Read the fast food data set 
ffood <- read.csv("servicetime.csv", header=T) 
# csv stands for comma separated value file 
wgain <- read.table("wtgain.txt", header=T)

bill <- read.table("billionaires.txt", header=T)

# header=T (TRUE) tells R that the first row of the data file contains the column names.

## try the following if you were not able to read the data already 
path <- "http://www.personal.soton.ac.uk/sks/teach/math1024/"

cfail <- scan(paste0(path, "compfail.txt"))
ffood <- read.csv(paste0(path, "servicetime.csv"), head=T)
wgain <- read.table(paste0(path, "wtgain.txt"), head=T)
bill <- read.table(paste0(path, "billionaires.txt"), header=T)


ffood

head(ffood)

tail(ffood)

dim(ffood)

names(ffood)  # Prints the column names of the data set
ffood$AM # Prints the values in the AM column 
ffood[1,] # Prints the first row and all columns
ffood[,1] # Prints the first column and all rows 
ffood[1:2, ] # Prints the first two rows and all columns
ffood[1, 2] # Prints the first row second column entry 

# Section 1.6 -------------------------------------------------

summary(ffood)

summary(cfail)

summary(wgain)

summary(bill)

table(cfail)

var(ffood$AM) # variance of the AM data

var(c(ffood$AM, ffood$PM)) 
# variance of the AM and PM data combined

table(bill$region)

# Section 1.7 -------------------------------------------------

stem(ffood$AM)

barplot(table(cfail))

barplot(table(bill$region), col=2:6)

hist(cfail)

hist(cfail, xlab="Number of weekly computer failures")

plot(wgain$initial, wgain$final) # plot wgain$final against wgain$initial

abline(0, 1, col="red") # add a 45 degree line
# Nicer and more informative plot
plot(wgain$initial, wgain$final,xlab="Wt in Week 1", ylab="Wt in Week 12", pch="*", las=1) # relabel both axes
abline(0, 1, col="red")
title("A scatterplot of the weights in Week 12 against the weights in Week 1") # add a title

boxplot(cfail)

## Example of programming in R

boxplot(ffood)

boxplot(data=bill, wealth ~ region, col=2:6)

?par 

# Section 1.8 -------------------------------------------------
## Highlight from below 
butterfly <- function(color = 2, p1=2, p2=4) {
  theta <- seq(from=0.0, to=24 * pi, len = 2000)
  radius <- exp(cos(theta)) - p1 * cos(p2 * theta)
  radius <- radius + sin(theta/12)
  x <- radius * sin(theta)
  y <-  - radius * cos(theta)
  plot(x, y, type = "l", axes = F, xlab = "", ylab = "", col = color)
}
## Upto the above
# Then press the Run button
## If there are no error messages run the following 

butterfly(p1=20, p2=4)

butterfly(color = 6)

par(mfrow=c(2, 2))
butterfly(color = 6)
butterfly(p1=5, p2=5, color=2)
butterfly(p1=10, p2=1.5, color = "seagreen")
butterfly(p1=20, p2=4, color = "blue")

## Finish R commands for lab session 1


## Start R commands for lab session 2

# Section 2.1 -------------------------------------------------
x <- c(1, 4, 7, 10, 13)
x[1] # gives the first element of x.
x[2:4] # gives the elements x[2], x[3], x[4].
x[-(2:4)] # gives all but x[2], x[3], x[4].

y <- 5:15

x <- seq(from=1, to=13, by =3) # a better way of inputting the x above.

?seq # prints out the help file 

a1 <- c(1,3,5,6,8,21) # if you have to input irregular data.

a2 <- seq(5,25, length=5)
a3 <- c(a1,a2)
a4 <- seq(from=min(a1), to=max(a1), length=10)
a5 <- rep(2, 5)
a6 <- c(1, 3, 9)
a7 <- rep(a6, times=2)
a8 <- rep(a6, each=2)
a9 <- rep(a6, c(2, 3, 1))
cbind(a7, a8, a9) # Can you see the differences between a7, a8 and a9?


y <- matrix(1:6, nrow=3,ncol=2) # creates a 3 by 2 matrix, called y.
# You can access parts of y by calling things like:
y[1,2] # gives the first row second column entry of y
y[1,] # gives the first row of y
y[,2] # gives the second column of y and so on.

# Individual elements of vectors or matrices, or whole rows or columns of matrices may be updated
# by assigning them new values, e.g.

a1[1] <- 3

y[1,2] <- 3

y[,2] <- c(2,2, 2)

# You can do arithmetic with the matrices, for example suppose
x <- - matrix (1:6, nrow=3,ncol=2)
# Now you can simply write 

z <- x+y

# Section 2.2 -------------------------------------------------
# Create a data frame called dframe by issuing the command:
dframe <- data.frame(x=1:10, y=rnorm(10))

# You can add a new column to a data frame, dframe say, by issuing:
dframe$xy <- dframe$x * dframe$y

# Certain statistical operations on vectors result in scalars
mean(dframe$x)
var(dframe$x)

View(dframe)

myresults <- list(mean=10, sd=3.32, values=5:15)

# Section 2.3 -------------------------------------------------
citizen <- factor(c("uk", "us", "no", "in", "es", "in"))

table(citizen)

levels(citizen)

levels(bill$region) # Assuming you read the billionaire data set already.

levels(bill$region) <- c("Asia", "Europe", "Mid-East", "Other", "USA")

a1[a1>5]

bill$wealth> 5

bill$region == "A"

bill.wealth.ge5 <- bill[bill$wealth>5, ]

bill.wealth.ge5

bill.region.A <- bill[ bill$region == "A", ]

bill.region.A

x <- 1:10

x>3 & x<7

x<3 | x>7

a <- seq(1, 10, by =2)

oddrows <- bill[a, ]
## Finish R commands for lab session 2


## Start R commands for lab session 3
  
# Set working directory and then read the data set 

bill <- read.table("billionaires.txt", header=T)

# Section 3.1 -------------------------------------------------
x <- matrix(1:12, byrow=T, ncol=4) # type x to see what matrix you have got.

apply(x, 2, mean) # produces four column means of x

apply(x, 1, mean) # produces three row means of x

tapply(X=bill$wealth, INDEX=bill$region, FUN=mean)
tapply(X=bill$wealth, INDEX=bill$region, FUN=sd)

# Rounding the numbers make those look nice 
round(tapply(X=bill$wealth, INDEX=bill$region, FUN=mean), 2)

?tapply 


# Section 3.2 -------------------------------------------------

hist(bill$wealth) # produces a dull looking plot.

hist(bill$wealth, nclass=20) # produces a more detailed plot.

hist(bill$wealth, nclass=20, xlab="Wealth", main="Histogram of wealth of billionaires") 
#produces a more detailed plot.

boxplot(data=bill, wealth~region, col=2:6) # Side by side box plots of wealth by region.

boxplot(data=bill, age~region, col=2:6) # Age distribution of the wealthy by region.

plot(bill$age, bill$wealth) # Very dull plot.

plot(bill$age, bill$wealth, xlab="Age", ylab="Wealth", pch="*") # A bit better.
plot(bill$age, bill$wealth, xlab="Age", ylab="Wealth", type="n") # Lays the plot area but does not plot.

text(bill$age, bill$wealth, labels=bill$region, cex=0.7, col=2:6) 
# Adds the points to the empty plot. Definitely a better looking plot where we can grasp a bit more information.

install.packages("ggplot2")

library(ggplot2)

levels(bill$region) <- c("Asia", "Europe", "Mid-East", "Other", "USA")

g1 <- ggplot(data=bill, aes(x=age, y=wealth)) +
  geom_point(aes(col=region, size=wealth)) 
g1

g2 <- g1 + geom_smooth(method="loess", se=F)
g2

g3 <- g2 + labs(subtitle="Wealth vs Age of Billionaires",   
                x="Age",  y="Wealth (Billion US $)", 
                caption = "Source: Fortune Magazine, 1992.")

g3


# Section 3.3 -------------------------------------------------
errors <- read.csv("2019ageguess.csv", head=T)
head(errors)

# Q1: How many rows and columns are there in the data set?
dim(errors)

# Q2: Number of students in class
sum(errors$size)/10  ## Is the total number of students 
sum(errors$females)/10 ## Is the total number of femals 
sum(errors$size)/10 - sum(errors$females)/10 # number of male students 

# Q3: Note down the number of photographed person for each unique value of age.
table(errors$tru_age)/55
## divided by 55 since each of the 55 groups guessed the photos 

# Q4: Cross tabulation 
table(errors$sex, errors$race)/55
## divided by 55 since each of the 55 groups guessed the photos 

# Q5: What are the minimum and maximum true ages of the photographed mathematicians?
# alternatively can find the minimums and maximums this way
min(errors$tru_age)
max(errors$tru_age)

# Q6. Obtain a barplot of the true age distribution.
barplot(table(errors$tru_age)/55)

# Q7: Obtain a histogram of the estimated age column and compare this with the true age distribution seen in the barplot drawn above.
hist(errors$est_age)

# Q8: Plot estimated age against true ages 

plot(errors$tru_age, errors$est_age)

# Q9. What are the means and standard deviations for the columns: size, females, est age, tru age, error and abs error?
summary(errors)

# or alternatively
mean(errors$size)
mean(errors$females)
mean(errors$est_age)
mean(errors$tru_age)
mean(errors$error)
mean(errors$abs_error)
# For sd s 
sd(errors$size)
sd(errors$females)
sd(errors$est_age)
sd(errors$tru_age)
sd(errors$error)
sd(errors$abs_error)

# Q10: What is the mean number of males in each group? What is the mean number of females in each group?
# mean number of males in each group - first, create a new variable containing the number of males in a group
errors$males <- errors$size - errors$female
head(errors$males) # to see the first few rows of the data set containing the new variable
mean(errors$males)

# mean number of females in each group
mean(errors$females)

# Q11. How many of the photographs were of each race?
table(errors$race)/55

# Q12. Note down the frequency table of the sign of the errors.
errors$sign <- sign(errors$error)
table(errors$sign)

# Q13. Obtain a histogram for the errors and another for the absolute errors. Which one is bellshaped and why?
hist(errors$error)
hist(errors$abs_error)


# Q14. Obtain a histogram for the square-root of absolute errors. 
hist(sqrt(errors$abs_error))


# Q15. Draw a boxplot
boxplot(errors$abs_error)


# Q16. Is it easier to guess the ages of female mathematicians?
errors$sex <- factor(errors$sex) # make sex a factor variable
levels(errors$sex)
tapply(X=errors$abs_error, INDEX=errors$sex, FUN =mean)

#Also draw a side by side boxplot of the absolute errors for the two groups of mathematicians: males and females.
boxplot(data=errors, abs_error~sex, col=c(2,4) )

# Q17. Is it easier to guess the ages of black mathematicians? 
tapply(X=errors$abs_error, INDEX=errors$race, FUN =mean)

# Q18 How would you order the mean absolute error by race?
boxplot(data=errors, abs_error~race, col=c(7, 8, 5, 0) )

# Q19. Is it easier to guess the ages of younger mathematicians?
tapply(X=errors$abs_error, INDEX=errors$tru_age, FUN =mean)

# Q20. Which person's age is the most difficult to guess?
boxplot(data=errors, abs_error~photo, col=heat.colors(8))