Tutorial 13.3 - Transformations and standardizations

12 Mar 2015

View R code for preliminaries.

> library(vegan)
> library(ggplot2)
> library(grid)
> #define my common ggplot options
> murray_opts <- opts(panel.grid.major=theme_blank(),
+                            panel.grid.minor=theme_blank(),
+                            panel.border = theme_blank(),
+                            panel.background = theme_blank(),
+                            axis.title.y=theme_text(size=15, vjust=0,angle=90),
+                            axis.text.y=theme_text(size=12),
+                            axis.title.x=theme_text(size=15, vjust=-1),
+                            axis.text.x=theme_text(size=12),
+                            axis.line = theme_segment(),
+                            plot.margin=unit(c(0.5,0.5,1,2),"lines")
+ )

Error: Use 'theme' instead. (Defunct;
last used in version 0.9.1)

> coenocline <- function(x,A0,m,r,a,g, int=T, noise=T) {
+ #x is the environmental range
+ #A0 is the maximum abundance of the species at the optimum environmental conditions
+ #m is the value of the environmental gradient that represents the optimum conditions for the species
+ #r the species range over the environmental gradient (niche width)
+ #a and g are shape parameters representing the skewness and kurtosis
+ # when a=g, the distribution is symmetrical
+ # when a>g - negative skew (large left tail)
+ # when a<g - positive skew (large right tail)
+ #int - indicates whether the responses should be rounded to integers (=T)
+ #noise - indicates whether or not random noise should be added (reflecting random sampling)  
+ #NOTE.  negative numbers converted to 0
+          b <- a/(a+g)
+          d <- (b^a)*(1-b)^g
+          cc <- (A0/d)*((((x-m)/r)+b)^a)*((1-(((x-m)/r)+b))^g)
+          if (noise) {n <- A0/10; n[n<0]<-0; cc<-cc+rnorm(length(cc),0,n)}
+          cc[cc<0] <- 0
+          cc[is.na(cc)]<-0
+          if (int) cc<-round(cc,0)
+          cc
+ }
> #plot(coenocline(0:100,40,40,20,1,1, int=T, noise=T), ylim=c(0,100))

Tutorial 13.1 discussed the idea of amalgamating variables together so as to create new, condensed insights into the composition of communities (objects). However, the success and appropriateness of the amalgamations is very much dependent on the characteristics of the original variables (e.g. species).

Some of the techniques are parametric and assume that the variables are:

normally distributed
linearly related
measured on the same scale

Furthermore, the amalgams of many of the techniques can be overly biased towards the patterns of highly abundant species or variables whose scales yield values of large magnitude (such as units of force or pressure). Rare or variables with values small in magnitude tend to have almost no influence at all.

Ecological/biological multivariate data typically comprises of the following:

species abundances (counts). Whilst species abundances are usually expressed in the same units and on the same scale, they tend to be positively skewed (since they are truncated at zero).
environmental data (measurements). Environmental data tend to be measured on disparate scales (pH, degrees C, mm, kg, etc) and thus can present issues of non-normality and non-linearity
morphometric data (counts, measurements). Morphometric data as used in taxonomic studies often represent a mixture of measurements (lengths, masses) in addition to binary (feature present/absent) and counts and thus normality and linearity can also pose issues.

Hence, the first step of most multivariate analyses is to transform the data so that the data used to create the amalgamations (and thus the amalgamations themselves) best represents the broad characteristics of the communities.

Again we will use a simulated data set introduced in the previous Tutorial. This multivariate dataset comprises the abundances of 10 species from each of 10 sites located throughout a landscape.

Details of data generation

> set.seed(1)
> x <- seq(0,50,l=10)
> n <- 10
> sp1<-coenocline(x=x,A0=5,m=0,r=2,a=1,g=1,int=T, noise=T)
> sp2<-coenocline(x=x,A0=70,m=7,r=30,a=1,g=1,int=T, noise=T)
> sp3<-coenocline(x=x,A0=50,m=15,r=30,a=1,g=1,int=T, noise=T)
> sp4<-coenocline(x=x,A0=7,m=25,r=20,a=0.4,g=0.1,int=T, noise=T)
> sp5<-coenocline(x=x,A0=40,m=30,r=30,a=0.6,g=0.5,int=T, noise=T)
> sp6<-coenocline(x=x,A0=15,m=35,r=15,a=0.2,g=0.3,int=T, noise=T)
> sp7<-coenocline(x=x,A0=20,m=45,r=25,a=0.5,g=0.9,int=T, noise=T)
> sp8<-coenocline(x=x,A0=5,m=45,r=5,a=1,g=1,int=T, noise=T)
> sp9<-coenocline(x=x,A0=20,m=45,r=15,a=1,g=1,int=T, noise=T)
> sp10<-coenocline(x=x,A0=30,m=50,r=5,a=1,g=1,int=T, noise=T)
> X <- cbind(sp1, sp10,sp9,sp2,sp3,sp8,sp4,sp5,sp7,sp6)
> #X<-X[c(1,10,9,2,3,8,4,5,7,6),] 
> colnames(X) <- paste("Sp",1:10,sep="")
> rownames(X) <- paste("Site", c(1,10,9,2,3,8,4,5,7,6), sep="")
> X <- X[c(1,4,5,7,8,10,9,6,3,2),]
> data <- data.frame(Sites=factor(rownames(X),levels=rownames(X)), X)

Sites	Sp1	Sp2	Sp3	Sp4	Sp5	Sp6	Sp7	Sp8	Sp9	Sp10
Site1	5	0	0	65	5	0	0	0	0	0
Site2	0	0	0	25	39	0	6	23	0	0
Site3	0	0	0	6	42	0	6	31	0	0
Site4	0	0	0	0	0	0	0	40	0	14
Site5	0	0	6	0	0	0	0	34	18	12
Site6	0	29	12	0	0	0	0	0	22	0
Site7	0	0	21	0	0	5	0	0	20	0
Site8	0	0	0	0	13	0	6	37	0	0
Site9	0	0	0	60	47	0	4	0	0	0
Site10	0	0	0	72	34	0	0	0	0	0

I will also introduce another simulated data set that comprises five biophysical measurements made from the 10 sites. These biophysical environmental data include pH (log scale), Pressure (Pa), Altitude (m), Slope (degrees) and Substrate (categorical: Quartz or Shale).

Details of data generation

> set.seed(1)
> Site <- gl(10,1,10,lab=paste('Site',1:10, sep=""))
> Y <- matrix(c(
+ 6.1,4.2,101325,2,
+ 6.7,9.2,101352,510,
+ 6.8,8.6,101356,546,
+ 7.0,7.4,101372,758,
+ 7.2,5.8,101384,813,
+ 7.5,8.4,101395,856,
+ 7.5,0.5,101396,854,
+ 7.0,11.8,101370,734,
+ 8.4,8.2,101347,360,
+ 6.2,1.5,101345,356
+   ),10,4, byrow=TRUE)
> colnames(Y) <- c('pH','Slope', 'Pressure', 'Altitude')
> Substrate <- factor(c('Quartz','Shale','Shale','Shale','Shale','Quartz','Quartz','Shale','Quartz','Quartz'))
> enviro <- data.frame(Site,Y,Substrate)

Site	pH	Slope	Pressure	Altitude	Substrate
Site1	6.1	4.2	101325	2	Quartz
Site2	6.7	9.2	101352	510	Shale
Site3	6.8	8.6	101356	546	Shale
Site4	7.0	7.4	101372	758	Shale
Site5	7.2	5.8	101384	813	Shale
Site6	7.5	8.4	101395	856	Quartz
Site7	7.5	0.5	101396	854	Quartz
Site8	7.0	11.8	101370	734	Shale
Site9	8.4	8.2	101347	360	Quartz
Site10	6.2	1.5	101345	356	Quartz

Simple transformations

Simple transformations such as those outlined in Tutorial 4.1 can be used to help address issues of normality and non-linearity. Indeed species abundance data are routinely forth-root ($\sqrt[4]{}$) or logx+1 transformed prior to multivariate analysis.

In addition, a rather drastic transformation is that which transforms count data in to binary (presence/absence) data. When performed on the entire data frame, such a transformation removes the distinction between dominant and rare species. It can be useful when the applied to a single variable whose values are predominantly 0's and 1's.

Many of the common transformation/standardization routines used in ecology are supported in R by the decostand function within the vegan package.

> library(vegan)
> decostand(data[,-1], method="pa")

       Sp1 Sp2 Sp3 Sp4 Sp5 Sp6 Sp7 Sp8
Site1    1   0   0   1   1   0   0   0
Site2    0   0   0   1   1   0   1   1
Site3    0   0   0   1   1   0   1   1
Site4    0   0   0   0   0   0   0   1
Site5    0   0   1   0   0   0   0   1
Site6    0   1   1   0   0   0   0   0
Site7    0   0   1   0   0   1   0   0
Site8    0   0   0   0   1   0   1   1
Site9    0   0   0   1   1   0   1   0
Site10   0   0   0   1   1   0   0   0
       Sp9 Sp10
Site1    0    0
Site2    0    0
Site3    0    0
Site4    0    1
Site5    1    1
Site6    1    0
Site7    1    0
Site8    0    0
Site9    0    0
Site10   0    0

Standardizations

Close inspection of the above species abundance data reveals that whilst some of the species are relatively abundant (Species 1, 3, 4 and 8), other species (such as Species 2, 6 and especially 9) are relatively rare. As previously stated, most numerical methods of amalgamating will be more heavily influenced by the more abundant species. Sometimes this is a desired outcome - you might want your measures of the community to reflect this dominance. That is, you may want a numerical description of what is visible obvious in the field. Yet for other purposes, you may want the patterns to be more representative of the subtleties and complexities of the communities.

The environmental data highlight a different set of common issues:

some variables (particularly Pressure) have values of much greater magnitude than others
despite having values of very high magnitude, the Pressure measurements have relatively little spread (variation).
the altitude measurements have relatively high levels of spread

Rather than simply transforming the variables in isolation, standardizations transform the values relative to other variables, objects or both. There are numerous ways that multivariate data can be standardized in an attempt to alter the balance of weightings and inter-relationships. Most work by adjusting the values such that some property such as means, maximums, totals and or spread are equivalent for each species.

Total per row. Each value in a row is divided by the total for the row. This is a simple standardization that can dampen objects (sites) that have very high abundances.

> library(vegan)
> decostand(data[,-1], method="total")

           Sp1    Sp2     Sp3     Sp4
Site1  0.06667 0.0000 0.00000 0.86667
Site2  0.00000 0.0000 0.00000 0.26882
Site3  0.00000 0.0000 0.00000 0.07059
Site4  0.00000 0.0000 0.00000 0.00000
Site5  0.00000 0.0000 0.08571 0.00000
Site6  0.00000 0.4603 0.19048 0.00000
Site7  0.00000 0.0000 0.45652 0.00000
Site8  0.00000 0.0000 0.00000 0.00000
Site9  0.00000 0.0000 0.00000 0.54054
Site10 0.00000 0.0000 0.00000 0.67925
           Sp5    Sp6     Sp7    Sp8
Site1  0.06667 0.0000 0.00000 0.0000
Site2  0.41935 0.0000 0.06452 0.2473
Site3  0.49412 0.0000 0.07059 0.3647
Site4  0.00000 0.0000 0.00000 0.7407
Site5  0.00000 0.0000 0.00000 0.4857
Site6  0.00000 0.0000 0.00000 0.0000
Site7  0.00000 0.1087 0.00000 0.0000
Site8  0.23214 0.0000 0.10714 0.6607
Site9  0.42342 0.0000 0.03604 0.0000
Site10 0.32075 0.0000 0.00000 0.0000
          Sp9   Sp10
Site1  0.0000 0.0000
Site2  0.0000 0.0000
Site3  0.0000 0.0000
Site4  0.0000 0.2593
Site5  0.2571 0.1714
Site6  0.3492 0.0000
Site7  0.4348 0.0000
Site8  0.0000 0.0000
Site9  0.0000 0.0000
Site10 0.0000 0.0000

Maximum per column. Each value in a column is divided by the maximum of the column. This is a simple standardization that can even up the influence of each of the columns. This is effective for variables that are measured on the same scale and that have similar spread (variance). Standardized values will range from 0 to 1. Note this standardization will not address changes in the spread or variability of variables measured on different scales.

> library(vegan)
> decostand(data[,-1], method="max")

       Sp1 Sp2    Sp3     Sp4    Sp5
Site1    1   0 0.0000 0.90278 0.1064
Site2    0   0 0.0000 0.34722 0.8298
Site3    0   0 0.0000 0.08333 0.8936
Site4    0   0 0.0000 0.00000 0.0000
Site5    0   0 0.2857 0.00000 0.0000
Site6    0   1 0.5714 0.00000 0.0000
Site7    0   0 1.0000 0.00000 0.0000
Site8    0   0 0.0000 0.00000 0.2766
Site9    0   0 0.0000 0.83333 1.0000
Site10   0   0 0.0000 1.00000 0.7234
       Sp6    Sp7   Sp8    Sp9   Sp10
Site1    0 0.0000 0.000 0.0000 0.0000
Site2    0 1.0000 0.575 0.0000 0.0000
Site3    0 1.0000 0.775 0.0000 0.0000
Site4    0 0.0000 1.000 0.0000 1.0000
Site5    0 0.0000 0.850 0.8182 0.8571
Site6    0 0.0000 0.000 1.0000 0.0000
Site7    1 0.0000 0.000 0.9091 0.0000
Site8    0 1.0000 0.925 0.0000 0.0000
Site9    0 0.6667 0.000 0.0000 0.0000
Site10   0 0.0000 0.000 0.0000 0.0000

Wisconsin double standardization. Each value is first standardized by the column maximum before being standardized by the row total. This standardization tends to enhance the patterns in the data and therefore is a popular choice.

> library(vegan)
> wisconsin(data[,-1])

          Sp1    Sp2    Sp3     Sp4
Site1  0.4977 0.0000 0.0000 0.44933
Site2  0.0000 0.0000 0.0000 0.12617
Site3  0.0000 0.0000 0.0000 0.03028
Site4  0.0000 0.0000 0.0000 0.00000
Site5  0.0000 0.0000 0.1016 0.00000
Site6  0.0000 0.3889 0.2222 0.00000
Site7  0.0000 0.0000 0.3438 0.00000
Site8  0.0000 0.0000 0.0000 0.00000
Site9  0.0000 0.0000 0.0000 0.33333
Site10 0.0000 0.0000 0.0000 0.58025
           Sp5    Sp6    Sp7    Sp8
Site1  0.05295 0.0000 0.0000 0.0000
Site2  0.30152 0.0000 0.3634 0.2089
Site3  0.32472 0.0000 0.3634 0.2816
Site4  0.00000 0.0000 0.0000 0.5000
Site5  0.00000 0.0000 0.0000 0.3024
Site6  0.00000 0.0000 0.0000 0.0000
Site7  0.00000 0.3438 0.0000 0.0000
Site8  0.12563 0.0000 0.4542 0.4201
Site9  0.40000 0.0000 0.2667 0.0000
Site10 0.41975 0.0000 0.0000 0.0000
          Sp9   Sp10
Site1  0.0000 0.0000
Site2  0.0000 0.0000
Site3  0.0000 0.0000
Site4  0.0000 0.5000
Site5  0.2911 0.3049
Site6  0.3889 0.0000
Site7  0.3125 0.0000
Site8  0.0000 0.0000
Site9  0.0000 0.0000
Site10 0.0000 0.0000

Range. Each value in a column is standardized into a range of 0 to 1. This is one way to adjust for differences in the spread of values. For example, it could be used to even up the spread of measurements in the simulated environmental data.

> library(vegan)
> #we need to first convert the categorical variable (Substrate) into a numeric
> enviro1 <- within(enviro, Substrate <- as.numeric(Substrate))
> decostand(enviro1[,-1], method="range")

        pH  Slope Pressure Altitude
1  0.00000 0.3274   0.0000   0.0000
2  0.26087 0.7699   0.3803   0.5948
3  0.30435 0.7168   0.4366   0.6370
4  0.39130 0.6106   0.6620   0.8852
5  0.47826 0.4690   0.8310   0.9496
6  0.60870 0.6991   0.9859   1.0000
7  0.60870 0.0000   1.0000   0.9977
8  0.39130 1.0000   0.6338   0.8571
9  1.00000 0.6814   0.3099   0.4192
10 0.04348 0.0885   0.2817   0.4145
   Substrate
1          0
2          1
3          1
4          1
5          1
6          0
7          0
8          1
9          0
10         0

Centre. Each value in a column is standardized to have a mean of 0 by subtracting the column mean from each of the values in the column. This is useful when the variables all have similar absolute spreads of values yet vastly different magnitudes. values when

> #we need to first convert the categorical variable (Substrate) into a numeric
> enviro1 <- within(enviro, Substrate <- as.numeric(Substrate))
> apply(enviro1[,-1],2,scale, scale=FALSE)

         pH Slope Pressure Altitude
 [1,] -0.94 -2.36    -39.2   -576.9
 [2,] -0.34  2.64    -12.2    -68.9
 [3,] -0.24  2.04     -8.2    -32.9
 [4,] -0.04  0.84      7.8    179.1
 [5,]  0.16 -0.76     19.8    234.1
 [6,]  0.46  1.84     30.8    277.1
 [7,]  0.46 -6.06     31.8    275.1
 [8,] -0.04  5.24      5.8    155.1
 [9,]  1.36  1.64    -17.2   -218.9
[10,] -0.84 -5.06    -19.2   -222.9
      Substrate
 [1,]      -0.5
 [2,]       0.5
 [3,]       0.5
 [4,]       0.5
 [5,]       0.5
 [6,]      -0.5
 [7,]      -0.5
 [8,]       0.5
 [9,]      -0.5
[10,]      -0.5

Standardize. Each value in a column is standardized to a mean of 0 and standard deviation of 1. That is, each of the variables are normalized. This is another way to adjust for differences in the spread of values.

> #we need to first convert the categorical variable (Substrate) into a numeric
> enviro1 <- within(enviro, Substrate <- as.numeric(Substrate))
> apply(enviro1[,-1],2,scale)

> #OR
> library(vegan)
> decostand(enviro1[,-1], method="standardize")

         pH   Slope Pressure Altitude
1  -1.39885 -0.6636  -1.6863  -2.0691
2  -0.50597  0.7423  -0.5248  -0.2471
3  -0.35715  0.5736  -0.3527  -0.1180
4  -0.05953  0.2362   0.3355   0.6424
5   0.23810 -0.2137   0.8517   0.8396
6   0.68455  0.5173   1.3249   0.9938
7   0.68455 -1.7039   1.3679   0.9867
8  -0.05953  1.4733   0.2495   0.5563
9   2.02387  0.4611  -0.7399  -0.7851
10 -1.25004 -1.4227  -0.8259  -0.7995
   Substrate
1    -0.9487
2     0.9487
3     0.9487
4     0.9487
5     0.9487
6    -0.9487
7    -0.9487
8     0.9487
9    -0.9487
10   -0.9487

As with simple transformations in statistical models, it is usually advised that multivariate analyses be repeated with a range of transformation and standardization options so as to gain an appreciation of the influence of dominant/rare species, populous and sparse sites, large or varied measurements. If the various standardizations ultimately yield similar patterns amongst communities, then it suggests that the patterns are stable within the scale of your observations and that any one of the outcomes can be used to describe the patterns. If the patterns are substantially different, then it is likely that the different standardizations are drawing out different scales of community patterns.

Worked Examples

Basic statistics references

Legendre and Legendre
Quinn & Keough (2002) - Chpt 17

Standardizations

The following community data represent the abundances of three species of gastropods in five quadrats (ranging from high shore marsh - Quadrat 1, to low shore marsk - Quadrat 5) in a saltmarsh.

Download gastropod data set

Format of the gastropod

Salinator	Ophicardelus	Marinula
4	0	1
9	3	0
9	4	1
6	2	0
0	1	1

Salinator	Number of Salinator gastropods - variable
Ophicardelus	Number of Ophicardelus gastropods - variable
Marinula	Number of Marinula gastropods - variable
Q1-Q5	Quadrats - these are the objects

Open the gastropod data set.

Show code

> gastropod <- read.csv('../downloads/data/gastropod.csv')
> gastropod

  Salinator Ophicardelus Marinula
1         4            0        1
2         9            3        0
3         9            4        1
4         6            2        0
5         0            1        1

Before proceeding with any multivariate analyses, it is a good idea to get a 'feel' for your data. The gastropod data set is intentionally very small so that we can help relate various calculated properties to what we can see by simply inspecting the counts.

To build up a picture of these data, generate the following exploratory properties:

Scale of each of the species (column maximums)

Show code

> apply(gastropod,2,max)

   Salinator Ophicardelus     Marinula 
           9            4            1

Scale of each of the species (column means)

Show code

> apply(gastropod,2,mean)

   Salinator Ophicardelus     Marinula 
         5.6          2.0          0.6

Variability of each of the species (column variance)

Show code

> apply(gastropod,2,var)

   Salinator Ophicardelus     Marinula 
        14.3          2.5          0.3

Abundances in each quadrat (row totals)
Show code
> apply(gastropod,1,sum)
[1] 5 12 14 8 2

Correlations between species

Show code

> cor(gastropod)

             Salinator Ophicardelus
Salinator       1.0000       0.7944
Ophicardelus    0.7944       1.0000
Marinula       -0.4587      -0.2887
             Marinula
Salinator     -0.4587
Ophicardelus  -0.2887
Marinula       1.0000

We intend to use these data in some sort of multivariate analysis. Typically, before doing so, we standardize the data in order to ensure that certain features are honored in the analysis. Standardize the gastropod data to achieve the following:

ensure that the rare and abundant species alike have similar weighting and are constrained to the range of 0-1

Show code

> library(vegan)
> gast1 <- decostand(gastropod,"max")
> gast1

  Salinator Ophicardelus Marinula
1    0.4444         0.00        1
2    1.0000         0.75        0
3    1.0000         1.00        1
4    0.6667         0.50        0
5    0.0000         0.25        1

> apply(gast1,2,max)

   Salinator Ophicardelus     Marinula 
           1            1            1

> apply(gast1,2,range)

     Salinator Ophicardelus Marinula
[1,]         0            0        0
[2,]         1            1        1

ensure that the all species have similar weighting yet maintain their variability. This could be important if you want multivariate patterns to reflect heterogeneity (many analyses are drawn towards higher variability).

Show code

> #center the data
> gast2<-apply(gastropod,2,scale,scale=FALSE)
> gast2

     Salinator Ophicardelus Marinula
[1,]      -1.6           -2      0.4
[2,]       3.4            1     -0.6
[3,]       3.4            2      0.4
[4,]       0.4            0     -0.6
[5,]      -5.6           -1      0.4

> apply(gast2,2,mean)

   Salinator Ophicardelus     Marinula 
   3.554e-16    0.000e+00    2.220e-17

> apply(gast2,2,var)

   Salinator Ophicardelus     Marinula 
        14.3          2.5          0.3

ensure that the all species have similar weighting. The influences of highly abundant and/or variable species are suppressed and those of rare species are enhanced so that all have similar influence.

Show code

> #scale data to mean=0 and variance of 1
> gast3<-apply(gastropod,2,scale)
> #OR
> library(vegan)
> gast3<-decostand(gastropod,method="standardize")
> gast3

  Salinator Ophicardelus Marinula
1   -0.4231      -1.2649   0.7303
2    0.8991       0.6325  -1.0954
3    0.8991       1.2649   0.7303
4    0.1058       0.0000  -1.0954
5   -1.4809      -0.6325   0.7303

> apply(gast3,2,mean)

   Salinator Ophicardelus     Marinula 
   1.193e-16    0.000e+00    0.000e+00

> apply(gast3,2,var)

   Salinator Ophicardelus     Marinula 
           1            1            1

ensure that all sites have similar weightings and are constrained to a range of 0-1.

Show code

> library(vegan)
> gast4 <- decostand(gastropod,"total")
> gast4

  Salinator Ophicardelus Marinula
1    0.8000       0.0000  0.20000
2    0.7500       0.2500  0.00000
3    0.6429       0.2857  0.07143
4    0.7500       0.2500  0.00000
5    0.0000       0.5000  0.50000

> apply(gast4,1,sum)

[1] 1 1 1 1 1

> cor(gast4)

             Salinator Ophicardelus
Salinator       1.0000      -0.8353
Ophicardelus   -0.8353       1.0000
Marinula       -0.8852       0.4836
             Marinula
Salinator     -0.8852
Ophicardelus   0.4836
Marinula       1.0000

ensure that all species and sites have similar weightings and yet enhances any underlying patterns (increases species correlations for example). This can improve the success of any resulting multivariate analyses.

Show code

> library(vegan)
> # Wisconsin double standardization
> gast5 <- wisconsin(gastropod)
> gast5

  Salinator Ophicardelus Marinula
1    0.3077       0.0000   0.6923
2    0.5714       0.4286   0.0000
3    0.3333       0.3333   0.3333
4    0.5714       0.4286   0.0000
5    0.0000       0.2000   0.8000

> cor(gast5)

             Salinator Ophicardelus
Salinator       1.0000       0.6123
Ophicardelus    0.6123       1.0000
Marinula       -0.9241      -0.8680
             Marinula
Salinator     -0.9241
Ophicardelus  -0.8680
Marinula       1.0000

Transformation	Syntax
log_e	> new_var <- log(old_var)
log₁₀	> new_var <- log10(old_var)
square root	> new_var <- sqrt(old_var)
arcsin	> new_var <- asin(sqrt(old_var))
scale (mean=0, unit variance)	> new_var <- scale(old_var)

Transformation

Syntax

log_e

> new_var <- log(old_var)

log₁₀

> new_var <- log10(old_var)

square root

> new_var <- sqrt(old_var)

arcsin

> new_var <- asin(sqrt(old_var))

scale (mean=0, unit variance)

> new_var <- scale(old_var)

Sample number	Sample mean
1	12.1
2	12.7
3	12.5
Mean of sample means	12.433
> SD of sample means	0.306