Ellipses for bivariate normal

Multivariate Analysis

Freddy Hernández-Barajas

MASS package to simulate from a \(N_p\)

library(MASS)

Generating one sample

n <- 100
Media <- c(-1, 2)
Sigma <- matrix(c(10, -4,
                  -4, 20), byrow=TRUE, ncol=2)

df <- mvrnorm(n=n, mu=Media, Sigma=Sigma)
df <- as.data.frame(df)

The simulated data

head(df)
          V1         V2
1 -0.8061237 -0.4915237
2 -0.6249777  1.7525209
3  7.8307234 -2.0325077
4 -4.4661228 13.7029644
5  0.3082988 -6.8614756
6 -3.3198190 -6.6730524

The scatterplot

require(ggplot2)
ggplot(df, aes(V1, V2)) + geom_point()

Ellipse

ggplot(df, aes(V1, V2)) +
  geom_point() + stat_ellipse(level=0.97)

Example with medidas_cuerpo2 dataset

The medidas_cuerpo2 dataset contains measures of weight, height, neck circumference and wrist circumference for students at UNAL. Below a figure with an illustration of the measures collected.

The dataset

url <- "https://raw.githubusercontent.com/fhernanb/datos/master/medidas_cuerpo2"
datos <- read.table(file=url, header=TRUE)
Ano Semestre Peso Sexo Estatura circun_cuello circun_muneca
2020 1 47.60 F 1.57 29.5 13.9
2020 1 68.10 M 1.66 38.4 16.0
2020 1 68.00 M 1.90 36.5 16.6
2020 1 80.00 M 1.76 38.0 17.1
2020 1 68.10 M 1.83 38.0 17.1
2020 1 56.10 F 1.66 33.0 14.7
2020 1 54.20 F 1.65 32.5 15.4
2020 1 69.20 M 1.78 40.5 16.5
2020 1 74.30 M 1.68 38.0 16.1
2020 1 73.30 M 1.69 37.5 16.3
2020 1 102.20 M 1.79 41.5 17.1
2020 1 46.70 F 1.49 31.5 13.8
2020 1 63.80 M 1.74 38.0 16.4
2020 1 76.90 M 1.73 39.5 17.6
2020 1 52.50 F 1.52 32.5 14.4
2020 1 67.30 M 1.76 36.5 16.1
2020 1 79.10 M 1.82 38.0 18.0
2020 1 58.40 F 1.62 33.0 14.3
2020 1 59.30 F 1.68 32.0 14.2
2020 1 57.30 F 1.61 32.0 14.7
2020 1 67.60 F 1.64 34.5 15.3
2020 1 62.70 F 1.67 33.0 15.3
2020 1 71.90 M 1.64 38.5 16.8
2020 1 74.90 M 1.75 40.0 16.8
2020 1 73.00 M 1.85 37.2 16.4
2020 1 63.80 M 1.71 35.0 15.6
2023 1 78.10 M 1.69 40.0 17.7
2023 1 53.85 M 1.69 34.1 17.5
2023 1 51.45 F 1.63 30.1 14.4
2023 1 47.95 F 1.59 30.1 14.6
2023 1 70.65 M 1.66 37.6 17.5
2023 1 82.55 M 1.85 40.0 17.5
2023 1 78.00 M 1.66 37.6 17.5
2023 1 71.70 M 1.76 37.1 16.9
2023 1 70.75 F 1.52 38.5 16.4
2023 1 52.80 M 1.75 33.4 14.6
2023 1 48.80 F 1.52 31.1 14.4
2023 1 68.70 M 1.85 36.0 16.0
2023 1 77.70 M 1.79 38.0 15.0
2023 1 82.95 M 1.74 37.0 17.0
2023 1 57.55 F 1.62 30.5 15.4
2023 1 64.75 M 1.83 34.8 16.9
2023 1 62.65 F 1.62 33.1 15.2
2023 1 62.45 M 1.75 35.7 16.0
2023 1 62.75 M 1.75 35.5 16.1
2023 1 84.75 M 1.73 40.2 20.0
2023 1 47.35 F 1.54 31.5 14.6
2023 1 55.45 F 1.54 31.7 15.0
2023 1 67.30 M 1.82 37.1 17.5
2023 1 55.05 F 1.59 31.0 13.9
2023 1 49.85 M 1.67 25.5 15.0
2023 1 55.80 M 1.74 36.0 16.5

Using only weight and height variables from 2023

library(dplyr)
datos %>% filter(Ano == 2023) %>% select(Peso, Estatura, Sexo) -> dt
head(dt)
   Peso Estatura Sexo
1 78.10     1.69    M
2 53.85     1.69    M
3 51.45     1.63    F
4 47.95     1.59    F
5 70.65     1.66    M
6 82.55     1.85    M

Checking normality

library(MVN)
mvn(data=dt[, 1:2], mvnTest="mardia")
$multivariateNormality
             Test        Statistic            p value Result
1 Mardia Skewness  1.0685561773063  0.899228488052355    YES
2 Mardia Kurtosis -1.6477228050437 0.0994095661074923    YES
3             MVN             <NA>               <NA>    YES

$univariateNormality
              Test  Variable Statistic   p value Normality
1 Anderson-Darling   Peso       0.4972    0.1937    YES   
2 Anderson-Darling Estatura     0.3325    0.4900    YES   

$Descriptives
          n      Mean   Std.Dev Median   Min   Max  25th    75th       Skew
Peso     26 63.909615 11.863490  62.70 47.35 84.75 54.15 71.4625  0.2464979
Estatura 26  1.688462  0.101654   1.69  1.52  1.85  1.62  1.7500 -0.1065326
          Kurtosis
Peso     -1.323122
Estatura -1.169331

The scatterplot

require(ggplot2)
ggplot(dt, aes(Peso, Estatura)) + geom_point()

The ellipse

ggplot(dt, aes(Peso, Estatura)) +
  geom_point() + stat_ellipse(level=0.90)

Conditioning

ggplot(dt, aes(Peso, Estatura, color = Sexo == "M")) +
  geom_point() + stat_ellipse(level=0.90)

Other

ggplot(dt, aes(Peso, Estatura, color = Sexo == "M")) + geom_point() +
  stat_ellipse(type = "norm", linetype="solid") +
  stat_ellipse(type = "t", linetype="dashed")