I’m giving a talk for the Kansas City R Users Group on how to get a preliminary impression of relationships between pairs of variables. Here is the R code and output that I will use.
Simple measures of association
There are several different ways of measuring bivariate relationships in a descriptive fashion prior to data analysis. The methods can be largely grouped into measures of relationship between two continuous variables, two categorical variables and measures of a relationship between a categorical variable and a continuous variable.
fn <- "http://lib.stat.cmu.edu/DASL/Datafiles/homedat.html"
h0 <- readLines(fn)
n <- length(h0)
print(n)
## [1] 165
head(h0)
## [1] "<TITLE>Home Prices</TITLE>"
## [2] ""
## [3] ""
## [4] "<hr size=2><center><table border=1 cellpadding=0 cellspacing=0><tr><td align=center><table border=1 cellpadding=1 cellspacing=0><tr><td><A HREF=\"../DataArchive.html\"><IMG SRC=\"../InlineImages/mainmenu.gif\" alt=\"Go to Main Menu\"></a></td></tr></table></td><td align=center><table border=1 cellpadding=1 cellspacing=0><tr><td><A HREF=\"/cgi-bin/dasl.cgi\"><IMG SRC=\"../InlineImages/powersearchsmall.gif\" alt=\"Go to Power Search\"></a></td></tr></table></td><td align=center><table border=1 cellpadding=1 cellspacing=0><tr><td><A HREF=\"../allsubjects.html\"><IMG SRC=\"../InlineImages/allsubjects.gif\" alt=\"Go to Datafile Subjects\"></a></td></tr></table></td></tr></table></center><hr size=2>"
## [5] "<B><DT>Datafile Name:</B> Home Prices"
## [6] ""
tail(h0)
## [1] "872\t1229\t6\t3\t0\t0\t0\t721" "870\t1273\t4\t4\t0\t0\t0\t638"
## [3] "869\t1165\t7\t4\t0\t0\t0\t694" "766\t1200\t7\t4\t0\t0\t1\t634"
## [5] "739\t970\t4\t4\t0\t0\t1\t541" "</PRE>"
the.data.line <- grep("The Data:",h0)
print(the.data.line)
## [1] 44
h1 <- h0[(the.data.line+4):(n-1)]
head(h1)
## [1] "2050\t2650\t13\t7\t1\t1\t0\t1639" "2080\t2600\t*\t4\t1\t1\t0\t1088"
## [3] "2150\t2664\t6\t5\t1\t1\t0\t1193" "2150\t2921\t3\t6\t1\t1\t0\t1635"
## [5] "1999\t2580\t4\t4\t1\t1\t0\t1732" "1900\t2580\t4\t4\t1\t0\t0\t1534"
tail(h1)
## [1] "874\t1236\t3\t4\t0\t0\t0\t707" "872\t1229\t6\t3\t0\t0\t0\t721"
## [3] "870\t1273\t4\t4\t0\t0\t0\t638" "869\t1165\t7\t4\t0\t0\t0\t694"
## [5] "766\t1200\t7\t4\t0\t0\t1\t634" "739\t970\t4\t4\t0\t0\t1\t541"
h2 <- strsplit(h1,"\t")
head(h2)
## [[1]]
## [1] "2050" "2650" "13" "7" "1" "1" "0" "1639"
##
## [[2]]
## [1] "2080" "2600" "*" "4" "1" "1" "0" "1088"
##
## [[3]]
## [1] "2150" "2664" "6" "5" "1" "1" "0" "1193"
##
## [[4]]
## [1] "2150" "2921" "3" "6" "1" "1" "0" "1635"
##
## [[5]]
## [1] "1999" "2580" "4" "4" "1" "1" "0" "1732"
##
## [[6]]
## [1] "1900" "2580" "4" "4" "1" "0" "0" "1534"
tail(h2)
## [[1]]
## [1] "874" "1236" "3" "4" "0" "0" "0" "707"
##
## [[2]]
## [1] "872" "1229" "6" "3" "0" "0" "0" "721"
##
## [[3]]
## [1] "870" "1273" "4" "4" "0" "0" "0" "638"
##
## [[4]]
## [1] "869" "1165" "7" "4" "0" "0" "0" "694"
##
## [[5]]
## [1] "766" "1200" "7" "4" "0" "0" "1" "634"
##
## [[6]]
## [1] "739" "970" "4" "4" "0" "0" "1" "541"
h3 <- as.numeric(unlist(h2))
## Warning: NAs introduced by coercion
head(h3)
## [1] 2050 2650 13 7 1 1
tail(h3)
## [1] 4 4 0 0 1 541
h4 <- matrix(h3,ncol=8,byrow=TRUE)
head(h4)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 2050 2650 13 7 1 1 0 1639
## [2,] 2080 2600 NA 4 1 1 0 1088
## [3,] 2150 2664 6 5 1 1 0 1193
## [4,] 2150 2921 3 6 1 1 0 1635
## [5,] 1999 2580 4 4 1 1 0 1732
## [6,] 1900 2580 4 4 1 0 0 1534
tail(h4)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [112,] 874 1236 3 4 0 0 0 707
## [113,] 872 1229 6 3 0 0 0 721
## [114,] 870 1273 4 4 0 0 0 638
## [115,] 869 1165 7 4 0 0 0 694
## [116,] 766 1200 7 4 0 0 1 634
## [117,] 739 970 4 4 0 0 1 541
var.names.line <- the.data.line+2
print(h0[var.names.line])
## [1] "PRICE SQFT AGE FEATS NE CUST COR TAX"
var.names <- strsplit(h0[var.names.line]," ")
print(var.names)
## [[1]]
## [1] "PRICE" "" "SQFT" "" "AGE" "FEATS" "" ""
## [9] "NE" "" "" "" "CUST" "" "" ""
## [17] "" "COR" "" "" "" "" "" ""
## [25] "" "" "TAX"
var.names <- unlist(var.names)
print(var.names)
## [1] "PRICE" "" "SQFT" "" "AGE" "FEATS" "" ""
## [9] "NE" "" "" "" "CUST" "" "" ""
## [17] "" "COR" "" "" "" "" "" ""
## [25] "" "" "TAX"
var.names <- var.names[nchar(var.names)>0]
print(var.names)
## [1] "PRICE" "SQFT" "AGE" "FEATS" "NE" "CUST" "COR" "TAX"
var.names <- tolower(var.names)
print(var.names)
## [1] "price" "sqft" "age" "feats" "ne" "cust" "cor" "tax"
dimnames(h4)[[2]] <- var.names
head(h4)
## price sqft age feats ne cust cor tax
## [1,] 2050 2650 13 7 1 1 0 1639
## [2,] 2080 2600 NA 4 1 1 0 1088
## [3,] 2150 2664 6 5 1 1 0 1193
## [4,] 2150 2921 3 6 1 1 0 1635
## [5,] 1999 2580 4 4 1 1 0 1732
## [6,] 1900 2580 4 4 1 0 0 1534
tail(h4)
## price sqft age feats ne cust cor tax
## [112,] 874 1236 3 4 0 0 0 707
## [113,] 872 1229 6 3 0 0 0 721
## [114,] 870 1273 4 4 0 0 0 638
## [115,] 869 1165 7 4 0 0 0 694
## [116,] 766 1200 7 4 0 0 1 634
## [117,] 739 970 4 4 0 0 1 541
h5 <- data.frame(h4)
head(h5)
## price sqft age feats ne cust cor tax
## 1 2050 2650 13 7 1 1 0 1639
## 2 2080 2600 NA 4 1 1 0 1088
## 3 2150 2664 6 5 1 1 0 1193
## 4 2150 2921 3 6 1 1 0 1635
## 5 1999 2580 4 4 1 1 0 1732
## 6 1900 2580 4 4 1 0 0 1534
tail(h5)
## price sqft age feats ne cust cor tax
## 112 874 1236 3 4 0 0 0 707
## 113 872 1229 6 3 0 0 0 721
## 114 870 1273 4 4 0 0 0 638
## 115 869 1165 7 4 0 0 0 694
## 116 766 1200 7 4 0 0 1 634
## 117 739 970 4 4 0 0 1 541
The best graphical summary of two continuous variables is a scatterplot. You should include a smooting curve or spline model to the graph to emphasize the general trend and any departures from linearity.
plot(h5$sqft,h5$price)
lines(lowess(h5$price~h5$sqft))
plot(h5$age,h5$price)
sb <- is.finite(h5$age)
lines(lowess(h5$price[sb]~h5$age[sb]))
The best numeric summary of two continuous variables is a correlation coefficient.
cor(h5[,c("price","sqft","age")],use="pairwise.complete.obs")
## price sqft age
## price 1.0000000 0.84479510 -0.16867888
## sqft 0.8447951 1.00000000 -0.03965489
## age -0.1686789 -0.03965489 1.00000000
Correlations should always be rounded to two or maybe even just one significant digit.
round(cor(h5[,c("price","sqft","age")],use="pairwise.complete.obs"),1)
## price sqft age
## price 1.0 0.8 -0.2
## sqft 0.8 1.0 0.0
## age -0.2 0.0 1.0
Anyhting larger than 0.7 or smaller than -0.7 is a strong linear relationship. Anything between 0.3 and 0.7 or between -0.3 and -0.7 is a weak linear relationship. Anything between -0.3 and 0.3 represents little or no linear relationship.
The best graphical summary between a continuous variable and a categorical variable is a boxplot.
boxplot(h5$price~h5$feats)
boxplot(h5$price~h5$ne)
boxplot(h5$price~h5$cust)
boxplot(h5$price~h5$cor)
If your categorical variable is binary, you can also use a scatterplot. The binary variable goes on the y axis and a trend line is critical.
plot(h5$price,h5$ne)
lines(lowess(h5$ne~h5$price))
plot(h5$price,h5$cust)
lines(lowess(h5$cust~h5$price))
plot(h5$price,h5$cor)
lines(lowess(h5$cor~h5$price))
You can also compute a correlaton between a binary variable and a categorical variable. It is equivalent to the point-bisearial correlation.
round(cor(h5[,c("ne","cust","cor")],h5$price),1)
## [,1]
## ne 0.2
## cust 0.6
## cor -0.1
Let’s save the display of a relationship involving two categorical variables until another day.