#load directly from chmullig.com bdata <- read.csv("http://chmullig.com/wp-content/uploads/2012/06/births.csv") #filter bdata<-bdata[(bdata$births > 1000),] bdata$smoothbirths <- bdata$births bdata$smoothbirths[bdata$month==2 & bdata$day==29] <- bdata$births[bdata$month==2 & bdata$day==29]*4 bdata$order <- rank(bdata$month + bdata$day/100) birthloess = loess(births ~ order, span=1, iter=6, bdata) bdata$predict <- predict(birthloess, bdata$order) bdata$flag <- ifelse(bdata$births - bdata$predict > sd(bdata$smoothbirths)*2.3, 3, ifelse(bdata$births - bdata$predict < sd(bdata$smoothbirths)*-2.3, 1, 0)) #special days we might care about bdata$flag[bdata$month==2 & bdata$day==14] <- 3 bdata$flag[bdata$month==10 & bdata$day==31] <- 1 monthstarts <- by(bdata$order, list(bdata$month), min) plot(births ~ order, data=bdata, xaxt="n", xlab="Day", ylab="Births", main="Births by Day of Year", type="l", lwd=0) axis(1, at=monthstarts, labels=c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), tck=1, lwd=0.5, lty=2, col="grey", cex.axis=0.99) title(sub="Source: National Vital Statistics Natality Data, as provided by Google BigQuery. Graph by Chris Mulligan @ chmullig.com", cex.sub=0.75, adj=1) #add the mean line abline(a=sum(bdata$births)/365.25, b=0, col="red", lwd=0.5) #add the smoothed line lines(bdata$order, bdata$predict, col="blue", lwd=0.5) legend("bottomright", c("Births", "Smoothed", "Mean"), col=c("black", "blue", "red"), lty=1, lwd=2, bty="n") #label some outliers with(bdata, text(order[flag==1], births[flag==1], paste(month[flag==1], "/", day[flag==1]), pos=1, cex=1)) with(bdata, text(order[flag==3], births[flag==3], paste(month[flag==3], "/", day[flag==3]), pos=3, cex=1)) #write the actual line lines(births ~ order, data=bdata, xaxt="n", xlab="Day", ylab="Births", type="l", lwd=1.5)