Exploaratory analysis of COVID-19 infections in US states

General remarks

This analysis is based on the numbers of confirmed infections and deaths provided by the New York Times at . Testing data is obtained from the covidtracking project at Johns Hopkins university, data at , for more information see .

data <- read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")
data2 <- read.csv("https://api.covidtracking.com/v1/states/daily.csv")
states <- unique(data$state)[c(1:50,54)]
statesabr <- unique(data2$state)[c(53,17,6,5,22,
                                   54,48,33,49,41,
                                   11,38,44,12,34,
                                   31,35,7,23,37,
                                   47,14,18,20,26,
                                   40,42,45,9,19,
                                   27,52,51,8,15,
                                   21,39,25,46,3,
                                   10,29,36,32,56,
                                   1,24,2,16,30,55)] 
data <- data[data$state%in%states,-3]
data2 <- data2[data2$state%in%statesabr,-3]
nentries <- dim(data)[1]
udates <- as.Date(unique(data[,1]))
udates <- udates[udates > "2020-02-29"]
ndates <- length(udates)
dates <- 1:ndates
nc <- length(states)
pop <- c(7614,12672,39512,7278,6950,
         5822,28996,1934,3206,4218,
         21478,19453,1059,10617,1360,
         10488,8882,5759,6046,3080,
         6833,1416,6732,4468,5640,
         3957,12802,5149,706,2913,
         6137,624,8536,3565,3155,
         4649,11689,9987,885,3018,
         974,2976,2097,762,579,
         732,1344,4903,1787,1069,1792)
cases <- deaths <- matrix(0,ndates,nc)
for(i in 1:nentries){
   j <- (1:ndates)[udates==as.Date(data$date[i])]
   k <- (1:nc)[states==data$state[i]]
   cases[j,k] <- data$cases[i]
   deaths[j,k] <- data$deaths[i]
}
nentries2 <- dim(data2)[1]
conv2date <- function(x){
   year <- x %/% 10000
   month <- (x %% 10000)%/%100
   day <- x %% 100
   as.Date(paste0(year,"-",month,"-",day))
}
udates2 <- conv2date(unique(data2[,1]))
udates2 <- udates2[udates2 > "2020-02-29"]
ndates2 <- length(udates2)
udates2 <- udates2[ndates2:1]
dates2 <- 1:ndates2
positive <- negative <- testincr <- matrix(0,ndates2,nc)
for(i in 1:nentries2){
   j <- (1:ndates2)[udates2==conv2date(data2$date[i])]
   k <- (1:nc)[statesabr==data2$state[i]]
   positive[j,k] <- if(is.null(data2$positiveIncrease[i])) 0 else data2$positiveIncrease[i]
   negative[j,k] <- if(is.null(data2$negativeIncrease[i])) 0 else data2$negativeIncrease[i]
   testincr[j,k] <- if(is.null(data2$totalTestResultsIncrease[i])) 0 else data2$totalTestResultsIncrease[i]
}
positive[is.na(positive)] <- 0
negative[is.na(negative)] <- 0
tests <- pmax(0,positive) + pmax(0,negative)
tests[testincr>tests] <- testincr[testincr>tests]
dim(tests) <- dim(positive)
# take moving averages over weeks
tests <- rbind(matrix(0,7,51),apply(apply(tests,2,cumsum),2,diff,7)/7)
positive <- rbind(matrix(0,7,51),apply(apply(positive,2,cumsum),2,diff,7)/7)

posprop <-  positive/tests
posprop[is.na(posprop)] <- 0
posprop[posprop>=1] <- 0 # no negatives recorded
posprop[posprop<0] <- 0
tests[tests<0] <- 0
testincr[testincr<0] <- 0
testspermill <- sweep(testincr,2,pop,"/")*1000

deaths[120:121,5] <- 8054 # Massechusets reported -41 deaths on day 122
rate <- sweep(cases,2,pop,"/") 
rate[rate < 1e-4] <- 1e-4
diffrate <- apply(rate,2,diff)
ind <- order(rate[ndates,],decreasing = TRUE)

For modeling we 2use local polynomial regression as implemented in function locpoly in R package KernSmooth. We employ a Gaussian kernel with bandwidth (kernel standard deviation) 1.5 days for estimating rates and 2.5 for estimating derivatives. Polynomial orders are 1 and 2, respectiviley, to minimize boundary bias. dates are in days after March 1.

Please note that the estimated curves for small number of reported cases carry a high variability. This is especially due for early dates and states with low population (ISL, EST). Problems in the data like underreporting and reporting bias around weekends are not modeled.

Active cases and estimated effective reproduction number R

Active cases are here defined as cases that were reported within the last 14 days. This is more conservative than the rule used, e.g., by the German RKI (corresponding to approximately the last 11 days) and was chosen to minimize effects due to a weekly cycly of reporting biases observed in the German reporting data.

bw <- 1.5
dbw <- 2.5
inc <- 14 # this is the length of the time period used in the definition of active cases
rlcurves <- dlcurves <- list(NULL)
rlylim <- dlylim <- NULL
for (i in 1:nc) {
rlcurves[[i]]<- locpoly(dates,log(pmax(1e-4,diff(c(rep(0,inc),rate[,i]),inc))),bandwidth=bw,degree=1) 
rlylim <- range(rlylim,rlcurves[[i]]$y)
dlcurves[[i]] <- locpoly(dates,log(pmax(1e-4,diff(c(rep(0,inc),rate[,i]),inc))),bandwidth=dbw,drv=1,degree=1) 
dlylim <- range(dlylim,dlcurves[[i]]$y)
}
rlylim1 <- dlylim1 <- NULL
for(i in ind[1:10]){
   rlylim1 <- range(rlylim1,rlcurves[[i]]$y)
   dlylim1 <- range(dlylim1,dlcurves[[i]]$y)
}
rlylim2 <- dlylim2 <- NULL
for(i in ind[11:20]){
   rlylim2 <- range(rlylim2,rlcurves[[i]]$y)
   dlylim2 <- range(dlylim2,dlcurves[[i]]$y)
}
rlylim3 <- dlylim3 <- NULL
for(i in ind[21:30]){
   rlylim3 <- range(rlylim3,rlcurves[[i]]$y)
   dlylim3 <- range(dlylim3,dlcurves[[i]]$y)
}
rlylim4 <- dlylim4 <- NULL
for(i in ind[31:40]){
   rlylim4 <- range(rlylim4,rlcurves[[i]]$y)
   dlylim4 <- range(dlylim4,dlcurves[[i]]$y)
}
rlylim5 <- dlylim5 <- NULL
for(i in ind[41:51]){
   rlylim5 <- range(rlylim5,rlcurves[[i]]$y)
   dlylim5 <- range(dlylim5,dlcurves[[i]]$y)
}

The following figure shows quantities derived as functions of the logarithmic rate of active cases and its derivative. The latter value is closely related to R, i.e., \(R \approx exp(C*value)\) with \(C\approx 4\). Please note that what is shown is not the R reported by,regulatory b7odies like e.g., the RKI and that the value provided here is calculated from a different model. Also note that the amount and criteria of testing used differ substantially between states which makes comparisons difficult. For interpretation: keeping the effective reproduction number below 1 means that the number of active cases is decreasing.

par(mfrow=c(1,2),mar=c(3,3,3,1),mgp=c(2,1,0))
plot(dates,rate[,1],ylim = exp(rlylim1),type="n",xlab="day starting March 1",ylab="")
for(i in 1:10)
  lines(rlcurves[[ind[i]]]$x,exp(rlcurves[[ind[i]]]$y),col=i,lty=1+(i-1)%/%8)
  legend(1,exp(rlylim1[2]),states[ind[1:10]],col=1:10,lty=1+(0:9)%/%8,lwd=rep(1,10))
title("Active cases per 1000 inhabitants")

plot(dates,c(0,diffrate[,1]),ylim = c(0.5,2.5),type="n",xlab="day starting March 1",ylab="")
for(i in 1:10) lines(dlcurves[[ind[i]]]$x,exp(4*dlcurves[[ind[i]]]$y),col=i,lty=1+(i-1)%/%8)
title(paste("Approximate reproduction rate R",as.character(udates[ndates])))
lines(range(dates),c(1,1),lty=2,col=2)
for(lvl in seq(.5,1.2,.1)[-6]) lines(range(dates),c(lvl,lvl),lty=3)

par(mfrow=c(1,2),mar=c(3,3,3,1),mgp=c(2,1,0))
plot(dates,rate[,1],ylim = exp(rlylim2),type="n",xlab="day starting March 1",ylab="")
for(i in 11:20)
  lines(rlcurves[[ind[i]]]$x,exp(rlcurves[[ind[i]]]$y),col=i-10,lty=1+(i-10-1)%/%8)
  legend(1,exp(rlylim2[2]),states[ind[11:20]],col=1:10,lty=1+(0:9)%/%8,lwd=rep(1,10))
title("Active cases per 1000 inhabitants")

plot(dates,c(0,diffrate[,1]),ylim = c(0.5,2.5),type="n",xlab="day starting March 1",ylab="")
for(i in 11:20) lines(dlcurves[[ind[i]]]$x,exp(4*dlcurves[[ind[i]]]$y),col=i-10,lty=1+(i-10-1)%/%8)
title(paste("Approximate reproduction rate R",as.character(udates[ndates])))
lines(range(dates),c(1,1),lty=2,col=2)
for(lvl in seq(.5,1.2,.1)[-6]) lines(range(dates),c(lvl,lvl),lty=3)

par(mfrow=c(1,2),mar=c(3,3,3,1),mgp=c(2,1,0))
plot(dates,rate[,1],ylim = exp(rlylim3),type="n",xlab="day starting March 1",ylab="")
for(i in 21:30)
  lines(rlcurves[[ind[i]]]$x,exp(rlcurves[[ind[i]]]$y),col=i-20,lty=1+(i-20-1)%/%8)
  legend(1,exp(rlylim3[2]),states[ind[21:30]],col=1:10,lty=1+(0:9)%/%8,lwd=rep(1,10))
title("Active cases per 1000 inhabitants")

plot(dates,c(0,diffrate[,1]),ylim = c(0.5,2.5),type="n",xlab="day starting March 1",ylab="")
for(i in 21:30) lines(dlcurves[[ind[i]]]$x,exp(4*dlcurves[[ind[i]]]$y),col=i-20,lty=1+(i-20-1)%/%8)
title(paste("Approximate reproduction rate R",as.character(udates[ndates])))
lines(range(dates),c(1,1),lty=2,col=2)
for(lvl in seq(.5,1.2,.1)[-6]) lines(range(dates),c(lvl,lvl),lty=3)

par(mfrow=c(1,2),mar=c(3,3,3,1),mgp=c(2,1,0))
plot(dates,rate[,1],ylim = exp(rlylim4),type="n",xlab="day starting March 1",ylab="")
for(i in 31:40)
  lines(rlcurves[[ind[i]]]$x,exp(rlcurves[[ind[i]]]$y),col=i-30,lty=1+(i-30-1)%/%8)
  legend(1,exp(rlylim4[2]),states[ind[31:40]],col=1:10,lty=1+(0:9)%/%8,lwd=rep(1,10))
title("Active cases per 1000 inhabitants")

plot(dates,c(0,diffrate[,1]),ylim = c(0.5,2.5),type="n",xlab="day starting March 1",ylab="")
for(i in 31:40) lines(dlcurves[[ind[i]]]$x,exp(4*dlcurves[[ind[i]]]$y),col=i-30,lty=1+(i-30-1)%/%8)
title(paste("Approximate reproduction rate R",as.character(udates[ndates])))
lines(range(dates),c(1,1),lty=2,col=2)
for(lvl in seq(.5,1.2,.1)[-6]) lines(range(dates),c(lvl,lvl),lty=3)

par(mfrow=c(1,2),mar=c(3,3,3,1),mgp=c(2,1,0))
plot(dates,rate[,1],ylim = exp(rlylim5),type="n",xlab="day starting March 1",ylab="")
for(i in 41:51)
  lines(rlcurves[[ind[i]]]$x,exp(rlcurves[[ind[i]]]$y),col=i-40,lty=1+(i-40-1)%/%8)
  legend(1,exp(rlylim5[2]),states[ind[41:51]],col=1:11,lty=1+(0:10)%/%8,lwd=rep(1,11))
title("Active cases per 1000 inhabitants")

plot(dates,c(0,diffrate[,1]),ylim = c(0.5,2.5),type="n",xlab="day starting March 1",ylab="")
for(i in 41:51) lines(dlcurves[[ind[i]]]$x,exp(4*dlcurves[[ind[i]]]$y),col=i-40,lty=1+(i-40-1)%/%8)
title(paste("Approximate reproduction rate R",as.character(udates[ndates])))
lines(range(dates),c(1,1),lty=2,col=2)
for(lvl in seq(.5,1.2,.1)[-6]) lines(range(dates),c(lvl,lvl),lty=3)

The following figure illustrates the current status. States that occur on the right side currently have a high infection load. Note that the estimated reproduction rate is highly variable for small (by population) states dispayed in the left. For states in the upper in the upper right we observe a very dynamic and deteriorating situation.

arate <- arepr <- numeric(nc)
larate <- larepr <- matrix(0,7,nc)
lweek <- trunc(401/ndates*((ndates-7):(ndates-1)))

for(i in 1:nc) {
   arate[i] <- exp(rlcurves[[ind[i]]]$y[401])
   larate[,i] <- exp(rlcurves[[ind[i]]]$y[lweek])
   arepr[i] <- exp(4*dlcurves[[ind[i]]]$y[401])
   larepr[,i] <- exp(4*dlcurves[[ind[i]]]$y[lweek])
}
par(mfrow=c(1,1),mar=c(3,3,3,.1),mgp=c(2,1,0))
xlim <- pmax(5e-2,range(arate)) # no reported case for Massachusetts 
ylim <- pmin(2.5,pmax(5e-1,range(arepr)))
plot(arate,arepr,xlim=xlim,ylim=ylim,col=rep(1:6,9)[ind], xlab="Active cases per 1000 inhabitants", 
                        ylab="Approximate reproduction rate", log="")
for(i in 1:nc){
  lines(c(larate[,i],arate[i]),c(larepr[,i],arepr[i]),col=rep(1:6,9)[ind[i]])
  text(arate[i],arepr[i],states[ind[i]],pos=3,col=rep(1:6,9)[ind[i]])
}
title(paste("Situation on",as.character(udates[ndates])))

Circles illustrate the current situation with coordinates given by (smoothed) infections over the last 2 weeks (x-axis) and estimated reproduction number (y-axis). Lines follow the situation over the last week. Note that the x-axis is on a log scale.

Testing

The following analysis uses data on tests from the Johns Hopkins university coronavirus tracking project. Data from days with inadequate reporting (tests/million inhabitants < 25) are not used in the analysis.

bw <- 3
nd <- length(dates2)
testcurves <- tposcurves <- list(NULL)
testylim <- tposylim <- NULL
for (i in 1:nc) {
indp <- (1:nd)[posprop[,i]>0 & testspermill[,i] > 25]
testcurves[[i]]<- locpoly(dates2[indp],testspermill[indp,i]/1000,bandwidth=bw, degree=0) 
testcurves[[i]]$x <- testcurves[[i]]$x[!is.na(testcurves[[i]]$y)]
testcurves[[i]]$y <- testcurves[[i]]$y[!is.na(testcurves[[i]]$y)]
testylim <- range(testylim,testcurves[[i]]$y)
tposcurves[[i]] <- locpoly(dates2[indp],posprop[indp,i],bandwidth=bw, degree=0) 
tposcurves[[i]]$x <- tposcurves[[i]]$x[!is.na(tposcurves[[i]]$y)]
tposcurves[[i]]$y <- tposcurves[[i]]$y[!is.na(tposcurves[[i]]$y)]
tposylim <- range(tposylim,tposcurves[[i]]$y)
#cat(i,states[i],range(testcurves[[i]]$y),range(tposcurves[[i]]$y),"\n")
  }
testylim1 <- tposylim1 <- NULL
for(i in ind[1:10]){
   testylim1 <- range(testylim1,testcurves[[i]]$y)
   tposylim1 <- range(tposylim1,tposcurves[[i]]$y)
}
testylim2 <- tposylim2 <- NULL
for(i in ind[11:20]){
   testylim2 <- range(testylim2,testcurves[[i]]$y)
   tposylim2 <- range(tposylim2,tposcurves[[i]]$y)
}
testylim3 <- tposylim3 <- NULL
for(i in ind[21:30]){
   testylim3 <- range(testylim3,testcurves[[i]]$y)
   tposylim3 <- range(tposylim3,tposcurves[[i]]$y)
}
testylim4 <- tposylim4 <- NULL
for(i in ind[31:40]){
   testylim4 <- range(testylim4,testcurves[[i]]$y)
   tposylim4 <- range(tposylim4,tposcurves[[i]]$y)
}
testylim5 <- tposylim5 <- NULL
for(i in ind[41:51]){
   testylim5 <- range(testylim5,testcurves[[i]]$y)
   tposylim5 <- range(tposylim5,tposcurves[[i]]$y)
}

The following figures shows smoothed number of tests per million inhabitants and smoothed positivity rates.

par(mfrow=c(1,2),mar=c(3,3,3,1),mgp=c(2,1,0))
plot(dates2, testspermill[,1], ylim = testylim1,type="n",xlab=paste("day starting",as.character(udates[1])),ylab="")
for(i in 1:10)
  lines(testcurves[[ind[i]]]$x,testcurves[[ind[i]]]$y,col=i,lty=1+(i-1)%/%8)
  legend(1,testylim1[2],states[ind[1:10]],col=1:10,lty=1+(0:9)%/%8,lwd=rep(1,10))
title("Tests per 1000 inhabitants")

plot(dates2,posprop[,1],ylim = tposylim1,type="n",xlab=paste("day starting",as.character(udates[1])),ylab="")
for(i in 1:10) lines(tposcurves[[ind[i]]]$x,tposcurves[[ind[i]]]$y,col=i,lty=1+(i-1)%/%8)
title(paste("Test positivity rate",as.character(udates[ndates])))
lines(range(dates2),c(1,1),lty=2,col=2)
for(lvl in seq(.05,.3,.05)) lines(range(dates2),c(lvl,lvl),lty=3)

par(mfrow=c(1,2),mar=c(3,3,3,1),mgp=c(2,1,0))
plot(dates2, testspermill[,1], ylim = testylim2,type="n",xlab=paste("day starting",as.character(udates[1])),ylab="")
for(i in 11:20)
  lines(testcurves[[ind[i]]]$x,testcurves[[ind[i]]]$y,col=i-10,lty=1+(i-11)%/%8)
  legend(1,testylim2[2],states[ind[11:20]],col=1:10,lty=1+(0:9)%/%8,lwd=rep(1,10))
title("Tests per 1000 inhabitants")

plot(dates2,posprop[,1],ylim = tposylim2,type="n",xlab=paste("day starting",as.character(udates[1])),ylab="")
for(i in 11:20) lines(tposcurves[[ind[i]]]$x,tposcurves[[ind[i]]]$y,col=i-10,lty=1+(i-11)%/%8)
title(paste("Test positivity rate",as.character(udates[ndates])))
lines(range(dates2),c(1,1),lty=2,col=2)
for(lvl in seq(.05,.3,.05)) lines(range(dates2),c(lvl,lvl),lty=3)

par(mfrow=c(1,2),mar=c(3,3,3,1),mgp=c(2,1,0))
plot(dates2, testspermill[,1], ylim = testylim3,type="n",xlab=paste("day starting",as.character(udates[1])),ylab="")
for(i in 21:30)
  lines(testcurves[[ind[i]]]$x,testcurves[[ind[i]]]$y,col=i-20,lty=1+(i-21)%/%8)
  legend(1,testylim3[2],states[ind[21:30]],col=1:10,lty=1+(0:9)%/%8,lwd=rep(1,10))
title("Tests per 1000 inhabitants")

plot(dates2,posprop[,1],ylim = tposylim3,type="n",xlab=paste("day starting",as.character(udates[1])),ylab="")
for(i in 21:30) lines(tposcurves[[ind[i]]]$x,tposcurves[[ind[i]]]$y,col=i-20,lty=1+(i-21)%/%8)
title(paste("Test positivity rate",as.character(udates[ndates])))
lines(range(dates2),c(1,1),lty=2,col=2)
for(lvl in seq(.05,.3,.05)) lines(range(dates2),c(lvl,lvl),lty=3)

par(mfrow=c(1,2),mar=c(3,3,3,1),mgp=c(2,1,0))
plot(dates2, testspermill[,1], ylim = testylim4,type="n",xlab=paste("day starting",as.character(udates[1])),ylab="")
for(i in 31:40)
  lines(testcurves[[ind[i]]]$x,testcurves[[ind[i]]]$y,col=i-30,lty=1+(i-31)%/%8)
  legend(1,testylim4[2],states[ind[31:40]],col=1:10,lty=1+(0:9)%/%8,lwd=rep(1,10))
title("Tests per 1000 inhabitants")

plot(dates2,posprop[,1],ylim = tposylim4,type="n",xlab=paste("day starting",as.character(udates[1])),ylab="")
for(i in 31:40) lines(tposcurves[[ind[i]]]$x,tposcurves[[ind[i]]]$y,col=i-30,lty=1+(i-31)%/%8)
title(paste("Test positivity rate",as.character(udates[ndates])))
lines(range(dates2),c(1,1),lty=2,col=2)
for(lvl in seq(.05,.3,.05)) lines(range(dates2),c(lvl,lvl),lty=3)

par(mfrow=c(1,2),mar=c(3,3,3,1),mgp=c(2,1,0))
plot(dates2, testspermill[,1], ylim = testylim5,type="n",xlab=paste("day starting",as.character(udates[1])),ylab="")
for(i in 41:51)
  lines(testcurves[[ind[i]]]$x,testcurves[[ind[i]]]$y,col=i-40,lty=1+(i-41)%/%8)
  legend(1,testylim5[2],states[ind[41:51]],col=1:11,lty=1+(0:11)%/%8,lwd=rep(1,11))
title("Tests per 1000 inhabitants")

plot(dates2,posprop[,1],ylim = tposylim5,type="n",xlab=paste("day starting",as.character(udates[1])),ylab="")
for(i in 41:51) lines(tposcurves[[ind[i]]]$x,tposcurves[[ind[i]]]$y,col=i-40,lty=1+(i-41)%/%8)
title(paste("Test positivity rate",as.character(udates[ndates])))
lines(range(dates2),c(1,1),lty=2,col=2)
for(lvl in seq(.05,.3,.05)) lines(range(dates2),c(lvl,lvl),lty=3)

The following figure illustrates the current statusconcerning tests. States that occur on the right side currently have a high number of tests per 1000 inhabitants. The vertical coordinate corresponds to the positivity rate.

arate <- arepr <- numeric(nc)

for(i in 1:nc) {
   arate[i] <- testcurves[[ind[i]]]$y[401]
   arepr[i] <- tposcurves[[ind[i]]]$y[401]
}
par(mfrow=c(1,1),mar=c(3,3,3,.1),mgp=c(2,1,0))
plot(arate,arepr,col=rep(1:6,9)[ind], xlab="Number of tests per 1000 inhabitants", 
                        ylab="positivity rate")
for(i in 1:nc){
  text(arate[i],arepr[i],statesabr[ind[i]],col=rep(1:6,9)[ind[i]],pos=3)
}
title(paste("Situation on",as.character(udates[ndates])))

cases by state

library(ggplot2)
library(reshape2)
fcases <- as.data.frame(apply(cases,2,diff))
fcases[54,5] <- fcases[54,5] - 1500 # redistribute cases for Massechusets
fcases[44:53,5] <- fcases[44:53,5] + 150 
fcases[92,5] <- fcases[92,5] - 2500 # redistribute cases for Massechusets
fcases[67:91,5] <- fcases[67:91,5] + 100 
fcases[fcases<0] <- 0
fcases$dates <- dates[-1]
fd <- melt(fcases, id.vars="dates")
f1 <- ggplot(fd, aes(dates,value))
f2 <- f1 + geom_bar(aes(fill=rep(states,ndates-1)),position=position_stack(reverse=TRUE),stat="identity") +
           labs(title="COVIT-19 cases reported", x="day starting March 1", y="counts") +
           coord_flip() +  theme(legend.position = "bottom") #+ scale_fill_hue(h=c(0,720))
f2

fcases <- as.data.frame(apply(cases,2,diff))
fcases[54,5] <- fcases[54,5] - 1500 # redistribute late cases for Massechusets
fcases[44:53,5] <- fcases[44:53,5] + 150 
fcases[92,5] <- fcases[92,5] - 2500 # redistribute late cases for Massechusets
fcases[67:91,5] <- fcases[67:91,5] + 100 
fcases[218,14] <- fcases[218,14] - 20000 # redistribute late cases for Massechusets
fcases[208:217,14] <- fcases[208:217,14] + 200 
fcases[96,38] <- fcases[96,38] - 4000 # redistribute late cases for Georgia
fcases[56:95,38] <- fcases[56:95,38] + 100 
fcases[fcases<0] <- 0
fdpm <- sweep(fcases,2,pop,"/")*1000
fdpm$dates <- dates[-1]
fd <- melt(fdpm, id.vars="dates")
fd$states <- rep(states,rep(ndates-1,51))
f1 <- ggplot(fd, aes(dates,value))

f1 + geom_bar(stat="identity") + facet_wrap(~states,nrow=26)  + labs(title="COVIT-19 cases reported per 1 million inhabitants and day", x="day starting March 1", y="cases reported per 1 million inhabitants and day")

library(ggplot2)
library(reshape2)
fdeaths <- as.data.frame(apply(deaths,2,diff))
# Reallocate deaths reported for New Jersey on June 25 
fdeaths[35:115,17] <- fdeaths[35:115,17]+22
fdeaths[116,17] <- fdeaths[116,17]-81*22 
fdeaths$dates <- dates[-1]
fd <- melt(fdeaths, id.vars="dates")
f1 <- ggplot(fd, aes(dates,value))
f2 <- f1 + geom_bar(aes(fill=rep(states,ndates-1)),position=position_stack(reverse=TRUE),stat="identity") +
           labs(title="COVIT-19 deaths reported", x="day starting March 1", y="counts") +
           coord_flip() +  theme(legend.position = "bottom") 
f2

fdeaths <- as.data.frame(apply(deaths,2,diff))
# Reallocate deaths reported for New Jersey on June 25 
fdeaths[35:115,17] <- fdeaths[35:115,17]+22
fdeaths[116,17] <- fdeaths[116,17]-81*22
# Reallocate deaths reported for Delaware on June 24
fdeaths[49:113,41] <- fdeaths[49:113,41]+1
fdeaths[114,41] <- fdeaths[114,41]-65 
fdeaths[143,17] <- 0
fdeaths[129,17] <- fdeaths[129,17]-30
# New Yersey reported -30 on July 22
fdpm <- sweep(fdeaths,2,pop,"/")*1000
fdpm$dates <- dates[-1]
fd <- melt(fdpm, id.vars="dates")
fd$states <- rep(states,rep(ndates-1,51))
f1 <- ggplot(fd, aes(dates,value))

f1 + geom_bar(stat="identity") + facet_wrap(~states,nrow=26)  + labs(title="COVIT-19 deaths reported per 1 million inhabitants and day", x="day starting March 1", y="deaths reported per 1 million inhabitants and day")

Exploaratory analysis of COVID-19 infections in US states

Joerg Polzehl

02 06 2022

General remarks

Active cases and estimated effective reproduction number R

Testing

cases by state