---
title: "Analysis Code for: Statistics education in undergraduate psychology: A survey of UK curricula"
author: "TARG Meta-Research Group"
date: "`r Sys.Date()`"
output:
  pdf_document: default
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(include = FALSE) # this option stops the code chunks from being output from the knit
```

```{r loadPackages}
# Load packages and set working directory
library("tidyverse") #to calculate descriptives by subgroup.
library("samplingbook") #Sprop for  CI of proportions with finite population correction. 
library("purrr") #for map function in Block 6 
library("irr") #for the kappa2() function 
library("kableExtra") # for kable table styling
setwd(getwd())
```

```{r sampleFigure}
# Make Figure 1 that shows the sample selection process for the courses

#load data. This file MUST be in alphabetical order by university to ensure the Russell Group assignments get linked properly to their university
unis <- read.csv("StatsSyllabi_Data1ProgrammeSample_220710.csv")

#change characters to numerics
unis$psychDegree <- as.numeric(unis$psychDegree)
unis$russellGroup <- as.numeric(unis$russellGroup)
unis$public <- as.numeric(unis$public)
unis$detailed <- as.numeric(unis$detailed)
unis$coded <- as.numeric(unis$coded)

#group by Russell Group
unis <- unis %>% as.data.frame() %>% 
  group_by(russellGroup)

#summarise data for table 1
t1 <- unis %>% summarise(
  "Recognised Universities"=sum(psychDegree<2),
  "Universities with psychology degree"=sum(psychDegree),
  "Sufficient detail"=sum(detailed==1, na.rm=TRUE))

t2 <- unis %>% summarise(
  "Insufficient detail"=sum(detailed==0, na.rm=TRUE),
  "Syllabus not publicly available"=sum(public==0, na.rm=TRUE),
  "Courses contacted"=sum(followup!="NA", na.rm=T),
  "Syllabus or details provided"=sum(followup=="syllabusProvided", na.rm=T),
  "Further details unavailable"=sum(followup=="furtherDetailsUnavailable", na.rm=T),
  "University denied access"=sum(followup=="deniedAccess", na.rm=T),
  "No response"=sum(followup=="noResponse", na.rm=T),
  "Courses sampled"=sum(coded, na.rm=TRUE)
  )

#there was an issue where i couldn't use $detail twice in the same summarise(), so I made 2 tables then bind them together
t2 <- t2 %>% subset(select=-c(russellGroup))
t1 <- cbind(t1, t2) %>% 
  as.data.frame() %>%
  arrange(desc(russellGroup)) %>%
  t()

#export t1 to .csv. Then manually format into Table 1.

#This data was originally going to be a table on it's own. We decided to not include this table in the final publication. The data is now used for Figure 1 (the flowchart)
write.csv(t1, "fig1a.csv")

#additional numbers to input into the bottom row of Figure 1, the flowchart
flow <- unis %>% summarise(
  "not public provided"=sum(followup=="syllabusProvided" & public==0, na.rm=T),
  "not public denied"=sum(followup=="deniedAccess" & public==0, na.rm=T),
  "not public no response"=sum(followup=="noResponse" & public==0, na.rm=T),
  "not public further details unavailable"=sum(followup=="furtherDetailsUnavailable" & public==0, na.rm=T),
  "public further details unavailable"=sum(followup=="furtherDetailsUnavailable" & public==1, na.rm=T),
  "public provided"=sum(followup=="syllabusProvided" & public==1, na.rm=T),
  "public no response"=sum(followup=="noResponse" & public==1, na.rm=T)
    ) %>% 
  arrange(desc(russellGroup)) %>%
  t()

#export flow to .csv. Then use these numbers to populate the bottom row of Figure 1, the flowchart
write.csv(flow, "fig1b.csv") 
```

```{r modulesToCourses}
# Collapse module coding into course coding

#load file. stringAsFactors = FALSE is necessary to not get errors
mod <- read.csv("StatsSyllabi_Data3ModuleTopics_220716.csv", header = TRUE, stringsAsFactors = FALSE)

#This is a workaround to get the column titles to have spaces. I do this to facilitate making tables and the data output.
itemsShort <- colnames(mod)
items <- mod[1,]
mod <- mod[-1,]
colnames(mod) <- items

#filter to single resolved coding
mod1 <- mod  %>% filter(codeNum==3) 

#make columns for guidelines and software
mod1 <- mod1 %>% mutate(APA = ifelse(grepl("APA", guideline), 1, 0))
mod1 <- mod1 %>% mutate(BPS = ifelse(grepl("BPS", guideline), 1, 0))
mod1 <- mod1 %>% mutate(SPSS = ifelse(grepl("SPSS", software), 1, 0))
mod1 <- mod1 %>% mutate(Excel = ifelse(grepl("Excel", software), 1, 0))
mod1 <- mod1 %>% mutate(R = ifelse(grepl("R", software), 1, 0))
mod1 <- mod1 %>% mutate(Stata = ifelse(grepl("Stata", software), 1, 0))
mod1 <- mod1 %>% mutate(Minitab = ifelse(grepl("Minitab", software), 1, 0))

#swap 'yes' and 'no' for 1 and 0 *Int* for integer
modInt <- mod1
for (i in 1:nrow(mod1)){
  for (j in 8:40){
    if (mod1[i,j] == "No"){
      modInt[i,j] <- 0
    } else if (mod1[i,j] == "Yes"){
      modInt[i,j] <- 1
    } else {
      modInt[i,j] <- mod1[i,j]
    }
  }
}

#messy code to convert the numbers stored as chars in modInt to numeric
modIntCut <- modInt[,8:40]
modIntCut <- apply(modIntCut, 2, as.numeric) %>% as.data.frame()
modInt[,8:40] <- modIntCut

#group by university so that we can collapse modules into courses
modInt <- modInt %>% group_by(university)
course <- modInt %>% summarise_if(is.numeric, sum) %>% 
          as.data.frame()

#get data from "StatsSyllabi_Data1CourseSample_220710.csv" on coded courses for russell group and module numbers
unisCoded <- unis %>% filter(coded==1)
course <- course %>% add_column(rg=unisCoded$russellGroup,
                                 quant=as.numeric(unisCoded$quantMod),
                                 methods=as.numeric(unisCoded$methodsMod),
                                 coding=as.numeric(unisCoded$codingMod))

#output cleaned data used for calculation of main analysis
write.csv(course, "StatsSyllabi_Data4ProgrammeTopics_220716.csv")
```

```{r table1}
# Output modules and software tables (Table 1)

#change name of course to data_by_rg (Russell Group status) because I coded this section before I coded section 2, and now don't want to change all the names
data_by_rg = group_by(course, rg)

#Calculate mean for the number of course modules, to 2 decimal places (courseMods_mean).  
courseMods_mean = round(summarise(data_by_rg,
                                  quant = mean(quant), 
                                  methods = mean(methods), 
                                  coding = mean(coding)),2)



#Calculate median
courseMods_median = summarise(data_by_rg, 
                              quant = median(quant), 
                              methods = median(methods), 
                              coding = median(coding))

#rename columns
courseMods <- rbind(courseMods_mean, courseMods_median) %>% 
  t() %>%
  as.data.frame %>%
  rename(nonRG_mean=V1,
         RG_mean=V2,
         nonRG_median=V3,
         RG_median=V4)

#rearrange column for Table 2 output
courseMods_out <- courseMods %>% select(RG_mean, RG_median, nonRG_mean, nonRG_median)

#Calculate totals for software and guidelines
#NOTE: Prior to analysis, the software and guidelines data was split into the individual categories listed below, to make the R analysis simpler.  
courseSoft = summarise(data_by_rg, 
                       APA = sum(APA >0),
                       BPS = sum(BPS >0), 
                       SPSS = sum(SPSS >0),
                       Excel = sum(Excel >0), 
                       R = sum(R >0),
                       Stata = sum(Stata >0),
                       Minitab = sum(Minitab >0))  %>%
              arrange(desc(rg)) %>%
              t()

#Export the results. Then manually combine these to make Table 1
write.csv(courseMods_out, "table1a.csv")
write.csv(courseSoft, "table1b.csv")
courseSoftPerc <- courseSoft
courseSoftPerc[,1] <- round(courseSoft[,1]/15, 2)
courseSoftPerc[,2] <- round(courseSoft[,2]/12, 2)
write.csv(courseSoftPerc, "table1c.csv")
```

```{r table2}
# Calculate percentage of courses that include each topic (Table 2)

#new dataframe with only the relevant measurements (ie. Remove everything already analysed in previous 2 sections).
binary_by_rg <- select(data_by_rg, -c("university", "quant", "methods", "coding", "BPS", "APA", "R", "Excel", "SPSS", "Minitab", "Stata"))

#change the sums to binary yes/no in 1/0s
binary_by_rg <- ifelse (binary_by_rg > 0, 1, 0) %>% as.data.frame %>% group_by(rg)

#Calculate the total number of course that contain each topic surveyed
binary_count = summarise_all(binary_by_rg, sum)

#Convert totals into proportions. (Total courses that contain the survey item/Total courses)
binary_perc_nonRG<- (binary_count[1,] / as.integer(count(binary_by_rg)[1,2]))
binary_perc_RG <- (binary_count[2,] / as.integer(count(binary_by_rg)[2,2]))
binary_perc <- rbind(binary_perc_nonRG, binary_perc_RG)
#move rg to last column, so that we can later bind this dataframe with the CIs
binary_perc <- binary_perc[, c(2:ncol(binary_perc),1)]

#Create subsets of non-RG and RG, and do the same CI calculation on each. 
binary_RG = subset(binary_by_rg, rg==1)
binary_nonRG = subset(binary_by_rg, rg==0)

#(Note N=95, total number of non-RG universities.)
#(Note N=23, total number of non-RG universities.)
CI_RG = lapply(binary_RG, function(x) Sprop(x, n=nrow(binary_RG), N=23, level = 0.95))
CI_nonRG = lapply(binary_nonRG, function(x) Sprop(x, n=nrow(binary_nonRG), N=95, level = 0.95))

#The results are double-nested and need some work to extract. 
#Result is nested as varname[4[2]] OR dataframe$varname$ci$exact[1:2]
CI_RG = map(map(CI_RG, 4),2) %>% as.data.frame()
CI_nonRG = map(map(CI_nonRG, 4),2) %>% as.data.frame()

#match column names so that rbind() works
colnames(CI_RG)=colnames(binary_RG)
colnames(CI_nonRG)=colnames(binary_RG)

#bind the point estimates and CIs. RG first with CIs, then nonRG same.
out <- rbind(binary_perc[2,], CI_RG, binary_perc[1,], CI_nonRG)
#collapse the 2 descriptive stats variables. They are the same at course level, so we can remove 1, and then rename it
#and remove he rg identifier, we add it in the colnames in a few lines
out <- out %>% select(-"Descriptive statistics (central tendency)", -rg) %>%
      rename("Descriptive statistics" = "Descriptive statistics (variability)")

#add a row to identify which category each item is in. This was slightly out of order because we added items to the Google form after coding the first 10 modules
out <- rbind(out, c(1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,1,2,2,3,3)) %>% 
  t() %>% 
  as.data.frame() %>%
  round(2) %>%
  rownames_to_column()
#name the columns
colnames(out) <- c("item","rg","rg_low","rg_high","nrg","nrg_low","nrg_high","sec")

#store order before arrange() so I can use it to arrange TableS2 and kappa output.
ord <- out$rg
#sort by section and then rg higher percentage at top.
out <- out %>% arrange(sec, desc(rg))

table2 <- data.frame(paste0(out$item))
table2[,2] <- data.frame(paste0(out$rg*100, "% (", out$rg_low*100, "-", out$rg_high*100, ")"))
table2[,3] <- data.frame(paste0(out$nrg*100, "% (", out$nrg_low*100, "-", out$nrg_high*100, ")"))
colnames(table2) <- c("item", "Russell Group", "Non-Russell Group")

# remove the 2 topics: philosophy of science and qualitatve methods. This decision was made after receiving reviewer comments
table2Removed <- table2[c(8,29),]
table2 <- table2[c(1:7,9:28,30:32),]

write.csv(table2, "table2.csv")
```

```{r exams}
# Dot plot of exam percentages (Figure S1)

#read this file to get the RG codings
rgs <- read.csv("StatsSyllabi_Data2ModuleSample_220710.csv", fileEncoding="UTF-8-BOM", header = T)
#remove the modules we didn't code
rgs <- rgs %>% filter(coded==1)
#remove module #14, located in row 13. This was a Edge Hill module that we collapsed the coding for because it was in one document
# and sort by moduleID
rgs <- rgs %>% arrange(as.numeric(moduleID))
rgs <- rgs[-9,] 
#check to ensure the module IDs line up before transferring the RG coding from rgs to mod1
mod1 <- mod1 %>% arrange(as.numeric(moduleID))
check <- rgs$moduleID==mod1$moduleID
if (F %in% check){
  stop("modules don't align between data sheets")
}


modrg <- mod1 %>% mutate(rg = rgs$RussellGroup) 
modExam <- modrg %>% filter(exams!="notStated")

# These two lines are to calculate the number of upper and lower year courses (they are unrelated to the exams)
mod_RG <- modrg %>% filter(rg==1)
mod_nonRG <- modrg %>% filter(rg==0)

dot<- ggplot(modExam, aes(x = as.numeric(exams), 
                        fill=factor(rg))) +
        geom_dotplot(stackgroups=TRUE, 
                      method="histodot", 
                      binwidth=1,
                      dotsize=5
                     ) +
        ylim(0,30) +
        xlim(0,100) +
        labs(x="percent exams", y="module count") +
        theme(legend.position = "none") +
        scale_x_continuous(breaks=seq(0,100,10)) +
        theme(text = element_text(size=20))

#this graph must be scale when printing it so that the 0% column adds up to exactly 30. This is a quirk of geom_dotplot that I've tried to work around. 392*555 seems to work.

png("figS1.png", width = 392, height = 552)
plot(dot)
dev.off()

```

```{r textbooks}
#Chunk 9: textbooks (table S3)
textbook <- modrg %>% group_by(rg) %>% 
  count(textbook) 
#because we coded two modules for edge hill and collapsed them into one, and Field was mentioned in the overall document, we should add one.
textbook[5,3] <- textbook[5,3] + 1
write.csv(textbook,"tableS3.csv")
#finish organizing this file manually.
```

```{r stackedBarChart}
# Create the stacked bar chart for Figure 2
brks <- c(0, 0.25, 0.5, 0.75, 1)

#create an inverse of the dataset, so we can make a filled bar chart
outPlot <- out %>% mutate(real="real") 

#create section headers for bar graph
header1 <- data.frame("Statistical concepts",0,0,0,0,0,0,0,"real")
names(header1) <- names(outPlot)
header2 <- data.frame("Statistical tools / techniques",0,0,0,0,0,0,0,"real")
names(header2) <- names(outPlot)
header3 <- data.frame("Other methodological skills",0,0,0,0,0,0,0,"real")
names(header3) <- names(outPlot)

#match column names, so that we can bind these dataframes
colnames(header1) <- c("item", "rg", "rg_low", "rg_high", "nrg", "nrg_low", "nrg_high", "sec", "real")
colnames(header2) <- c("item", "rg", "rg_low", "rg_high", "nrg", "nrg_low", "nrg_high", "sec", "real")
colnames(header3) <- c("item", "rg", "rg_low", "rg_high", "nrg", "nrg_low", "nrg_high", "sec", "real")

#insert section headers
outPlot <- rbind(header1, outPlot[1:12,], header2, outPlot[13:27,], header3, outPlot[28:32,])

#keep order from before
item35 <- c("Statistical concepts", out$item[1:12], "Statistical tools / techniques", out$item[13:27], "Other methodological skills", out$item[28:32])
outPlot$item <- ordered(item35,outPlot$item) %>% fct_rev()

#create inverse dataframe
outPlotInv <- outPlot
outPlotInv[,2:7] <- 1-outPlot[,2:7]
outPlotInv[,9] <- "inv"

#combine dataframes
outPlotAll <- rbind(outPlot, outPlotInv) %>%
  mutate(count_rg=round(rg*15),0) %>%
  mutate(count_nrg=round(nrg*12),0)

#make section headers 0 to remove colour from those sections in the graph
outPlotAll[c(36,49,65),c(2:7,10:12)] <- c(0L,0L,0L,0L,0L,0L,0L,0L,0L)
outPlotAll$real <- as.character(outPlotAll$real)

#bold the 3 section titles
breaks <- levels(outPlotAll$item)
labels <- as.expression(breaks)
labels[[35]] <- bquote(bolditalic(.(labels[[35]])))
labels[[22]] <- bquote(bolditalic(.(labels[[22]])))
labels[[6]] <- bquote(bolditalic(.(labels[[6]])))

# remove the two non-quantitative measures (in line with the reviewer comments)
outPlotAll <- outPlotAll %>% filter(item != "Philosophy of science")
outPlotAll <- outPlotAll %>% filter(item != "Qualitative methods")

# save the figures
plot_rg <- ggplot(outPlotAll, aes(x=item, y=rg)) +
  geom_bar(stat="identity", aes(fill=real)) + 
  geom_text(data=subset(outPlotAll,count_rg != 0),aes(label=count_rg), position=position_stack(vjust = 0.5)) +
  coord_flip() +
  scale_y_continuous(breaks = brks, labels = scales::percent(brks)) +
  #remove background grey colour
  theme(axis.line = element_line(colour = "white"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.title = element_blank(),
        legend.position = "none",
        axis.ticks.y = element_blank(),
        text = element_text(size=18),
        axis.text.x = element_text(size=12)
        ) +
  scale_fill_manual(values=c("grey80", "goldenrod2")) +
  scale_x_discrete(label = labels, breaks = breaks)

png("fig2a.png", width = 335, height = 770, units = "px")
plot(plot_rg)
dev.off()

plot_nrg <- ggplot(outPlotAll, aes(x=item, y=nrg)) +
  geom_bar(stat="identity", aes(fill=real)) + 
  geom_text(data=subset(outPlotAll,count_nrg != 0),aes(label=count_nrg), position=position_stack(vjust = 0.5)) +
  coord_flip() +
  scale_y_continuous(breaks = brks, labels = scales::percent(brks)) +
  #remove background grey colour
  theme(axis.line = element_line(colour = "white"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.title = element_blank(),
        legend.position = "none",
        axis.ticks.y = element_blank(),
        text = element_text(size=18),
        axis.text.x = element_text(size=12)
        ) +
  scale_fill_manual(values=c("grey80", "goldenrod2")) +
  scale_x_discrete(label = labels, breaks = breaks)

#manually cut and paste these graphs next to each other for the final image.

#print the image
png("fig2b.png", width = 335, height = 770, units = "px")
plot(plot_nrg)
dev.off()

#manually place the two stacked bar charts next to each other for the final image.

```

```{r IRRdetial}
# Calculate the IRR for coding whether the module syllabi had enough detail to code for this study
d <- read.csv("StatsSyllabi_Data2ModuleSample_220710.csv", fileEncoding="UTF-8-BOM", header = T)
kDetail <- kappa2(cbind(d$sufficientDetailCoder1, d$sufficientDetailCoder2))
```

```{r IRR}
# Calculating kappa, inter-rater agreement (Table S1)

#make a 2*n column to go into kappa2()
x1 <- mod %>% filter(codeNum == 1)
x2 <- mod %>% filter(codeNum == 2)
k <- mod1[1,]

#run kappa2() on all binary items
for (i in 8:40){
  temp <- cbind(x1[i], x2[i]) %>% 
    kappa2()
  k[i] = temp[5]
}
#rename columns so I can subset for an average kappa
k <- k[8:40]
kTemp <- k %>% as.data.frame()
colnames(kTemp) <- itemsShort[8:40]
kMeanAll <- mean(as.numeric(select(kTemp, -c(phil, qualitative))), na.rm=TRUE)

#remove items with 4 or less occurrences as well as applied and criticEval which were misunderstood.
kCut <- k %>% as.data.frame()
colnames(kCut) <- itemsShort[8:40]
kCut <- kCut %>% select(-c(phil, applied, freq, bayes, sig, mult, qualitative))
kMeanCut <- mean(as.numeric(kCut), na.rm=TRUE)


disc <- matrix(nrow=nrow(mod1), ncol=33)
#identify number of discrepancies
for (i in 1:nrow(mod1)){
  for (j in 8:40){
    if (mod[(i*3-2),j] == mod[(i*3-1),j]){
      disc[i,j-7] <- 0
    } else {
      disc[i,j-7] <- 1 
    }
  }
}

#output kappa and number of discrepancies
kOut <- k %>% round(2)
kOut[2,] <- colSums(as.data.frame(disc))
kOut <- kOut %>% t()

#order kappa output in same way as table. Put we need have the two versions of descriptive variables now, so some tweaks to the code are necessary
ord33 <- append(ord, ord[13],after=12)
kOut <- kOut %>% cbind(c(1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,1,2,2,3,3), ord33) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  arrange(V3, desc(ord33)) %>%
  select(-c(V3, ord33))

colnames(kOut) <- c("topic", "kappa", "numDiscrepant")
write.csv(kOut, file="tableS1.csv")

#also need to organize it in the same order as everything else

```

This code outputs several tables and figure that we integrated into the manuscript. It also outputs a pdf with several key numbers we input into the manuscript, and is followed by the same tables and figures that are output to csv and png files.

<!--Code is above this line. Output is below this line -->
```{r}
# for use in the Key Numbers section
courseCut <- course[,2:34]
colnames(courseCut) <- itemsShort[8:40]
```

# Key numbers

Percent of courses where all the quantitative modules had publicly available syllabi or sufficient detail for our coding procedure: `r round(sum(unis$public==1 & unis$detailed==1, na.rm=T)/sum(unis$psychDegree==1), 2)`

Kappa for sufficient detail: `r kDetail$value %>% round(2)`

Number coded different: `r sum(d$sufficientDetailCoder1 != d$sufficientDetailCoder2, na.rm=T)`

Total number coded for sufficient detail: `r nrow(d)`

Number of universities contacted: `r sum(flow) - 1` <!--minus one because there is a 1 in that dataframe which identifies the RG univerisites-->

Mean Cohen's Kappa for topics with philosophy and qual removed: `r kMeanAll %>% round(2)`

Mean Cohen's Kappa for topics with philosophy and qual removed + 5 other rare topics excluded: `r kMeanCut %>% round(2)`

Range of # of quant modules for Russell Group universities: `r range(filter(course, rg==1)$quant)`

Range of # of quant modules for NON-Russell Group universities: `r range(filter(course, rg==0)$quant)`

Percent of courses using APA guidelines: `r round(sum(course$APA > 0)/nrow(course), 2)`

Number of modules with 100% exams: `r sum(modExam$exams == 100)`

Number of modules with >0% & <100% exams: `r sum(modExam$exams > 0) - sum(modExam$exams == 100)`

Number of modules with 0% exams: `r sum(modExam$exams == 0)`

Number of modules that don't mention grading: `r sum(modrg$exams == "notStated")`

critical evaluation: `r round(sum(courseCut$criticEval > 0)/nrow(course),2)`

probability and randomness: `r round(sum(courseCut$probRand > 0)/nrow(course),2)`

power: `r round(sum(courseCut$power > 0)/nrow(course),2)`

effect size: `r round(sum(courseCut$es > 0)/nrow(course),2)`

data treatment: `r round(sum(courseCut$dataTreat > 0)/nrow(course),2)`

confidence intervals: `r round(sum(courseCut$ci > 0)/nrow(course),2)`

multiple comparisons: `r round(sum(courseCut$mult > 0)/nrow(course),2)`

practical significance: `r round(sum(courseCut$sig > 0)/nrow(course),2)`

correlation: `r round(sum(courseCut$corr > 0)/nrow(course),2)`

t-test: `r round(sum(courseCut$tTest > 0)/nrow(course),2)`

Number of first year Russell Group modules: `r sum(grepl("1", mod_RG$year))`

Number of second year Russell Group modules: `r sum(grepl("2", mod_RG$year))`

Number of third year Russell Group modules: `r sum(grepl("3", mod_RG$year))`

<!--plus on for first year modules because there were 2 Edge Hill modules coded together and they were both for first year modules-->
Number of first year Non-Russell Group modules: `r sum(grepl("1", mod_nonRG$year)) + 1`

Number of second year Non-Russell Group modules: `r sum(grepl("2", mod_nonRG$year))`

Number of third year Non-Russell Group modules: `r sum(grepl("3", mod_nonRG$year))`

Number module RC coded: `r sum(mod$coder == "RC")`

Number module RT coded: `r sum(mod$coder == "RT")`

Number module KD coded: `r sum(mod$coder == "KD")`

Number courses with Philosophy of science: `r sum(courseCut$phil > 0)`

Number courses with Qualitative methods: `r sum(courseCut$qualitative > 0)`

\newpage
```{r, fig1a, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(t1, caption = "Data for Figure 1 (1/2)", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

```{r, fig1b, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(t(t2), caption = "Data for Figure 1 (2/2)", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

```{r, table1a, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(courseMods_out, caption = "Data for Table 1 (1/6)", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

```{r, table1b, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(courseSoft, caption = "Data for Table 1 (2/6)", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

```{r, table1c, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(courseSoftPerc, caption = "Data for Table 1 (3/6)", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

```{r, table2out, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(table2, caption = "Data for Table 2", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

```{r, tableS1out, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(kOut, caption = "Data for Table S1", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

```{r, tableS3out, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(textbook, caption = "Data for Table S3", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

\newpage
![Supplementary Figure 1. Percentage of grade given in exams.](figS1.png)
\newpage

```{r optionalRemoved}
# This chunk repeats the chunks "modulesToCourses", "table1" and "table2", but excludes the optional modules.

########### repeat "modulesToCourses ###########

#load file. stringAsFactors = FALSE is necessary to not get errors
mod <- read.csv("StatsSyllabi_Data3ModuleTopics_220716.csv", header = TRUE, stringsAsFactors = FALSE)

#This is a workaround to get the column titles to have spaces. I do this to facilitate making tables and the data output.
itemsShort <- colnames(mod)
items <- mod[1,]
mod <- mod[-1,]
colnames(mod) <- items

#filter to single resolved coding
mod1 <- mod  %>% filter(codeNum==3) 

mod1 <- mod1 %>% filter(optional=="Compulsory")

#make columns for guidelines and software
mod1 <- mod1 %>% mutate(APA = ifelse(grepl("APA", guideline), 1, 0))
mod1 <- mod1 %>% mutate(BPS = ifelse(grepl("BPS", guideline), 1, 0))
mod1 <- mod1 %>% mutate(SPSS = ifelse(grepl("SPSS", software), 1, 0))
mod1 <- mod1 %>% mutate(Excel = ifelse(grepl("Excel", software), 1, 0))
mod1 <- mod1 %>% mutate(R = ifelse(grepl("R", software), 1, 0))
mod1 <- mod1 %>% mutate(Stata = ifelse(grepl("Stata", software), 1, 0))
mod1 <- mod1 %>% mutate(Minitab = ifelse(grepl("Minitab", software), 1, 0))

#swap 'yes' and 'no' for 1 and 0 *Int* for integer
modInt <- mod1
for (i in 1:nrow(mod1)){
  for (j in 8:40){
    if (mod1[i,j] == "No"){
      modInt[i,j] <- 0
    } else if (mod1[i,j] == "Yes"){
      modInt[i,j] <- 1
    } else {
      modInt[i,j] <- mod1[i,j]
    }
  }
}

#messy code to convert the numbers stored as chars in modInt to numeric
modIntCut <- modInt[,8:40]
modIntCut <- apply(modIntCut, 2, as.numeric) %>% as.data.frame()
modInt[,8:40] <- modIntCut

#group by university so that we can collapse modules into courses
modInt <- modInt %>% group_by(university)
course <- modInt %>% summarise_if(is.numeric, sum) %>% 
          as.data.frame()

#get data from "StatsSyllabi_Data1CourseSample_220710.csv" on coded courses for russell group and module numbers
unisCoded <- unis %>% filter(coded==1)
course <- course %>% add_column(rg=unisCoded$russellGroup,
                                 quant=as.numeric(unisCoded$quantMod),
                                 methods=as.numeric(unisCoded$methodsMod),
                                 coding=as.numeric(unisCoded$codingMod))

# remove the 3 optional modules from Greenwich, Southampton, and York
course$quant[18] <- course$quant[18] - 1
course$quant[23] <- course$quant[23] - 1
course$quant[27] <- course$quant[27] - 1


########### repeat table1 chunk ##############
# Output modules and software tables (Table 1)

#change name of course to data_by_rg (Russell Group status) because I coded this section before I coded section 2, and now don't want to change all the names
data_by_rg = group_by(course, rg)

#Calculate mean for the number of course modules, to 2 decimal places (courseMods_mean).  
courseMods_mean = round(summarise(data_by_rg,
                                  quant = mean(quant), 
                                  methods = mean(methods), 
                                  coding = mean(coding)),2)


#Calculate median
courseMods_median = summarise(data_by_rg, 
                              quant = median(quant), 
                              methods = median(methods), 
                              coding = median(coding))

#rename columns
courseMods <- rbind(courseMods_mean, courseMods_median) %>% 
  t() %>%
  as.data.frame %>%
  rename(nonRG_mean=V1,
         RG_mean=V2,
         nonRG_median=V3,
         RG_median=V4)

#rearrange column for Table 2 output
courseMods_out_opt <- courseMods %>% select(RG_mean, RG_median, nonRG_mean, nonRG_median)

#Calculate totals for software and guidelines
#NOTE: Prior to analysis, the software and guidelines data was split into the individual categories listed below, to make the R analysis simpler.  
courseSoft_opt = summarise(data_by_rg, 
                       APA = sum(APA >0),
                       BPS = sum(BPS >0), 
                       SPSS = sum(SPSS >0),
                       Excel = sum(Excel >0), 
                       R = sum(R >0),
                       Stata = sum(Stata >0),
                       Minitab = sum(Minitab >0))  %>%
              arrange(desc(rg)) %>%
              t()

#Export the results. Then manually combine these to make Table 1
write.csv(courseMods_out_opt, "table1a_opt.csv")
write.csv(courseSoft_opt, "table1b_opt.csv")
courseSoftPerc_opt <- courseSoft_opt
courseSoftPerc_opt[,1] <- round(courseSoft_opt[,1]/15, 2)
courseSoftPerc_opt[,2] <- round(courseSoft_opt[,2]/12, 2)
write.csv(courseSoftPerc_opt, "table1c_opt.csv")

########### repeat table2 chunk ##############

# Calculate percentage of courses that include each topic (Table 2)

#new dataframe with only the relevant measurements (ie. Remove everything already analysed in previous 2 sections).
binary_by_rg <- select(data_by_rg, -c("university", "quant", "methods", "coding", "BPS", "APA", "R", "Excel", "SPSS", "Minitab", "Stata"))

#change the sums to binary yes/no in 1/0s
binary_by_rg <- ifelse (binary_by_rg > 0, 1, 0) %>% as.data.frame %>% group_by(rg)

#Calculate the total number of course that contain each topic surveyed
binary_count = summarise_all(binary_by_rg, sum)

#Convert totals into proportions. (Total courses that contain the survey item/Total courses)
binary_perc_nonRG <- (binary_count[1,] / as.integer(count(binary_by_rg)[1,2]))
binary_perc_RG <- (binary_count[2,] / as.integer(count(binary_by_rg)[2,2]))
binary_perc <- rbind(binary_perc_nonRG, binary_perc_RG)
#move rg to last column, so that we can later bind this dataframe with the CIs
binary_perc <- binary_perc[, c(2:ncol(binary_perc),1)]

#Create subsets of non-RG and RG, and do the same CI calculation on each. 
binary_RG = subset(binary_by_rg, rg==1)
binary_nonRG = subset(binary_by_rg, rg==0)

#(Note N=95, total number of non-RG universities.)
#(Note N=23, total number of non-RG universities.)
CI_RG = lapply(binary_RG, function(x) Sprop(x, n=nrow(binary_RG), N=23, level = 0.95))
CI_nonRG = lapply(binary_nonRG, function(x) Sprop(x, n=nrow(binary_nonRG), N=95, level = 0.95))

#The results are double-nested and need some work to extract. 
#Result is nested as varname[4[2]] OR dataframe$varname$ci$exact[1:2]
CI_RG = map(map(CI_RG, 4),2) %>% as.data.frame()
CI_nonRG = map(map(CI_nonRG, 4),2) %>% as.data.frame()

#match column names so that rbind() works
colnames(CI_RG)=colnames(binary_RG)
colnames(CI_nonRG)=colnames(binary_RG)

#bind the point estimates and CIs. RG first with CIs, then nonRG same.
out <- rbind(binary_perc[2,], CI_RG, binary_perc[1,], CI_nonRG)
#collapse the 2 descriptive stats variables. They are the same at course level, so we can remove 1, and then rename it
#and remove he rg identifier, we add it in the colnames in a few lines
out <- out %>% select(-"Descriptive statistics (central tendency)", -rg) %>%
      rename("Descriptive statistics" = "Descriptive statistics (variability)")

#add a row to identify which category each item is in. This was slightly out of order because we added items to the Google form after coding the first 10 modules
out <- rbind(out, c(1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,1,2,2,3,3)) %>% 
  t() %>% 
  as.data.frame() %>%
  round(2) %>%
  rownames_to_column()
#name the columns
colnames(out) <- c("item","rg","rg_low","rg_high","nrg","nrg_low","nrg_high","sec")

#store order before arrange() so I can use it to arrange TableS2 and kappa output.
ord <- out$rg
#sort by section and then rg higher percentage at top.
out <- out %>% arrange(sec, desc(rg))

table2 <- data.frame(paste0(out$item))
table2[,2] <- data.frame(paste0(out$rg*100, "% (", out$rg_low*100, "-", out$rg_high*100, ")"))
table2[,3] <- data.frame(paste0(out$nrg*100, "% (", out$nrg_low*100, "-", out$nrg_high*100, ")"))
colnames(table2) <- c("item", "Russell Group", "Non-Russell Group")

# remove the 2 topics: philosophy of science and qualitatve methods. This decision was made after receiving reviewer comments
table2Removed <- table2[c(8,29),]
table2 <- table2[c(1:7,9:28,30:32),]

write.csv(table2, "tableS2.csv")

#manually format table to highlight differences between with and without including optional modules

```

```{r, table1a_opt, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(courseMods_out_opt, caption = "Data for Table 1 (4/6)", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

```{r, table1b_opt, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(courseSoft_opt, caption = "Data for Table 1 (5/6)", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

```{r, table1c_opt, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(courseSoftPerc_opt, caption = "Data for Table 1 (6/6)", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```

```{r, tableS2out, include = TRUE, echo = FALSE, results = "asis"}
knitr::kable(table2, caption = "Data for Table S2", booktabs = T, linesep = "", align = "c") %>% 
  kable_styling(latex_options = "striped")
```