--- title: "Analysis Code for: Statistics education in undergraduate psychology: A survey of UK curricula" author: "TARG Meta-Research Group" date: "`r Sys.Date()`" output: pdf_document: default --- ```{r setup, include=FALSE} knitr::opts_chunk$set(include = FALSE) # this option stops the code chunks from being output from the knit ``` ```{r loadPackages} # Load packages and set working directory library("tidyverse") #to calculate descriptives by subgroup. library("samplingbook") #Sprop for CI of proportions with finite population correction. library("purrr") #for map function in Block 6 library("irr") #for the kappa2() function library("kableExtra") # for kable table styling setwd(getwd()) ``` ```{r sampleFigure} # Make Figure 1 that shows the sample selection process for the courses #load data. This file MUST be in alphabetical order by university to ensure the Russell Group assignments get linked properly to their university unis <- read.csv("StatsSyllabi_Data1ProgrammeSample_220710.csv") #change characters to numerics unis$psychDegree <- as.numeric(unis$psychDegree) unis$russellGroup <- as.numeric(unis$russellGroup) unis$public <- as.numeric(unis$public) unis$detailed <- as.numeric(unis$detailed) unis$coded <- as.numeric(unis$coded) #group by Russell Group unis <- unis %>% as.data.frame() %>% group_by(russellGroup) #summarise data for table 1 t1 <- unis %>% summarise( "Recognised Universities"=sum(psychDegree<2), "Universities with psychology degree"=sum(psychDegree), "Sufficient detail"=sum(detailed==1, na.rm=TRUE)) t2 <- unis %>% summarise( "Insufficient detail"=sum(detailed==0, na.rm=TRUE), "Syllabus not publicly available"=sum(public==0, na.rm=TRUE), "Courses contacted"=sum(followup!="NA", na.rm=T), "Syllabus or details provided"=sum(followup=="syllabusProvided", na.rm=T), "Further details unavailable"=sum(followup=="furtherDetailsUnavailable", na.rm=T), "University denied access"=sum(followup=="deniedAccess", na.rm=T), "No response"=sum(followup=="noResponse", na.rm=T), "Courses sampled"=sum(coded, na.rm=TRUE) ) #there was an issue where i couldn't use $detail twice in the same summarise(), so I made 2 tables then bind them together t2 <- t2 %>% subset(select=-c(russellGroup)) t1 <- cbind(t1, t2) %>% as.data.frame() %>% arrange(desc(russellGroup)) %>% t() #export t1 to .csv. Then manually format into Table 1. #This data was originally going to be a table on it's own. We decided to not include this table in the final publication. The data is now used for Figure 1 (the flowchart) write.csv(t1, "fig1a.csv") #additional numbers to input into the bottom row of Figure 1, the flowchart flow <- unis %>% summarise( "not public provided"=sum(followup=="syllabusProvided" & public==0, na.rm=T), "not public denied"=sum(followup=="deniedAccess" & public==0, na.rm=T), "not public no response"=sum(followup=="noResponse" & public==0, na.rm=T), "not public further details unavailable"=sum(followup=="furtherDetailsUnavailable" & public==0, na.rm=T), "public further details unavailable"=sum(followup=="furtherDetailsUnavailable" & public==1, na.rm=T), "public provided"=sum(followup=="syllabusProvided" & public==1, na.rm=T), "public no response"=sum(followup=="noResponse" & public==1, na.rm=T) ) %>% arrange(desc(russellGroup)) %>% t() #export flow to .csv. Then use these numbers to populate the bottom row of Figure 1, the flowchart write.csv(flow, "fig1b.csv") ``` ```{r modulesToCourses} # Collapse module coding into course coding #load file. stringAsFactors = FALSE is necessary to not get errors mod <- read.csv("StatsSyllabi_Data3ModuleTopics_220716.csv", header = TRUE, stringsAsFactors = FALSE) #This is a workaround to get the column titles to have spaces. I do this to facilitate making tables and the data output. itemsShort <- colnames(mod) items <- mod[1,] mod <- mod[-1,] colnames(mod) <- items #filter to single resolved coding mod1 <- mod %>% filter(codeNum==3) #make columns for guidelines and software mod1 <- mod1 %>% mutate(APA = ifelse(grepl("APA", guideline), 1, 0)) mod1 <- mod1 %>% mutate(BPS = ifelse(grepl("BPS", guideline), 1, 0)) mod1 <- mod1 %>% mutate(SPSS = ifelse(grepl("SPSS", software), 1, 0)) mod1 <- mod1 %>% mutate(Excel = ifelse(grepl("Excel", software), 1, 0)) mod1 <- mod1 %>% mutate(R = ifelse(grepl("R", software), 1, 0)) mod1 <- mod1 %>% mutate(Stata = ifelse(grepl("Stata", software), 1, 0)) mod1 <- mod1 %>% mutate(Minitab = ifelse(grepl("Minitab", software), 1, 0)) #swap 'yes' and 'no' for 1 and 0 *Int* for integer modInt <- mod1 for (i in 1:nrow(mod1)){ for (j in 8:40){ if (mod1[i,j] == "No"){ modInt[i,j] <- 0 } else if (mod1[i,j] == "Yes"){ modInt[i,j] <- 1 } else { modInt[i,j] <- mod1[i,j] } } } #messy code to convert the numbers stored as chars in modInt to numeric modIntCut <- modInt[,8:40] modIntCut <- apply(modIntCut, 2, as.numeric) %>% as.data.frame() modInt[,8:40] <- modIntCut #group by university so that we can collapse modules into courses modInt <- modInt %>% group_by(university) course <- modInt %>% summarise_if(is.numeric, sum) %>% as.data.frame() #get data from "StatsSyllabi_Data1CourseSample_220710.csv" on coded courses for russell group and module numbers unisCoded <- unis %>% filter(coded==1) course <- course %>% add_column(rg=unisCoded$russellGroup, quant=as.numeric(unisCoded$quantMod), methods=as.numeric(unisCoded$methodsMod), coding=as.numeric(unisCoded$codingMod)) #output cleaned data used for calculation of main analysis write.csv(course, "StatsSyllabi_Data4ProgrammeTopics_220716.csv") ``` ```{r table1} # Output modules and software tables (Table 1) #change name of course to data_by_rg (Russell Group status) because I coded this section before I coded section 2, and now don't want to change all the names data_by_rg = group_by(course, rg) #Calculate mean for the number of course modules, to 2 decimal places (courseMods_mean). courseMods_mean = round(summarise(data_by_rg, quant = mean(quant), methods = mean(methods), coding = mean(coding)),2) #Calculate median courseMods_median = summarise(data_by_rg, quant = median(quant), methods = median(methods), coding = median(coding)) #rename columns courseMods <- rbind(courseMods_mean, courseMods_median) %>% t() %>% as.data.frame %>% rename(nonRG_mean=V1, RG_mean=V2, nonRG_median=V3, RG_median=V4) #rearrange column for Table 2 output courseMods_out <- courseMods %>% select(RG_mean, RG_median, nonRG_mean, nonRG_median) #Calculate totals for software and guidelines #NOTE: Prior to analysis, the software and guidelines data was split into the individual categories listed below, to make the R analysis simpler. courseSoft = summarise(data_by_rg, APA = sum(APA >0), BPS = sum(BPS >0), SPSS = sum(SPSS >0), Excel = sum(Excel >0), R = sum(R >0), Stata = sum(Stata >0), Minitab = sum(Minitab >0)) %>% arrange(desc(rg)) %>% t() #Export the results. Then manually combine these to make Table 1 write.csv(courseMods_out, "table1a.csv") write.csv(courseSoft, "table1b.csv") courseSoftPerc <- courseSoft courseSoftPerc[,1] <- round(courseSoft[,1]/15, 2) courseSoftPerc[,2] <- round(courseSoft[,2]/12, 2) write.csv(courseSoftPerc, "table1c.csv") ``` ```{r table2} # Calculate percentage of courses that include each topic (Table 2) #new dataframe with only the relevant measurements (ie. Remove everything already analysed in previous 2 sections). binary_by_rg <- select(data_by_rg, -c("university", "quant", "methods", "coding", "BPS", "APA", "R", "Excel", "SPSS", "Minitab", "Stata")) #change the sums to binary yes/no in 1/0s binary_by_rg <- ifelse (binary_by_rg > 0, 1, 0) %>% as.data.frame %>% group_by(rg) #Calculate the total number of course that contain each topic surveyed binary_count = summarise_all(binary_by_rg, sum) #Convert totals into proportions. (Total courses that contain the survey item/Total courses) binary_perc_nonRG<- (binary_count[1,] / as.integer(count(binary_by_rg)[1,2])) binary_perc_RG <- (binary_count[2,] / as.integer(count(binary_by_rg)[2,2])) binary_perc <- rbind(binary_perc_nonRG, binary_perc_RG) #move rg to last column, so that we can later bind this dataframe with the CIs binary_perc <- binary_perc[, c(2:ncol(binary_perc),1)] #Create subsets of non-RG and RG, and do the same CI calculation on each. binary_RG = subset(binary_by_rg, rg==1) binary_nonRG = subset(binary_by_rg, rg==0) #(Note N=95, total number of non-RG universities.) #(Note N=23, total number of non-RG universities.) CI_RG = lapply(binary_RG, function(x) Sprop(x, n=nrow(binary_RG), N=23, level = 0.95)) CI_nonRG = lapply(binary_nonRG, function(x) Sprop(x, n=nrow(binary_nonRG), N=95, level = 0.95)) #The results are double-nested and need some work to extract. #Result is nested as varname[4[2]] OR dataframe$varname$ci$exact[1:2] CI_RG = map(map(CI_RG, 4),2) %>% as.data.frame() CI_nonRG = map(map(CI_nonRG, 4),2) %>% as.data.frame() #match column names so that rbind() works colnames(CI_RG)=colnames(binary_RG) colnames(CI_nonRG)=colnames(binary_RG) #bind the point estimates and CIs. RG first with CIs, then nonRG same. out <- rbind(binary_perc[2,], CI_RG, binary_perc[1,], CI_nonRG) #collapse the 2 descriptive stats variables. They are the same at course level, so we can remove 1, and then rename it #and remove he rg identifier, we add it in the colnames in a few lines out <- out %>% select(-"Descriptive statistics (central tendency)", -rg) %>% rename("Descriptive statistics" = "Descriptive statistics (variability)") #add a row to identify which category each item is in. This was slightly out of order because we added items to the Google form after coding the first 10 modules out <- rbind(out, c(1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,1,2,2,3,3)) %>% t() %>% as.data.frame() %>% round(2) %>% rownames_to_column() #name the columns colnames(out) <- c("item","rg","rg_low","rg_high","nrg","nrg_low","nrg_high","sec") #store order before arrange() so I can use it to arrange TableS2 and kappa output. ord <- out$rg #sort by section and then rg higher percentage at top. out <- out %>% arrange(sec, desc(rg)) table2 <- data.frame(paste0(out$item)) table2[,2] <- data.frame(paste0(out$rg*100, "% (", out$rg_low*100, "-", out$rg_high*100, ")")) table2[,3] <- data.frame(paste0(out$nrg*100, "% (", out$nrg_low*100, "-", out$nrg_high*100, ")")) colnames(table2) <- c("item", "Russell Group", "Non-Russell Group") # remove the 2 topics: philosophy of science and qualitatve methods. This decision was made after receiving reviewer comments table2Removed <- table2[c(8,29),] table2 <- table2[c(1:7,9:28,30:32),] write.csv(table2, "table2.csv") ``` ```{r exams} # Dot plot of exam percentages (Figure S1) #read this file to get the RG codings rgs <- read.csv("StatsSyllabi_Data2ModuleSample_220710.csv", fileEncoding="UTF-8-BOM", header = T) #remove the modules we didn't code rgs <- rgs %>% filter(coded==1) #remove module #14, located in row 13. This was a Edge Hill module that we collapsed the coding for because it was in one document # and sort by moduleID rgs <- rgs %>% arrange(as.numeric(moduleID)) rgs <- rgs[-9,] #check to ensure the module IDs line up before transferring the RG coding from rgs to mod1 mod1 <- mod1 %>% arrange(as.numeric(moduleID)) check <- rgs$moduleID==mod1$moduleID if (F %in% check){ stop("modules don't align between data sheets") } modrg <- mod1 %>% mutate(rg = rgs$RussellGroup) modExam <- modrg %>% filter(exams!="notStated") # These two lines are to calculate the number of upper and lower year courses (they are unrelated to the exams) mod_RG <- modrg %>% filter(rg==1) mod_nonRG <- modrg %>% filter(rg==0) dot<- ggplot(modExam, aes(x = as.numeric(exams), fill=factor(rg))) + geom_dotplot(stackgroups=TRUE, method="histodot", binwidth=1, dotsize=5 ) + ylim(0,30) + xlim(0,100) + labs(x="percent exams", y="module count") + theme(legend.position = "none") + scale_x_continuous(breaks=seq(0,100,10)) + theme(text = element_text(size=20)) #this graph must be scale when printing it so that the 0% column adds up to exactly 30. This is a quirk of geom_dotplot that I've tried to work around. 392*555 seems to work. png("figS1.png", width = 392, height = 552) plot(dot) dev.off() ``` ```{r textbooks} #Chunk 9: textbooks (table S3) textbook <- modrg %>% group_by(rg) %>% count(textbook) #because we coded two modules for edge hill and collapsed them into one, and Field was mentioned in the overall document, we should add one. textbook[5,3] <- textbook[5,3] + 1 write.csv(textbook,"tableS3.csv") #finish organizing this file manually. ``` ```{r stackedBarChart} # Create the stacked bar chart for Figure 2 brks <- c(0, 0.25, 0.5, 0.75, 1) #create an inverse of the dataset, so we can make a filled bar chart outPlot <- out %>% mutate(real="real") #create section headers for bar graph header1 <- data.frame("Statistical concepts",0,0,0,0,0,0,0,"real") names(header1) <- names(outPlot) header2 <- data.frame("Statistical tools / techniques",0,0,0,0,0,0,0,"real") names(header2) <- names(outPlot) header3 <- data.frame("Other methodological skills",0,0,0,0,0,0,0,"real") names(header3) <- names(outPlot) #match column names, so that we can bind these dataframes colnames(header1) <- c("item", "rg", "rg_low", "rg_high", "nrg", "nrg_low", "nrg_high", "sec", "real") colnames(header2) <- c("item", "rg", "rg_low", "rg_high", "nrg", "nrg_low", "nrg_high", "sec", "real") colnames(header3) <- c("item", "rg", "rg_low", "rg_high", "nrg", "nrg_low", "nrg_high", "sec", "real") #insert section headers outPlot <- rbind(header1, outPlot[1:12,], header2, outPlot[13:27,], header3, outPlot[28:32,]) #keep order from before item35 <- c("Statistical concepts", out$item[1:12], "Statistical tools / techniques", out$item[13:27], "Other methodological skills", out$item[28:32]) outPlot$item <- ordered(item35,outPlot$item) %>% fct_rev() #create inverse dataframe outPlotInv <- outPlot outPlotInv[,2:7] <- 1-outPlot[,2:7] outPlotInv[,9] <- "inv" #combine dataframes outPlotAll <- rbind(outPlot, outPlotInv) %>% mutate(count_rg=round(rg*15),0) %>% mutate(count_nrg=round(nrg*12),0) #make section headers 0 to remove colour from those sections in the graph outPlotAll[c(36,49,65),c(2:7,10:12)] <- c(0L,0L,0L,0L,0L,0L,0L,0L,0L) outPlotAll$real <- as.character(outPlotAll$real) #bold the 3 section titles breaks <- levels(outPlotAll$item) labels <- as.expression(breaks) labels[[35]] <- bquote(bolditalic(.(labels[[35]]))) labels[[22]] <- bquote(bolditalic(.(labels[[22]]))) labels[[6]] <- bquote(bolditalic(.(labels[[6]]))) # remove the two non-quantitative measures (in line with the reviewer comments) outPlotAll <- outPlotAll %>% filter(item != "Philosophy of science") outPlotAll <- outPlotAll %>% filter(item != "Qualitative methods") # save the figures plot_rg <- ggplot(outPlotAll, aes(x=item, y=rg)) + geom_bar(stat="identity", aes(fill=real)) + geom_text(data=subset(outPlotAll,count_rg != 0),aes(label=count_rg), position=position_stack(vjust = 0.5)) + coord_flip() + scale_y_continuous(breaks = brks, labels = scales::percent(brks)) + #remove background grey colour theme(axis.line = element_line(colour = "white"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.title = element_blank(), legend.position = "none", axis.ticks.y = element_blank(), text = element_text(size=18), axis.text.x = element_text(size=12) ) + scale_fill_manual(values=c("grey80", "goldenrod2")) + scale_x_discrete(label = labels, breaks = breaks) png("fig2a.png", width = 335, height = 770, units = "px") plot(plot_rg) dev.off() plot_nrg <- ggplot(outPlotAll, aes(x=item, y=nrg)) + geom_bar(stat="identity", aes(fill=real)) + geom_text(data=subset(outPlotAll,count_nrg != 0),aes(label=count_nrg), position=position_stack(vjust = 0.5)) + coord_flip() + scale_y_continuous(breaks = brks, labels = scales::percent(brks)) + #remove background grey colour theme(axis.line = element_line(colour = "white"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.title = element_blank(), legend.position = "none", axis.ticks.y = element_blank(), text = element_text(size=18), axis.text.x = element_text(size=12) ) + scale_fill_manual(values=c("grey80", "goldenrod2")) + scale_x_discrete(label = labels, breaks = breaks) #manually cut and paste these graphs next to each other for the final image. #print the image png("fig2b.png", width = 335, height = 770, units = "px") plot(plot_nrg) dev.off() #manually place the two stacked bar charts next to each other for the final image. ``` ```{r IRRdetial} # Calculate the IRR for coding whether the module syllabi had enough detail to code for this study d <- read.csv("StatsSyllabi_Data2ModuleSample_220710.csv", fileEncoding="UTF-8-BOM", header = T) kDetail <- kappa2(cbind(d$sufficientDetailCoder1, d$sufficientDetailCoder2)) ``` ```{r IRR} # Calculating kappa, inter-rater agreement (Table S1) #make a 2*n column to go into kappa2() x1 <- mod %>% filter(codeNum == 1) x2 <- mod %>% filter(codeNum == 2) k <- mod1[1,] #run kappa2() on all binary items for (i in 8:40){ temp <- cbind(x1[i], x2[i]) %>% kappa2() k[i] = temp[5] } #rename columns so I can subset for an average kappa k <- k[8:40] kTemp <- k %>% as.data.frame() colnames(kTemp) <- itemsShort[8:40] kMeanAll <- mean(as.numeric(select(kTemp, -c(phil, qualitative))), na.rm=TRUE) #remove items with 4 or less occurrences as well as applied and criticEval which were misunderstood. kCut <- k %>% as.data.frame() colnames(kCut) <- itemsShort[8:40] kCut <- kCut %>% select(-c(phil, applied, freq, bayes, sig, mult, qualitative)) kMeanCut <- mean(as.numeric(kCut), na.rm=TRUE) disc <- matrix(nrow=nrow(mod1), ncol=33) #identify number of discrepancies for (i in 1:nrow(mod1)){ for (j in 8:40){ if (mod[(i*3-2),j] == mod[(i*3-1),j]){ disc[i,j-7] <- 0 } else { disc[i,j-7] <- 1 } } } #output kappa and number of discrepancies kOut <- k %>% round(2) kOut[2,] <- colSums(as.data.frame(disc)) kOut <- kOut %>% t() #order kappa output in same way as table. Put we need have the two versions of descriptive variables now, so some tweaks to the code are necessary ord33 <- append(ord, ord[13],after=12) kOut <- kOut %>% cbind(c(1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,1,2,2,3,3), ord33) %>% as.data.frame() %>% rownames_to_column() %>% arrange(V3, desc(ord33)) %>% select(-c(V3, ord33)) colnames(kOut) <- c("topic", "kappa", "numDiscrepant") write.csv(kOut, file="tableS1.csv") #also need to organize it in the same order as everything else ``` This code outputs several tables and figure that we integrated into the manuscript. It also outputs a pdf with several key numbers we input into the manuscript, and is followed by the same tables and figures that are output to csv and png files. ```{r} # for use in the Key Numbers section courseCut <- course[,2:34] colnames(courseCut) <- itemsShort[8:40] ``` # Key numbers Percent of courses where all the quantitative modules had publicly available syllabi or sufficient detail for our coding procedure: `r round(sum(unis$public==1 & unis$detailed==1, na.rm=T)/sum(unis$psychDegree==1), 2)` Kappa for sufficient detail: `r kDetail$value %>% round(2)` Number coded different: `r sum(d$sufficientDetailCoder1 != d$sufficientDetailCoder2, na.rm=T)` Total number coded for sufficient detail: `r nrow(d)` Number of universities contacted: `r sum(flow) - 1` Mean Cohen's Kappa for topics with philosophy and qual removed: `r kMeanAll %>% round(2)` Mean Cohen's Kappa for topics with philosophy and qual removed + 5 other rare topics excluded: `r kMeanCut %>% round(2)` Range of # of quant modules for Russell Group universities: `r range(filter(course, rg==1)$quant)` Range of # of quant modules for NON-Russell Group universities: `r range(filter(course, rg==0)$quant)` Percent of courses using APA guidelines: `r round(sum(course$APA > 0)/nrow(course), 2)` Number of modules with 100% exams: `r sum(modExam$exams == 100)` Number of modules with >0% & <100% exams: `r sum(modExam$exams > 0) - sum(modExam$exams == 100)` Number of modules with 0% exams: `r sum(modExam$exams == 0)` Number of modules that don't mention grading: `r sum(modrg$exams == "notStated")` critical evaluation: `r round(sum(courseCut$criticEval > 0)/nrow(course),2)` probability and randomness: `r round(sum(courseCut$probRand > 0)/nrow(course),2)` power: `r round(sum(courseCut$power > 0)/nrow(course),2)` effect size: `r round(sum(courseCut$es > 0)/nrow(course),2)` data treatment: `r round(sum(courseCut$dataTreat > 0)/nrow(course),2)` confidence intervals: `r round(sum(courseCut$ci > 0)/nrow(course),2)` multiple comparisons: `r round(sum(courseCut$mult > 0)/nrow(course),2)` practical significance: `r round(sum(courseCut$sig > 0)/nrow(course),2)` correlation: `r round(sum(courseCut$corr > 0)/nrow(course),2)` t-test: `r round(sum(courseCut$tTest > 0)/nrow(course),2)` Number of first year Russell Group modules: `r sum(grepl("1", mod_RG$year))` Number of second year Russell Group modules: `r sum(grepl("2", mod_RG$year))` Number of third year Russell Group modules: `r sum(grepl("3", mod_RG$year))` Number of first year Non-Russell Group modules: `r sum(grepl("1", mod_nonRG$year)) + 1` Number of second year Non-Russell Group modules: `r sum(grepl("2", mod_nonRG$year))` Number of third year Non-Russell Group modules: `r sum(grepl("3", mod_nonRG$year))` Number module RC coded: `r sum(mod$coder == "RC")` Number module RT coded: `r sum(mod$coder == "RT")` Number module KD coded: `r sum(mod$coder == "KD")` Number courses with Philosophy of science: `r sum(courseCut$phil > 0)` Number courses with Qualitative methods: `r sum(courseCut$qualitative > 0)` \newpage ```{r, fig1a, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(t1, caption = "Data for Figure 1 (1/2)", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` ```{r, fig1b, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(t(t2), caption = "Data for Figure 1 (2/2)", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` ```{r, table1a, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(courseMods_out, caption = "Data for Table 1 (1/6)", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` ```{r, table1b, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(courseSoft, caption = "Data for Table 1 (2/6)", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` ```{r, table1c, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(courseSoftPerc, caption = "Data for Table 1 (3/6)", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` ```{r, table2out, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(table2, caption = "Data for Table 2", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` ```{r, tableS1out, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(kOut, caption = "Data for Table S1", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` ```{r, tableS3out, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(textbook, caption = "Data for Table S3", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` \newpage ![Supplementary Figure 1. Percentage of grade given in exams.](figS1.png) \newpage ```{r optionalRemoved} # This chunk repeats the chunks "modulesToCourses", "table1" and "table2", but excludes the optional modules. ########### repeat "modulesToCourses ########### #load file. stringAsFactors = FALSE is necessary to not get errors mod <- read.csv("StatsSyllabi_Data3ModuleTopics_220716.csv", header = TRUE, stringsAsFactors = FALSE) #This is a workaround to get the column titles to have spaces. I do this to facilitate making tables and the data output. itemsShort <- colnames(mod) items <- mod[1,] mod <- mod[-1,] colnames(mod) <- items #filter to single resolved coding mod1 <- mod %>% filter(codeNum==3) mod1 <- mod1 %>% filter(optional=="Compulsory") #make columns for guidelines and software mod1 <- mod1 %>% mutate(APA = ifelse(grepl("APA", guideline), 1, 0)) mod1 <- mod1 %>% mutate(BPS = ifelse(grepl("BPS", guideline), 1, 0)) mod1 <- mod1 %>% mutate(SPSS = ifelse(grepl("SPSS", software), 1, 0)) mod1 <- mod1 %>% mutate(Excel = ifelse(grepl("Excel", software), 1, 0)) mod1 <- mod1 %>% mutate(R = ifelse(grepl("R", software), 1, 0)) mod1 <- mod1 %>% mutate(Stata = ifelse(grepl("Stata", software), 1, 0)) mod1 <- mod1 %>% mutate(Minitab = ifelse(grepl("Minitab", software), 1, 0)) #swap 'yes' and 'no' for 1 and 0 *Int* for integer modInt <- mod1 for (i in 1:nrow(mod1)){ for (j in 8:40){ if (mod1[i,j] == "No"){ modInt[i,j] <- 0 } else if (mod1[i,j] == "Yes"){ modInt[i,j] <- 1 } else { modInt[i,j] <- mod1[i,j] } } } #messy code to convert the numbers stored as chars in modInt to numeric modIntCut <- modInt[,8:40] modIntCut <- apply(modIntCut, 2, as.numeric) %>% as.data.frame() modInt[,8:40] <- modIntCut #group by university so that we can collapse modules into courses modInt <- modInt %>% group_by(university) course <- modInt %>% summarise_if(is.numeric, sum) %>% as.data.frame() #get data from "StatsSyllabi_Data1CourseSample_220710.csv" on coded courses for russell group and module numbers unisCoded <- unis %>% filter(coded==1) course <- course %>% add_column(rg=unisCoded$russellGroup, quant=as.numeric(unisCoded$quantMod), methods=as.numeric(unisCoded$methodsMod), coding=as.numeric(unisCoded$codingMod)) # remove the 3 optional modules from Greenwich, Southampton, and York course$quant[18] <- course$quant[18] - 1 course$quant[23] <- course$quant[23] - 1 course$quant[27] <- course$quant[27] - 1 ########### repeat table1 chunk ############## # Output modules and software tables (Table 1) #change name of course to data_by_rg (Russell Group status) because I coded this section before I coded section 2, and now don't want to change all the names data_by_rg = group_by(course, rg) #Calculate mean for the number of course modules, to 2 decimal places (courseMods_mean). courseMods_mean = round(summarise(data_by_rg, quant = mean(quant), methods = mean(methods), coding = mean(coding)),2) #Calculate median courseMods_median = summarise(data_by_rg, quant = median(quant), methods = median(methods), coding = median(coding)) #rename columns courseMods <- rbind(courseMods_mean, courseMods_median) %>% t() %>% as.data.frame %>% rename(nonRG_mean=V1, RG_mean=V2, nonRG_median=V3, RG_median=V4) #rearrange column for Table 2 output courseMods_out_opt <- courseMods %>% select(RG_mean, RG_median, nonRG_mean, nonRG_median) #Calculate totals for software and guidelines #NOTE: Prior to analysis, the software and guidelines data was split into the individual categories listed below, to make the R analysis simpler. courseSoft_opt = summarise(data_by_rg, APA = sum(APA >0), BPS = sum(BPS >0), SPSS = sum(SPSS >0), Excel = sum(Excel >0), R = sum(R >0), Stata = sum(Stata >0), Minitab = sum(Minitab >0)) %>% arrange(desc(rg)) %>% t() #Export the results. Then manually combine these to make Table 1 write.csv(courseMods_out_opt, "table1a_opt.csv") write.csv(courseSoft_opt, "table1b_opt.csv") courseSoftPerc_opt <- courseSoft_opt courseSoftPerc_opt[,1] <- round(courseSoft_opt[,1]/15, 2) courseSoftPerc_opt[,2] <- round(courseSoft_opt[,2]/12, 2) write.csv(courseSoftPerc_opt, "table1c_opt.csv") ########### repeat table2 chunk ############## # Calculate percentage of courses that include each topic (Table 2) #new dataframe with only the relevant measurements (ie. Remove everything already analysed in previous 2 sections). binary_by_rg <- select(data_by_rg, -c("university", "quant", "methods", "coding", "BPS", "APA", "R", "Excel", "SPSS", "Minitab", "Stata")) #change the sums to binary yes/no in 1/0s binary_by_rg <- ifelse (binary_by_rg > 0, 1, 0) %>% as.data.frame %>% group_by(rg) #Calculate the total number of course that contain each topic surveyed binary_count = summarise_all(binary_by_rg, sum) #Convert totals into proportions. (Total courses that contain the survey item/Total courses) binary_perc_nonRG <- (binary_count[1,] / as.integer(count(binary_by_rg)[1,2])) binary_perc_RG <- (binary_count[2,] / as.integer(count(binary_by_rg)[2,2])) binary_perc <- rbind(binary_perc_nonRG, binary_perc_RG) #move rg to last column, so that we can later bind this dataframe with the CIs binary_perc <- binary_perc[, c(2:ncol(binary_perc),1)] #Create subsets of non-RG and RG, and do the same CI calculation on each. binary_RG = subset(binary_by_rg, rg==1) binary_nonRG = subset(binary_by_rg, rg==0) #(Note N=95, total number of non-RG universities.) #(Note N=23, total number of non-RG universities.) CI_RG = lapply(binary_RG, function(x) Sprop(x, n=nrow(binary_RG), N=23, level = 0.95)) CI_nonRG = lapply(binary_nonRG, function(x) Sprop(x, n=nrow(binary_nonRG), N=95, level = 0.95)) #The results are double-nested and need some work to extract. #Result is nested as varname[4[2]] OR dataframe$varname$ci$exact[1:2] CI_RG = map(map(CI_RG, 4),2) %>% as.data.frame() CI_nonRG = map(map(CI_nonRG, 4),2) %>% as.data.frame() #match column names so that rbind() works colnames(CI_RG)=colnames(binary_RG) colnames(CI_nonRG)=colnames(binary_RG) #bind the point estimates and CIs. RG first with CIs, then nonRG same. out <- rbind(binary_perc[2,], CI_RG, binary_perc[1,], CI_nonRG) #collapse the 2 descriptive stats variables. They are the same at course level, so we can remove 1, and then rename it #and remove he rg identifier, we add it in the colnames in a few lines out <- out %>% select(-"Descriptive statistics (central tendency)", -rg) %>% rename("Descriptive statistics" = "Descriptive statistics (variability)") #add a row to identify which category each item is in. This was slightly out of order because we added items to the Google form after coding the first 10 modules out <- rbind(out, c(1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,1,2,2,3,3)) %>% t() %>% as.data.frame() %>% round(2) %>% rownames_to_column() #name the columns colnames(out) <- c("item","rg","rg_low","rg_high","nrg","nrg_low","nrg_high","sec") #store order before arrange() so I can use it to arrange TableS2 and kappa output. ord <- out$rg #sort by section and then rg higher percentage at top. out <- out %>% arrange(sec, desc(rg)) table2 <- data.frame(paste0(out$item)) table2[,2] <- data.frame(paste0(out$rg*100, "% (", out$rg_low*100, "-", out$rg_high*100, ")")) table2[,3] <- data.frame(paste0(out$nrg*100, "% (", out$nrg_low*100, "-", out$nrg_high*100, ")")) colnames(table2) <- c("item", "Russell Group", "Non-Russell Group") # remove the 2 topics: philosophy of science and qualitatve methods. This decision was made after receiving reviewer comments table2Removed <- table2[c(8,29),] table2 <- table2[c(1:7,9:28,30:32),] write.csv(table2, "tableS2.csv") #manually format table to highlight differences between with and without including optional modules ``` ```{r, table1a_opt, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(courseMods_out_opt, caption = "Data for Table 1 (4/6)", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` ```{r, table1b_opt, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(courseSoft_opt, caption = "Data for Table 1 (5/6)", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` ```{r, table1c_opt, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(courseSoftPerc_opt, caption = "Data for Table 1 (6/6)", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ``` ```{r, tableS2out, include = TRUE, echo = FALSE, results = "asis"} knitr::kable(table2, caption = "Data for Table S2", booktabs = T, linesep = "", align = "c") %>% kable_styling(latex_options = "striped") ```