### This script extracts the stay times of devices at each destination from ### Lamotte's data for the musical festival (Section 3.1.1 in the accompanying ### paper). It calculates the distribution of stay times by both day and ### destination, as well as giving the distribution for all days and destinations. ### Requires processed data files generated from data_explore.R. rm(list = ls()) # Clear variables from global environment before proceeding. # A character vector of colours used for plotting. hues <- c("black", "red", "blue", "green", "yellow", "purple", "grey", "orange", "pink", "cyan", "aliceblue", "antiquewhite", "aquamarine", "azure", "beige", "bisque", "blanchedalmond", "blueviolet", "brown", "burlywood", "cadetblue", "chartreuse", "chocolate", "coral", "cornflowerblue", "cornsilk", "darkblue", "darkcyan", "darkgoldenrod", "darkgray", "darkgreen", "darkkhaki", "darkmagenta", "darkolivegreen", "darkorange", "darkorchid", "darkred", "darksalmon", "darkseagreen", "darkslateblue", "darkslategray", "darkturquoise", "darkviolet", "deeppink", "deepskyblue", "dimgray", "dodgerblue", "firebrick", "floralwhite", "forestgreen", "gainsboro", "gold", "goldenrod", "gray", "greenyellow", "honeydew", "hotpink", "indianred", "khaki", "lavender", "lavenderblush", "lawngreen", "lemonchiffon", "lightblue", "lightcoral", "lightcyan", "lightgoldenrod", "lightgoldenrodyellow", "lightgray", "lightgreen", "lightpink", "lightsalmon", "lightseagreen", "lightskyblue", "lightslateblue", "lightslategray", "lightsteelblue", "lightyellow", "limegreen", "linen", "magenta", "maroon", "mediumaquamarine", "mediumblue", "mediumorchid", "mediumpurple", "mediumseagreen", "mediumslateblue", "mediumspringgreen", "mediumturquoise", "mediumvioletred", "midnightblue", "mistyrose", "moccasin", "navajowhite", "navy", "navyblue", "oldlace", "olivedrab", "orange", "orangered", "orchid", "palegoldenrod", "palegreen", "paleturquoise", "palevioletred", "papayawhip", "peachpuff", "peru", "plum", "purple", "rosybrown", "royalblue", "saddlebrown", "salmon", "sandybrown", "seagreen", "seashell", "sienna", "skyblue", "slateblue", "slategray", "springgreen", "steelblue", "tan", "thistle", "tomato", "turquoise", "violet", "wheat", "yellowgreen") # Extract all the detector names from the file names from day 1. These # detectors are present in all other days. # Set current working directory to day 1. setwd("insertfilepathhere/Published_Data/Raw/day1") # Edit as needed # List the names of all files in this directory, they contain detector/destination names. all_files <- list.files() # All possible detector names come from the raw data files. data_files <- grep(pattern = "anon", all_files, value = T) # Split the names by '-' to separate information. name_split <- strsplit(data_files, split = "-") # Detector names are the first part. detectors <- sapply(name_split, "[", 1) # Remove detectors at thoroughfares. destinations <- detectors[-seq(5, 10)] # State which detectors are not destinations. not_dests <- detectors[seq(5, 10)] # Set current working directory to parent directory containing all data. setwd("insertfilepathhere/Published_Data/Raw") # Edit as needed # List each day available. all_days <- list.dirs(recursive = F, full.names = F) # Convert directory names into more human-friendly versions. day_strings <- c("Day 1", "Day 2", "Day 3", "Day -1") # Are we (re)calculating durations? calc <- F # must be T if this is the first time running this script. # Are we truncating plotted duration distributions? zoom <- F # If zoom = T, then this vector specifies the maximum durations to plot for each # day and destination. Note: some destinations for certain days have no # measurements, so length(xmax) != number of destinations x number of days. xmax <- c(10000, 15000, 10000, 6000, 16000, 7500, 9000, 5000, 1500, 10000, 15000, 10000, 7500, 10000, 8000, 6000, 1500, 9000, 15000, 8000, 10000, 7500, 8000, 5000, 1000, 10000, 15000, 10000, 4000, 450) # Initialise a counter for the xmax vector. iterator <- 1 # Initialise a matrix holding the 5% and 95% quantiles of the duration # distributions for all days. quantiles <- matrix(nrow = length(all_days), ncol = 2) # Set the colour of the line which marks the artificial durations on the # distribution plots. chroma <- col2rgb(col = "red", alpha = T) tint <- rgb((chroma[1] / 255), (chroma[2] / 255), (chroma[3] / 255), alpha = 0.5) # For each collection day... for (a in 1:length(all_days)) { #...set the current working directory to the correct day. setwd(paste0("insertfilepathhere/Published_Data/Processed/", all_days[a])) # Edit as needed # List all files present. There should be one for each detector for this day. al_file <- list.files() # If we wish to (re)calculate destination stay times... if (calc) { #...initialise the detector index. Resets for each day. ind <- 1 # Initialise a counter for each stay time across destinations and people. index <- 1 # Initialise a counter for each device that has no first detections ("I"). counter <- 1 # Numeric vector for the durations of each device at each destination. duration <- numeric() # Numeric vector for the destination visited for each duration. destins <- numeric() # Numeric vector of recordings to be deleted from the main data. del_devs <- numeric() # Initialise matrices of all data for this day. all_day_dests <- NULL all_duration <- NULL all_destins <- NULL # Select all processed data files from this directory/day. dat_file <- grep("processed_data", al_file, value = T) # Extract the names of all detectors with at least one measurement on this day. dat_fil_name <- strsplit(dat_file, split = "_") # Extract the names of all the detectors with at least one measurement. fil_nam <- sapply(dat_fil_name, "[", 1) # Find any processed files for detectors which are not destinations. imposter <- fil_nam %in% not_dests # Remove non-destinations from the detector names vector. Should now include # only destinations with at least one measurement. red_fil_nam <- fil_nam[!imposter] # For each detector... for (b in 1:length(dat_file)) { #...read the data collected. data <- read.table(file = dat_file[b], header = T, sep = ",") # If the detector is at a thoroughfare... if (imposter[b]) { #...do not include it in the destination data. next } # Collect data from all destinations for this day into one place. # If this is the first destination of the day... if (is.null(all_day_dests)) { #...initialise the object to hold all the data. all_day_dests <- data } # Otherwise, if this is not the first destination of the day... else { #...append the data from this destination to the collated data. all_day_dests <- rbind(all_day_dests, data) } # Increment the destination index. ind <- ind + 1 } # Rearrange data into chronological order. chron_day_dests <- all_day_dests[order(as.numeric(all_day_dests[, 1])), ] # Extract timestamps, which are written in unix format from the last epoch. unix_ts <- as.numeric(chron_day_dests[, 1]) # Convert the unix times into human-readable format. times <- as.POSIXlt(as.numeric(chron_day_dests[, 1]), origin = "1970-01-01", tz = "Europe/Brussels") # Extract the unique devices detected over the course of this day. people <- sort(unique(as.numeric(chron_day_dests[, 3]))) # Initialise a counter for the number of times a device appears without disappearing. no_absence <- 0 # For each device... for (f in 1:length(people)) { #...find the entries in the data which correspond to this device. entry_inds <- which(as.numeric(chron_day_dests[, 3]) == people[f]) # Extract the data for this device. entries <- chron_day_dests[entry_inds, ] # If the device has at least two measurements associated with it. This # means that we exclude devices which are either recorded at one # destination throughout data collection or disappear from one destination # without a corresponding appearance measurement... if (length(entry_inds) > 1) { #...find the destinations visited by this device. dests <- unique(entries[, 4]) # For each visited destination... for (g in 1:length(dests)) { #...isolate the measurements recorded at this destination. stay <- as.matrix(entries[which(entries[, 4] == dests[g]), ]) # Measurement timestamps. ts <- as.numeric(stay[, 1]) # Detection status (either "I" or "O"). detection <- stay[, 2] # Find the instances where a device arrives at the destination. starts <- which(detection == "I") # If this device is never recorded as arriving at this destination... if (length(starts) == 0) { #...then skip this destination. next } # For each appearance of this device at this destination... for (h in 1:length(starts)) { #...check whether the following measurement is the same type. same <- detection[starts[h] + 1] == detection[starts[h]] # Is the current measurement a detection? same2 <- same && detection[starts[h]] == "I" # If two successive appearances are recorded or the last measurement # is an appearance... if (starts[h] == nrow(stay) || same2) { #...then the start time is the measurement timestamp... start_time <- ts[starts[h]] #...and the end time is assumes to be the average measurement # update window for the detectors. end_time <- start_time + 130 no_absence <- no_absence + 1 } # Otherwise, if there is a disappearance measurement for this # appearance... else if (detection[starts[h] + 1] != detection[starts[h]]) { #...then the start time is the appearance timestamp. start_time <- ts[starts[h]] # The end time is the disappearance timestamp. Note: this assumes # that a device is not detected at two detectors at once. end_time <- ts[starts[h] + 1] } # Time spent by this device at this destination is end time - start time. duration[index] <- end_time - start_time # Record the destination that this device was recorded at. destins[index] <- as.numeric(dests[g]) # Increment the duration counter. index <- index + 1 } } } # Otherwise, if there is only one measurement from this device... else { #...then mark the device ID for removal. del_devs[counter] <- entry_inds # Increment the removed devices counter. counter <- counter + 1 } } # If this is the first day... if (is.null(all_duration)) { #...then create the data structure to hold durations for all days. all_duration <- duration # Also create the corresponding structure for destination IDs. all_destins <- destins } # Otherwise, if this is not the first day... else { #...append the durations for this day onto the existing structure... all_duration <- c(all_duration, duration) #...along with the destination IDs for this day. all_destins <- c(all_destins, destins) } # Remove the devices with only one measurement from the overall data structure. scrubbed_chron_day_dests <- chron_day_dests[-del_devs, ] # Combine the durations and their associated destinations into one structure. dur_out <- cbind(duration, destins) # Write this structure to an output file. write.table(dur_out, file = paste0(all_days[a], "_dest_durs.txt"), row.names = F, col.names = F, sep = ",") # For each destination... for (c in 1:length(detectors)) { #...isolate the durations for this destination. dest_dur <- duration[which(destins == c)] # If there are durations at this destination... if (length(dest_dur) > 0) { #...write them to their own output file. write.table(dest_dur, row.names = F, col.names = F, sep = ",", file = paste0(detectors[c], "_", all_days[a], "_dest_durs.txt")) } } } else { # Otherwise, if we are not (re)calculating destination stay times... #...then read in the durations for this day. duration <- read.table(file = paste0(all_days[a], "_dest_durs.txt"), sep = ",") # Calculate the 5% and 95% quantiles for the duration distribution. quantiles[a, ] <- quantile(duration[, 1], p = c(0.05, 0.95)) ## Plot the duration distribution over all destinations for this day. # If the extremely long durations for this destination are to be ignored when # plotting the distribution... if (zoom) { #...write the truncated distribution plot to a png file. png(filename = paste0("zoomed_dur_dist_", all_days[a], ".png")) # Plot the truncated distribution of durations spent at this destination. plot(density(duration[, 1]), xlim = c(0, xmax[iterator]), ylab = "Density", main = paste("Stay time distribution for", day_strings[a], "across destinations"), xlab = "Stay time / s") # Draw a red line where the 'artificial' duration is. abline(v = 130, lwd = 3, col = tint) dev.off() #dev.copy2pdf(file = paste0("zoomed_dur_dist_", all_days[a], ".pdf")) # Increment the xmax counter. iterator <- iterator + 1 } else { # Otherwise, if we want to plot the full duration distribution... #...write the duration distribution plot to a png file. png(filename = paste0("dur_dist_", all_days[a], ".png")) # Plot the distribution of durations spent at this destination. plot(density(duration[, 1]), ylab = "Density", main = paste("Stay time distribution for", day_strings[a], "across destinations"), xlab = "Stay time / s") # Draw a red line where the 'artificial' duration is. abline(v = 130, lwd = 1, col = tint) dev.off() #dev.copy2pdf(file = paste0("dur_dist_", all_days[a], ".pdf")) } ## Now he duration distributions for each destination for this day. # Select only destination duration files from this day. dat_file <- grep("dest_durs.txt", al_file, value = T) # Ignore the file containing durations for all detectors for this day. da_fil <- grep(paste0("_", all_days[a]), dat_file, value = T) da_fil_nam_piece <- strsplit(da_fil, split = "_") da_fil_nam <- sapply(da_fil_nam_piece, "[", 1) # For each destination... for (b in 1:length(da_fil)) { #...read the durations for this destination. dest_dur_dat <- read.table(file = da_fil[b], sep = ",") # The durations are actually the first column. dest_dur <- dest_dur_dat[, 1] # If the extremely long durations for this destination are to be ignored # when plotting the distribution... if (zoom) { #...write the truncated distribution plot to a png file. png(filename = paste0("zoomed_dur_dist_", da_fil_nam[b], "_", all_days[a], ".png")) # Plot the truncated distribution of durations spent at this destination. plot(density(dest_dur), xlab = "Stay times / s", ylab = "Density", main = paste("Stay time distribution for", da_fil_nam[b], "on", day_strings[a]), xlim = c(0, xmax[iterator])) # Draw a red line where the 'artificial' duration is. abline(v = 130, lwd = 1, col = "red") dev.off() #dev.copy2pdf(file = paste0("zoomed_dur_dist_", destinations[b], "_", # all_days[a], ".pdf")) # Increment the xmax counter. iterator <- iterator + 1 } else { # Otherwise, if we want to plot the full duration distribution... #...write the duration distribution plot to a png file. png(filename = paste0("dur_dist_", da_fil_nam[b], "_", all_days[a], ".png")) # Plot the distribution of durations spent at this destination. plot(density(dest_dur), xlab = "Stay times / s", ylab = "Density", main = paste("Stay time distribution for", da_fil_nam[b], "on", day_strings[a])) # Draw a red line where the 'artificial' duration is. abline(v = 130, lwd = 1, col = tint, lty = 2) dev.off() #dev.copy2pdf(file = paste0("dur_dist_", destinations[b], "_", all_days[a], # ".pdf")) } } } } ## Now read/write the durations for all destinations over all days from/to the ## parent WiFiPi directory. Used only to plot resulting distribution. # Set current working directory to the parent. setwd("insertfilepathhere/Published_data") # Edit as needed # If we are (re)calculating the durations... if (calc) { #...combine all durations with their corresponding destinations. all_dur_out <- cbind(all_duration, all_destins) # Write this data structure to an output file. write.table(all_dur_out, file = "all_durs.txt", row.names = F, col.names = F, sep = ",") } else { # Otherwise, if we are not (Re)calculating the durations... #...read in the durations from a file. all_duration <- read.table(file = "all_durs.txt", sep = ",") quant <- quantile(all_duration[, 1], p = c(0.05, 0.95)) ## Plot durations over all destinations and days. # If the extremely long durations for this destination are to be ignored when # plotting the distribution... if (zoom) { #...write the truncated distribution plot to a png file. png(filename = "zoomed_all_dur_dist.png") # Plot the truncated distribution of durations spent at this destination. plot(density(all_duration[, 1]), xlab = "Stay time / s", ylab = "Density", main = "Stay time distribution for all destinations over all days", xlim = c(0, 10000)) # Draw a red line where the 'artificial' duration is. abline(v = 130, lwd = 1, col = "red") dev.off() #dev.copy2pdf(file = "zoomed_all_dur_dist.pdf") } else { # Otherwise, if we want to plot the full duration distribution... #...write the duration distribution plot to a png file. png(filename = "all_dur_dist.png") # Plot the distribution of durations spent at this destination. plot(density(all_duration[, 1]), xlab = "Stay time / s", ylab = "Density", main = "Stay time distribution for all destinations over all days") # Draw a red line where the 'artificial' duration is. abline(v = 130, lwd = 1, col = tint, lty = 2) dev.off() #dev.copy2pdf(file = "all_dur_dist.pdf") } }