# This can (in theory) be changed to point to other datasets without breaking anything data.url = "http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors.zip" # Some of this code is formatted using LISP-style indentation, with function arguments # stacked vertically instead of listed horizontally. For example, in LISP stlye, this: # ifelse(x%%2 == 0, "even", "odd") # would be rendered like this: # ifelse( # x%%2 == 0, # "even", # "odd" # ) # Insert today's date into the filename get.zipname = function (url) { paste( # Separate name from .zip extension strsplit( # Extract filename from URL rev( # Split the url into /-delimited components strsplit(data.url, "/", fixed=TRUE)[[1]] )[1], ".", fixed=TRUE )[[1]][1], "_", Sys.Date(), ".zip", sep="" ) } zipname = get.zipname(data.url) # Download the file and save it with a datestamp download.file(data.url, zipname) # Splits x into sub-vectors at the indices contained in idxs split.at = function(x, idxs) { # sapply essentially functions as a for loop here, iterating # the variable "n" over the given range sapply( 0:length(idxs), # number of sub-vectors == length(idxs) + 1 == size of range function(n) { start = ifelse( n==0, 1, idxs[n] + 1) end = ifelse( n==length(idxs), length(x), idxs[n+1] - 1) # Exclude splitting elements (like strsplit) x[start:end] } ) } # Iterative version of the above function, for reference #split.at = function(x, idxs) { # parts = list() # for n in (0:length(idxs)) { # start = ifelse(n==0, 1, idxs[n] + 1) # end = ifelse(n==length(idxs), length(x), idxs[n+1] - 1) # Exclude delimiting elements # parts[[n + 1]] = x[start:end] # } #} # Extract the zipfile files = unzip(zipname) # Split each file into individual tables and writes each one to its own file for (f in files) { text = readLines(f) parts = split.at(text, grep("^$", text)) # divide the file on blank lines print(sprintf("Writing %d parts for input file %s", length(parts), f)) for (pn in (1:length(parts))) { writeLines(parts[[pn]], paste(f, ".part", pn, sep="")) } } # Functional (as opposed to procedural) version of above code, with the additional feature # that the filenames of the resulting parts are returned as a matrix #outfiles = sapply( # files, # function(f) { # text = readLines(f) # parts = split.at(text, grep("^$", text)) # divide the file on blank lines # print(sprintf("Writing %d parts for input file %s", length(parts), f)) # ofs = sapply( # 1:length(parts), # function(pn) { # of = paste(f, ".part", pn, sep="") # writeLines(parts[[pn]], of) # of # } # ) # ofs # } #)