Skip to content

Commit feaaad5

Browse files
committed
Autodetect symlink; disabled symlink by default; added header signatures; check header signatures when binding; doc fixes
1 parent b5abcc2 commit feaaad5

9 files changed

Lines changed: 205 additions & 78 deletions

File tree

DESCRIPTION

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: filearray
22
Type: Package
33
Title: File-Backed Array for Out-of-Memory Computation
4-
Version: 0.1.2.9000
4+
Version: 0.1.3
55
Language: en-US
66
Encoding: UTF-8
77
License: LGPL-3
@@ -17,7 +17,8 @@ Description: Stores large arrays in files to avoid occupying large
1717
reading/writing via 'OpenMP'. Supports multiple non-character data
1818
types (double, float, complex, integer, logical, and raw).
1919
Imports:
20-
methods,
20+
digest (>= 0.6.29),
21+
methods,
2122
Rcpp
2223
Suggests:
2324
rmarkdown,

NEWS.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,21 @@
11
# filearray (development version)
22

3+
# filearray 0.1.3
4+
5+
* Automatically detect whether symbolic-link works and show warnings
6+
* Warnings can be suppressed
7+
* Allow extra headers to be set in `meta` file
8+
* Added header signature method
9+
* Fixed symbolic-link issues on `Windows` when partition sizes are 0
10+
* Added check-load function `filearray_checkload` to validate header
11+
* Fixed collapse method when `dimnames` are set
12+
* Fixed an unprotected variable in `C++` code
13+
* `filearray_bind` can use cache if the header signatures agree
14+
* `filearray_bind` can choose to force overwrite
15+
* Added package `digest` to `Imports`
16+
* Fixed a typo and several small bugs
17+
18+
319
# filearray 0.1.2
420

521
* Removed `flush` in saving data to let system decide when to flush to hard drive

R/bind.R

Lines changed: 87 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,13 @@
66
#' @param symlink whether to use \code{\link[base]{file.symlink}}; if true,
77
#' then partition files will be symbolic-linked to the original arrays,
88
#' otherwise the partition files will be copied over. If you want your data
9-
#' to be portable, do not use symbolic-links.
9+
#' to be portable, do not use symbolic-links. The default value is \code{FALSE}
10+
#' @param overwrite whether to overwrite when \code{filebase} already exists;
11+
#' default is false, which raises errors
12+
#' @param cache_ok see 'Details', only used if \code{overwrite} is true.
13+
#'
14+
#' @return A bound array in \code{'FileArray'} class.
15+
#'
1016
#' @details The input arrays must share the same data type and partition size.
1117
#' The dimension for each partition should also be the same. For example
1218
#' an array \code{x1} has dimension \eqn{100x20x30} with partition size
@@ -16,9 +22,26 @@
1622
#' \eqn{100x20x40} and each partition size is \code{1}, then \code{x1} and
1723
#' \code{x2} can be merged.
1824
#'
19-
#' The \code{symlink} option should be used with caution. Creating symbolic
20-
#' links is definitely faster than copying partition files. However, since
21-
#' the partition files are simply linked to the original partition files,
25+
#' If \code{filebase} exists and \code{overwrite} is \code{FALSE}, an error will
26+
#' always raise. If \code{overwrite=TRUE} and \code{cache_ok=FALSE}, then
27+
#' the existing \code{filebase} will be erased and any data stored within will
28+
#' be lost.
29+
#' If both \code{overwrite} and \code{cache_ok} are \code{TRUE}, then
30+
#' , before erasing \code{filebase}, the function validates the existing
31+
#' array header and compare the header signatures. If the existing header
32+
#' signature is the same as the array to be created, then the existing array
33+
#' will be returned. This \code{cache_ok} could be extremely useful when
34+
#' binding large arrays with \code{symlink=FALSE} as the cache might avoid
35+
#' moving files around. However, \code{cache_ok} should be enabled with caution.
36+
#' This is because only the header information will be compared, but the
37+
#' partition data will not be compared. If the existing array was generated from
38+
#' an old versions of the source arrays, but the data from the source arrays
39+
#' has been altered, then the \code{cache_ok=TRUE} is rarely proper as the cache
40+
#' is outdated.
41+
#'
42+
#' The \code{symlink} option should be used with extra caution. Creating
43+
#' symbolic links is definitely faster than copying partition files. However,
44+
#' since the partition files are simply linked to the original partition files,
2245
#' changing to the input arrays will also affect the merged arrays, and
2346
#' vice versa; see 'Examples'. Also for arrays created from symbolic links, if
2447
#' the original
@@ -63,9 +86,12 @@
6386
#' @export
6487
filearray_bind <- function(
6588
..., .list = list(), filebase = tempfile(),
66-
symlink = FALSE
89+
symlink = FALSE, overwrite = FALSE, cache_ok = FALSE
6790
){
68-
if(symlink && !getOption("filearray.symlink_enabled", FALSE)){
91+
stopifnot(length(filebase) == 1)
92+
symlink <- as.logical(symlink)[[1]]
93+
# options("filearray.symlink_enabled" = TRUE)
94+
if(symlink && !getOption("filearray.symlink_enabled", symlink_enabled())){
6995
symlink <- FALSE
7096
quiet_warning("Symbolic link is disabled. Force `symlink` to be FALSE")
7197
}
@@ -110,16 +136,65 @@ filearray_bind <- function(
110136

111137
dim[[length(dim)]] <- sum(last_margin) * part_size
112138

113-
re <- filearray_create(filebase = filebase, dimension = dim, type = type, partition_size = part_size)
114-
139+
# Dry-run
115140
start <- 1
116141
end <- 1
117142

118143
bind_info <- list(
119144
is_bound = TRUE,
120-
symlink = as.logical(symlink)
145+
dimension = as.double(dim),
146+
type = type,
147+
partition_size = as.double(part_size),
148+
partition_header_signatures = sapply(arrays, function(y){ y$header_signature(include_path = TRUE) })
121149
)
150+
bind_signature <- digest::digest(bind_info, algo = "sha256")
151+
122152
source_info <- list()
153+
for(ii in seq_along(last_margin)){
154+
arr <- arrays[[ii]]
155+
end <- start -1 + last_margin[[ii]]
156+
idx <- seq.int(start, end)
157+
source_info[[ii]] <- arr$partition_path(seq_len(last_margin[[ii]]))
158+
start <- end + 1
159+
}
160+
bind_info$source_info <- source_info
161+
bind_info$symlink <- symlink
162+
163+
if(file.exists(filebase)) {
164+
165+
if(!overwrite){
166+
stop("Array has already existed at: ", filebase)
167+
}
168+
169+
if(cache_ok){
170+
check <- tryCatch({
171+
check <- FALSE
172+
if(file.exists(file.path(filebase, "meta"))){
173+
header <- load_meta(filebase)
174+
if(
175+
identical(header$filearray_bind_signature, bind_signature) &&
176+
identical(header$filearray_bind$symlink, symlink)
177+
){
178+
check <- TRUE
179+
}
180+
}
181+
check
182+
}, error = function(e){
183+
FALSE
184+
})
185+
if(check){
186+
re <- filearray_load(filebase = filebase, mode = "readonly")
187+
attr(re, "cached_bind") <- TRUE
188+
return(re)
189+
}
190+
}
191+
192+
unlink(filebase, recursive = TRUE)
193+
}
194+
re <- filearray_create(filebase = filebase, dimension = dim, type = type, partition_size = part_size)
195+
196+
start <- 1
197+
end <- 1
123198

124199
for(ii in seq_along(last_margin)){
125200
arr <- arrays[[ii]]
@@ -136,15 +211,12 @@ filearray_bind <- function(
136211
re$partition_path(idx)
137212
)
138213
}
139-
source_info[[ii]] <- arr$partition_path(seq_len(last_margin[[ii]]))
140-
141214
start <- end + 1
142215
}
143216

144-
if(symlink){
145-
bind_info$source_info <- source_info
146-
}
147-
re$set_header("filearray_bind", bind_info)
217+
re$.header$filearray_bind <- bind_info
218+
re$.header$filearray_bind_signature <- bind_signature
219+
re$.save_header()
148220

149221
if(symlink){
150222
re$.mode <- "readonly"

R/class_filearray.R

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,16 @@ setRefClass(
216216
.self$.header[[key]] <- value
217217
.self$.save_header()
218218
},
219+
header_signature = function(include_path = TRUE){
220+
header_sig <- digest::digest(.self$.header, algo = "sha256")
221+
if( include_path ){
222+
path <- normalizePath(.self$.filebase)
223+
header_sig <- digest::digest(c(
224+
header_sig, path
225+
), algo = "sha256")
226+
}
227+
header_sig
228+
},
219229
load = function(filebase, mode = c('readwrite', 'readonly')){
220230
mode <- match.arg(mode)
221231

R/filearray-package.R

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,49 @@ in_rcmdcheck <- function (...) {
4040
return(FALSE)
4141
}
4242

43+
symlink_enabled <- local({
44+
enabled <- NA
45+
function(){
46+
if(!is.na(enabled)){ return(enabled) }
47+
tempdir(check = TRUE)
48+
f1 <- tempfile(pattern = 'filearray_simlink_test_from')
49+
f2 <- tempfile(pattern = 'filearray_simlink_test_to')
50+
on.exit({
51+
if(file.exists(f1)){
52+
unlink(f1)
53+
}
54+
if(file.exists(f2)){
55+
unlink(f2)
56+
}
57+
}, add = FALSE)
58+
s <- paste(sample(LETTERS), collapse = "")
59+
writeLines(s, con = f1)
60+
file.symlink(f1, to = f2)
61+
en <- tryCatch({
62+
if(identical(readLines(f2), s)){
63+
TRUE
64+
} else {
65+
FALSE
66+
}
67+
}, error = function(e){
68+
FALSE
69+
}, warning = function(e){
70+
FALSE
71+
})
72+
enabled <<- en
73+
74+
if(file.exists(f1)){
75+
unlink(f1)
76+
}
77+
if(file.exists(f2)){
78+
unlink(f2)
79+
}
80+
on.exit({}, add = FALSE)
81+
82+
return(enabled)
83+
}
84+
})
85+
4386

4487
.onLoad <- function(libname, pkgname){
4588
if(hasOpenMP()){

R/zzz.R

Lines changed: 0 additions & 38 deletions
This file was deleted.

cran-comments.md

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,3 @@
1-
## Reasons:
2-
3-
* Bug fix
4-
* Fixed a violation of CRAN policy mentioned by Prof. Ripley by restricting
5-
number of CPU cores to 2 in R checks:
6-
7-
```
8-
using 8 threads is a serious violation of the CRAN policy,
9-
```
10-
111
## Dev environment
122
* osx (ARM), R 4.1.2
133

@@ -19,7 +9,7 @@ using 8 threads is a serious violation of the CRAN policy,
199

2010
## R CMD check results
2111

22-
On R-4.1 and `devel`
12+
On `release` and `devel`
2313
0 errors | 0 warnings | 0 notes
2414

2515
On R-4.0

man/filearray_bind.Rd

Lines changed: 37 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)