66# ' @param symlink whether to use \code{\link[base]{file.symlink}}; if true,
77# ' then partition files will be symbolic-linked to the original arrays,
88# ' otherwise the partition files will be copied over. If you want your data
9- # ' to be portable, do not use symbolic-links.
9+ # ' to be portable, do not use symbolic-links. The default value is \code{FALSE}
10+ # ' @param overwrite whether to overwrite when \code{filebase} already exists;
11+ # ' default is false, which raises errors
12+ # ' @param cache_ok see 'Details', only used if \code{overwrite} is true.
13+ # '
14+ # ' @return A bound array in \code{'FileArray'} class.
15+ # '
1016# ' @details The input arrays must share the same data type and partition size.
1117# ' The dimension for each partition should also be the same. For example
1218# ' an array \code{x1} has dimension \eqn{100x20x30} with partition size
1622# ' \eqn{100x20x40} and each partition size is \code{1}, then \code{x1} and
1723# ' \code{x2} can be merged.
1824# '
19- # ' The \code{symlink} option should be used with caution. Creating symbolic
20- # ' links is definitely faster than copying partition files. However, since
21- # ' the partition files are simply linked to the original partition files,
25+ # ' If \code{filebase} exists and \code{overwrite} is \code{FALSE}, an error will
26+ # ' always raise. If \code{overwrite=TRUE} and \code{cache_ok=FALSE}, then
27+ # ' the existing \code{filebase} will be erased and any data stored within will
28+ # ' be lost.
29+ # ' If both \code{overwrite} and \code{cache_ok} are \code{TRUE}, then
30+ # ' , before erasing \code{filebase}, the function validates the existing
31+ # ' array header and compare the header signatures. If the existing header
32+ # ' signature is the same as the array to be created, then the existing array
33+ # ' will be returned. This \code{cache_ok} could be extremely useful when
34+ # ' binding large arrays with \code{symlink=FALSE} as the cache might avoid
35+ # ' moving files around. However, \code{cache_ok} should be enabled with caution.
36+ # ' This is because only the header information will be compared, but the
37+ # ' partition data will not be compared. If the existing array was generated from
38+ # ' an old versions of the source arrays, but the data from the source arrays
39+ # ' has been altered, then the \code{cache_ok=TRUE} is rarely proper as the cache
40+ # ' is outdated.
41+ # '
42+ # ' The \code{symlink} option should be used with extra caution. Creating
43+ # ' symbolic links is definitely faster than copying partition files. However,
44+ # ' since the partition files are simply linked to the original partition files,
2245# ' changing to the input arrays will also affect the merged arrays, and
2346# ' vice versa; see 'Examples'. Also for arrays created from symbolic links, if
2447# ' the original
6386# ' @export
6487filearray_bind <- function (
6588 ... , .list = list (), filebase = tempfile(),
66- symlink = FALSE
89+ symlink = FALSE , overwrite = FALSE , cache_ok = FALSE
6790){
68- if (symlink && ! getOption(" filearray.symlink_enabled" , FALSE )){
91+ stopifnot(length(filebase ) == 1 )
92+ symlink <- as.logical(symlink )[[1 ]]
93+ # options("filearray.symlink_enabled" = TRUE)
94+ if (symlink && ! getOption(" filearray.symlink_enabled" , symlink_enabled())){
6995 symlink <- FALSE
7096 quiet_warning(" Symbolic link is disabled. Force `symlink` to be FALSE" )
7197 }
@@ -110,16 +136,65 @@ filearray_bind <- function(
110136
111137 dim [[length(dim )]] <- sum(last_margin ) * part_size
112138
113- re <- filearray_create(filebase = filebase , dimension = dim , type = type , partition_size = part_size )
114-
139+ # Dry-run
115140 start <- 1
116141 end <- 1
117142
118143 bind_info <- list (
119144 is_bound = TRUE ,
120- symlink = as.logical(symlink )
145+ dimension = as.double(dim ),
146+ type = type ,
147+ partition_size = as.double(part_size ),
148+ partition_header_signatures = sapply(arrays , function (y ){ y $ header_signature(include_path = TRUE ) })
121149 )
150+ bind_signature <- digest :: digest(bind_info , algo = " sha256" )
151+
122152 source_info <- list ()
153+ for (ii in seq_along(last_margin )){
154+ arr <- arrays [[ii ]]
155+ end <- start - 1 + last_margin [[ii ]]
156+ idx <- seq.int(start , end )
157+ source_info [[ii ]] <- arr $ partition_path(seq_len(last_margin [[ii ]]))
158+ start <- end + 1
159+ }
160+ bind_info $ source_info <- source_info
161+ bind_info $ symlink <- symlink
162+
163+ if (file.exists(filebase )) {
164+
165+ if (! overwrite ){
166+ stop(" Array has already existed at: " , filebase )
167+ }
168+
169+ if (cache_ok ){
170+ check <- tryCatch({
171+ check <- FALSE
172+ if (file.exists(file.path(filebase , " meta" ))){
173+ header <- load_meta(filebase )
174+ if (
175+ identical(header $ filearray_bind_signature , bind_signature ) &&
176+ identical(header $ filearray_bind $ symlink , symlink )
177+ ){
178+ check <- TRUE
179+ }
180+ }
181+ check
182+ }, error = function (e ){
183+ FALSE
184+ })
185+ if (check ){
186+ re <- filearray_load(filebase = filebase , mode = " readonly" )
187+ attr(re , " cached_bind" ) <- TRUE
188+ return (re )
189+ }
190+ }
191+
192+ unlink(filebase , recursive = TRUE )
193+ }
194+ re <- filearray_create(filebase = filebase , dimension = dim , type = type , partition_size = part_size )
195+
196+ start <- 1
197+ end <- 1
123198
124199 for (ii in seq_along(last_margin )){
125200 arr <- arrays [[ii ]]
@@ -136,15 +211,12 @@ filearray_bind <- function(
136211 re $ partition_path(idx )
137212 )
138213 }
139- source_info [[ii ]] <- arr $ partition_path(seq_len(last_margin [[ii ]]))
140-
141214 start <- end + 1
142215 }
143216
144- if (symlink ){
145- bind_info $ source_info <- source_info
146- }
147- re $ set_header(" filearray_bind" , bind_info )
217+ re $ .header $ filearray_bind <- bind_info
218+ re $ .header $ filearray_bind_signature <- bind_signature
219+ re $ .save_header()
148220
149221 if (symlink ){
150222 re $ .mode <- " readonly"
0 commit comments